#relation exist, continue analysis.completion = .5 analysis.save() # segment first = True for segment in distilled['segments']: if len(segment[0]) > 128: #print "[warning] sample 'segment' will be truncated:", segment[0] continue try: s = Segment.objects.get( content=segment[0][:128], language=d.language) except: s = Segment( content=segment[0][:128], stemmed=re.sub("\s+", ' ', " ".join(segment[1])[:128] ), language=d.language ) try: s.save() except Exception, e: #logger.warning("%s / %s FAILED distill document '%s' [%s], mimetype %s with Exception: %s" % ( i, total_count, d.title, d.id, d.mime_type, e) ) #print "[warning] unable to save segment:", segment[0][:128] continue try: sd = Document_Segment.objects.get( document=d, segment=s ) except: sd = Document_Segment( document=d, segment=s, tf=segment[2] ) sd.save() # relationship exist #if first: #print "[info] sample 'segment' saved:", s.id, s.content, ", stem:", s.stemmed ,", tf:", sd.tf
def decant( corpus, routine, settings, ref_completion=1.0 ): from sven.anta.models import Analysis, Routine, Segment, Segment_Concept, Document, Document_Segment, Document_Tag, Tag, Concept from sven.anta.utils import textify # path = settings.MEDIA_ROOT + options.corpus # print NL_STOPWORDS # get document corpus print "[info] starting pattern analysis on corpus:",corpus.id, corpus.name log_routine( routine, entry="[info] starting pattern analysis on corpus: %s" % corpus.id ) # current analysis, if any try: analysis = Analysis.objects.get(corpus=corpus, type="PT") except Analysis.DoesNotExist: analysis = Analysis( corpus=corpus, type="PT", start_date=datetime.now(), status="CRE" ) analysis.save() except: raise routine.analysis.add( analysis ) routine.save() # total count total_count = Document.objects.filter(corpus=corpus).count() # current document if analysis.document is None: documents = Document.objects.filter(corpus__id=corpus.id) analysis.document = documents[0] else: documents = Document.objects.filter(corpus__id=corpus.id, id__gt=analysis.document.id) if documents.count() == 0: documents = Document.objects.filter(corpus__id=corpus.id) # pending status for current analysis analysis.status = "PN" analysis.save() i = 0 # cycle through documents for d in documents: i = i + 1 # update analysis with current document analysis.document = d analysis.save() log_routine( routine, completion=ref_completion * i / total_count ) # a = Analysis( document=d, ) print "[info] document mimetype:",d.mime_type textified = textify( d, settings.MEDIA_ROOT ) if textified == False: analysis.status="ERR" analysis.save() raise Exception("error in textify function") textified = textified.replace("%20"," ") analysis.completion = 0.0 analysis.document = d analysis.save() # load storpwords for document d language if d.language == "NL": stopwords = NL_STOPWORDS elif d.language == "EN": stopwords = EN_STOPWORDS else: stopwords = [] print "[info] document language:",d.language print "[info] analysis started on doc ", d.id,"'", d.title,"'", d.language.lower(), "file:",textified #start distill anaysis, exclude given stopwors distilled = distill( filename=textified, language=d.language.lower(), stopwords=stopwords ) analysis.completion = .1 analysis.save() # append keywords as tag for the document for k in distilled['keywords']: # print k candidate = k[1] # get tag try: t = Tag.objects.get( name=candidate, type="keyword" ) except: # todo lemma version of a word according to language t = Tag( name=candidate, type="keyword" ) try: t.save() except: print "[warning] unable to save as tag:", candidate continue # set tag documnt relation try: td = Document_Tag( document=d, tag=t) td.save() except: #relation exist, continue analysis.completion = .5 analysis.save() # segment first = True for segment in distilled['segments']: if len(segment[0]) > 128: print "[warning] sample 'segment' will be truncated:", segment[0] continue try: s = Segment.objects.get( content=segment[0][:128], language=d.language) except: s = Segment( content=segment[0][:128], stemmed=re.sub("\s+", ' ', " ".join(segment[1])[:128] ), language=d.language ) try: s.save() except: print "[warning] unable to save segment:", segment[0][:128] continue try: sd = Document_Segment.objects.get( document=d, segment=s ) except: sd = Document_Segment( document=d, segment=s, tf=segment[2] ) sd.save() # relationship exist if first: print "[info] sample 'segment' saved:", s.id, s.content, ", stem:", s.stemmed ,", tf:", sd.tf # save concept and attach for k in segment[1]: # ignore numbers k = re.sub("[\d\-\.]+","", k) if len(k) < 2: continue try: c = Concept.objects.get( content=k, language=d.language) except: try: c = Concept( content=k, language=d.language ) c.save() except Exception, e: print "[warning] unable to save concept: %s, exception: %s" % (k, e) continue try: sc = Segment_Concept.objects.get( segment=s, concept=c ) except: sc = Segment_Concept( segment=s, concept=c ) sc.save() if first: print "[info] sample 'concept' saved:",c.id, c.content first = False print "[info] analysis ended on doc", d.id,"'", d.title,"'"