def importcsv(routine, csvfile, column="stemmed"): print """ ======================================== ---- IMPORT SEGMENTS ROM REFINE CSV ---- ======================================== """ log_routine(routine, entry="importcsv started", completion=0) transaction.commit() rows = list(csvfile) totalrows = len(rows) for i, row in enumerate(rows): # update stemmed_refined cell try: s = Segment.objects.get(id=row["segment_id"]) buffer_stemmed = s.stemmed s.stemmed = row["concept"] s.stemmed_refined = buffer_stemmed s.save() except Exception, e: # print " segemnt id %s was not found!" % row['segment_id'] close_routine(routine, error="Exception: %s %s" % (e, row), status="ERR") transaction.commit() return if i % 25 == 0: log_routine(routine, entry="importcsv at line: %s" % i, completion=i / float(totalrows)) print i, i / float(totalrows) transaction.commit()
def clean(corpus, routine): print """ ======================== ---- CLEAN SEGMENTS ---- ======================== """ # change routine type routine.type = "CLEAN" routine.save() transaction.commit() number_of_links = Document_Segment.objects.filter(document__corpus=corpus).count() loops = int(math.ceil(number_of_links / 25.0)) # print "corpus: %s" % corpus.json() # print "number_of_links: %s" % number_of_links # print "loops: %s" % loops try: # manually change for i in range(0, loops): for j in Document_Segment.objects.filter(document__corpus=corpus)[0:25]: j.segment.delete() j.delete() log_routine(routine, completion=float(i) / loops) transaction.commit() log_routine(routine, completion=1.0) except Exception, e: close_routine(routine, error="Ho trovato Wally 149, Exception: %s" % e, status="ERR") transaction.commit()
def tf( corpus, routine, completion_start=0.0, completion_score=1.0 ): print """ ====================== ---- TF, RELOADED ---- ====================== """ # change routine type routine.type = "TF" routine.save() transaction.commit() # get percentage info: number_of_documents_segments = Document_Segment.objects.count() logger.info( "number_of_documents_segments: %s" % number_of_documents_segments ) if number_of_documents_segments == 0: logger.error( "TF Not enought segments in your corpus. Try 'standard' routine first..." ) close_routine( routine, error="Not enought segments in your corpus. Try 'standard' routine first...", status="ERR") transaction.commit() return current_segment = 0 for d in Document.objects.filter(corpus=corpus): # print "document: %s" % d number_of_stems_per_document = Document_Segment.objects.filter( document=d, segment__status='IN' ).values('segment__stemmed').distinct().count() # print "number_of_stems_per_document: %s" % number_of_stems_per_document logger.info( "document '%s' [%s], number of 'pseudo-stems': %s" % (d.title, d.id, number_of_stems_per_document) ) for ds in Document_Segment.objects.filter( document=d, segment__status='IN'): # count alliases( segment with same stemmed version ) number_of_aliases = Document_Segment.objects.filter( document=d, segment__status='IN', segment__stemmed=ds.segment.stemmed ).count() ds.tf = float(number_of_aliases) / number_of_stems_per_document ds.save() if number_of_aliases > 1: # print just some lines pass # print ds.segment.content, ds.segment.stemmed, number_of_aliases, ds.tf if current_segment % 25 == 0: logger.info( "document '%s' [%s], completion: %s" % (d.title, d.id, ( float(current_segment) / number_of_documents_segments ) ) ) log_routine( routine, completion = completion_start + ( float(current_segment) / number_of_documents_segments ) * (completion_score-completion_start) ) transaction.commit() current_segment = current_segment + 1 logger.info( "document '%s' [%s] completed!" % (d.title, d.id ) ) if completion_score == 1.0: logger.info( "closing routine, task completed" ) close_routine( routine, error="", status="OK" ) transaction.commit()
def tf(corpus, routine, completion_start=0.0, completion_score=1.0): print """ ====================== ---- TF, RELOADED ---- ====================== """ # change routine type routine.type = "TF" routine.save() transaction.commit() # get percentage info: number_of_documents_segments = Document_Segment.objects.count() print "number_of_documents_segments: %s" % number_of_documents_segments if number_of_documents_segments == 0: close_routine( routine, error="Not enought segments in your corpus. Try 'standard' routine first...", status="ERR" ) transaction.commit() return current_segment = 0 for d in Document.objects.filter(corpus=corpus): print "document: %s" % d number_of_stems_per_document = ( Document_Segment.objects.filter(document=d).values("segment__stemmed").distinct().count() ) print "number_of_stems_per_document: %s" % number_of_stems_per_document for ds in Document_Segment.objects.filter(document=d): # count alliases( segment with same stemmed version ) number_of_aliases = Document_Segment.objects.filter(document=d, segment__stemmed=ds.segment.stemmed).count() ds.tf = float(number_of_aliases) / number_of_stems_per_document ds.save() if number_of_aliases > 1: # print just some lines print ds.segment.content, ds.segment.stemmed, number_of_aliases, ds.tf if current_segment % 25 == 0: log_routine( routine, completion=completion_start + (float(current_segment) / number_of_documents_segments) * (completion_score - completion_start), ) transaction.commit() current_segment = current_segment + 1 if completion_score == 1.0: close_routine(routine, error="", status="OK") transaction.commit()
def importcsv( routine, csvfile, corpus, column="stemmed" ): print """ ======================================== ---- IMPORT SEGMENTS ROM REFINE CSV ---- ======================================== """ log_routine( routine, entry="importcsv started", completion=0 ); transaction.commit() rows = list(csvfile) totalrows = len(rows) logger.info( "[corpus:%s] import csv, total rows: %s" % (corpus.name, totalrows) ) for i, row in enumerate(rows): try: segment_id = row['segment_id'] concept = row['concept'] status = 'IN' if row['status'] == 'IN' else 'OUT' except KeyError,e: logger.error( "KeyError exception: %s" % e ) close_routine( routine, error="Exception: %s %s" % (e,row), status="ERR" ) transaction.rollback() return # update stemmed_refined cell try: s = Segment.objects.get(id=segment_id) # update all the similar segment with the brand new segment Segment.objects.filter( stemmed=s.stemmed, documents__corpus=corpus ).update( stemmed=concept, status=status ) # buffer_stemmed = s.stemmed # s.stemmed = concept # s.status = status # s.stemmed_refined = buffer_stemmed # s.save() except Segment.DoesNotExist, e: logger.error( "[corpus:%s] import csv, row %s raised exception: %s" % (corpus.name, i,e) ) #print " segemnt id %s was not found!" % row['segment_id'] close_routine( routine, error="Exception: %s %s" % (e,row), status="ERR" ) transaction.commit() return
def similarity(corpus, routine, completion_start=0.0, completion_score=1.0): print """ ==================== ---- SIMILARITY ---- ==================== """ # Stories vlows Enter Nebuloses log_routine( routine, entry="similarity measurement started succesfully", completion=0.0, completion_start=completion_start, completion_score=completion_score, ) # 1. get number of document number_of_documents = Document.objects.filter(corpus=corpus, status="IN").count() logger.info("[%s:%s] SIMILARITY on %s 'IN' documents" % (corpus.name, corpus.id, number_of_documents)) # 2. dictionary where keys are the ids of corpus docments and valuse documents = {} # out some information # print "[info] corpus:",corpus.json() # print "[info] document in corpus:",number_of_documents # get the list of every stemmed segments inside each document. # the distance algorithm will work on "stemmed" documents! # @todo: verify that PATTERN distance measurement works well with this algorithm. cursor = connection.cursor() cursor.execute( """ SELECT count(*) FROM anta_document_segment ds JOIN anta_segment s ON ds.segment_id = s.id JOIN anta_document d ON ds.document_id = d.id WHERE d.corpus_id = %s AND s.status='IN' """, [corpus.id], ) number_of_stems = cursor.fetchone()[0] number_of_groups = cursor.execute( """ SELECT s.stemmed, d.id as document_id FROM anta_document_segment ds JOIN anta_segment s ON ds.segment_id = s.id JOIN anta_document d ON ds.document_id = d.id WHERE d.corpus_id = %s AND d.status='IN' AND s.status='IN' """, [corpus.id], ) # we do not need ORDER BY d.id, ds.id logger.info("[%s:%s] %s document_segment found" % (corpus.name, corpus.id, number_of_groups)) log_routine( routine, entry="similarity measurement started", completion=0.1, completion_start=completion_start, completion_score=completion_score, ) # @todo: improve remapping for row in cursor.fetchall(): stemmed, document_id = row # print document_id if document_id not in documents: documents[document_id] = [] documents[document_id].append(stemmed) logger.info("[%s:%s] creating documents for Pattern" % (corpus.name, corpus.id)) # translate corpus id pattern_id_translation = {} # reformat each document into a PATTERN compatible document: join space separated stemmed segment values. for d in documents: documents[d] = pvDocument(" ".join(documents[d])) log_routine( routine, entry="join stemmed segments", completion=0.15, completion_start=completion_start, completion_score=completion_score, ) pattern_id_translation[documents[d].id] = d # print "[info] document with segments in corpus:",len(pattern_id_translation) logger.info("[%s:%s] %s documents created for Pattern" % (corpus.name, corpus.id, pattern_id_translation)) # store document in corpus. c = pvCorpus(documents.values()) # computate and save similarities for counter, d in enumerate(documents): # print counter, "neighbors of" ,documents[d], neighbors = c.neighbors(documents[d], top=number_of_documents) if len(neighbors) == 0: logger.warning("no neighbors for document: %s" % pattern_id_translation[documents[d].id]) continue # print "%s neighbors found for document: %s" % ( len(neighbors), pattern_id_translation[ documents[d].id ] ) logger.info( "[%s:%s] %s neighbors found for document: %s, completion: %s" % ( corpus.name, corpus.id, len(neighbors), pattern_id_translation[documents[d].id], (counter / float(number_of_documents)), ) ) for n in c.neighbors(documents[d], top=number_of_documents): alpha_id = pattern_id_translation[documents[d].id] omega_id = pattern_id_translation[n[1].id] cosine_similarity = n[0] try: dist = Distance.objects.get(alpha__id=alpha_id, omega__id=omega_id) # print "[info] distantce exists ( %s - %s ), old value: %s, difference: %s" % ( alpha_id, omega_id, dist.cosine_similarity,(dist.cosine_similarity - cosine_similarity) ) except Distance.DoesNotExist, e: # print e dist = Distance(alpha_id=alpha_id, omega_id=omega_id) # print "[info] create Distance object", dist.id, cosine_similarity # print a distance exist between these two document dist.cosine_similarity = cosine_similarity dist.save()
def tfidf(corpus, routine, completion_start=0.0, completion_score=1.0, column="stemmed"): print """ =========================== ---- TFIDF COMPUTATION ---- =========================== """ # routine.type = "TFIDF" # routine.save() # 1. get number of document number_of_documents = Document.objects.filter(corpus=corpus, status="IN").count() logger.info("[%s:%s] TFIDF on %s 'IN' documents" % (corpus.name, corpus.id, number_of_documents)) # 2. get all languages in corpus number_of_languages = Document.objects.values("language").distinct().count() # 3. get GLOBAL number of segments (aka stems, segments grouped by their stemmed version: they do not need to be in the same corpus!!!) number_of_stems = ( Segment.objects.values("stemmed", "language").annotate(Count("stemmed"), Count("language")).count() ) logger.info("[%s:%s] %s segment groups" % (corpus.name, corpus.id, number_of_stems)) # SELECT COUNT(*), stemmed FROM anta_segment GROUP BY stemmed # out some information # print "[info] column:",column # print "[info] corpus:",corpus.json() # print "[info] document in corpus:",number_of_documents # print "[info] stems in corpus (grouped by stemmed, language):",number_of_stems cursor = connection.cursor() # global counter (all languages cycle) current_stem = 0 # 3. for each language, perform a tfidf for i in ( Document.objects.filter(corpus=corpus, status="IN") .values("language") .annotate(num_document=Count("language")) .distinct() ): # print "[info] language info: ",i logger.info("[%s:%s] language %s" % (corpus.name, corpus.id, i["language"])) language = i["language"] # count tfidf group # SELECT COUNT(*), stemmed FROM anta_segment WHERE language="EN" GROUP BY stemmed stem_count = Segment.objects.filter(language=language).values("stemmed").annotate(Count("stemmed")).count() # check length. If it's 0, exit with error.... if stem_count == 0: logger.error( "[%s:%s] not enought segments in your corpus with language %s" % (corpus.name, corpus.id, i["language"]) ) close_routine( routine, error="Not enought segments in your corpus. Try standard routine first...", status="ERR" ) transaction.commit() return # 5. for each segment in this language... number_of_groups = cursor.execute( """ SELECT COUNT( DISTINCT ds.document_id ) as distribution, s.language, s.stemmed FROM `anta_document_segment` ds JOIN anta_segment s ON ds.segment_id = s.id JOIN anta_document d ON d.id = ds.document_id WHERE d.corpus_id = %s AND s.language = %s AND s.status = 'IN' GROUP BY s.stemmed ORDER BY distribution DESC, stemmed ASC""", [corpus.id, language], ) logger.info("[%s:%s] distribution query executed for language '%s'" % (corpus.name, corpus.id, language)) language_cursor = 0.0 for row in dictfetchall(cursor): # increment global runner (stats) current_stem = current_stem + 1 language_cursor + 1 # store tfidf inside each segment-document relationships try: dss = Document_Segment.objects.filter(segment__stemmed=row["stemmed"], segment__language=language) df = float(row["distribution"]) / number_of_documents # print float(current_stem) / number_of_stems * 100.0, row[ column ], row['distribution'], df except Exception, e: logger.exception("[%s:%s] uhandled exception ..." % (corpus.name, corpus.id)) # print e close_routine( routine, error="Ho trovato Wally alla liena 281 di metrics.py, Exception: %s" % e, status="ERR" ) transaction.commit() return for ds in dss: ds.tfidf = ds.tf * math.log(1 / df) ds.save() if current_stem % 75 == 0: try: completion = completion_start + (float(current_stem) / number_of_stems) * ( completion_score - completion_start ) logger.info( "[%s:%s] language completion: %s" % (corpus.name, corpus.id, (float(current_stem) / number_of_stems)) ) log_routine(routine, completion=completion) # save percentage and commit transaction transaction.commit() except Exception, e: logger.exception("[%s:%s] uhandled exception ..." % (corpus.name, corpus.id))
except Segment.DoesNotExist, e: logger.error("[corpus:%s] import csv, row %s raised exception: %s" % (corpus.name, i, e)) # print " segemnt id %s was not found!" % row['segment_id'] close_routine(routine, error="Exception: %s %s" % (e, row), status="ERR") transaction.commit() return except IndexError, e: logger.error( "[corpus:%s] import csv, row %s raised IndexError (Segment does not belong to the given corpus ?): %s" % (corpus.name, i, e) ) transaction.commit() return if i % 25 == 0: log_routine(routine, entry="importcsv at line: %s" % i, completion=i / float(totalrows)) # print i, i/float(totalrows) transaction.commit() # completed import csv # @todo: we need to reintegrate similarity( corpus, routine ) logger.info("[corpus:%s] import csv completed, total rows: %s" % (corpus.name, totalrows)) log_routine(routine, entry="completed importcsv at line: %s" % i, completion=1.0) close_routine(routine, status="OK") transaction.commit() @transaction.commit_manually def similarity(corpus, routine, completion_start=0.0, completion_score=1.0): print """
def decant( corpus, routine, ref_completion=1.0 ): # path = settings.MEDIA_ROOT + options.corpus # print NL_STOPWORDS # get document corpus print "[info] starting pattern analysis on corpus:",corpus.id, corpus.name log_routine( routine, entry="[info] starting pattern analysis on corpus: %s %s" % corpus.id % corpus.name ) raise Exception("stop here") log_routine(routine, entry="after exception raised") return # create or resume analysis try: analysis = Analysis.objects.get(corpus=corpus, type="PT") print "[info] analysis:", analysis.id, "found, current document:",analysis.document if analysis.status == "ERR": print "[info] analysis blocked:", analysis.id, "with ERR status, force restart.." elif analysis.status != "OK": print "[warning] analysis:", analysis.id, "not completed, status:", analysis.status print "[info] restart analysis." except Analysis.DoesNotExist: analysis = Analysis( corpus=corpus, type="PT", start_date=datetime.utcnow(), status="CRE" ) analysis.save() print "[info] analysis created:", analysis.id, "at",analysis.start_date print "[info] analysis status:", analysis.id, "at", analysis.status if analysis.document is None: documents = Document.objects.filter(corpus__id=corpus.id) print "[info] analysis documentis None, then start from first available document" else: documents = Document.objects.filter(corpus__id=corpus.id, id__gt=analysis.document.id) if documents.count() == 0: documents = Document.objects.filter(corpus__id=corpus.id) # pending status for current analysis analysis.status = "PN" analysis.save() for d in documents: print print " document ",d.id print " ---------------------------------------" print # a = Analysis( document=d, ) print "[info] document mimetype:",d.mime_type textified = textify( d, settings.MEDIA_ROOT ) if textified == False: print "[error] while textify file ",d.id, d.title analysis.status="ERR" analysis.save() exit textified = textified.replace("%20"," ") # update analysis with current valid document analysis.document = d analysis.save() # load storpwords for document d language if d.language == "NL": stopwords = NL_STOPWORDS else: stopwords = EN_STOPWORDS print "[info] document language:",d.language print "[info] analysis started on doc ", d.id,"'", d.title,"'", d.language.lower(), "file:",textified #start distill anaysis, exclude given stopwors distilled = distill( filename=textified, language=d.language.lower(), stopwords=stopwords ) # append keywords as tag for the document for k in distilled['keywords']: # print k candidate = k[1] # get tag try: t = Tag.objects.get( name=candidate, type="keyword" ) except: # todo lemma version of a word according to language t = Tag( name=candidate, type="keyword" ) try: t.save() except: print "[warning] unable to save as tag:", candidate continue # set tag documnt relation try: td = Document_Tag( document=d, tag=t) td.save() except: #relation exist, continue # segment first = True for segment in distilled['segments']: if len(segment[0]) > 128: print "[warning] sample 'segment' will be truncated:", segment[0] continue try: s = Segment.objects.get( content=segment[0][:128], language=d.language) except: s = Segment( content=segment[0][:128], stemmed=re.sub("\s+", ' ', " ".join(segment[1])[:128] ), language=d.language ) try: s.save() except: print "[warning] unable to save segment:", segment[0][:128] continue try: sd = Document_Segment.objects.get( document=d, segment=s ) except: sd = Document_Segment( document=d, segment=s, tf=segment[2] ) sd.save() # relationship exist if first: print "[info] sample 'segment' saved:", s.id, s.content, ", stem:", s.stemmed ,", tf:", sd.tf # save concept and attach for k in segment[1]: # ignore numbers k = re.sub("[\d\-\.]+","", k) if len(k) < 2: continue try: c = Concept.objects.get( content=k, language=d.language) except: try: c = Concept( content=k, language=d.language ) c.save() except: print "[warning] unable to save concept: ", k continue try: sc = Segment_Concept.objects.get( segment=s, concept=c ) except: sc = Segment_Concept( segment=s, concept=c ) sc.save() if first: print "[info] sample 'concept' saved:",c.id, c.content first = False print "[info] analysis ended on doc", d.id,"'", d.title,"'" print "[info] analysis completed on corpus:", corpus.id analysis.status = "OK" analysis.end_date = datetime.utcnow() analysis.save()
def similarity(corpus, routine, completion_start=0.0, completion_score=1.0): print """ ==================== ---- SIMILARITY ---- ==================== """ # Stories vlows Enter Nebuloses log_routine( routine, entry="similarity measurement started succesfully", completion=0.0, completion_start=completion_start, completion_score=completion_score, ) transaction.commit() # 1. get number of document number_of_documents = Document.objects.filter(corpus=corpus).count() # 2. dictionary where keys are the ids of corpus docments and valuse documents = {} # out some information print "[info] corpus:", corpus.json() print "[info] document in corpus:", number_of_documents # get the list of every stemmed segments inside each document. # the distance algorithm will work on "stemmed" documents! # @todo: verify that PATTERN distance measurement works well with this algorithm. cursor = connection.cursor() cursor.execute( """ SELECT count(*) FROM anta_document_segment ds JOIN anta_segment s ON ds.segment_id = s.id JOIN anta_document d ON ds.document_id = d.id WHERE d.corpus_id = %s """, [corpus.id], ) number_of_stems = cursor.fetchone()[0] cursor.execute( """ SELECT s.stemmed, d.id as document_id FROM anta_document_segment ds JOIN anta_segment s ON ds.segment_id = s.id JOIN anta_document d ON ds.document_id = d.id WHERE d.corpus_id = %s """, [corpus.id], ) # we do not need ORDER BY d.id, ds.id log_routine( routine, entry="similarity measurement started", completion=0.1, completion_start=completion_start, completion_score=completion_score, ) # @todo: improve remapping for row in cursor.fetchall(): stemmed, document_id = row # print document_id if document_id not in documents: documents[document_id] = [] documents[document_id].append(stemmed) # translate corpus id pattern_id_translation = {} # reformat each document into a PATTERN compatible document: join space separated stemmed segment values. for d in documents: documents[d] = pvDocument(" ".join(documents[d])) log_routine( routine, entry="join stemmed segments", completion=0.15, completion_start=completion_start, completion_score=completion_score, ) pattern_id_translation[documents[d].id] = d # store document in corpus. c = pvCorpus(documents.values()) # computate and save similarities for counter, d in enumerate(documents): print counter for n in c.neighbors(documents[d], top=number_of_documents): alpha_id = pattern_id_translation[documents[d].id] omega_id = pattern_id_translation[n[1].id] cosine_similarity = n[0] try: dist = Distance.objects.get(alpha__id=alpha_id, omega__id=omega_id) print "[info] distantce exists ( %s - %s ), old value: %s, difference: %s" % ( alpha_id, omega_id, dist.cosine_similarity, (dist.cosine_similarity - cosine_similarity), ) except Exception, e: print e dist = Distance(alpha_id=alpha_id, omega_id=omega_id) print "[info] create Distance object", dist.id, cosine_similarity # print a distance exist between these two document dist.cosine_similarity = cosine_similarity dist.save() # log_routine( routine, entry="neighbors computation", completion=(counter + 1.0) / number_of_documents, completion_start=completion_start, completion_score=completion_score, ) transaction.commit()
except Exception, e: # print " segemnt id %s was not found!" % row['segment_id'] close_routine(routine, error="Exception: %s %s" % (e, row), status="ERR") transaction.commit() return if i % 25 == 0: log_routine(routine, entry="importcsv at line: %s" % i, completion=i / float(totalrows)) print i, i / float(totalrows) transaction.commit() # completed import csv # @todo: we need to reintegrate similarity( corpus, routine ) log_routine(routine, entry="completed importcsv at line: %s" % i, completion=1.0) close_routine(routine, status="OK") transaction.commit() @transaction.commit_manually def similarity(corpus, routine, completion_start=0.0, completion_score=1.0): print """ ==================== ---- SIMILARITY ---- ==================== """ # Stories vlows Enter Nebuloses log_routine( routine, entry="similarity measurement started succesfully",