def clean(corpus, routine): print """ ======================== ---- CLEAN SEGMENTS ---- ======================== """ # change routine type routine.type = "CLEAN" routine.save() transaction.commit() number_of_links = Document_Segment.objects.filter(document__corpus=corpus).count() loops = int(math.ceil(number_of_links / 25.0)) # print "corpus: %s" % corpus.json() # print "number_of_links: %s" % number_of_links # print "loops: %s" % loops try: # manually change for i in range(0, loops): for j in Document_Segment.objects.filter(document__corpus=corpus)[0:25]: j.segment.delete() j.delete() log_routine(routine, completion=float(i) / loops) transaction.commit() log_routine(routine, completion=1.0) except Exception, e: close_routine(routine, error="Ho trovato Wally 149, Exception: %s" % e, status="ERR") transaction.commit()
def importcsv(routine, csvfile, column="stemmed"): print """ ======================================== ---- IMPORT SEGMENTS ROM REFINE CSV ---- ======================================== """ log_routine(routine, entry="importcsv started", completion=0) transaction.commit() rows = list(csvfile) totalrows = len(rows) for i, row in enumerate(rows): # update stemmed_refined cell try: s = Segment.objects.get(id=row["segment_id"]) buffer_stemmed = s.stemmed s.stemmed = row["concept"] s.stemmed_refined = buffer_stemmed s.save() except Exception, e: # print " segemnt id %s was not found!" % row['segment_id'] close_routine(routine, error="Exception: %s %s" % (e, row), status="ERR") transaction.commit() return if i % 25 == 0: log_routine(routine, entry="importcsv at line: %s" % i, completion=i / float(totalrows)) print i, i / float(totalrows) transaction.commit()
def tf_tfidf( corpus, routine ): number_of_documents = Document.objects.filter(corpus=corpus ).count() if number_of_documents == 0: return close_routine( routine, error="tfidf routine: no document found", status="OK") logger.info("starting TF computation on corpus '%s' [%s], %s documents" % (corpus.name, corpus.id, number_of_documents)) tf( corpus=corpus, routine=routine, completion_start=0.0, completion_score=0.4 ) logger.info("TF computation done") if routine.status == "ERR": logger.error("TF computation failed on corpus '%s' [%s]" % (corpus.name, corpus.id)) return logger.info("TF computation done on corpus '%s' [%s]" % (corpus.name, corpus.id )) logger.info("starting TFIDF computation on corpus '%s' [%s], %s documents" % (corpus.name, corpus.id, number_of_documents)) tfidf( corpus=corpus, routine=routine, completion_start=0.4, completion_score=0.4 ) if routine.status == "ERR": logger.error("TFIDF computation failed on corpus '%s' [%s]" % (corpus.name, corpus.id)) return logger.info("TFIDF computation done on corpus '%s' [%s]" % (corpus.name, corpus.id )) logger.info("starting SIMILARITY computation on corpus '%s' [%s], %s documents" % (corpus.name, corpus.id, number_of_documents )) similarity( corpus=corpus, routine=routine, completion_start=0.8, completion_score=0.2 ) logger.info("SIMILARITY computation done on corpus '%s' [%s]" % (corpus.name, corpus.id )) close_routine( routine, error="", status="OK" )
def tf( corpus, routine, completion_start=0.0, completion_score=1.0 ): print """ ====================== ---- TF, RELOADED ---- ====================== """ # change routine type routine.type = "TF" routine.save() transaction.commit() # get percentage info: number_of_documents_segments = Document_Segment.objects.count() logger.info( "number_of_documents_segments: %s" % number_of_documents_segments ) if number_of_documents_segments == 0: logger.error( "TF Not enought segments in your corpus. Try 'standard' routine first..." ) close_routine( routine, error="Not enought segments in your corpus. Try 'standard' routine first...", status="ERR") transaction.commit() return current_segment = 0 for d in Document.objects.filter(corpus=corpus): # print "document: %s" % d number_of_stems_per_document = Document_Segment.objects.filter( document=d, segment__status='IN' ).values('segment__stemmed').distinct().count() # print "number_of_stems_per_document: %s" % number_of_stems_per_document logger.info( "document '%s' [%s], number of 'pseudo-stems': %s" % (d.title, d.id, number_of_stems_per_document) ) for ds in Document_Segment.objects.filter( document=d, segment__status='IN'): # count alliases( segment with same stemmed version ) number_of_aliases = Document_Segment.objects.filter( document=d, segment__status='IN', segment__stemmed=ds.segment.stemmed ).count() ds.tf = float(number_of_aliases) / number_of_stems_per_document ds.save() if number_of_aliases > 1: # print just some lines pass # print ds.segment.content, ds.segment.stemmed, number_of_aliases, ds.tf if current_segment % 25 == 0: logger.info( "document '%s' [%s], completion: %s" % (d.title, d.id, ( float(current_segment) / number_of_documents_segments ) ) ) log_routine( routine, completion = completion_start + ( float(current_segment) / number_of_documents_segments ) * (completion_score-completion_start) ) transaction.commit() current_segment = current_segment + 1 logger.info( "document '%s' [%s] completed!" % (d.title, d.id ) ) if completion_score == 1.0: logger.info( "closing routine, task completed" ) close_routine( routine, error="", status="OK" ) transaction.commit()
def standard(corpus, routine): number_of_documents = Document.objects.filter(corpus=corpus).count() if number_of_documents == 0: return close_routine(routine, error="No document found", status="ERR") # 1. distiller.decant (tf computation ) try: decant(corpus=corpus, routine=routine, settings=settings, ref_completion=0.5) except Exception, e: return close_routine(routine, error="Exception: %s" % e, status="ERR")
def tf(corpus, routine, completion_start=0.0, completion_score=1.0): print """ ====================== ---- TF, RELOADED ---- ====================== """ # change routine type routine.type = "TF" routine.save() transaction.commit() # get percentage info: number_of_documents_segments = Document_Segment.objects.count() print "number_of_documents_segments: %s" % number_of_documents_segments if number_of_documents_segments == 0: close_routine( routine, error="Not enought segments in your corpus. Try 'standard' routine first...", status="ERR" ) transaction.commit() return current_segment = 0 for d in Document.objects.filter(corpus=corpus): print "document: %s" % d number_of_stems_per_document = ( Document_Segment.objects.filter(document=d).values("segment__stemmed").distinct().count() ) print "number_of_stems_per_document: %s" % number_of_stems_per_document for ds in Document_Segment.objects.filter(document=d): # count alliases( segment with same stemmed version ) number_of_aliases = Document_Segment.objects.filter(document=d, segment__stemmed=ds.segment.stemmed).count() ds.tf = float(number_of_aliases) / number_of_stems_per_document ds.save() if number_of_aliases > 1: # print just some lines print ds.segment.content, ds.segment.stemmed, number_of_aliases, ds.tf if current_segment % 25 == 0: log_routine( routine, completion=completion_start + (float(current_segment) / number_of_documents_segments) * (completion_score - completion_start), ) transaction.commit() current_segment = current_segment + 1 if completion_score == 1.0: close_routine(routine, error="", status="OK") transaction.commit()
def standard( corpus, routine ): number_of_documents = Document.objects.filter(corpus=corpus ).count() logger.info("opening standard routine, corpus: '%s' [%s], %s documents" % ( corpus.name, corpus.id, number_of_documents) ) if number_of_documents == 0: logger.error("routine closed, not enough documents") return close_routine( routine, error="standard routine: No document found", status="OK") # 1. distiller.decant (tf computation ) try: decant( corpus=corpus, routine=routine, settings=settings, ref_completion=0.5 ) except Exception, e: logger.exception("Exception: %s" % e) return close_routine( routine, error="Exception: %s" % e, status="ERR")
def importcsv( routine, csvfile, corpus, column="stemmed" ): print """ ======================================== ---- IMPORT SEGMENTS ROM REFINE CSV ---- ======================================== """ log_routine( routine, entry="importcsv started", completion=0 ); transaction.commit() rows = list(csvfile) totalrows = len(rows) logger.info( "[corpus:%s] import csv, total rows: %s" % (corpus.name, totalrows) ) for i, row in enumerate(rows): try: segment_id = row['segment_id'] concept = row['concept'] status = 'IN' if row['status'] == 'IN' else 'OUT' except KeyError,e: logger.error( "KeyError exception: %s" % e ) close_routine( routine, error="Exception: %s %s" % (e,row), status="ERR" ) transaction.rollback() return # update stemmed_refined cell try: s = Segment.objects.get(id=segment_id) # update all the similar segment with the brand new segment Segment.objects.filter( stemmed=s.stemmed, documents__corpus=corpus ).update( stemmed=concept, status=status ) # buffer_stemmed = s.stemmed # s.stemmed = concept # s.status = status # s.stemmed_refined = buffer_stemmed # s.save() except Segment.DoesNotExist, e: logger.error( "[corpus:%s] import csv, row %s raised exception: %s" % (corpus.name, i,e) ) #print " segemnt id %s was not found!" % row['segment_id'] close_routine( routine, error="Exception: %s %s" % (e,row), status="ERR" ) transaction.commit() return
def tf_tfidf(corpus, routine): number_of_documents = Document.objects.filter(corpus=corpus).count() if number_of_documents == 0: return close_routine(routine, error="No document found", status="ERR") tf(corpus=corpus, routine=routine, completion_start=0.0, completion_score=0.4) if routine.status == "ERR": return tfidf(corpus=corpus, routine=routine, completion_start=0.4, completion_score=0.4) if routine.status == "ERR": return similarity(corpus=corpus, routine=routine, completion_start=0.8, completion_score=0.2) close_routine(routine, error="", status="OK")
def tfidf(corpus, routine, completion_start=0.0, completion_score=1.0, column="stemmed"): print """ =========================== ---- TFIDF COMPUTATION ---- =========================== """ # routine.type = "TFIDF" # routine.save() # 1. get number of document number_of_documents = Document.objects.filter(corpus=corpus, status="IN").count() logger.info("[%s:%s] TFIDF on %s 'IN' documents" % (corpus.name, corpus.id, number_of_documents)) # 2. get all languages in corpus number_of_languages = Document.objects.values("language").distinct().count() # 3. get GLOBAL number of segments (aka stems, segments grouped by their stemmed version: they do not need to be in the same corpus!!!) number_of_stems = ( Segment.objects.values("stemmed", "language").annotate(Count("stemmed"), Count("language")).count() ) logger.info("[%s:%s] %s segment groups" % (corpus.name, corpus.id, number_of_stems)) # SELECT COUNT(*), stemmed FROM anta_segment GROUP BY stemmed # out some information # print "[info] column:",column # print "[info] corpus:",corpus.json() # print "[info] document in corpus:",number_of_documents # print "[info] stems in corpus (grouped by stemmed, language):",number_of_stems cursor = connection.cursor() # global counter (all languages cycle) current_stem = 0 # 3. for each language, perform a tfidf for i in ( Document.objects.filter(corpus=corpus, status="IN") .values("language") .annotate(num_document=Count("language")) .distinct() ): # print "[info] language info: ",i logger.info("[%s:%s] language %s" % (corpus.name, corpus.id, i["language"])) language = i["language"] # count tfidf group # SELECT COUNT(*), stemmed FROM anta_segment WHERE language="EN" GROUP BY stemmed stem_count = Segment.objects.filter(language=language).values("stemmed").annotate(Count("stemmed")).count() # check length. If it's 0, exit with error.... if stem_count == 0: logger.error( "[%s:%s] not enought segments in your corpus with language %s" % (corpus.name, corpus.id, i["language"]) ) close_routine( routine, error="Not enought segments in your corpus. Try standard routine first...", status="ERR" ) transaction.commit() return # 5. for each segment in this language... number_of_groups = cursor.execute( """ SELECT COUNT( DISTINCT ds.document_id ) as distribution, s.language, s.stemmed FROM `anta_document_segment` ds JOIN anta_segment s ON ds.segment_id = s.id JOIN anta_document d ON d.id = ds.document_id WHERE d.corpus_id = %s AND s.language = %s AND s.status = 'IN' GROUP BY s.stemmed ORDER BY distribution DESC, stemmed ASC""", [corpus.id, language], ) logger.info("[%s:%s] distribution query executed for language '%s'" % (corpus.name, corpus.id, language)) language_cursor = 0.0 for row in dictfetchall(cursor): # increment global runner (stats) current_stem = current_stem + 1 language_cursor + 1 # store tfidf inside each segment-document relationships try: dss = Document_Segment.objects.filter(segment__stemmed=row["stemmed"], segment__language=language) df = float(row["distribution"]) / number_of_documents # print float(current_stem) / number_of_stems * 100.0, row[ column ], row['distribution'], df except Exception, e: logger.exception("[%s:%s] uhandled exception ..." % (corpus.name, corpus.id)) # print e close_routine( routine, error="Ho trovato Wally alla liena 281 di metrics.py, Exception: %s" % e, status="ERR" ) transaction.commit() return for ds in dss: ds.tfidf = ds.tf * math.log(1 / df) ds.save() if current_stem % 75 == 0: try: completion = completion_start + (float(current_stem) / number_of_stems) * ( completion_score - completion_start ) logger.info( "[%s:%s] language completion: %s" % (corpus.name, corpus.id, (float(current_stem) / number_of_stems)) ) log_routine(routine, completion=completion) # save percentage and commit transaction transaction.commit() except Exception, e: logger.exception("[%s:%s] uhandled exception ..." % (corpus.name, corpus.id))
) if number_of_documents == 0: logger.error("routine closed, not enough documents") return close_routine(routine, error="standard routine: No document found", status="OK") # 1. distiller.decant (tf computation ) try: decant(corpus=corpus, routine=routine, settings=settings, ref_completion=0.5) except Exception, e: logger.exception("Exception: %s" % e) return close_routine(routine, error="Exception: %s" % e, status="ERR") # 2. get all languages in corpus tf_tfidf(corpus=corpus, routine=routine) close_routine(routine, error="", status="OK") @transaction.commit_manually def entities_alchemy(corpus, routine): print """ ================================= ---- ENTITIES VIA ALCHEMYAPI ---- ================================= """ from services import alchemy number_of_documents = Document.objects.filter(corpus=corpus).count() # print "[info] number_of_documents:",number_of_documents # print "[info] alchmy apikey:", settings.ALCHEMY_API_KEY