def similarity(corpus, routine, completion_start=0.0, completion_score=1.0): print """ ==================== ---- SIMILARITY ---- ==================== """ # Stories vlows Enter Nebuloses log_routine( routine, entry="similarity measurement started succesfully", completion=0.0, completion_start=completion_start, completion_score=completion_score, ) # 1. get number of document number_of_documents = Document.objects.filter(corpus=corpus, status="IN").count() logger.info("[%s:%s] SIMILARITY on %s 'IN' documents" % (corpus.name, corpus.id, number_of_documents)) # 2. dictionary where keys are the ids of corpus docments and valuse documents = {} # out some information # print "[info] corpus:",corpus.json() # print "[info] document in corpus:",number_of_documents # get the list of every stemmed segments inside each document. # the distance algorithm will work on "stemmed" documents! # @todo: verify that PATTERN distance measurement works well with this algorithm. cursor = connection.cursor() cursor.execute( """ SELECT count(*) FROM anta_document_segment ds JOIN anta_segment s ON ds.segment_id = s.id JOIN anta_document d ON ds.document_id = d.id WHERE d.corpus_id = %s AND s.status='IN' """, [corpus.id], ) number_of_stems = cursor.fetchone()[0] number_of_groups = cursor.execute( """ SELECT s.stemmed, d.id as document_id FROM anta_document_segment ds JOIN anta_segment s ON ds.segment_id = s.id JOIN anta_document d ON ds.document_id = d.id WHERE d.corpus_id = %s AND d.status='IN' AND s.status='IN' """, [corpus.id], ) # we do not need ORDER BY d.id, ds.id logger.info("[%s:%s] %s document_segment found" % (corpus.name, corpus.id, number_of_groups)) log_routine( routine, entry="similarity measurement started", completion=0.1, completion_start=completion_start, completion_score=completion_score, ) # @todo: improve remapping for row in cursor.fetchall(): stemmed, document_id = row # print document_id if document_id not in documents: documents[document_id] = [] documents[document_id].append(stemmed) logger.info("[%s:%s] creating documents for Pattern" % (corpus.name, corpus.id)) # translate corpus id pattern_id_translation = {} # reformat each document into a PATTERN compatible document: join space separated stemmed segment values. for d in documents: documents[d] = pvDocument(" ".join(documents[d])) log_routine( routine, entry="join stemmed segments", completion=0.15, completion_start=completion_start, completion_score=completion_score, ) pattern_id_translation[documents[d].id] = d # print "[info] document with segments in corpus:",len(pattern_id_translation) logger.info("[%s:%s] %s documents created for Pattern" % (corpus.name, corpus.id, pattern_id_translation)) # store document in corpus. c = pvCorpus(documents.values()) # computate and save similarities for counter, d in enumerate(documents): # print counter, "neighbors of" ,documents[d], neighbors = c.neighbors(documents[d], top=number_of_documents) if len(neighbors) == 0: logger.warning("no neighbors for document: %s" % pattern_id_translation[documents[d].id]) continue # print "%s neighbors found for document: %s" % ( len(neighbors), pattern_id_translation[ documents[d].id ] ) logger.info( "[%s:%s] %s neighbors found for document: %s, completion: %s" % ( corpus.name, corpus.id, len(neighbors), pattern_id_translation[documents[d].id], (counter / float(number_of_documents)), ) ) for n in c.neighbors(documents[d], top=number_of_documents): alpha_id = pattern_id_translation[documents[d].id] omega_id = pattern_id_translation[n[1].id] cosine_similarity = n[0] try: dist = Distance.objects.get(alpha__id=alpha_id, omega__id=omega_id) # print "[info] distantce exists ( %s - %s ), old value: %s, difference: %s" % ( alpha_id, omega_id, dist.cosine_similarity,(dist.cosine_similarity - cosine_similarity) ) except Distance.DoesNotExist, e: # print e dist = Distance(alpha_id=alpha_id, omega_id=omega_id) # print "[info] create Distance object", dist.id, cosine_similarity # print a distance exist between these two document dist.cosine_similarity = cosine_similarity dist.save()
def similarity(corpus, routine, completion_start=0.0, completion_score=1.0): print """ ==================== ---- SIMILARITY ---- ==================== """ # Stories vlows Enter Nebuloses log_routine( routine, entry="similarity measurement started succesfully", completion=0.0, completion_start=completion_start, completion_score=completion_score, ) transaction.commit() # 1. get number of document number_of_documents = Document.objects.filter(corpus=corpus).count() # 2. dictionary where keys are the ids of corpus docments and valuse documents = {} # out some information print "[info] corpus:", corpus.json() print "[info] document in corpus:", number_of_documents # get the list of every stemmed segments inside each document. # the distance algorithm will work on "stemmed" documents! # @todo: verify that PATTERN distance measurement works well with this algorithm. cursor = connection.cursor() cursor.execute( """ SELECT count(*) FROM anta_document_segment ds JOIN anta_segment s ON ds.segment_id = s.id JOIN anta_document d ON ds.document_id = d.id WHERE d.corpus_id = %s """, [corpus.id], ) number_of_stems = cursor.fetchone()[0] cursor.execute( """ SELECT s.stemmed, d.id as document_id FROM anta_document_segment ds JOIN anta_segment s ON ds.segment_id = s.id JOIN anta_document d ON ds.document_id = d.id WHERE d.corpus_id = %s """, [corpus.id], ) # we do not need ORDER BY d.id, ds.id log_routine( routine, entry="similarity measurement started", completion=0.1, completion_start=completion_start, completion_score=completion_score, ) # @todo: improve remapping for row in cursor.fetchall(): stemmed, document_id = row # print document_id if document_id not in documents: documents[document_id] = [] documents[document_id].append(stemmed) # translate corpus id pattern_id_translation = {} # reformat each document into a PATTERN compatible document: join space separated stemmed segment values. for d in documents: documents[d] = pvDocument(" ".join(documents[d])) log_routine( routine, entry="join stemmed segments", completion=0.15, completion_start=completion_start, completion_score=completion_score, ) pattern_id_translation[documents[d].id] = d # store document in corpus. c = pvCorpus(documents.values()) # computate and save similarities for counter, d in enumerate(documents): print counter for n in c.neighbors(documents[d], top=number_of_documents): alpha_id = pattern_id_translation[documents[d].id] omega_id = pattern_id_translation[n[1].id] cosine_similarity = n[0] try: dist = Distance.objects.get(alpha__id=alpha_id, omega__id=omega_id) print "[info] distantce exists ( %s - %s ), old value: %s, difference: %s" % ( alpha_id, omega_id, dist.cosine_similarity, (dist.cosine_similarity - cosine_similarity), ) except Exception, e: print e dist = Distance(alpha_id=alpha_id, omega_id=omega_id) print "[info] create Distance object", dist.id, cosine_similarity # print a distance exist between these two document dist.cosine_similarity = cosine_similarity dist.save() # log_routine( routine, entry="neighbors computation", completion=(counter + 1.0) / number_of_documents, completion_start=completion_start, completion_score=completion_score, ) transaction.commit()