def compareDocs(jobId, doc1Key, doc2Key): jobInfo = "Doc1 id: " + doc1Key + " Doc2 id: " + doc2Key \ + ". Job id: " + jobId logger.info("Started comparing docs. %s", jobInfo) docManager = DocManager() doc1 = docManager.get(doc1Key) doc2 = docManager.get(doc2Key) score = 0 if (doc1.tags[FEEDTAG_LANG] == LANG_ENGLISH) and \ (doc2.tags[FEEDTAG_LANG] == LANG_ENGLISH): score = computeEnglishDocsSimScore(doc1, doc2) logger.info("Comparing using shingles. %s", jobInfo) else: score = computeDocSimScoreUsingEntities(doc1, doc2) logger.info("Comparing using entities. %s", jobInfo) if FEEDTAG_LOCALE in doc1.tags and FEEDTAG_LOCALE in doc2.tags and \ doc1.tags[FEEDTAG_LOCALE] != doc2.tags[FEEDTAG_LOCALE]: logger.info( "The two docs are from different locations. Adding penalty. %s", jobInfo) score = score - 0.4 if score < 0: score = 0 logger.info("Comparision score: %s. %s", str(score), jobInfo) if score > SIMSCORE_MIN_THRESHOLD: distanceTableManager = DistanceTableManager() distanceTableManager.addEntry(doc1Key, doc2Key, score) logger.info("Added comparision score to distances table. %s", jobInfo) logger.info("Completed comparing docs. %s", jobInfo)
def compareDocs(jobId, doc1Key, doc2Key): jobInfo = "Doc1 id: " + doc1Key + " Doc2 id: " + doc2Key \ + ". Job id: " + jobId logger.info("Started comparing docs. %s", jobInfo) docManager = DocManager() doc1 = docManager.get(doc1Key) doc2 = docManager.get(doc2Key) score = getDocComparisionScore(jobInfo, doc1, doc2) if score > SIMSCORE_MIN_THRESHOLD: distanceTableManager = DistanceTableManager() distanceTableManager.addEntry(doc1Key, doc2Key, score) logger.info("Added comparision score to distances table. %s", jobInfo) logger.info("Completed comparing docs. %s", jobInfo)