def getCandidateDocsThroughClusters(jobId): jobInfo = "Job id: " + jobId distanceTableManager = DistanceTableManager() clusterManager = ClusterManager() jobManager = MinerJobManager() distances = distanceTableManager.getDistanceMatrix() logger.info("Got the distance matrix. %s.", jobInfo) clusters = list(clusterManager.getCurrentClusters()) logger.info("Got the clusters. %s.", jobInfo) for cluster in clusters: if len(cluster) > 1: closeDocs = [] for doc in cluster: closeDocs = closeDocs + distanceTableManager.getCloseDocs(doc) closeDocs = list(set(closeDocs)) for (doc1, doc2) in itertools.product(cluster, closeDocs): try: _tryGetDocDistance(distances, doc1, doc2) logging.info("Docs %s and %s already compared. %s", doc1, doc2, jobInfo) except KeyError: if doc1 != doc2: job = WorkerJob( JOB_COMPAREDOCS, { JOBARG_COMPAREDOCS_DOC1ID: doc1, JOBARG_COMPAREDOCS_DOC2ID: doc2 }) jobManager.enqueueJob(job) logging.info( "Put compare docs job with jobid: %s. doc1: %s. doc2: %s. %s", job.jobId, doc1, doc2, jobInfo)
def cleanUpDistanceTable(jobId): jobInfo = "Job id: " + jobId distanceTableManager = DistanceTableManager() clusterManager = ClusterManager() jobManager = MinerJobManager() docList = list(clusterManager.getCurrentDocs()) distances = list(distanceTableManager.getEntries()) staleDocs = [] for entry in distances: staleDoc = "" if entry[0] not in docList: staleDocs.append(entry[0]) elif entry[1] not in docList: staleDocs.append(entry[1]) staleDocs = list(set(staleDocs)) for docKey in staleDocs: job = WorkerJob(JOB_CLEANUPDOC, {JOBARG_CLEANUPDOC_DOCID: docKey}) jobManager.enqueueJob(job) logging.info("Put cleanup doc job with id %s for docId: %s. %s", job.jobId, docKey, jobInfo) logging.info("Number of stale entries in distances table: %i. %s", len(staleDocs), jobInfo)
def compareDocs(jobId, doc1Key, doc2Key): jobInfo = "Doc1 id: " + doc1Key + " Doc2 id: " + doc2Key \ + ". Job id: " + jobId logger.info("Started comparing docs. %s", jobInfo) docManager = DocManager() doc1 = docManager.get(doc1Key) doc2 = docManager.get(doc2Key) score = 0 if (doc1.tags[FEEDTAG_LANG] == LANG_ENGLISH) and \ (doc2.tags[FEEDTAG_LANG] == LANG_ENGLISH): score = computeEnglishDocsSimScore(doc1, doc2) logger.info("Comparing using shingles. %s", jobInfo) else: score = computeDocSimScoreUsingEntities(doc1, doc2) logger.info("Comparing using entities. %s", jobInfo) if FEEDTAG_LOCALE in doc1.tags and FEEDTAG_LOCALE in doc2.tags and \ doc1.tags[FEEDTAG_LOCALE] != doc2.tags[FEEDTAG_LOCALE]: logger.info( "The two docs are from different locations. Adding penalty. %s", jobInfo) score = score - 0.4 if score < 0: score = 0 logger.info("Comparision score: %s. %s", str(score), jobInfo) if score > SIMSCORE_MIN_THRESHOLD: distanceTableManager = DistanceTableManager() distanceTableManager.addEntry(doc1Key, doc2Key, score) logger.info("Added comparision score to distances table. %s", jobInfo) logger.info("Completed comparing docs. %s", jobInfo)
def cleanUpDocDistances(jobId, docId): docAndJobId = "Doc id: " + docId + ". Job id: " + jobId logger.info("Started cleaning up doc distances. %s.", docAndJobId) distanceTableManager = DistanceTableManager() distanceTableManager.cleanUpDoc(docId) logger.info("Completed cleaning up doc distances. %s.", docAndJobId)
def _isDuplicateArticle(docKey, docsAdded): distanceTableManager = DistanceTableManager() for addedDoc in docsAdded: distance = distanceTableManager.getDistance(docKey, addedDoc) if distance > DOC_DUPLICATION_THRESHOLD: return True return False
def compareDocs(jobId, doc1Key, doc2Key): jobInfo = "Doc1 id: " + doc1Key + " Doc2 id: " + doc2Key \ + ". Job id: " + jobId logger.info("Started comparing docs. %s", jobInfo) docManager = DocManager() doc1 = docManager.get(doc1Key) doc2 = docManager.get(doc2Key) score = getDocComparisionScore(jobInfo, doc1, doc2) if score > SIMSCORE_MIN_THRESHOLD: distanceTableManager = DistanceTableManager() distanceTableManager.addEntry(doc1Key, doc2Key, score) logger.info("Added comparision score to distances table. %s", jobInfo) logger.info("Completed comparing docs. %s", jobInfo)
def clusterDocs(jobId): jobInfo = "Job id: " + jobId logger.info("Started clustering docs. %s.", jobInfo) distanceTableManager = DistanceTableManager() clusterManager = ClusterManager() distances = distanceTableManager.getDistanceMatrix() logger.info("Got the distance matrix. %s.", jobInfo) clusters = list(clusterManager.getCurrentClusters()) logger.info("Got the clusters. %s.", jobInfo) logger.info("Started clustering. %s.", jobInfo) clusterHierarchical(jobInfo, clusters, distances) logger.info("Finished clustering. %s.", jobInfo) clusterManager.putCurrentClusters(clusters) logger.info("Put the computed clusters. %s.", jobInfo)