def getCandidateDocsThroughClusters(jobId): jobInfo = "Job id: " + jobId distanceTableManager = DistanceTableManager() clusterManager = ClusterManager() jobManager = MinerJobManager() distances = distanceTableManager.getDistanceMatrix() logger.info("Got the distance matrix. %s.", jobInfo) clusters = list(clusterManager.getCurrentClusters()) logger.info("Got the clusters. %s.", jobInfo) for cluster in clusters: if len(cluster) > 1: closeDocs = [] for doc in cluster: closeDocs = closeDocs + distanceTableManager.getCloseDocs(doc) closeDocs = list(set(closeDocs)) for (doc1, doc2) in itertools.product(cluster, closeDocs): try: _tryGetDocDistance(distances, doc1, doc2) logging.info("Docs %s and %s already compared. %s", doc1, doc2, jobInfo) except KeyError: if doc1 != doc2: job = WorkerJob( JOB_COMPAREDOCS, { JOBARG_COMPAREDOCS_DOC1ID: doc1, JOBARG_COMPAREDOCS_DOC2ID: doc2 }) jobManager.enqueueJob(job) logging.info( "Put compare docs job with jobid: %s. doc1: %s. doc2: %s. %s", job.jobId, doc1, doc2, jobInfo)
def clusterDocs(jobId): jobInfo = "Job id: " + jobId logger.info("Started clustering docs. %s.", jobInfo) distanceTableManager = DistanceTableManager() clusterManager = ClusterManager() distances = distanceTableManager.getDistanceMatrix() logger.info("Got the distance matrix. %s.", jobInfo) clusters = list(clusterManager.getCurrentClusters()) logger.info("Got the clusters. %s.", jobInfo) logger.info("Started clustering. %s.", jobInfo) clusterHierarchical(jobInfo, clusters, distances) logger.info("Finished clustering. %s.", jobInfo) clusterManager.putCurrentClusters(clusters) logger.info("Put the computed clusters. %s.", jobInfo)