예제 #1
0
def getCandidateDocsThroughClusters(jobId):
    jobInfo = "Job id: " + jobId
    distanceTableManager = DistanceTableManager()
    clusterManager = ClusterManager()
    jobManager = MinerJobManager()

    distances = distanceTableManager.getDistanceMatrix()
    logger.info("Got the distance matrix. %s.", jobInfo)

    clusters = list(clusterManager.getCurrentClusters())
    logger.info("Got the clusters. %s.", jobInfo)

    for cluster in clusters:
        if len(cluster) > 1:
            closeDocs = []
            for doc in cluster:
                closeDocs = closeDocs + distanceTableManager.getCloseDocs(doc)
            closeDocs = list(set(closeDocs))

            for (doc1, doc2) in itertools.product(cluster, closeDocs):
                try:
                    _tryGetDocDistance(distances, doc1, doc2)
                    logging.info("Docs %s and %s already compared. %s", doc1,
                                 doc2, jobInfo)
                except KeyError:
                    if doc1 != doc2:
                        job = WorkerJob(
                            JOB_COMPAREDOCS, {
                                JOBARG_COMPAREDOCS_DOC1ID: doc1,
                                JOBARG_COMPAREDOCS_DOC2ID: doc2
                            })
                        jobManager.enqueueJob(job)
                        logging.info(
                            "Put compare docs job with jobid: %s. doc1: %s. doc2: %s. %s",
                            job.jobId, doc1, doc2, jobInfo)
예제 #2
0
def cleanUpDistanceTable(jobId):
    jobInfo = "Job id: " + jobId
    distanceTableManager = DistanceTableManager()
    clusterManager = ClusterManager()
    jobManager = MinerJobManager()

    docList = list(clusterManager.getCurrentDocs())
    distances = list(distanceTableManager.getEntries())

    staleDocs = []
    for entry in distances:
        staleDoc = ""
        if entry[0] not in docList:
            staleDocs.append(entry[0])
        elif entry[1] not in docList:
            staleDocs.append(entry[1])
    staleDocs = list(set(staleDocs))

    for docKey in staleDocs:
        job = WorkerJob(JOB_CLEANUPDOC, {JOBARG_CLEANUPDOC_DOCID: docKey})
        jobManager.enqueueJob(job)
        logging.info("Put cleanup doc job with id %s for docId: %s. %s",
                     job.jobId, docKey, jobInfo)

    logging.info("Number of stale entries in distances table: %i. %s",
                 len(staleDocs), jobInfo)
예제 #3
0
def compareDocs(jobId, doc1Key, doc2Key):
    jobInfo = "Doc1 id: " + doc1Key + " Doc2 id: " + doc2Key \
              + ". Job id: " + jobId
    logger.info("Started comparing docs. %s", jobInfo)

    docManager = DocManager()
    doc1 = docManager.get(doc1Key)
    doc2 = docManager.get(doc2Key)

    score = 0
    if (doc1.tags[FEEDTAG_LANG] == LANG_ENGLISH) and \
        (doc2.tags[FEEDTAG_LANG] == LANG_ENGLISH):
        score = computeEnglishDocsSimScore(doc1, doc2)
        logger.info("Comparing using shingles. %s", jobInfo)
    else:
        score = computeDocSimScoreUsingEntities(doc1, doc2)
        logger.info("Comparing using entities. %s", jobInfo)

    if FEEDTAG_LOCALE in doc1.tags and FEEDTAG_LOCALE in doc2.tags and \
        doc1.tags[FEEDTAG_LOCALE] != doc2.tags[FEEDTAG_LOCALE]:

        logger.info(
            "The two docs are from different locations. Adding penalty. %s",
            jobInfo)
        score = score - 0.4
        if score < 0:
            score = 0
    logger.info("Comparision score: %s. %s", str(score), jobInfo)

    if score > SIMSCORE_MIN_THRESHOLD:
        distanceTableManager = DistanceTableManager()
        distanceTableManager.addEntry(doc1Key, doc2Key, score)
        logger.info("Added comparision score to distances table. %s", jobInfo)

    logger.info("Completed comparing docs. %s", jobInfo)
예제 #4
0
def cleanUpDocDistances(jobId, docId):
    docAndJobId = "Doc id: " + docId + ". Job id: " + jobId
    logger.info("Started cleaning up doc distances. %s.", docAndJobId)

    distanceTableManager = DistanceTableManager()
    distanceTableManager.cleanUpDoc(docId)

    logger.info("Completed cleaning up doc distances. %s.", docAndJobId)
예제 #5
0
def _isDuplicateArticle(docKey, docsAdded):
  distanceTableManager = DistanceTableManager()

  for addedDoc in docsAdded:
    distance = distanceTableManager.getDistance(docKey, addedDoc)
    if distance > DOC_DUPLICATION_THRESHOLD:
      return True

  return False
예제 #6
0
def compareDocs(jobId, doc1Key, doc2Key):
    jobInfo = "Doc1 id: " + doc1Key + " Doc2 id: " + doc2Key \
              + ". Job id: " + jobId
    logger.info("Started comparing docs. %s", jobInfo)

    docManager = DocManager()
    doc1 = docManager.get(doc1Key)
    doc2 = docManager.get(doc2Key)

    score = getDocComparisionScore(jobInfo, doc1, doc2)

    if score > SIMSCORE_MIN_THRESHOLD:
        distanceTableManager = DistanceTableManager()
        distanceTableManager.addEntry(doc1Key, doc2Key, score)
        logger.info("Added comparision score to distances table. %s", jobInfo)

    logger.info("Completed comparing docs. %s", jobInfo)
예제 #7
0
def clusterDocs(jobId):
    jobInfo = "Job id: " + jobId
    logger.info("Started clustering docs. %s.", jobInfo)

    distanceTableManager = DistanceTableManager()
    clusterManager = ClusterManager()

    distances = distanceTableManager.getDistanceMatrix()
    logger.info("Got the distance matrix. %s.", jobInfo)

    clusters = list(clusterManager.getCurrentClusters())
    logger.info("Got the clusters. %s.", jobInfo)

    logger.info("Started clustering. %s.", jobInfo)
    clusterHierarchical(jobInfo, clusters, distances)
    logger.info("Finished clustering. %s.", jobInfo)

    clusterManager.putCurrentClusters(clusters)
    logger.info("Put the computed clusters. %s.", jobInfo)