예제 #1
0
def compareDocs(jobId, doc1Key, doc2Key):
    jobInfo = "Doc1 id: " + doc1Key + " Doc2 id: " + doc2Key \
              + ". Job id: " + jobId
    logger.info("Started comparing docs. %s", jobInfo)

    docManager = DocManager()
    doc1 = docManager.get(doc1Key)
    doc2 = docManager.get(doc2Key)

    score = 0
    if (doc1.tags[FEEDTAG_LANG] == LANG_ENGLISH) and \
        (doc2.tags[FEEDTAG_LANG] == LANG_ENGLISH):
        score = computeEnglishDocsSimScore(doc1, doc2)
        logger.info("Comparing using shingles. %s", jobInfo)
    else:
        score = computeDocSimScoreUsingEntities(doc1, doc2)
        logger.info("Comparing using entities. %s", jobInfo)

    if FEEDTAG_LOCALE in doc1.tags and FEEDTAG_LOCALE in doc2.tags and \
        doc1.tags[FEEDTAG_LOCALE] != doc2.tags[FEEDTAG_LOCALE]:

        logger.info(
            "The two docs are from different locations. Adding penalty. %s",
            jobInfo)
        score = score - 0.4
        if score < 0:
            score = 0
    logger.info("Comparision score: %s. %s", str(score), jobInfo)

    if score > SIMSCORE_MIN_THRESHOLD:
        distanceTableManager = DistanceTableManager()
        distanceTableManager.addEntry(doc1Key, doc2Key, score)
        logger.info("Added comparision score to distances table. %s", jobInfo)

    logger.info("Completed comparing docs. %s", jobInfo)
예제 #2
0
def compareDocs(jobId, doc1Key, doc2Key):
    jobInfo = "Doc1 id: " + doc1Key + " Doc2 id: " + doc2Key \
              + ". Job id: " + jobId
    logger.info("Started comparing docs. %s", jobInfo)

    docManager = DocManager()
    doc1 = docManager.get(doc1Key)
    doc2 = docManager.get(doc2Key)

    score = getDocComparisionScore(jobInfo, doc1, doc2)

    if score > SIMSCORE_MIN_THRESHOLD:
        distanceTableManager = DistanceTableManager()
        distanceTableManager.addEntry(doc1Key, doc2Key, score)
        logger.info("Added comparision score to distances table. %s", jobInfo)

    logger.info("Completed comparing docs. %s", jobInfo)