Пример #1
0
def getCandidateDocsThroughClusters(jobId):
    jobInfo = "Job id: " + jobId
    distanceTableManager = DistanceTableManager()
    clusterManager = ClusterManager()
    jobManager = MinerJobManager()

    distances = distanceTableManager.getDistanceMatrix()
    logger.info("Got the distance matrix. %s.", jobInfo)

    clusters = list(clusterManager.getCurrentClusters())
    logger.info("Got the clusters. %s.", jobInfo)

    for cluster in clusters:
        if len(cluster) > 1:
            closeDocs = []
            for doc in cluster:
                closeDocs = closeDocs + distanceTableManager.getCloseDocs(doc)
            closeDocs = list(set(closeDocs))

            for (doc1, doc2) in itertools.product(cluster, closeDocs):
                try:
                    _tryGetDocDistance(distances, doc1, doc2)
                    logging.info("Docs %s and %s already compared. %s", doc1,
                                 doc2, jobInfo)
                except KeyError:
                    if doc1 != doc2:
                        job = WorkerJob(
                            JOB_COMPAREDOCS, {
                                JOBARG_COMPAREDOCS_DOC1ID: doc1,
                                JOBARG_COMPAREDOCS_DOC2ID: doc2
                            })
                        jobManager.enqueueJob(job)
                        logging.info(
                            "Put compare docs job with jobid: %s. doc1: %s. doc2: %s. %s",
                            job.jobId, doc1, doc2, jobInfo)
Пример #2
0
def pushLinkJobs():
    """
    Cleanup old links in the links table.
    Run this job periodically.
    """

    jobManager = MinerJobManager()
    linkManager = LinkManager()

    if jobManager.count() > 100:
        logging.info("Skipping. Too many jobs queued already!!")
        return;

    logging.info("Getting unprocessed links.")
    links = linkManager.getUnprocessedLinks()

    nLinks = 0;
    for linkId in links:
        processLinkJob = WorkerJob(
            JOB_PROCESSLINK,
            { JOBARG_PROCESSLINK_LINKID : linkId})
        jobManager.enqueueJob(processLinkJob)
        logging.info(
            "Process link job with jobId '%s' put for linkId: %s.",
            processLinkJob.jobId,
            linkId)
        nLinks = nLinks + 1

    logging.info("Number of process link jobs  were: %i", nLinks)
Пример #3
0
def cleanUpDistanceTable(jobId):
    jobInfo = "Job id: " + jobId
    distanceTableManager = DistanceTableManager()
    clusterManager = ClusterManager()
    jobManager = MinerJobManager()

    docList = list(clusterManager.getCurrentDocs())
    distances = list(distanceTableManager.getEntries())

    staleDocs = []
    for entry in distances:
        staleDoc = ""
        if entry[0] not in docList:
            staleDocs.append(entry[0])
        elif entry[1] not in docList:
            staleDocs.append(entry[1])
    staleDocs = list(set(staleDocs))

    for docKey in staleDocs:
        job = WorkerJob(JOB_CLEANUPDOC, {JOBARG_CLEANUPDOC_DOCID: docKey})
        jobManager.enqueueJob(job)
        logging.info("Put cleanup doc job with id %s for docId: %s. %s",
                     job.jobId, docKey, jobInfo)

    logging.info("Number of stale entries in distances table: %i. %s",
                 len(staleDocs), jobInfo)
Пример #4
0
def pushFeedJobs():
    """
    Push feed processing jobs to job queue.
    """

    jobManager = MinerJobManager()
    feedManager = FeedManager()

    if jobManager.count() > 50:
        logging.info("Skipping. Too many jobs queued already!!")
        return

    logging.info("Getting stale  feeds.")
    staleFeeds = feedManager.getStaleFeeds()

    nStaleFeeds = 0
    for feed in staleFeeds:
        processFeedJob = WorkerJob(JOB_PROCESSFEED,
                                   {JOBARG_PROCESSFEED_FEEDID: feed})
        jobManager.enqueueJob(processFeedJob)
        logging.info("Process feed job put for feedId: %s. Job id: %s", feed,
                     processFeedJob.jobId)
        nStaleFeeds = nStaleFeeds + 1

    logging.info("Number of stale feeds are: %i", nStaleFeeds)
Пример #5
0
def deepCleanStaleDocs():
    """
    Puts cleanup doc jobs for stale entries in shingles table
    Run this job rarely.
    """

    docManager = DocManager()
    jobManager = MinerJobManager()
    shingleTableManager = ShingleTableManager()
    docsToBeCleanedUp = []

    logging.info("Started scanning the shingle table")
    scanResults = shingleTableManager.scan()

    for entry in scanResults:
        try:
            docManager.get(entry[0])
        except S3ResponseError:
            staleDocId = entry[0]
            staleShingle = entry[1]
            logging.info("Stale entry found -> docId: %s, shingle: %s",
                         staleDocId, staleShingle)

            if staleDocId not in docsToBeCleanedUp:
                docsToBeCleanedUp.append(staleDocId)
                job = WorkerJob(JOB_CLEANUPDOC,
                                {JOBARG_CLEANUPDOC_DOCID: staleDocId})
                jobManager.enqueueJob(job)
                logging.info("Put cleanup doc job for docId: %s. Job id: %s",
                             staleDocId, job.jobId)

    logging.info("Number of stale docs deleted were: %i",
                 len(list(docsToBeCleanedUp)))
Пример #6
0
def putComareDocJobs(docId, matches, docAndJobId):
    jobManager = MinerJobManager()

    for match in matches:
        if match != docId:
            job = WorkerJob(JOB_COMPAREDOCS, {
                JOBARG_COMPAREDOCS_DOC1ID: docId,
                JOBARG_COMPAREDOCS_DOC2ID: match
            })
            jobManager.enqueueJob(job)
            logging.info(
                "Put compare docs job with jobid: %s. compared docId: %s. %s",
                job.jobId, match, docAndJobId)
Пример #7
0
def parseDoc(jobId, docId):
    docAndJobId = "Doc id: " + docId + ". Job id: " + jobId
    logger.info("Started parsing doc. %s.", docAndJobId)

    docManager = DocManager()
    doc = docManager.get(docId)

    # compute and put shingles
    if (doc.tags[FEEDTAG_LANG] == LANG_ENGLISH):
        shingles = th.getShingles(getDocEnglishSummaryText(doc), 3, 3)
        shingles = shingles + th.getShingles(getDocEnglishContent(doc), 3, 3)
        logger.info("Completed getting shingles. %s.", docAndJobId)
        shingles = list(set(shingles))
        logger.info("Number of unique shingles are %i. %s.", len(shingles),
                    docAndJobId)

        shingleTableManager = ShingleTableManager()
        shingleTableManager.addEntries(docId, shingles)
        logger.info("Added shingles to shingle table. %s.", docAndJobId)

    # compute and put entities
    entities = th.getEntities(getDocEnglishTitle(doc)) + \
        th.getEntities(getDocEnglishSummaryText(doc)) + \
        th.getEntities(getDocEnglishContent(doc))
    entities = list(set(entities))
    logger.info("Completed getting entities. %s.", docAndJobId)
    logger.info("Number of unique entities are %i. %s.", len(entities),
                docAndJobId)

    entityTableManager = EntityTableManager()
    entityTableManager.addEntries(docId, entities)
    logger.info("Added entities to entity table. %s.", docAndJobId)

    #store entity weights in the doc
    entityWeights = {}
    for entity in entities:
        entityWeight = entityTableManager.getEntityWeight(entity)
        entityWeights[entity] = entityWeight
    doc.tags[DOCTAG_ENTITY_WEIGHTS] = json.dumps(entityWeights)
    docManager.put(doc)
    logger.info("Added entity weights to doc. %s.", docAndJobId)

    job = WorkerJob(JOB_GETCANDIDATEDOCS,
                    {JOBARG_GETCANDIDATEDOCS_DOCID: docId})
    jobManager = MinerJobManager()
    jobManager.enqueueJob(job)
    logging.info("Put get candidate doc job with jobId: %s. %s", job.jobId,
                 docAndJobId)

    logger.info("Completed parsing doc. %s.", docAndJobId)
Пример #8
0
def _putNewLinks(feedAndJobId, linksToAdd):
  linkManager = LinkManager()
  jobManager = MinerJobManager()
  latestPubTime = 0

  for link in linksToAdd:
    try:
      existingLink = linkManager.get(link.id)
      logger.info(
        "Link with id '%s' already exists. Not processing it. %s",
        link.id,
        feedAndJobId)
      continue
    except:
      pass

    linkManager.put(link)
    logger.info(
        "Put link with id '%s' in links database. %s.",
        link.id,
        feedAndJobId)

    if latestPubTime < link.tags[LINKTAG_PUBTIME]:
      latestPubTime  = link.tags[LINKTAG_PUBTIME]

  return latestPubTime;
Пример #9
0
def archiveStaleDocs():
  """
  Remove the docs fro current working set
  Run this job periodically.
  """

  clusterManager = ClusterManager()
  jobManager = MinerJobManager()

  logging.info("Archiving old clusters.")
  staleClusters = clusterManager.archiveOldClusters()

  for cluster in staleClusters:
    for docKey in cluster:
      job = WorkerJob(JOB_CLEANUPDOC, { JOBARG_CLEANUPDOC_DOCID : docKey})
      jobManager.enqueueJob(job)
      logging.info(
        "Put cleanup doc job for docId: %s. Job id: %s",
        docKey,
        job.jobId)

  logging.info("Archived old clusters and cleaned up docs in them from working set.")
Пример #10
0
def cleanUpDoc(jobId, docId):
    docAndJobId = "Doc id: " + docId + ". Job id: " + jobId
    logger.info("Started cleaning up doc. %s.", docAndJobId)

    jobManager = MinerJobManager()

    job = WorkerJob(JOB_CLEANUPDOCSHINGLES,
                    {JOBARG_CLEANUPDOCSHINGLES_DOCID: docId})
    jobManager.enqueueJob(job)
    logging.info("Put cleanup doc shingles job. %s.", docAndJobId)

    job = WorkerJob(JOB_CLEANUPDOCENTITIES,
                    {JOBARG_CLEANUPDOCENTITIES_DOCID: docId})
    jobManager.enqueueJob(job)
    logging.info("Put cleanup doc entities job. %s.", docAndJobId)

    job = WorkerJob(JOB_CLEANUPDOCDISTANCES,
                    {JOBARG_CLEANUPDOCDISTANCES_DOCID: docId})
    jobManager.enqueueJob(job)
    logging.info("Put cleanup doc distances job. %s.", docAndJobId)
Пример #11
0
def processLink(jobId, linkId):
    """
  Processes a link(takes as input the linkId)

  Steps:
  1. get link from database
  2. get publisher for that link from database
  3. get html for that link
  4. process that html to generate doc
  5. save that doc in docstore.
  6. update the link's is processed tag.
  """

    linkAndJobId = "Link id: " + linkId + ". Job id: " + jobId
    logger.info("Started processing link. %s.", linkAndJobId)

    # get the link
    linkManager = LinkManager()
    link = linkManager.get(linkId)
    logger.info("Got link from database. %s.", linkAndJobId)

    # get the publisher
    publisherManager = PublisherManager()
    publisher = publisherManager.get(link.tags[TAG_PUBLISHER])
    logger.info("Got publisher from database. Publisher id: %s. %s.",
                link.tags[TAG_PUBLISHER], linkAndJobId)

    # get html for the link
    processingResult = _processHtmlForLink(jobId, link, publisher)
    if not processingResult[0]:
        logger.warning("No text extracted for the link. %s.", linkAndJobId)

    # generate corresponding doc
    doc = Doc(_getDocKey(link), processingResult[0], link.tags)
    doc.tags[TAG_IMAGES] = processingResult[1]
    doc.tags[DOCTAG_URL] = linkId
    doc.tags[TAG_PUBLISHER_DETAILS] = _getPublisherDetails(publisher)
    doc = _addTranslationTags(jobId, doc)
    doc = _addSummaryIfNotPresent(doc)
    doc.tags[LINKTAG_HIGHLIGHTS] = _getDocHighlights(doc)

    # save the doc
    docManager = DocManager()
    docManager.put(doc)
    logger.info("Document generated and saved for link. Doc key %s. %s.",
                doc.key, linkAndJobId)

    #update the doc key in links table
    link.tags[LINKTAG_DOCKEY] = doc.key
    linkManager.put(link)

    # put parse doc job
    parseDocJob = WorkerJob(JOB_PARSEDOC, {JOBARG_PARSEDOC_DOCID: doc.key})
    jobManager = MinerJobManager()
    jobManager.enqueueJob(parseDocJob)
    logger.info("Parse doc job with with jobId '%s' put. %s.",
                parseDocJob.jobId, linkAndJobId)

    if FEEDTAG_DO_NOT_CLUSTER not in doc.tags:
        newCluster = Cluster([doc.key])
        processNewClusterJob = WorkerJob(
            JOB_PROCESSNEWCLUSTER,
            {JOBARG_PROCESSNEWCLUSTER_CLUSTER: list(newCluster)})
        clusterJobManager = ClusterJobManager()
        clusterJobManager.enqueueJob(processNewClusterJob)
        logging.info(
            "Put process new cluster job for new doc. Cluster id: %s. %s",
            newCluster.id, linkAndJobId)

    # update the link
    link.tags[LINKTAG_ISPROCESSED] = 'true'
    linkManager.put(link)
    logger.info("Link updated after being successfully processed. %s.",
                linkAndJobId)

    logger.info("Completed processing link. %s.", linkAndJobId)