Exemplo n.º 1
0
def compareDocs(jobId, doc1Key, doc2Key):
    jobInfo = "Doc1 id: " + doc1Key + " Doc2 id: " + doc2Key \
              + ". Job id: " + jobId
    logger.info("Started comparing docs. %s", jobInfo)

    docManager = DocManager()
    doc1 = docManager.get(doc1Key)
    doc2 = docManager.get(doc2Key)

    score = 0
    if (doc1.tags[FEEDTAG_LANG] == LANG_ENGLISH) and \
        (doc2.tags[FEEDTAG_LANG] == LANG_ENGLISH):
        score = computeEnglishDocsSimScore(doc1, doc2)
        logger.info("Comparing using shingles. %s", jobInfo)
    else:
        score = computeDocSimScoreUsingEntities(doc1, doc2)
        logger.info("Comparing using entities. %s", jobInfo)

    if FEEDTAG_LOCALE in doc1.tags and FEEDTAG_LOCALE in doc2.tags and \
        doc1.tags[FEEDTAG_LOCALE] != doc2.tags[FEEDTAG_LOCALE]:

        logger.info(
            "The two docs are from different locations. Adding penalty. %s",
            jobInfo)
        score = score - 0.4
        if score < 0:
            score = 0
    logger.info("Comparision score: %s. %s", str(score), jobInfo)

    if score > SIMSCORE_MIN_THRESHOLD:
        distanceTableManager = DistanceTableManager()
        distanceTableManager.addEntry(doc1Key, doc2Key, score)
        logger.info("Added comparision score to distances table. %s", jobInfo)

    logger.info("Completed comparing docs. %s", jobInfo)
Exemplo n.º 2
0
def deepCleanStaleDocs():
    """
    Puts cleanup doc jobs for stale entries in shingles table
    Run this job rarely.
    """

    docManager = DocManager()
    jobManager = MinerJobManager()
    shingleTableManager = ShingleTableManager()
    docsToBeCleanedUp = []

    logging.info("Started scanning the shingle table")
    scanResults = shingleTableManager.scan()

    for entry in scanResults:
        try:
            docManager.get(entry[0])
        except S3ResponseError:
            staleDocId = entry[0]
            staleShingle = entry[1]
            logging.info("Stale entry found -> docId: %s, shingle: %s",
                         staleDocId, staleShingle)

            if staleDocId not in docsToBeCleanedUp:
                docsToBeCleanedUp.append(staleDocId)
                job = WorkerJob(JOB_CLEANUPDOC,
                                {JOBARG_CLEANUPDOC_DOCID: staleDocId})
                jobManager.enqueueJob(job)
                logging.info("Put cleanup doc job for docId: %s. Job id: %s",
                             staleDocId, job.jobId)

    logging.info("Number of stale docs deleted were: %i",
                 len(list(docsToBeCleanedUp)))
Exemplo n.º 3
0
    def __init__(self):
        """
        Instantiates a new instance of ClusterManager class

        """

        self.clusterTableManager = ClusterTableManager()
        self.docManager = DocManager()
Exemplo n.º 4
0
  def process(self):
    """
    Processes the cluster to include metadata of consisting documents,
    and overall metadata of cluster like category, location, feeds, etc.
    """

    self.categories = []
    self.countries = []
    self.locales = []
    self.publishers = []
    self.languages = []
    self.articles = [] # contains non-duplicate articles
    self.duplicates = []
    self.lastPubTime = 0

    docManager = DocManager()
    docsAdded = []
    for docKey in super(Cluster, self).__iter__():
      doc = docManager.get(docKey)

      if not _isDuplicateArticle(docKey, docsAdded):
        self.articles.append({
          'title': doc.tags.get(LINKTAG_TITLE, ""),
          'publisher': doc.tags.get(TAG_PUBLISHER_DETAILS, ""),
          'link': doc.tags.get(DOCTAG_URL, "#"),
          'summaryText': doc.tags.get(LINKTAG_SUMMARYTEXT, ""),
          'images': _getImagesForDoc(doc),
          'lang': doc.tags.get(FEEDTAG_LANG, ""),
          'publishedOn': doc.tags.get(LINKTAG_PUBTIME, 0)
        })
        docsAdded.append(docKey)
      else:
        self.duplicates.append(docKey)

      if doc.tags.get(FEEDTAG_CATEGORY):
        self.categories.append(doc.tags[FEEDTAG_CATEGORY])
      if doc.tags.get(FEEDTAG_COUNTRY):
        self.countries.append(doc.tags[FEEDTAG_COUNTRY])
      if doc.tags.get(FEEDTAG_LOCALE):
        self.locales.append(doc.tags[FEEDTAG_LOCALE])
      if doc.tags.get(TAG_PUBLISHER):
        self.publishers.append(doc.tags[TAG_PUBLISHER])
      if doc.tags.get(FEEDTAG_LANG):
        self.languages.append(doc.tags[FEEDTAG_LANG])
      if doc.tags.get(LINKTAG_PUBTIME, 0) > self.lastPubTime:
        self.lastPubTime =  doc.tags.get(LINKTAG_PUBTIME)

    nArticles = len(self.articles) + len(self.duplicates)
    #remove duplicates
    self.categories = _removeDuplicatesAndOutliers(self.categories, nArticles)
    self.countries = _removeDuplicatesAndOutliers(self.countries, nArticles)
    self.locales = _removeDuplicatesAndOutliers(self.locales, nArticles)
    self.publishers = list(set(self.publishers))
    self.languages = list(set(self.languages))
Exemplo n.º 5
0
def parseDoc(jobId, docId):
    docAndJobId = "Doc id: " + docId + ". Job id: " + jobId
    logger.info("Started parsing doc. %s.", docAndJobId)

    docManager = DocManager()
    doc = docManager.get(docId)

    # compute and put shingles
    if (doc.tags[FEEDTAG_LANG] == LANG_ENGLISH):
        shingles = th.getShingles(getDocEnglishSummaryText(doc), 3, 3)
        shingles = shingles + th.getShingles(getDocEnglishContent(doc), 3, 3)
        logger.info("Completed getting shingles. %s.", docAndJobId)
        shingles = list(set(shingles))
        logger.info("Number of unique shingles are %i. %s.", len(shingles),
                    docAndJobId)

        shingleTableManager = ShingleTableManager()
        shingleTableManager.addEntries(docId, shingles)
        logger.info("Added shingles to shingle table. %s.", docAndJobId)

    # compute and put entities
    entities = th.getEntities(getDocEnglishTitle(doc)) + \
        th.getEntities(getDocEnglishSummaryText(doc)) + \
        th.getEntities(getDocEnglishContent(doc))
    entities = list(set(entities))
    logger.info("Completed getting entities. %s.", docAndJobId)
    logger.info("Number of unique entities are %i. %s.", len(entities),
                docAndJobId)

    entityTableManager = EntityTableManager()
    entityTableManager.addEntries(docId, entities)
    logger.info("Added entities to entity table. %s.", docAndJobId)

    #store entity weights in the doc
    entityWeights = {}
    for entity in entities:
        entityWeight = entityTableManager.getEntityWeight(entity)
        entityWeights[entity] = entityWeight
    doc.tags[DOCTAG_ENTITY_WEIGHTS] = json.dumps(entityWeights)
    docManager.put(doc)
    logger.info("Added entity weights to doc. %s.", docAndJobId)

    job = WorkerJob(JOB_GETCANDIDATEDOCS,
                    {JOBARG_GETCANDIDATEDOCS_DOCID: docId})
    jobManager = MinerJobManager()
    jobManager.enqueueJob(job)
    logging.info("Put get candidate doc job with jobId: %s. %s", job.jobId,
                 docAndJobId)

    logger.info("Completed parsing doc. %s.", docAndJobId)
Exemplo n.º 6
0
def compareDocs(jobId, doc1Key, doc2Key):
    jobInfo = "Doc1 id: " + doc1Key + " Doc2 id: " + doc2Key \
              + ". Job id: " + jobId
    logger.info("Started comparing docs. %s", jobInfo)

    docManager = DocManager()
    doc1 = docManager.get(doc1Key)
    doc2 = docManager.get(doc2Key)

    score = getDocComparisionScore(jobInfo, doc1, doc2)

    if score > SIMSCORE_MIN_THRESHOLD:
        distanceTableManager = DistanceTableManager()
        distanceTableManager.addEntry(doc1Key, doc2Key, score)
        logger.info("Added comparision score to distances table. %s", jobInfo)

    logger.info("Completed comparing docs. %s", jobInfo)
Exemplo n.º 7
0
def _getDocsInParallel(docKeys):
  que = Queue.Queue()
  threads_list = list()
  docManager = DocManager()
  for docKey in docKeys:
    t = Thread(
      target=lambda q, arg1: q.put(docManager.get(arg1)),
      args=(que, docKey))
    t.start()
    threads_list.append(t)

  for t in threads_list:
    t.join()

  docs = list()
  while not que.empty():
    docs.append(que.get())

  return docs
Exemplo n.º 8
0
class ClusterManager:
    """
    Manage clusters stored in cloud.
    """
    def __init__(self):
        """
        Instantiates a new instance of ClusterManager class

        """

        self.clusterTableManager = ClusterTableManager()
        self.docManager = DocManager()
        self.processedClusterStore = ProcessedClusterStore()

    def getProcessedCluster(self, cluster):
        return self.processedClusterStore.getProcessedCluster(cluster)

    def processNewCluster(self, cluster):
        cluster.isCurrent = 'true'
        cluster = self.processedClusterStore.processAndSaveCluster(cluster)
        self.clusterTableManager.addCluster(cluster)

    def __getProcessedClusterArticles(self, cluster):
        cluster = self.getProcessedCluster(cluster)
        return cluster.articles

    def __getClusterResponse(self, cluster, filters=None):
        articles = self.__getProcessedClusterArticles(
            self.__filterDocsInCluster(cluster, filters))

        title = articles[0]['title']
        description = articles[0]['title'] + " - " + \
            articles[0]['publisher'][PUBLISHER_DETAILS_NAME] + "."
        if len(articles) > 1:
            description += " " + articles[1]['title'] + " - " + \
              articles[1]['publisher'][PUBLISHER_DETAILS_NAME] + "."

        return {
            "articles": articles,
            "title": title,
            "description": description,
            "locales": cluster.locales,
            "languages": cluster.languages,
            "importance": self.__computeClusterRankingScore(cluster)
        }

    def __computeClusterRankingScore(self, cluster):
        return (0.3 * (len(cluster) - len(cluster.duplicates))) + \
            (0.7 * len(cluster.publishers))

    def __sortClustersByImportance(self, clusters):
        clusterList = list(clusters)
        clusterList.sort(key=self.__computeClusterRankingScore, reverse=True)
        return clusterList

    def __filterClusters(self, clusterList, filters):
        if not filters:
            return clusterList

        if CLUSTERS_FILTER_LANGUAGES in filters:
            clusterList = [cluster for cluster in clusterList if not \
                set(filters[CLUSTERS_FILTER_LANGUAGES]).isdisjoint(cluster.languages)]

        return clusterList

    def __filterDocsInCluster(self, cluster, filters):
        if not filters:
            return cluster

        filteredDocs = []

        for docKey in cluster:
            isDocAllowed = True
            doc = self.docManager.get(docKey)

            if CLUSTERS_FILTER_LANGUAGES in filters:
                if doc.tags[FEEDTAG_LANG] not in filters[
                        CLUSTERS_FILTER_LANGUAGES]:
                    isDocAllowed = False

            if isDocAllowed:
                filteredDocs.append(docKey)

        return Cluster(filteredDocs)

    def __constructQueryResponse(self, clusters, skip, top, filters=None):
        response = []
        clusterList = list(clusters)
        clusterList = self.__filterClusters(clusterList, filters)
        clusterList = self.__sortClustersByImportance(clusterList)

        for cluster in clusterList[skip:(skip + top)]:
            try:
                response.append(self.__getClusterResponse(cluster, filters))

            except Exception, e:
                logging.exception(
                    "Could not construct query response for cluster id %s",
                    cluster.id)
                continue

        return response
Exemplo n.º 9
0
def processLink(jobId, linkId):
    """
  Processes a link(takes as input the linkId)

  Steps:
  1. get link from database
  2. get publisher for that link from database
  3. get html for that link
  4. process that html to generate doc
  5. save that doc in docstore.
  6. update the link's is processed tag.
  """

    linkAndJobId = "Link id: " + linkId + ". Job id: " + jobId
    logger.info("Started processing link. %s.", linkAndJobId)

    # get the link
    linkManager = LinkManager()
    link = linkManager.get(linkId)
    logger.info("Got link from database. %s.", linkAndJobId)

    # get the publisher
    publisherManager = PublisherManager()
    publisher = publisherManager.get(link.tags[TAG_PUBLISHER])
    logger.info("Got publisher from database. Publisher id: %s. %s.",
                link.tags[TAG_PUBLISHER], linkAndJobId)

    # get html for the link
    processingResult = _processHtmlForLink(jobId, link, publisher)
    if not processingResult[0]:
        logger.warning("No text extracted for the link. %s.", linkAndJobId)

    # generate corresponding doc
    doc = Doc(_getDocKey(link), processingResult[0], link.tags)
    doc.tags[TAG_IMAGES] = processingResult[1]
    doc.tags[DOCTAG_URL] = linkId
    doc.tags[TAG_PUBLISHER_DETAILS] = _getPublisherDetails(publisher)
    doc = _addTranslationTags(jobId, doc)
    doc = _addSummaryIfNotPresent(doc)
    doc.tags[LINKTAG_HIGHLIGHTS] = _getDocHighlights(doc)

    # save the doc
    docManager = DocManager()
    docManager.put(doc)
    logger.info("Document generated and saved for link. Doc key %s. %s.",
                doc.key, linkAndJobId)

    #update the doc key in links table
    link.tags[LINKTAG_DOCKEY] = doc.key
    linkManager.put(link)

    # put parse doc job
    parseDocJob = WorkerJob(JOB_PARSEDOC, {JOBARG_PARSEDOC_DOCID: doc.key})
    jobManager = MinerJobManager()
    jobManager.enqueueJob(parseDocJob)
    logger.info("Parse doc job with with jobId '%s' put. %s.",
                parseDocJob.jobId, linkAndJobId)

    if FEEDTAG_DO_NOT_CLUSTER not in doc.tags:
        newCluster = Cluster([doc.key])
        processNewClusterJob = WorkerJob(
            JOB_PROCESSNEWCLUSTER,
            {JOBARG_PROCESSNEWCLUSTER_CLUSTER: list(newCluster)})
        clusterJobManager = ClusterJobManager()
        clusterJobManager.enqueueJob(processNewClusterJob)
        logging.info(
            "Put process new cluster job for new doc. Cluster id: %s. %s",
            newCluster.id, linkAndJobId)

    # update the link
    link.tags[LINKTAG_ISPROCESSED] = 'true'
    linkManager.put(link)
    logger.info("Link updated after being successfully processed. %s.",
                linkAndJobId)

    logger.info("Completed processing link. %s.", linkAndJobId)
Exemplo n.º 10
0
class ClusterManager:
    """
    Manage clusters stored in cloud.
    """
    def __init__(self):
        """
        Instantiates a new instance of ClusterManager class

        """

        self.clusterTableManager = ClusterTableManager()
        self.docManager = DocManager()

    def processNewCluster(self, cluster):
        cluster.process()
        cluster.isCurrent = 'true'
        self.clusterTableManager.addCluster(cluster)

    def __getProcessedClusterArticles(self, cluster):
        cluster.process()
        return cluster.articles

    def __computeClusterRankingScore(self, cluster):
        return (0.4 * (len(cluster) - len(cluster.duplicates))) + \
            (0.6 * len(cluster.publishers))

    def __filterClusters(self, clusterList, filters):
        if not filters:
            return clusterList

        if CLUSTERS_FILTER_LANGUAGES in filters:
            clusterList = [cluster for cluster in clusterList if not \
                set(filters[CLUSTERS_FILTER_LANGUAGES]).isdisjoint(cluster.languages)]

        return clusterList

    def __filterDocsInCluster(self, cluster, filters):
        if not filters:
            return cluster

        filteredDocs = []

        for docKey in cluster:
            isDocAllowed = True
            doc = self.docManager.get(docKey)

            if CLUSTERS_FILTER_LANGUAGES in filters:
                if doc.tags[FEEDTAG_LANG] not in filters[
                        CLUSTERS_FILTER_LANGUAGES]:
                    isDocAllowed = False

            if isDocAllowed:
                filteredDocs.append(docKey)

        return Cluster(filteredDocs)

    def __constructQueryResponse(self, clusters, skip, top, filters=None):
        response = []
        clusterList = list(clusters)
        clusterList = self.__filterClusters(clusterList, filters)
        clusterList.sort(key=self.__computeClusterRankingScore, reverse=True)

        for cluster in clusterList[skip:(skip + top)]:
            try:
                response.append({
                    "articles":
                    self.__getProcessedClusterArticles(
                        self.__filterDocsInCluster(cluster, filters)),
                    "importance":
                    self.__computeClusterRankingScore(cluster)
                })

            except Exception, e:
                logging.exception(
                    "Could not construct query response for cluster id %s",
                    cluster.id)
                continue

        return response