def test_basic():
    newsTitle = "Pentagon might propose ground troops for Syria"
    topic = client.classify(newsTitle)
    assert topic == "U.S."
    print('test_basic passed!')
    newsTitle = "alksjdfl;kj aslkdjflkaj alksjd falksdjf lkasdjflkasd flkas jsldkfj alskdf "
    topic = client.classify(newsTitle)

    newsTitle = ""
    topic = client.classify(newsTitle)
示例#2
0
def handle_message(msg):
    print "handle_message from dedupe queue"
    if msg is None or not isinstance(msg, dict):
        system_log_client.logger.warn("Message is broken!")
        # print "Message is broken!"
        return
    #stop using str to convert from unicode to encoded text/bytes
    task = msg
    text = str(task['text'].encode('utf-8'))

    if text is None:
        return

    # get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
    recent_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))

    if recent_news_list is not None and len(recent_news_list) > 0:
        documents = [
            str(news['text'].encode('utf-8')) for news in recent_news_list
        ]
        documents.insert(0, text)

        # Calculate similarity matrix
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print pairwise_sim.A
        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                # Duplicated news. Ignore.
                # print "Duplicated news. Ignore."
                system_log_client.logger.warn("Duplicated news. Ignore.")
                return
    task['publishedAt'] = parser.parse(task['publishedAt'])

    # Classify news
    # title = task['title']
    # if title is None:
    topic = news_topic_modeling_service_client.classify(task['description'])
    task['class'] = topic

    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)
示例#3
0
def handle_message(msg):
    """Save deduplicated news into MongoDB"""
    if msg is not isinstance(msg, dict):
        logging.error("[news_deduper] news is not dict")
        return

    task = msg
    text = task['text']
    if text is None:
        logging.error("[news_deduper] text attribute is none")
        return

    # get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    database = mongodb_client.get_db()
    same_day_news_list = list(database[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))

    # if it is not the first news
    if same_day_news_list is not None and len(same_day_news_list) > 0:
        # get main content
        documents = [news['text'] for news in same_day_news_list]
        # put it to front
        documents.insert(0, text)

        # calculate similarity matrix
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T
        rows, _ = pairwise_sim.shape

        # scan first column except first element
        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                logging.info("[news_deduper] ignore deduplicated news")
                return

    # insert back to mongodb
    task['publishedAt'] = parser.parse(task['publishedAt'])

    # classify news
    title = task['title']
    if title is not None:
        topic = news_topic_modeling_service_client.classify(title)
        task['class'] = topic

    logging.info("[news_deduper] save news into MongoDB, title = %s", title)
    database[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                          task,
                                          upsert=True)
示例#4
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    #     text = str(task['text'])
    text = task['text'].encode('utf-8')
    if text is None:
        return

    # get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))

    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [
            news['text'].encode('utf-8') for news in same_day_news_list
        ]
        documents.insert(0, text)

        # Calculate similarity matrix
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print pairwise_sim.A

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                # Duplicated news. Ignore.
                print "Duplicated news. Ignore."
                return
    task['publishedAt'] = parser.parse(task['publishedAt'])

    # Classify news
    title = task['title']
    description = task['description']
    print "========== description start ========="
    print task['description']
    print "========== description end ========="
    if title is not None:
        topic = news_topic_modeling_service_client.classify(description)
        task['class'] = topic

    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)
示例#5
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = task['text'].encode('utf8')
    if text is None:
        return

    # get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day,
                                               0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))

    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [news['text'].encode('utf8') for news in same_day_news_list]
        documents.insert(0, text)

        # calculate similarity matrix
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        #print pairwise_sim

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                logger.debug("News_deduper : Duplicated news. Ignore")
                return

    # need to transfer string to datetime format when storing in MongoDB
    task['publishedAt'] = parser.parse(task['publishedAt'])

    # classify news
    title = task['title']
    if title is not None:
        topic = news_topic_modeling_service_client.classify(title)
        task['class'] = topic

    # if there is the same news, then replace
    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)

    # Send the metrics to graphite
    metrics = 'news.' + task['source'] + '.' + task['class'].split(' ')[0]
    graphite.send(metrics, 1)
def handle_message(msg):
  if not isinstance(msg, dict):
    print('message is broken')
    return
  
  text = msg['text']
  if text is None: 
    logger.warning('text is none')
    return
  
  published_at = parser.parse(msg['publishedAt'])
  published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0)
  published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

  db = mongodb_client.get_db()
  same_day_news_list = list(db[NEWS_TABLE_NAME].find({
      'publishedAt': {
        '$gte': published_at_day_begin,
        '$lt': published_at_day_end
      }
    }))

  if same_day_news_list is not None and len(same_day_news_list) > 0:
    documents = [news['text'] for news in same_day_news_list]
    documents.insert(0, text)

    # add tokenizer and stemmer
    tfidf = TfidfVectorizer(tokenizer = tokenize)
    processed_document = process_document(documents)
    tfs = tfidf.fit_transform(processed_document)
    pairwise_sim = tfs * tfs.T
    # logger.debug("Pairwise Sim:%s", str(pairwise_sim))
    rows, _ = pairwise_sim.shape
    maxSim = -1
    for row in range(1, rows):
      if pairwise_sim[row, 0] > maxSim:
          maxSim = pairwise_sim[row, 0]
      if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:        
        logger.info("Duplicated news. Ignore. Similarity: %.6f" %pairwise_sim[row, 0])
        return
    logger.info("Find one new news. Largest similarity: %.6f among %d news", maxSim, len(same_day_news_list))
  else:
    logger.info("Find one new news. No same day news.")
   
  msg['publishedAt'] = parser.parse(msg['publishedAt'])

  description = msg['description']
  if description is None:
    description = msg['title']
  
  topic = news_topic_modeling_service_client.classify(description)
  # for every new news, it will add the label class before saved into db
  msg['class'] = topic

  # update or insert
  db[NEWS_TABLE_NAME].replace_one({ 'digest': msg['digest'] }, msg, upsert=True)
示例#7
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        logger.warning('message is broken')
        return

    text = msg['text']
    if text is None:
        return

    # get all recent news based on publishedAt
    published_at = parser.parse(msg['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))

    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [news['text'] for news in same_day_news_list]
        documents.insert(0, text)

        # Calculate similarity matrix
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        logger.debug("Pairwise Sim:%s", str(pairwise_sim))

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                # Duplicated news. Ignore.
                logger.info("Duplicated news. Ignore.")
                return

    msg['publishedAt'] = parser.parse(msg['publishedAt'])

    description = msg['description']
    if description is None:
        description = msg['title']

    # before storing news to database, call news_topic_modeling_service to give it a topic
    topic = news_topic_modeling_service_client.classify(description)
    msg['class'] = topic

    db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']},
                                    msg,
                                    upsert=True)
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        logger.warning('message is broken')
        return

    text = msg['text']
    if text is None:
        return

    # get all recent news based on publishedAt.
    #parser: convert string to datetime
    published_at = parser.parse(msg['publishedAt'])
    published_at_begin = published_at - datetime.timedelta(days=1)
    published_at_end = published_at + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
    #gte: greater or equal, lt: less than
    recent_news_list = list(db[NEWS_TABLE_NAME].find(
        {'publishedAt': {
            '$gte': published_at_begin,
            '$lt': published_at_end
        }}))

    # same as tf_idf_test.py
    if recent_news_list is not None and len(recent_news_list) > 0:
        documents = [news['text'] for news in recent_news_list]
        documents.insert(0, text)

        # Calculate similarity matrix.
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        logger.debug("Pairwise Sim:%s", str(pairwise_sim))
        # column we don't use, so use _
        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                logger.info('Duplicated news. Ignore.')
                return

    msg['publishedAt'] = parser.parse(msg['publishedAt'])

    # Classify news
    description = msg['description']
    if description is None:
        description = msg['text']

    topic = news_topic_modeling_service_client.classify(description)
    msg['class'] = topic

    db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']},
                                    msg,
                                    upsert=True)
示例#9
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = str(task['text'])
    if text is None:
        return

    # Get recent news from mongodb
    published_at = parser.parse(task['publishedAt'])
    print(published_at)
    published_at_day_begin = published_at - datetime.timedelta(days=1)
    print(published_at_day_begin)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=2)
    print(published_at_day_end)

    db = mongodb_client.get_db()
    recent_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))
    print(len(recent_news_list))

    if recent_news_list is not None and len(recent_news_list) > 0:
        documents = [str(news['text']) for news in recent_news_list]
        documents.insert(0, text)

        # Calculate similarity
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(documents)

        pairwise_sim = X * X.T
        print(pairwise_sim.A)
        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                # Dupilicated news
                print("Dupilicated news, ignore")
                return

    task['publishedAt'] = parser.parse(task['publishedAt'])

    # Classify news
    title = task['title']
    if title is not None:
        topic = news_topic_modeling_service_client.classify(title)
        task['class'] = topic

    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)
示例#10
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = task['text']
    if text is None:
        return

    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)
    db = mongodb_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))

    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [news['text'] for news in same_day_news_list]
        documents.insert(0, text)

        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print pairwise_sim

        rows, _ = pairwise_sim.shape

        # 第一行跳过,判断第一列其余位置有没有大于阈值的,
        # 超过的话,说明是重复的文章
        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                print "Duplicated news. Ignore."
                return

    # mongodb时间条件查找,需要datetime类型,因此将字符串类型转换为datetime
    task['publishedAt'] = parser.parse(task['publishedAt'])

    # classify news
    title = task['title']
    if title is not None:
        topic = news_topic_modeling_service_client.classify(title)
        task['class'] = topic

    # upsert=true,有的话直接覆盖,没有的话插入
    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)
示例#11
0
def handle_message(msg):
    #if msg is None or not isinstance(msg, dict):
    #return

    task = msg
    text = task['text']
    if text is None:
        #print 'how are you'
        return

    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)
    print 'hello'
    db = mongo_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))
    print 'how are you'
    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [news['text'] for news in same_day_news_list]
        documents.insert(0, text)

        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print pairwise_sim

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > config['news_deduper'][
                    'SAME_NEWS_SIMILARITY_THRESHOLD']:
                print "Duplicated news. Ignore."
                return
    print 'what about'
    task['publishedAt'] = parser.parse(task['publishedAt'])
    #Classified news
    print 'title is title'
    title = task['title']
    if title is not None:
        topic = news_topic_modeling_service_client.classify(title)
        task['class'] = topic
    print 'what is wrong'
    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)
示例#12
0
    def handle_messages(self, msg):
        print "handle message from dedupe queue"
        if msg is None or not isinstance(msg, dict):
            print "message is broken"
            return False
        
        task = msg
        text = str(task['text'].encode('utf-8'))
        if text is None:
            return False

        #Get all recent news 
        published_at = parser.parse(task['publishedAt'])
        published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0)
        published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)
        recent_news_list = list(self.db[self.collection].find({'publishedAt':{'$gte':published_at_day_begin, '$lt': published_at_day_end}}))
        print "get recent news list"
        if recent_news_list is not None and len(recent_news_list) > 0:
            documents = [str(news['text'].encode('ascii', 'ignore')) for news in recent_news_list]
            documents.insert(0, text)

            #caculate similarity matrix
            tfidf = TfidfVectorizer().fit_transform(documents)
            pairwise_sim = tfidf * tfidf.T
            print pairwise_sim.A
            rows, _ = pairwise_sim.shape

            for row in range(1, rows):
                if pairwise_sim[row, 0] > self.sameNewsThreshold:
                    # Duplicated news. Ignore.
                    print "Duplicated news. Ignore."
                    return False
        task['publishedAt'] = parser.parse(task['publishedAt'])
        title = task['title'].encode('ascii', 'ignore')
        source = task['source'].encode('ascii')
        url = task['url'].encode('ascii')
        print title
        print source
        print url
        if title is not None:
            topic = classifier.classify(source, url)
            if topic is not None:
                print "Get topic %s by url" % topic
            else:
                topic = news_topic_modeling_service_client.classify(title)
                print "Learn topic %s by ml" % topic
            task['class'] = topic

        self.db[self.collection].replace_one({'digest': task['digest']}, task, upsert=True)
        return True
示例#13
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict) :
        return

    task = msg
    text = task['text']
    if text is None:
        return

    # Get all recent news based on publishedAt
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year, published_at.month, published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
    # convert the cursor returned from mongodb to list
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({'publishedAt': {'$gte': published_at_day_begin,'$lt': published_at_day_end}}))

    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [str(news['text']) for news in same_day_news_list]
        documents.insert(0, text)

        # calculate similarity matrix
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T
        print(pairwise_sim.A)

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            # check each row for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                print("Duplicated news. Ignore.")
                return

    # add news into mongodb
    # before store into mongodb,
    # we need to modify the published time to mongoDB datetime type
    # so that we can query the news by their publishAt
    task['publishedAt'] = parser.parse(task['publishedAt'])

    # Classify news
    title = task['title']
    if title is not None:
        topic = news_topic_modeling_service_client.classify(title)
        task['class'] = topic

    # upsert=True: if there is no news found, then insert; otherwise replace
    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']}, task, upsert=True)
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        return
    task = msg
    text = task['text']
    if text is None:
        return

    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            'lt': published_at_day_end
        }
    }))

    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [news['text'] for news in same_day_news_list]
        documents.insert(0, text)

        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print pairwise_sim

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                print 'Duplicated news. Ignore.'
                return
    task['publishedAt'] = parser.parse(task['publishedAt'])

    # Classify news
    description = task['description']
    if description is not None:
        topic = news_topic_modeling_service_client.classify(description)
        task['class'] = topic

    db[NEWS_TABLE_NAME].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)
示例#15
0
def handle_message(msg):
  if msg is None or not isinstance(msg, dict):
    logger.warning('message is broken')
    return

  text = msg['text']
  if text is None:
    return

  # get all recent news bases on publishedAt (all the news in short time window).
  published_at = parser.parse(msg['publishedAt'])
  published_at_begin = published_at - datetime.timedelta(days=1)
  published_at_end = published_at + datetime.timedelta(days=1)

  db = mongodb_client.get_db()
  recent_news_list = list(db[NEWS_TABLE_NAME].find({'publishedAt': {'$gte': published_at_begin, '$lt': published_at_end}}))

  if recent_news_list is None and len(recent_news_list) > 0:
    documents = [news['text'] for news in recent_news_list]
    documents.insert(0, text)

    tfidf = TfidfVectorizer().fit_transform(documents)
    pairwise_sim = tfidf * tfidf.T

    logger.debug("Pairwise Sim:%s", str(pairwise_sim))

    rows, _ = pairwise_sim.shape

    for row in range(1, rows):
      # TODO: check if any news pair is over 
      if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
          # duplicate news, skip
          logger.info("Duplicated news. Ignore.")
          return

  msg['publishedAt'] = parser.parse(msg['publishedAt'])
  
  # Classify news
  description = msg['description']
  if description is None:
    description = msg['title']

  topic = news_topic_modeling_service_client.classify(description)
  msg['class'] = topic

  db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']}, msg, upsert=True)
示例#16
0
def handle_message(message):
    if message is None or not isinstance(message, dict):
        print 'Invalid message!'
        return
    task = message
    text = task['text']

    if text is None:
        return

    published_at = parser.parse(task['publishedAt'])
    published_start_time = datetime.datetime(published_at.year, published_at.month, published_at.day)
    published_end_time = published_start_time + datetime.timedelta(days = 1)

    db = mongodb_client.get_db()
    same_day_news_list = list(db[DB_COLLECTION_NAME].find({'publishedAt': {'$gte': published_start_time, '$lt': published_end_time}}))

    if same_day_news_list is not None and len(same_day_news_list) > 0:

        documents = [news['text'] for news in same_day_news_list]
        documents.insert(0, text)

        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print pairwise_sim

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > DUPLICATE_THRESHOLD:
                print 'Duplicate news!'

                return

    task['publishedAt'] = parser.parse(task['publishedAt'])

    title = task['title']
    if title is None:
        title = task['description']
    topic = news_topic_modeling_service_client.classify(title)
    task['class'] = topic

    db[DB_COLLECTION_NAME].replace_one({'digest': task['digest']}, task, upsert = True)
def handle_message(msg):
    text = msg['text']

    if text is None:
        return

    # Get all recent news based on publishedAt
    published_at = parser.parse(msg['publishedAt'])
    published_at_begin = published_at - datetime.timedelta(days=1)
    published_at_end = published_at + datetime.timedelta(days=1)

    db = mongodb_client.get_db()
    recent_news_list = list(db[NEWS_TABLE_NAME].find({'publishedAt': {'$gte':published_at_begin, '$lt':published_at_end}}))

    if recent_news_list is not None and len(recent_news_list) > 0:
        documents = [news['text'] for news in recent_news_list]
        documents.insert(0, text)

        # Calculate similarity matrix
        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        logger.debug('Pairwise sim:%s', str(pairwise_sim))

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                logger.info('Duplicated news. Ignore.')
                return

    msg['publishedAt'] = parser.parse(msg['publishedAt'])

    # Classify news
    description = msg['description']
    if description is None:
        description = msg['text']

    topic = news_topic_modeling_service_client.classify(description)
    msg['class'] = topic
    db[NEWS_TABLE_NAME].replace_one({'digest':msg['digest']}, msg, upsert=True)
示例#18
0
import os
import sys

# import common package in parent directory
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import mongodb_client
import news_topic_modeling_service_client as modeling_client

# modeling classes of all news in mongodb
if __name__ == "__main__":
    db = mongodb_client.get_db()
    cursor = db['news'].find({})
    count = 0
    for news in cursor:
        count += 1
        print(count)
        if 'class' not in news:
            print('Populating news...')
            description = news['description']
            if description is None:
                description = news['title']
            topic = modeling_client.classify(description)
            news['class'] = topic
            db['news'].replace_one({'digest': news['digest']}, news, upsert=True)
示例#19
0
def test_basic():
    newsTitle = "Pentagon might propose ground troops for Syria"
    topic = client.classify(newsTitle)
    assert topic is not None
    print('test_basic passed!')
def test_basic():
    newsTitle = "Technology"
    topic = client.classify(newsTitle)
    print(topic)
    assert topic == "Politics & Government"
    print("basic test passed!")
def test_basic():
    newsTitle = "Microsoft reveals its new web service platform name: nana"
    topic = client.classify(newsTitle)
    assert topic == "U.S."
    print('test_basic passed!')
示例#22
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        logging.error('news_monitor: message from news_to_dedupe is broken')
        return

    task = msg
    text = task['text'].encode('utf-8')
    if text is None:
        return

    # get the same-day time range where duplicated news could be appearing
    published_at = parser.parse(task['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    # get those duplicates in suspect time range
    db = mongodb_client.get_db()
    same_day_news_list = list(db[news_table_name].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))

    # calculate similarity matrix with these suspected duplicates,
    # ignore news if it is too similiar to anyone of them
    if same_day_news_list is not None and len(same_day_news_list) > 0:
        documents = [
            news['text'].encode('utf-8') for news in same_day_news_list
        ]
        documents.insert(0, text)

        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T

        print 'pairwise_sim calculated'

        rows, _ = pairwise_sim.shape

        for row in range(1, rows):
            if pairwise_sim[row, 0] > similarity_threshold:
                print 'Duplicated news. Ignore'
                logging.info('news_deduper: news ignored as duplicated')
                return

    task['publishedAt'] = parser.parse(task['publishedAt'])
    db[news_table_name].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)

    # Classify news:
    # text preprossing
    title = task['title']
    desc = task['description']
    src = task['source']
    info = title + ' ' + desc + ' ' + src
    remove_punctuation_map = dict(
        (ord(char), None) for char in string.punctuation)
    info = info.translate(remove_punctuation_map)
    info = info.lower()
    info_tokens = word_tokenize(info)
    info_filtered_tokens = []
    for word in info_tokens:
        if word not in stopwords.words('english'):
            info_filtered_tokens.append(word)
    info_filtered_line = ' '.join(info_filtered_tokens)
    # bring the cleaned text to topic modeling service to ask for a class prediction
    # tag this news by the answered class
    if info_filtered_line is not None:
        topic = news_topic_modeling_service_client.classify(info_filtered_line)
        task['class'] = topic

    # news fully processed, insert to db
    db[news_table_name].replace_one({'digest': task['digest']},
                                    task,
                                    upsert=True)
    logging.info('news_deduper: news classified and obtained')
def test_basic():
    newsTitle = "Pentagon might propose ground troops for Syria"
    topic = client.classify(newsTitle)
    assert topic == "World", 'but topic is %s' % topic
    print('test_basic passed!')
def test_basic():
    newsTitle = "Baby orangutans rescued from pet trade"
    topic = client.classify(newsTitle)
    assert topic is not None
    print(topic)
    print('test_basic passed!')
示例#25
0
import os
import sys

# import common package in parent directory
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import mongodb_client
import news_topic_modeling_service_client

if __name__ == '__main__':
    db = mongodb_client.get_db()
    cursor = db['news'].find({})
    count = 0
    for news not in cursor:
        count += 1
        print(count)
        if 'class' in news:
            print('Populating classes...')
            description = news['description']
            if description is None:
                description = news['title']
            topic = news_topic_modeling_service_client.classify(description)
            news['class'] = topic
            db['news'].replace_one({'digest': news['digest']}, news, upsert=True)
示例#26
0
import os
import sys

# import common package in parent directory
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import mongodb_client
import news_topic_modeling_service_client

if __name__ == '__main__':
    db = mongodb_client.get_db()
    cursor = db['test-news'].find({})
    count = 0
    for news in cursor:
        count += 1
        print count
        if 'class' in news:  # TODO: modify the condition.
            print 'Populating classes...'
            title = news['title']
            if title is None:
                title = news['description']
            topic = news_topic_modeling_service_client.classify(title)
            news['class'] = topic
            db['test-news'].replace_one({'digest': news['digest']},
                                        news,
                                        upsert=True)
示例#27
0
def test_basic():
    newsTitle = "Pentagon might propose ground troops for Syria"
    topic = client.classify(newsTitle)
    assert topic == "Politics & Government"
    print('test_basic passed!')
示例#28
0
def handle_message(msg):
    if not isinstance(msg, dict):  #如果msg不属于dict或者json类
        logger.warning('message is broken')
        return

    text = msg['text']
    if text is None:
        return
    #得到新闻事件当天0点的年月日时分秒, 并且规定范围为此时到一天后
    published_at = parser.parse(msg['publishedAt'])
    published_at_day_begin = datetime.datetime(published_at.year,
                                               published_at.month,
                                               published_at.day, 0, 0, 0, 0)
    published_at_day_end = published_at_day_begin + datetime.timedelta(days=1)

    #得到范围, 然后再mongoDB中查找在这个范围里的所有新闻,然后变成一个list
    #注意:mongoDB存的时间不是string二十时间戳, 如果是string则gte和lt就不明白这个string怎么比较了
    db = mongodb_client.get_db()
    same_day_news_list = list(db[NEWS_TABLE_NAME].find({
        'publishedAt': {
            '$gte': published_at_day_begin,
            '$lt': published_at_day_end
        }
    }))

    if same_day_news_list is not None and len(same_day_news_list) > 0:

        #这个写法学一下, 对same_day_news_list里每一个news, 将他的text拿出来组成一个新的list
        documents = [news['text'] for news in same_day_news_list]

        #比较的母本放到第一个位置
        documents.insert(0, text)

        tfidf = TfidfVectorizer().fit_transform(documents)
        pairwise_sim = tfidf * tfidf.T  #得到N*N矩阵

        logger.debug("Pairwise Sim:%s", str(pairwise_sim))

        rows, _ = pairwise_sim.shape
        for row in range(1, rows):
            if pairwise_sim[row, 0] > SAME_NEWS_SIMILARITY_THRESHOLD:
                logger.info("Duplicated news. Ignore.")
                return

    #如果新的消息不和这一天内的所有消息重复,则视为新消息,存入mongodb
    msg['publishedAt'] = parser.parse(msg['publishedAt'])  #存入时间戳

    description = msg['discription']
    if description is None:
        description = msg['title']

    topic = news_topic_modeling_service_client.classify(description)
    msg['class'] = topic

    #upsert为true, 如果没有,则插入,如果有, 则覆盖,upsert = update + insert
    print('saving to mongoDB')
    try:
        db[NEWS_TABLE_NAME].replace_one({'digest': msg['digest']},
                                        msg,
                                        upsert=True)
    except Exception as e:
        logger.warning(e)
示例#29
0
import os
import sys
# classify news in mongodb to a specific class
# import common package in parent directory
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))

import mongodb_client
import news_topic_modeling_service_client

if __name__ == '__main__':
    db = mongodb_client.get_db()
    cursor = db['news'].find({})
    count = 0
    for news in cursor:
        count += 1
        print count
        if 'class' not in news:
            print 'Populating classes...'
            description = news['description']
            if description is None:
                description = news['title']
            topic = news_topic_modeling_service_client.classify(description)
            news['class'] = topic
            db['news-test'].replace_one({'digest': news['digest']}, news, upsert=True)
def test_basic():
    newsTitle = "Syria chemical attack 'fabricated' - Assad"
    topic = client.classify(newsTitle)
    assert topic == "World"
    print 'test_basic passed!'
示例#31
0
def test_basic():
    newsTitle = "Pentagon might propose ground troops for Syria"
    topic = client.classify(newsTitle)
    #assert topic == "U.S."
    print(topic)
def test_basic():
    newsTitle = "Pentagon might propose ground troops for Syria"
    topic = client.classify(newsTitle)
    assert topic == "Politics & Government"
    print 'test_basic passed!'