示例#1
0
 def evaluate(self):
     LoggerHelper.info("Evaluation Started...")
     nlp = pipeline('sentiment-analysis')
     self.load_model(self.config["evaluation"]["load"])
     self.model.eval()
     self.timer.start()
     db = Mongo()
     news_collection = db.create_collection(self.config["evaluation"]["collection"])
     news_filtered = db.create_collection(self.config["evaluation"]["destination"], NewsOrganizer.get_index_models())
     count = 0
     processed = 0
     while True:
         try:
             cursor = news_collection.find(self.config["evaluation"]["query"], no_cursor_timeout=True).skip(
                 processed)
             for news in cursor:
                 try:
                     summery = news.get('summery')
                     b_input_ids, b_input_mask = self.reader.get_one_news(summery)
                     b_input_ids, b_input_mask = b_input_ids.to(self.device), b_input_mask.to(self.device)
                     outputs = self.model(b_input_ids, token_type_ids=None,
                                          attention_mask=b_input_mask)
                     logits = outputs[0].detach().cpu().numpy()  # Move result to CPU
                     result = np.argmax(logits, axis=1).flatten()  #
                     sentiment = nlp(summery)
                     if result[0] == 1:
                         news_filtered.insert({
                             "_id": news.get('_id'),
                             "title": news.get('title'),
                             "summery": news.get('summery'),
                             "article": news.get('article'),
                             "url": news.get('url'),
                             "category": news.get('category'),
                             "price_after_minute": news.get('price_after_minute'),
                             "price_after_hour": news.get('price_after_hour'),
                             "price_after_day": news.get('price_after_day'),
                             "sentiment": sentiment,
                             "price_before": news.get('price_before'),
                             "wiki_relatedness": news.get('wiki_relatedness'),
                             "tweet_count": news.get('tweet_count'),
                             "tweet_percentage": news.get('tweet_percentage'),
                             "date": news.get('date'),
                             "authors": news.get('authors'),
                             "comment": news.get('comment'),
                             "price_effect": news.get('price_effect')
                         })
                 except Exception as exception:
                     Logger().get_logger().error(type(exception).__name__, exc_info=True)
                     traceback.print_exc()
                 count = count + 1
                 if count % 500 == 0:
                     print(count)
                 processed += 1
             cursor.close()
             break
         except CursorNotFound:
             processed += 1
             print("Lost cursor. Retry with skip")
     self.timer.stop(time_for="Evaluation")
示例#2
0
    def dnn_organizer(self, collection="Product", key="BRTUSD"):
        db = Mongo()
        pre_processing = PreProcessing()
        news_collection = db.create_collection(
            self.config["database"]["collection"])
        news_filtered = db.create_collection(
            self.config["database"]["destination"],
            NewsOrganizer.get_index_models())

        for news in news_collection.find(self.config["database"]["query"]):
            date = news.get('date')
            before = self.get_price_before_date(db, collection, key, date)
            minute = self.get_price_at_date(db, collection, key, date)
            hour = self.get_price_at_date(db,
                                          collection,
                                          key,
                                          date,
                                          minutes=60)
            day = self.get_price_at_date(db,
                                         collection,
                                         key,
                                         date,
                                         add_day=True)
            try:
                news_filtered.insert({
                    "_id":
                    news.get('_id'),
                    "title":
                    pre_processing.preprocess(news.get('title')),
                    "summery":
                    pre_processing.preprocess(news.get('summery')),
                    "article":
                    pre_processing.preprocess(news.get('article')),
                    "url":
                    news.get('url'),
                    "category":
                    news.get('category'),
                    "price_after_minute":
                    minute,
                    "price_after_hour":
                    hour,
                    "price_after_day":
                    day,
                    "price_before":
                    before,
                    "date":
                    date,
                    "authors":
                    news['authors']
                })
            except Exception as exception:
                Logger().get_logger().error(type(exception).__name__,
                                            exc_info=True)
                traceback.print_exc()
示例#3
0
    def organize(self):
        db = Mongo()
        news_collection = db.create_collection("News")
        news_filtered = db.create_collection("FilteredNews",
                                             NewsOrganizer.get_index_models())

        for news in news_collection.find():
            article = NewsOrganizer.get_article(news)
            if article is None:
                FileHelper.append_to_file(self.config["log"]["Article_None"],
                                          news["_id"])
                continue
            if article == "":
                FileHelper.append_to_file(self.config["log"]["Article_Empty"],
                                          news["_id"])
                continue
            date = NewsOrganizer.get_date(news)
            if not date:
                FileHelper.append_to_file(self.config["Log"]["Date_None"],
                                          news["_id"])
                continue
            summery = NewsOrganizer.get_summery(news)
            if not summery:
                FileHelper.append_to_file(self.config["Log"]["Summery_None"],
                                          news["_id"])
                continue
            try:
                news_filtered.insert({
                    "title":
                    NewsOrganizer.get_title(news),
                    "summery":
                    summery,
                    "category":
                    NewsOrganizer.get_category(news),
                    "date":
                    date,
                    "article":
                    article,
                    "url":
                    news['URL'],
                    "canonical_link":
                    news['Canonical_Link'],
                    "authors":
                    news['Authors']
                })
            except Exception as exception:
                Logger().get_logger().error(type(exception).__name__,
                                            exc_info=True)
                traceback.print_exc()
示例#4
0
    def collect(self):
        socket.setdefaulttimeout(120)  # 120 seconds
        db=Mongo()

        start = datetime(self.START_YEAR, 1, 1, 0, 0, 0, 0)
        end = datetime(self.START_YEAR, 1+1, 1, 0, 0, 0, 0)
        collection = db.create_collection("FilteredNews")
        print("\t", end='\t')
        for category in self.categories:
            print(category, end='\t')
        print()
        while end.year < self.END_YEAR:
            count = collection.find({'RSS_Date': {'$gte': start, '$lt': end}}).count(False)
            # Get Category Count
            result = collection.aggregate([{ '$match': { 'date': {'$gte': start, '$lt': end},} },
                                { "$group": { "_id": { "$toLower": "$category" }, "count": { "$sum": 1 } } },
                                { "$group": { "_id": None, "counts": { "$push": { "k": "$_id", "v": "$count" } } } },
                                { "$replaceRoot": { "newRoot": { "$arrayToObject": "$counts" } } } ])
            print(str(start.year) + "." + str(start.month) + " \t " + str(count), end='\t')
            list_result = list(result)
            for item in list_result:
                for category in self.categories:
                    if category in item:
                        print(item[category], end='\t')
                    else:
                        print('0', end='\t')
            print()
            start = Statistics.add_one_month(start)
            end = Statistics.add_one_month(end)
示例#5
0
 def dnn_organizer_for_dnn_filtered_news(self):
     db = Mongo()
     collection = self.config["dnnfiltered"]["text_collection"]
     news_collection = db.create_collection(
         self.config["dnnfiltered"]["collection"])
     news_filtered = db.create_collection(
         self.config["dnnfiltered"]["destination"],
         NewsOrganizer.get_index_models())
     count = 0
     processed = 0
     while True:
         try:
             cursor = news_collection.find(
                 self.config["dnnfiltered"]["query"],
                 no_cursor_timeout=True).skip(processed)
             for news in cursor:
                 try:
                     url = news.get('url')
                     date = news.get('date')
                     before = self.get_price_before_date(
                         db, "Product", "BRTUSD", date)
                     minute = self.get_price_at_date(
                         db, "Product", "BRTUSD", date)
                     hour = self.get_price_at_date(db,
                                                   "Product",
                                                   "BRTUSD",
                                                   date,
                                                   minutes=60)
                     day = self.get_price_at_date(db,
                                                  "Product",
                                                  "BRTUSD",
                                                  date,
                                                  add_day=True)
                     info = self.get_news_for_link(db,
                                                   collection,
                                                   url,
                                                   fields=None)
                     if info is None:
                         info = {}
                     news_filtered.insert({
                         "_id":
                         news.get('_id'),
                         "title":
                         news.get('title'),
                         "title_o":
                         info.get('title'),
                         "summery":
                         news.get('title'),
                         "summery_o":
                         info.get('summery'),
                         "article":
                         news.get('article'),
                         "article_o":
                         info.get('article'),
                         "url":
                         url,
                         "category":
                         info.get('category'),
                         "price_after_minute":
                         minute,
                         "price_after_hour":
                         hour,
                         "price_after_day":
                         day,
                         "price_before":
                         before,
                         "wiki_relatedness":
                         info.get('wiki_relatedness'),
                         "tweet_count":
                         info.get('tweet_count'),
                         "tweet_percentage":
                         info.get('tweet_percentage'),
                         "date":
                         date,
                         "authors":
                         info.get('authors'),
                         "comment":
                         info.get('comment'),
                         "wiki_relatedness_nor":
                         info.get('wiki_relatedness_nor'),
                         "tweet_count_nor":
                         info.get('tweet_count_nor'),
                         "price_effect":
                         info.get('price_effect')
                     })
                 except Exception as exception:
                     Logger().get_logger().error(type(exception).__name__,
                                                 exc_info=True)
                     traceback.print_exc()
                 count = count + 1
                 if count % 500 == 0:
                     print(count)
                 processed += 1
             cursor.close()
             break
         except CursorNotFound:
             processed += 1
             print("Lost cursor. Retry with skip")
示例#6
0
 def dnn_organizer_with_wiki_tweets(self,
                                    collection="Product",
                                    key="BRTUSD",
                                    name="Brent Crude"):
     db = Mongo()
     pre_processing = PreProcessing()
     news_collection = db.create_collection(
         self.config["database"]["collection"])
     news_filtered = db.create_collection(
         self.config["database"]["destination"],
         NewsOrganizer.get_index_models())
     wiki_forecast = WikiForecast()
     twitter_forecast = TwitterForecast()
     if self.config["elasticSearch"]["enableTag"]:
         tags = twitter_forecast.get_pre_defined_tags()
     else:
         tags = {"tags": []}
     count = 0
     processed = 0
     while True:
         try:
             cursor = news_collection.find(
                 self.config["database"]["query"],
                 no_cursor_timeout=True).skip(processed)
             for news in cursor:
                 try:
                     summery = pre_processing.preprocess(
                         news.get('summery'))
                     summery_similarity = wiki_forecast.get_similarity(
                         summery, title=name)
                     date = news.get('date')
                     title = pre_processing.preprocess(news.get('title'))
                     before = self.get_price_before_date(
                         db, collection, key, date)
                     minute = self.get_price_at_date(
                         db, collection, key, date)
                     hour = self.get_price_at_date(db,
                                                   collection,
                                                   key,
                                                   date,
                                                   minutes=60)
                     day = self.get_price_at_date(db,
                                                  collection,
                                                  key,
                                                  date,
                                                  add_day=True)
                     total, percentage = twitter_forecast.get_popularity_from_elastic_search(
                         date,
                         title + tags["tags"],
                         pre_processing,
                         maxsize=self.config["elasticSearch"]["maxSize"])
                     news_filtered.insert({
                         "_id":
                         news.get('_id'),
                         "title":
                         title,
                         "summery":
                         pre_processing.preprocess(news.get('summery')),
                         "article":
                         pre_processing.preprocess(news.get('article')),
                         "url":
                         news.get('url'),
                         "category":
                         news.get('category'),
                         "price_after_minute":
                         minute,
                         "price_after_hour":
                         hour,
                         "price_after_day":
                         day,
                         "price_before":
                         before,
                         "wiki_relatedness":
                         summery_similarity,
                         "tweet_count":
                         total,
                         "tweet_percentage":
                         percentage,
                         "date":
                         date,
                         "authors":
                         news['authors']
                     })
                 except Exception as exception:
                     Logger().get_logger().error(type(exception).__name__,
                                                 exc_info=True)
                     traceback.print_exc()
                 count = count + 1
                 if count % 500 == 0:
                     print(count)
                 processed += 1
             cursor.close()
             break
         except CursorNotFound:
             processed += 1
             print("Lost cursor. Retry with skip")