def evaluate(self): LoggerHelper.info("Evaluation Started...") nlp = pipeline('sentiment-analysis') self.load_model(self.config["evaluation"]["load"]) self.model.eval() self.timer.start() db = Mongo() news_collection = db.create_collection(self.config["evaluation"]["collection"]) news_filtered = db.create_collection(self.config["evaluation"]["destination"], NewsOrganizer.get_index_models()) count = 0 processed = 0 while True: try: cursor = news_collection.find(self.config["evaluation"]["query"], no_cursor_timeout=True).skip( processed) for news in cursor: try: summery = news.get('summery') b_input_ids, b_input_mask = self.reader.get_one_news(summery) b_input_ids, b_input_mask = b_input_ids.to(self.device), b_input_mask.to(self.device) outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0].detach().cpu().numpy() # Move result to CPU result = np.argmax(logits, axis=1).flatten() # sentiment = nlp(summery) if result[0] == 1: news_filtered.insert({ "_id": news.get('_id'), "title": news.get('title'), "summery": news.get('summery'), "article": news.get('article'), "url": news.get('url'), "category": news.get('category'), "price_after_minute": news.get('price_after_minute'), "price_after_hour": news.get('price_after_hour'), "price_after_day": news.get('price_after_day'), "sentiment": sentiment, "price_before": news.get('price_before'), "wiki_relatedness": news.get('wiki_relatedness'), "tweet_count": news.get('tweet_count'), "tweet_percentage": news.get('tweet_percentage'), "date": news.get('date'), "authors": news.get('authors'), "comment": news.get('comment'), "price_effect": news.get('price_effect') }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc() count = count + 1 if count % 500 == 0: print(count) processed += 1 cursor.close() break except CursorNotFound: processed += 1 print("Lost cursor. Retry with skip") self.timer.stop(time_for="Evaluation")
def dnn_organizer(self, collection="Product", key="BRTUSD"): db = Mongo() pre_processing = PreProcessing() news_collection = db.create_collection( self.config["database"]["collection"]) news_filtered = db.create_collection( self.config["database"]["destination"], NewsOrganizer.get_index_models()) for news in news_collection.find(self.config["database"]["query"]): date = news.get('date') before = self.get_price_before_date(db, collection, key, date) minute = self.get_price_at_date(db, collection, key, date) hour = self.get_price_at_date(db, collection, key, date, minutes=60) day = self.get_price_at_date(db, collection, key, date, add_day=True) try: news_filtered.insert({ "_id": news.get('_id'), "title": pre_processing.preprocess(news.get('title')), "summery": pre_processing.preprocess(news.get('summery')), "article": pre_processing.preprocess(news.get('article')), "url": news.get('url'), "category": news.get('category'), "price_after_minute": minute, "price_after_hour": hour, "price_after_day": day, "price_before": before, "date": date, "authors": news['authors'] }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc()
def organize(self): db = Mongo() news_collection = db.create_collection("News") news_filtered = db.create_collection("FilteredNews", NewsOrganizer.get_index_models()) for news in news_collection.find(): article = NewsOrganizer.get_article(news) if article is None: FileHelper.append_to_file(self.config["log"]["Article_None"], news["_id"]) continue if article == "": FileHelper.append_to_file(self.config["log"]["Article_Empty"], news["_id"]) continue date = NewsOrganizer.get_date(news) if not date: FileHelper.append_to_file(self.config["Log"]["Date_None"], news["_id"]) continue summery = NewsOrganizer.get_summery(news) if not summery: FileHelper.append_to_file(self.config["Log"]["Summery_None"], news["_id"]) continue try: news_filtered.insert({ "title": NewsOrganizer.get_title(news), "summery": summery, "category": NewsOrganizer.get_category(news), "date": date, "article": article, "url": news['URL'], "canonical_link": news['Canonical_Link'], "authors": news['Authors'] }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc()
def collect(self): socket.setdefaulttimeout(120) # 120 seconds db=Mongo() start = datetime(self.START_YEAR, 1, 1, 0, 0, 0, 0) end = datetime(self.START_YEAR, 1+1, 1, 0, 0, 0, 0) collection = db.create_collection("FilteredNews") print("\t", end='\t') for category in self.categories: print(category, end='\t') print() while end.year < self.END_YEAR: count = collection.find({'RSS_Date': {'$gte': start, '$lt': end}}).count(False) # Get Category Count result = collection.aggregate([{ '$match': { 'date': {'$gte': start, '$lt': end},} }, { "$group": { "_id": { "$toLower": "$category" }, "count": { "$sum": 1 } } }, { "$group": { "_id": None, "counts": { "$push": { "k": "$_id", "v": "$count" } } } }, { "$replaceRoot": { "newRoot": { "$arrayToObject": "$counts" } } } ]) print(str(start.year) + "." + str(start.month) + " \t " + str(count), end='\t') list_result = list(result) for item in list_result: for category in self.categories: if category in item: print(item[category], end='\t') else: print('0', end='\t') print() start = Statistics.add_one_month(start) end = Statistics.add_one_month(end)
def dnn_organizer_for_dnn_filtered_news(self): db = Mongo() collection = self.config["dnnfiltered"]["text_collection"] news_collection = db.create_collection( self.config["dnnfiltered"]["collection"]) news_filtered = db.create_collection( self.config["dnnfiltered"]["destination"], NewsOrganizer.get_index_models()) count = 0 processed = 0 while True: try: cursor = news_collection.find( self.config["dnnfiltered"]["query"], no_cursor_timeout=True).skip(processed) for news in cursor: try: url = news.get('url') date = news.get('date') before = self.get_price_before_date( db, "Product", "BRTUSD", date) minute = self.get_price_at_date( db, "Product", "BRTUSD", date) hour = self.get_price_at_date(db, "Product", "BRTUSD", date, minutes=60) day = self.get_price_at_date(db, "Product", "BRTUSD", date, add_day=True) info = self.get_news_for_link(db, collection, url, fields=None) if info is None: info = {} news_filtered.insert({ "_id": news.get('_id'), "title": news.get('title'), "title_o": info.get('title'), "summery": news.get('title'), "summery_o": info.get('summery'), "article": news.get('article'), "article_o": info.get('article'), "url": url, "category": info.get('category'), "price_after_minute": minute, "price_after_hour": hour, "price_after_day": day, "price_before": before, "wiki_relatedness": info.get('wiki_relatedness'), "tweet_count": info.get('tweet_count'), "tweet_percentage": info.get('tweet_percentage'), "date": date, "authors": info.get('authors'), "comment": info.get('comment'), "wiki_relatedness_nor": info.get('wiki_relatedness_nor'), "tweet_count_nor": info.get('tweet_count_nor'), "price_effect": info.get('price_effect') }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc() count = count + 1 if count % 500 == 0: print(count) processed += 1 cursor.close() break except CursorNotFound: processed += 1 print("Lost cursor. Retry with skip")
def dnn_organizer_with_wiki_tweets(self, collection="Product", key="BRTUSD", name="Brent Crude"): db = Mongo() pre_processing = PreProcessing() news_collection = db.create_collection( self.config["database"]["collection"]) news_filtered = db.create_collection( self.config["database"]["destination"], NewsOrganizer.get_index_models()) wiki_forecast = WikiForecast() twitter_forecast = TwitterForecast() if self.config["elasticSearch"]["enableTag"]: tags = twitter_forecast.get_pre_defined_tags() else: tags = {"tags": []} count = 0 processed = 0 while True: try: cursor = news_collection.find( self.config["database"]["query"], no_cursor_timeout=True).skip(processed) for news in cursor: try: summery = pre_processing.preprocess( news.get('summery')) summery_similarity = wiki_forecast.get_similarity( summery, title=name) date = news.get('date') title = pre_processing.preprocess(news.get('title')) before = self.get_price_before_date( db, collection, key, date) minute = self.get_price_at_date( db, collection, key, date) hour = self.get_price_at_date(db, collection, key, date, minutes=60) day = self.get_price_at_date(db, collection, key, date, add_day=True) total, percentage = twitter_forecast.get_popularity_from_elastic_search( date, title + tags["tags"], pre_processing, maxsize=self.config["elasticSearch"]["maxSize"]) news_filtered.insert({ "_id": news.get('_id'), "title": title, "summery": pre_processing.preprocess(news.get('summery')), "article": pre_processing.preprocess(news.get('article')), "url": news.get('url'), "category": news.get('category'), "price_after_minute": minute, "price_after_hour": hour, "price_after_day": day, "price_before": before, "wiki_relatedness": summery_similarity, "tweet_count": total, "tweet_percentage": percentage, "date": date, "authors": news['authors'] }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc() count = count + 1 if count % 500 == 0: print(count) processed += 1 cursor.close() break except CursorNotFound: processed += 1 print("Lost cursor. Retry with skip")