def save_to_db(self): mongo = Mongo() try: mongo.insert(self.row) except Exception: print(self.row) Logger().get_logger().error('Insert Error', exc_info=True)
def collect(self): socket.setdefaulttimeout(120) # 120 seconds db=Mongo() start = datetime(self.START_YEAR, 1, 1, 0, 0, 0, 0) end = datetime(self.START_YEAR, 1+1, 1, 0, 0, 0, 0) collection = db.create_collection("FilteredNews") print("\t", end='\t') for category in self.categories: print(category, end='\t') print() while end.year < self.END_YEAR: count = collection.find({'RSS_Date': {'$gte': start, '$lt': end}}).count(False) # Get Category Count result = collection.aggregate([{ '$match': { 'date': {'$gte': start, '$lt': end},} }, { "$group": { "_id": { "$toLower": "$category" }, "count": { "$sum": 1 } } }, { "$group": { "_id": None, "counts": { "$push": { "k": "$_id", "v": "$count" } } } }, { "$replaceRoot": { "newRoot": { "$arrayToObject": "$counts" } } } ]) print(str(start.year) + "." + str(start.month) + " \t " + str(count), end='\t') list_result = list(result) for item in list_result: for category in self.categories: if category in item: print(item[category], end='\t') else: print('0', end='\t') print() start = Statistics.add_one_month(start) end = Statistics.add_one_month(end)
def collect(self): sites = self.read_website_collection() socket.setdefaulttimeout(120) # 120 seconds db = Mongo() count = 0 for info in sites: (site, category) = info.split(" ") siteHistory = archivecdx.Listing( site, fl=["original", "timestamp", "digest", "statuscode"], filter=["statuscode:200"]) print("Size of List :" + str(len(siteHistory.listing))) for history in siteHistory: timestamp = datetime.strptime(history.timestamp, "%Y%m%d%H%M%S") link = 'http://web.archive.org/web/%sid_/%s' % ( history.timestamp, history.original) print('(%d) - Archive Link : %s - %s' % (count, link, str(datetime.today()))) #if site == "http://feeds.bbci.co.uk/news/business/rss.xml": # if history.timestamp in self.Pass_List: #Control # continue try: d = feedparser.parse(link) except Exception as exception: print("FeedParser Timeout ?") Logger().get_logger().error(type(exception).__name__, exc_info=True) newslist = [] for post in d.entries: try: count = count + 1 if db.already_exists(post.link): continue if post.published_parsed: try: dt = datetime.fromtimestamp( mktime(post.published_parsed)) except AttributeError: dt = '' else: dt = '' article = Article(post.link) newslist.append( News.RssNews( title=post.title, time=dt, summery=post.summary, category=category, tags='', url=post.link, iaurl=('http://web.archive.org/web/%sid_/%s' % (history.timestamp, post.link)), article=article)) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) pool = NewsPool() pool.set(newslist) pool.join()
def parse_currency(currency_key, directory, name): # Type : 1 - Currency print("Currency") col = Mongo().create_collection("Currency", FDC.get_index_models()) with open(directory) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') print(currency_key) hour = -1 fd = None for row in csv_reader: if len(row) < 2: # Check Data continue add_value = 0 if currency_key == "EURUSD": date = DateHelper.str2date(row[0]) add_value = -1 else: date = DateHelper.str2date(row[0]+row[1]) if hour != date.hour: hour = date.hour if fd is not None: try: col.insert(fd.get_currency()) except: Logger().get_logger().error('Insert Error', exc_info=True) fd = FinancialData(name, currency_key, date, row[FDLocations.Currency_Open.value + add_value], row[FDLocations.Currency_High.value + add_value], row[FDLocations.Currency_Low.value + add_value], row[FDLocations.Currency_Close.value + add_value]) else: fd.add(row[FDLocations.Currency_High.value + add_value], row[FDLocations.Currency_Low.value + add_value], row[FDLocations.Currency_Close.value + add_value])
def parse_index_datetime(currency_key, directory, name, interval): # Type : 4 - Index col = Mongo().create_collection("Index") with open(directory) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 print(currency_key) hour = -1 hour_count = 0 fd = None for row in csv_reader: if len(row) < 2: # Check Data continue date = DateHelper.str2date(row[0] + row[1]) if hour != date.hour: hour = date.hour hour_count = 0 if fd is not None: print(fd) try: col.insert(fd.get_index()) except: Logger().get_logger().error('Insert Error', exc_info=True) fd = FinancialData(name, currency_key, date, row[FDLocations.IndexDateTime_Open.value], row[FDLocations.IndexDateTime_High.value], row[FDLocations.IndexDateTime_Low.value], row[FDLocations.IndexDateTime_Close.value]) else: fd.add(row[FDLocations.IndexDateTime_High.value], row[FDLocations.IndexDateTime_Low.value], row[FDLocations.IndexDateTime_Close.value]) hour_count += 1 line_count += 1 print(f'Processed {line_count} lines.')
def evaluate(self): LoggerHelper.info("Evaluation Started...") nlp = pipeline('sentiment-analysis') self.load_model(self.config["evaluation"]["load"]) self.model.eval() self.timer.start() db = Mongo() news_collection = db.create_collection(self.config["evaluation"]["collection"]) news_filtered = db.create_collection(self.config["evaluation"]["destination"], NewsOrganizer.get_index_models()) count = 0 processed = 0 while True: try: cursor = news_collection.find(self.config["evaluation"]["query"], no_cursor_timeout=True).skip( processed) for news in cursor: try: summery = news.get('summery') b_input_ids, b_input_mask = self.reader.get_one_news(summery) b_input_ids, b_input_mask = b_input_ids.to(self.device), b_input_mask.to(self.device) outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0].detach().cpu().numpy() # Move result to CPU result = np.argmax(logits, axis=1).flatten() # sentiment = nlp(summery) if result[0] == 1: news_filtered.insert({ "_id": news.get('_id'), "title": news.get('title'), "summery": news.get('summery'), "article": news.get('article'), "url": news.get('url'), "category": news.get('category'), "price_after_minute": news.get('price_after_minute'), "price_after_hour": news.get('price_after_hour'), "price_after_day": news.get('price_after_day'), "sentiment": sentiment, "price_before": news.get('price_before'), "wiki_relatedness": news.get('wiki_relatedness'), "tweet_count": news.get('tweet_count'), "tweet_percentage": news.get('tweet_percentage'), "date": news.get('date'), "authors": news.get('authors'), "comment": news.get('comment'), "price_effect": news.get('price_effect') }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc() count = count + 1 if count % 500 == 0: print(count) processed += 1 cursor.close() break except CursorNotFound: processed += 1 print("Lost cursor. Retry with skip") self.timer.stop(time_for="Evaluation")
def __init__(self, config, batch_size, sequence_length): self.db = Mongo() self.configs = config self.batch_size = batch_size self.sequence_length = sequence_length self.clear_data() self.__test_cursor = None self.__train_cursor = None
def dnn_organizer(self, collection="Product", key="BRTUSD"): db = Mongo() pre_processing = PreProcessing() news_collection = db.create_collection( self.config["database"]["collection"]) news_filtered = db.create_collection( self.config["database"]["destination"], NewsOrganizer.get_index_models()) for news in news_collection.find(self.config["database"]["query"]): date = news.get('date') before = self.get_price_before_date(db, collection, key, date) minute = self.get_price_at_date(db, collection, key, date) hour = self.get_price_at_date(db, collection, key, date, minutes=60) day = self.get_price_at_date(db, collection, key, date, add_day=True) try: news_filtered.insert({ "_id": news.get('_id'), "title": pre_processing.preprocess(news.get('title')), "summery": pre_processing.preprocess(news.get('summery')), "article": pre_processing.preprocess(news.get('article')), "url": news.get('url'), "category": news.get('category'), "price_after_minute": minute, "price_after_hour": hour, "price_after_day": day, "price_before": before, "date": date, "authors": news['authors'] }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc()
def organize(self): db = Mongo() news_collection = db.create_collection("News") news_filtered = db.create_collection("FilteredNews", NewsOrganizer.get_index_models()) for news in news_collection.find(): article = NewsOrganizer.get_article(news) if article is None: FileHelper.append_to_file(self.config["log"]["Article_None"], news["_id"]) continue if article == "": FileHelper.append_to_file(self.config["log"]["Article_Empty"], news["_id"]) continue date = NewsOrganizer.get_date(news) if not date: FileHelper.append_to_file(self.config["Log"]["Date_None"], news["_id"]) continue summery = NewsOrganizer.get_summery(news) if not summery: FileHelper.append_to_file(self.config["Log"]["Summery_None"], news["_id"]) continue try: news_filtered.insert({ "title": NewsOrganizer.get_title(news), "summery": summery, "category": NewsOrganizer.get_category(news), "date": date, "article": article, "url": news['URL'], "canonical_link": news['Canonical_Link'], "authors": news['Authors'] }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc()
class WikiRecorder(object): def __init__(self, collection_name="Wiki"): self.col = Mongo().create_collection(collection_name, WikiRecorder.get_index_models()) self.preprocessor = PreProcessing() self.config = WikiRecorder.get_config() self.total = 0 def collect_all(self): name_list = self.config["Wiki"]["Corporations"] for cor_name in name_list: self.collect(cor_name) def collect(self, title, page_id=None): page = Wikipedia.get_page(title, pageid=page_id) title = page.original_title title_p = self.preprocessor.preprocess(title) summary = page.summary summary_p = self.preprocessor.preprocess(summary) content = page.content page_id = page.pageid data = { 'title': title, 'title_p': title_p, 'summary': summary, 'summary_p': summary_p, 'content': content, 'page_id': page_id } print(data) try: self.col.insert(data) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) @staticmethod def get_index_models(): return [ IndexModel("title", name="index_title"), IndexModel("page_id", name="index_page_id") ] @staticmethod def get_config(): pwd = os.path.dirname(os.path.abspath(__file__)) return json.load(open(pwd + '/config.json', 'r'))
def calculate_distance_for_tweet(info, input): skip = info["skip"] get = info["to"] date = info["date"] title = info["news_title"] db = Mongo(test=2) pre = PreProcessing() tweets = WordEmbedding.get_tweets_before_date( db, date).skip(skip).limit(get) tweetcount = 0 count = 0 print(get) vector = WordEmbedding.get_vector_list(title) for tweet in tweets: tweetcount += 1 try: cosine = WordEmbedding.cosine_distance_word_embedding_with_vector( vector, pre.preprocess(tweet["tweet_text"])) percentage = round((1 - cosine) * 100, 2) except Exception as exception: print("Exeption") percentage = 0 if percentage > 80: count += 1 if tweet["tweet_user_verified"]: count += 1 print("count" + str(count)) return count
def __init__(self, config, batch_size, sequence_length, word_emb_enabled=True): self.db = Mongo() self.configs = config self.batch_size = batch_size self.sequence_length = sequence_length self.clear_data() if word_emb_enabled: self.word_embedding = WordEmbedding( path=self.configs["wordEmbedding"]["path"]) self.__test_cursor = None self.test_count = 0 self.__train_cursor = None self.train_count = 0 self.__validate_cursor = None self.validate_count = 0 self.max_min = None
def collect(self): db = Mongo() conn = sqlite3.connect(self.SQL_LOCATION) c = conn.cursor() c.execute( 'SELECT title, author, date, publication, category, digital, section, url FROM longform' ) line_count = 0 date_count = 0 newslist = [] for row in c: url = row[self.Url] date = DateHelper.str2date(row[self.Date]) title = row[self.Title] if url == "" or url is None or date == "": # Is There Url Or Date continue if db.is_title_url_exists(title, url): continue allUrls = FileCollector.extract_url_from_text(url) article = Article(allUrls[1]) category = row[self.Category] section = row[self.Section] newslist.append( News.RssNews(title=title, time=date, summery='', category=FileCollector.get_category( category, section), tags='', url=allUrls[1], iaurl=allUrls[0], article=article)) print(line_count) if len(newslist) == 20: pool = NewsPool() pool.set(newslist) pool.join() newslist = [] line_count += 1 print(f'\t{line_count}') print(f'\t{len(newslist)}')
def parse_stock(currency_key, directory, name, interval): # Type : 3 - Stock print("Stock") col = Mongo().create_collection("Stock", FDC.get_index_models()) with open(directory) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') print(currency_key) for row in csv_reader: if len(row) < 2: # Check Data continue date = DateHelper.str2date(row[0]) if interval == 60: fd = FinancialData(name, currency_key, date, row[FDLocations.Stock_Open.value], row[FDLocations.Stock_High.value], row[FDLocations.Stock_Low.value], row[FDLocations.Stock_Close.value], row[FDLocations.Stock_Volume.value], row[FDLocations.Stock_Trade.value], row[FDLocations.Stock_Avg.value]) col.insert(fd.get_stock()) else: print("Not Handled !!!")
class TaDataReader(object): # LSTM Applied On Sequential Data - It unrolls, In the Sequence Dimension # Batch Size : # Sequence Length : Memorize (Hidden and Cell State) def __init__(self, config, batch_size, sequence_length): self.db = Mongo() self.configs = config self.batch_size = batch_size self.sequence_length = sequence_length self.clear_data() self.__test_cursor = None self.__train_cursor = None def fetch_train_data(self): self.__train_cursor = self.db.get_data( self.configs['db'], self.configs['train_query'], self.configs['train_query_fields']) self.__train_cursor.batch_size( self.batch_size * self.sequence_length) # DB To Local Length def fetch_test_data(self): self.__test_cursor = self.db.get_data( self.configs['db'], self.configs['test_query'], self.configs['test_query_fields']) self.__test_cursor.batch_size( self.batch_size * self.sequence_length) # DB To Local Length def get_train_count(self): if self.__train_cursor is None: self.fetch_train_data() return self.__train_cursor.count() def get_train_data(self): self.__train_cursor.rewind() self.clear_data() batch_count = 0 sequence_count = 0 for row in self.__train_cursor: self.__x_sequence.append( np.asarray([row["Open"]], dtype=np.float32)) self.__y_sequence.append( np.asarray([row["Open"]], dtype=np.float32)) # row["High"] sequence_count += 1 if sequence_count % (self.sequence_length + 1) == 0: self.__x_sequence.pop() self.__y_sequence.pop(0) self.x.append(np.asarray(self.__x_sequence, dtype=np.float32)) self.y.append(np.asarray(self.__y_sequence, dtype=np.float32)) self.clear_sequence() batch_count += 1 if batch_count % self.batch_size == 0: yield np.asarray(self.x, dtype=np.float32), np.asarray( self.y, dtype=np.float32) self.clear_data() def get_test_count(self): if self.__test_cursor is None: self.fetch_test_data() return self.__test_cursor.count() def get_test_data(self): self.__test_cursor.rewind() self.clear_data() batch_count = 0 sequence_count = 0 for row in self.__test_cursor: self.__x_sequence.append( np.asarray([row["Open"]], dtype=np.float32)) self.__y_sequence.append( np.asarray([row["Open"]], dtype=np.float32)) # row["High"] sequence_count += 1 if sequence_count % (self.sequence_length + 1) == 0: self.__x_sequence.pop() self.__y_sequence.pop(0) self.x.append(np.asarray(self.__x_sequence, dtype=np.float32)) self.y.append(np.asarray(self.__y_sequence, dtype=np.float32)) self.clear_sequence() batch_count += 1 if batch_count % self.batch_size == 0: yield np.asarray(self.x, dtype=np.float32), np.asarray( self.y, dtype=np.float32) self.clear_data() def clear_data(self): self.x = [] self.y = [] self.clear_sequence() def clear_sequence(self): self.__x_sequence = [] self.__y_sequence = []
def get_wiki(collection="Wiki", title="Brent Crude"): db = Mongo() query = {"title": title} fields = {"summary_p": 1, "_id": 0} return db.get_data_one(collection, query, fields)
def __init__(self): self.config = self.__get_config() self.db = Mongo()
def __init__(self, directory="/Users/kaaneksen/Desktop/Master Project/Twitter/02", collection_name="Tweet"): self.directory = directory self.col = Mongo().create_collection(collection_name, TweetRecorder.get_index_models()) self.total = 0
class TweetRecorder(object): def __init__(self, directory="/Users/kaaneksen/Desktop/Master Project/Twitter/02", collection_name="Tweet"): self.directory = directory self.col = Mongo().create_collection(collection_name, TweetRecorder.get_index_models()) self.total = 0 def load_all_tweets_in_directory(self, directory=None): """Walk all files in directory and loads all tweets into a MongoDb""" files_processed = 0 if directory is None: directory = self.directory for root, dirs, files in os.walk(directory): for file in files: files_processed += 1 filename = os.path.join(root, file) if not filename.endswith('.bz2'): continue print('Starting work on file ' + str(files_processed) + '): ' + filename) self.handle_file(filename) if files_processed % 20 == 0: print("Total Tweets Processed : {}".format(self.total)) def handle_file(self, filename): """Takes a filename, loads all tweets into a MongoDb""" tweets = TweetRecorder.load_bz2_json(filename) tweet_dicts = [] tweets_saved = 0 for tweet in tweets: tweet_dict, tweets_saved = TweetRecorder.load_tweet(tweet, tweets_saved) # Extracts proper items and places them in database if tweet_dict: tweet_dicts.append(tweet_dict) self.total = self.total + len(tweet_dicts) try: self.col.insert_many(tweet_dicts, ordered=False, bypass_document_validation=True) except Exception: Logger().get_logger().error('Insert Error - Twitter', exc_info=True) return True @staticmethod def load_bz2_json(filename): """ Takes a bz2 filename, returns the tweets as a list of tweet dictionaries""" data = open(filename, "rb").read() lines = bz2.decompress(data).decode("utf-8").split("\n") tweets = [] for line in lines: try: if line == "": continue tweets.append(json.loads(line)) except: # I'm kind of lenient as I have millions of tweets, most errors were due to encoding or so) continue return tweets @staticmethod def load_tweet(tweet, tweets_saved): """Takes a tweet (dictionary) and convert to appropriate dictionary""" try: tweet_lang = tweet['lang'] data = { '_id': tweet['id'], 'tweet_text': tweet['text'], 'tweet_location': tweet['coordinates'], 'tweet_created_at': datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y'), 'tweet_entities': tweet['entities'], 'tweet_replay_to_tweet': tweet['in_reply_to_status_id'], 'tweet_replay_to_user': tweet['in_reply_to_user_id'], 'tweet_user_id': tweet['user']['id'], 'tweet_user_lang': tweet['user']['lang'], 'tweet_user_name': tweet['user']['name'], 'tweet_user_time_zone': tweet['user']['time_zone'], 'tweet_user_followers_count': tweet['user']['followers_count'], 'tweet_user_verified': tweet['user']['verified'], 'tweet_user_all_tweet_count': tweet['user']['statuses_count'] } if tweet_lang != "en": return {}, tweets_saved else: tweets_saved += 1 return data, tweets_saved except KeyError: return {}, tweets_saved @staticmethod def get_index_models(): return [IndexModel("tweet_created_at", name="index_date"), IndexModel("tweet_replay_to_tweet", name="index_replay_to"), IndexModel("tweet_user_id", name="index_user_id")]
def dnn_organizer_with_wiki_tweets(self, collection="Product", key="BRTUSD", name="Brent Crude"): db = Mongo() pre_processing = PreProcessing() news_collection = db.create_collection( self.config["database"]["collection"]) news_filtered = db.create_collection( self.config["database"]["destination"], NewsOrganizer.get_index_models()) wiki_forecast = WikiForecast() twitter_forecast = TwitterForecast() if self.config["elasticSearch"]["enableTag"]: tags = twitter_forecast.get_pre_defined_tags() else: tags = {"tags": []} count = 0 processed = 0 while True: try: cursor = news_collection.find( self.config["database"]["query"], no_cursor_timeout=True).skip(processed) for news in cursor: try: summery = pre_processing.preprocess( news.get('summery')) summery_similarity = wiki_forecast.get_similarity( summery, title=name) date = news.get('date') title = pre_processing.preprocess(news.get('title')) before = self.get_price_before_date( db, collection, key, date) minute = self.get_price_at_date( db, collection, key, date) hour = self.get_price_at_date(db, collection, key, date, minutes=60) day = self.get_price_at_date(db, collection, key, date, add_day=True) total, percentage = twitter_forecast.get_popularity_from_elastic_search( date, title + tags["tags"], pre_processing, maxsize=self.config["elasticSearch"]["maxSize"]) news_filtered.insert({ "_id": news.get('_id'), "title": title, "summery": pre_processing.preprocess(news.get('summery')), "article": pre_processing.preprocess(news.get('article')), "url": news.get('url'), "category": news.get('category'), "price_after_minute": minute, "price_after_hour": hour, "price_after_day": day, "price_before": before, "wiki_relatedness": summery_similarity, "tweet_count": total, "tweet_percentage": percentage, "date": date, "authors": news['authors'] }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc() count = count + 1 if count % 500 == 0: print(count) processed += 1 cursor.close() break except CursorNotFound: processed += 1 print("Lost cursor. Retry with skip")
def __init__(self, collection_name="Wiki"): self.col = Mongo().create_collection(collection_name, WikiRecorder.get_index_models()) self.preprocessor = PreProcessing() self.config = WikiRecorder.get_config() self.total = 0
def dnn_organizer_for_dnn_filtered_news(self): db = Mongo() collection = self.config["dnnfiltered"]["text_collection"] news_collection = db.create_collection( self.config["dnnfiltered"]["collection"]) news_filtered = db.create_collection( self.config["dnnfiltered"]["destination"], NewsOrganizer.get_index_models()) count = 0 processed = 0 while True: try: cursor = news_collection.find( self.config["dnnfiltered"]["query"], no_cursor_timeout=True).skip(processed) for news in cursor: try: url = news.get('url') date = news.get('date') before = self.get_price_before_date( db, "Product", "BRTUSD", date) minute = self.get_price_at_date( db, "Product", "BRTUSD", date) hour = self.get_price_at_date(db, "Product", "BRTUSD", date, minutes=60) day = self.get_price_at_date(db, "Product", "BRTUSD", date, add_day=True) info = self.get_news_for_link(db, collection, url, fields=None) if info is None: info = {} news_filtered.insert({ "_id": news.get('_id'), "title": news.get('title'), "title_o": info.get('title'), "summery": news.get('title'), "summery_o": info.get('summery'), "article": news.get('article'), "article_o": info.get('article'), "url": url, "category": info.get('category'), "price_after_minute": minute, "price_after_hour": hour, "price_after_day": day, "price_before": before, "wiki_relatedness": info.get('wiki_relatedness'), "tweet_count": info.get('tweet_count'), "tweet_percentage": info.get('tweet_percentage'), "date": date, "authors": info.get('authors'), "comment": info.get('comment'), "wiki_relatedness_nor": info.get('wiki_relatedness_nor'), "tweet_count_nor": info.get('tweet_count_nor'), "price_effect": info.get('price_effect') }) except Exception as exception: Logger().get_logger().error(type(exception).__name__, exc_info=True) traceback.print_exc() count = count + 1 if count % 500 == 0: print(count) processed += 1 cursor.close() break except CursorNotFound: processed += 1 print("Lost cursor. Retry with skip")
class NewsDnnBaseDataReader(object): DictDataTerm = {'Train': 1, 'Validate': 2, 'Test': 3} DictDataType = {'News': 1, 'Wiki': 2, 'WikiAndTweet': 3} ArticleMinSize = 10 # LSTM Applied On Sequential Data - It unrolls, In the Sequence Dimension # Batch Size : # Sequence Length : Memorize (Hidden and Cell State) -> Article Size def __init__(self, config, batch_size, sequence_length, word_emb_enabled=True): self.db = Mongo() self.configs = config self.batch_size = batch_size self.sequence_length = sequence_length self.clear_data() if word_emb_enabled: self.word_embedding = WordEmbedding( path=self.configs["wordEmbedding"]["path"]) self.__test_cursor = None self.test_count = 0 self.__train_cursor = None self.train_count = 0 self.__validate_cursor = None self.validate_count = 0 self.max_min = None ''' Data Fetch ''' def fetch_data(self, fetch_type=1): if fetch_type == NewsDnnBaseDataReader.DictDataTerm["Train"]: self.__train_cursor = self.db.get_data( self.configs['database']['name'], self.configs['database']['train']['query'], self.configs['database']['fields'], notimeout=True) if self.configs['database']['sort'] is not None: self.__train_cursor = self.__train_cursor.sort( ListHelper.convert_dict_list( self.configs['database']['sort'])) elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Validate"]: self.__validate_cursor = self.db.get_data( self.configs['database']['name'], self.configs['database']['validate']['query'], self.configs['database']['fields'], notimeout=True) if self.configs['database']['sort'] is not None: self.__validate_cursor = self.__validate_cursor.sort( ListHelper.convert_dict_list( self.configs['database']['sort'])) elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Test"]: self.__test_cursor = self.db.get_data( self.configs['database']['name'], self.configs['database']['test']['query'], self.configs['database']['fields'], notimeout=True) if self.configs['database']['sort'] is not None: self.__test_cursor = self.__test_cursor.sort( ListHelper.convert_dict_list( self.configs['database']['sort'])) else: LoggerHelper.critical('Unable To Fetch') ''' Get Count ''' def get_count(self, fetch_type=1): if fetch_type == NewsDnnBaseDataReader.DictDataTerm["Train"]: if self.__train_cursor is None: self.fetch_data(NewsDnnBaseDataReader.DictDataTerm["Train"]) self.train_count = self.__train_cursor.count() return self.train_count elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Validate"]: if self.__validate_cursor is None: self.fetch_data(NewsDnnBaseDataReader.DictDataTerm["Validate"]) self.validate_count = self.__validate_cursor.count() return self.validate_count elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Test"]: if self.__test_cursor is None: self.fetch_data(NewsDnnBaseDataReader.DictDataTerm["Test"]) self.test_count = self.__test_cursor.count() return self.test_count else: LoggerHelper.critical('Unable To Fetch') ''' Get Data ''' def get_data(self, fetch_type=1, data_type=1): if fetch_type == NewsDnnBaseDataReader.DictDataTerm["Train"]: cursor = self.__train_cursor elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Validate"]: cursor = self.__validate_cursor elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Test"]: cursor = self.__test_cursor else: LoggerHelper.critical('Unable To Get Cursor (Check Fetch Type)') return None cursor.rewind() self.clear_data() if data_type == NewsDnnBaseDataReader.DictDataType["News"]: return self.get_data_news(cursor) elif data_type == NewsDnnBaseDataReader.DictDataType["Wiki"]: return self.get_data_wiki(cursor) elif data_type == NewsDnnBaseDataReader.DictDataType["WikiAndTweet"]: return self.get_data_wiki_and_tweet(cursor) else: LoggerHelper.critical('Unknown Data Type (data_type)') return None ''' Get Max Min ''' def get_max_min(self): data = {} for field in self.configs['database']['max_min']['fields']: fields = {field: 1, "_id": 0} min = self.db.get_data_one( self.configs['database']['name'], self.configs['database']['max_min']['query'], fields=fields, sort=[(field, +1)]) max = self.db.get_data_one( self.configs['database']['name'], self.configs['database']['max_min']['query'], fields=fields, sort=[(field, -1)]) data[field] = {"max": max, "min": min} self.max_min = data return data ''' NEWS ''' def get_data_news(self, cursor): batch_count = 0 price_start = self.configs["database"]["price"]["start"] price_end = self.configs["database"]["price"]["end"] for row in cursor: embedded_article = self.word_embedding.get_weight_matrix( row["article"]) if len(embedded_article) < NewsDnnBaseDataReader.ArticleMinSize: continue self.x.append(self.pad_embedded_article(embedded_article)) self.y.append( NewsDnnBaseDataReader.get_classification( row[price_start], row[price_end], self.configs['database']['price']['buffer_percent'])) batch_count = batch_count + 1 if batch_count % self.batch_size == 0: yield np.asarray(self.x, dtype=np.float32), np.asarray( self.y, dtype=np.float32) self.clear_data() ''' WIKI ''' def get_data_wiki(self, cursor): batch_count = 0 price_start = self.configs["database"]["price"]["start"] price_end = self.configs["database"]["price"]["end"] wiki_column = self.configs['options']['wiki']['wiki_column'] for row in cursor: embedded_article = self.word_embedding.\ get_weight_matrix_all(article=row["article"], wiki=row[wiki_column], wiki_multiply_factors=self.configs['options']['wiki']['multiply_factors']) if len(embedded_article) < NewsDnnBaseDataReader.ArticleMinSize: continue self.x.append(self.pad_embedded_article(embedded_article)) self.y.append( NewsDnnBaseDataReader.get_classification( row[price_start], row[price_end], self.configs['database']['price']['buffer_percent'])) batch_count = batch_count + 1 if batch_count % self.batch_size == 0: yield np.asarray(self.x, dtype=np.float32), np.asarray( self.y, dtype=np.float32) self.clear_data() ''' WIKI & TWEET ''' def get_data_wiki_and_tweet(self, cursor): batch_count = 0 price_start = self.configs["database"]["price"]["start"] price_end = self.configs["database"]["price"]["end"] wiki_column = self.configs['options']['wiki']['wiki_column'] tweet_column = self.configs['options']['twitter']['tweet_column'] wiki_multiply_factors = self.configs['options']['wiki'][ 'multiply_factors'] tweet_multiply_factors = self.configs['options']['twitter'][ 'multiply_factors'] for row in cursor: embedded_article = self.word_embedding. \ get_weight_matrix_all(article=row["article"], wiki=row[wiki_column], wiki_multiply_factors=wiki_multiply_factors, tweet=row[tweet_column], tweet_multiply_factors=tweet_multiply_factors) if len(embedded_article) < NewsDnnBaseDataReader.ArticleMinSize: continue # Article self.x.append(self.pad_embedded_article(embedded_article)) # Price self.y.append( NewsDnnBaseDataReader.get_classification( row[price_start], row[price_end], self.configs['database']['price']['buffer_percent'])) batch_count = batch_count + 1 if batch_count % self.batch_size == 0: yield np.asarray(self.x, dtype=np.float32), np.asarray( self.y, dtype=np.float32) self.clear_data() ''' HELPER METHODS ''' def pad_embedded_article(self, embedded_article): # Calculate Difference padding_difference = (embedded_article.shape[0] - self.sequence_length) if padding_difference == 0: return embedded_article if padding_difference >= 0: return embedded_article[:-padding_difference] else: # Add Padding return np.pad(embedded_article, ((abs(padding_difference), 0), (0, 0)), 'constant') def clear_data(self): self.x = [] self.y = [] @staticmethod def get_classification(start, end, buffer_percent): diff = float(start["Open"]) - float(end["Open"]) total = float(start["Open"]) + float(end["Open"]) / 2 percentage = (diff / total) * 100 if percentage > buffer_percent: return 2 # Increase elif percentage < -buffer_percent: return 1 # Decrease else: return 0 # Same Value