def load_config(): LoggerHelper.info("Loading Config...") pwd = os.path.dirname(os.path.abspath(__file__)) if platform.system() == "Windows": Config.add_config_ini('%s\\initialization\\main_w.ini' % pwd) else: Config.add_config_ini('%s/initialization/main.ini' % pwd) LoggerHelper.info("Loading is loaded.")
def get_network_input_size(self): size = self.config["wordEmbedding"]["size"] if self.config["options"]["wiki"]["enabled"]: size = size + self.config["options"]["wiki"]["multiply_factors"] if self.config["options"]["twitter"]["enabled"]: size = size + self.config["options"]["twitter"]["multiply_factors"] LoggerHelper.info("Network Input Size :" + str(size)) return size
def calculate_hidden_size(self): samples_in_training_data = 116100 scaling_factor = 5 input_neurons = self.input_size output_neurons = self.output_size size = int(samples_in_training_data / (scaling_factor * (input_neurons + output_neurons))) LoggerHelper.info('Calculated hidden size is ' + str(size)) if size == 0: LoggerHelper.error('Calculated hidden size is changed to 2') return 2 else: return size
def save_model(self): # serialize model to JSON save_file_name = self.get_save_file_name() checkpoint = { 'model': NewsDnnGeneralModel(), 'model_state_dict': self.model.state_dict(), 'optimizer': optim.Adam(self.model.parameters(), lr=self.model.lr), 'optimizer_state_dict': self.optimizer.state_dict() } torch.save(checkpoint, save_file_name) LoggerHelper.info("Model Saved to disk")
def load_model(self, path): checkpoint = torch.load(path) self.model = checkpoint['model'] self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer = checkpoint['optimizer'] self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) LoggerHelper.info("**Model Info**" + "\nbatch_size : " + str(self.reader.batch_size) + "\nsequence_length : " + str(self.reader.sequence_length) + "\ninput_size : " + str(self.model.input_size) + "\nnum_layers : " + str(self.model.num_layers) + "\ndrop_prob : " + str(self.model.drop_prob) + "\nlr : " + str(self.model.lr)) LoggerHelper.info("Model loaded from disk")
def get_news_type(dnn_type): dnn_type = dnn_type.strip() if dnn_type == "CNN": return NewsCnnMain() elif dnn_type == "RNN": return NewsDnnGeneralMain() elif dnn_type == "TA": return TaMain() elif dnn_type == "PriceRNN": return PriceRnnMain() elif dnn_type == "CATE": return NewsCateMain() else: # Default RNN LoggerHelper.error("DNN type (" + dnn_type + ") is not found. Default RNN (NewsDnnGeneralMain) is used.") return NewsDnnGeneralMain()
def get_count(self, fetch_type=1): if fetch_type == NewsDnnBaseDataReader.DictDataTerm["Train"]: if self.__train_cursor is None: self.fetch_data(NewsDnnBaseDataReader.DictDataTerm["Train"]) self.train_count = self.__train_cursor.count() return self.train_count elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Validate"]: if self.__validate_cursor is None: self.fetch_data(NewsDnnBaseDataReader.DictDataTerm["Validate"]) self.validate_count = self.__validate_cursor.count() return self.validate_count elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Test"]: if self.__test_cursor is None: self.fetch_data(NewsDnnBaseDataReader.DictDataTerm["Test"]) self.test_count = self.__test_cursor.count() return self.test_count else: LoggerHelper.critical('Unable To Fetch')
def test(self): LoggerHelper.info("Test Started...") self.timer.start() df = pandas.DataFrame(columns=['Accuracy', 'Test Accuracy', 'Mean Test Loss']) # Tracking variables val_losses = [] predictions, true_labels = [], [] test_set = self.reader.get_data(NewsCateDataReader.DictDataTerm["Test"], NewsCateDataReader.DictDataType[ self.config["options"]["network_type"]]) self.model.eval() accuracy = 0 for batch in test_set: # Add batch to GPU batch = tuple(t.to(self.device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): # Forward pass, calculate logit predictions outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate the accuracy for this batch of test sentences. label, acc = self.calculate_accuracy(logits, label_ids) accuracy += acc # Store predictions and true labels predictions.append(label) true_labels.append(label_ids) scores = self.calculate_scores(predictions, true_labels) df = self.log_test(df, accuracy, self.test_count, val_losses, scores) Export.append_df_to_excel(df, self.current_date) self.timer.stop(time_for="Test")
def get_data(self, fetch_type=1, data_type=1): if fetch_type == NewsDnnBaseDataReader.DictDataTerm["Train"]: cursor = self.__train_cursor elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Validate"]: cursor = self.__validate_cursor elif fetch_type == NewsDnnBaseDataReader.DictDataTerm["Test"]: cursor = self.__test_cursor else: LoggerHelper.critical('Unable To Get Cursor (Check Fetch Type)') return None cursor.rewind() self.clear_data() if data_type == NewsDnnBaseDataReader.DictDataType["News"]: return self.get_data_news(cursor) elif data_type == NewsDnnBaseDataReader.DictDataType["Wiki"]: return self.get_data_wiki(cursor) elif data_type == NewsDnnBaseDataReader.DictDataType["WikiAndTweet"]: return self.get_data_wiki_and_tweet(cursor) else: LoggerHelper.critical('Unknown Data Type (data_type)') return None
def validate(self, df, epoch, losses): LoggerHelper.info("Validation Started...") # Get validation loss val_losses = [] predictions, true_labels = [], [] self.model.eval() accuracy = 0 steps = 0 validate_set = self.reader.get_data(NewsCateDataReader.DictDataTerm["Validate"], NewsCateDataReader.DictDataType[ self.config["options"]["network_type"]]) for batch in validate_set: # Evaluate data for one epoch # Add batch to GPU batch = tuple(t.to(self.device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): # Not to compute or store gradients outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate the accuracy for this batch of test sentences. label, tmp_eval_accuracy = self.calculate_accuracy(logits, label_ids) # Accumulate the total accuracy. accuracy += tmp_eval_accuracy # Track the number of batches steps += 1 # Store predictions and true labels predictions.append(label) true_labels.append(label_ids) # Report the final accuracy for this validation run. LoggerHelper.info("Accuracy: {0:.2f}".format(accuracy / steps)) scores = self.calculate_scores(predictions, true_labels) self.model.train() # reset to train mode after iterationg through validation data return self.log_validate_without_loss(df, epoch, 0, self.validate_count, scores)
def __init__(self, input_size=1, hidden=None, n_layers=2, drop_prob=0.2, lr=0.001, training_data_size=100000, output_size=3, use_gpu=True): super().__init__() self.should_use_gpu = use_gpu self.input_size = input_size self.training_data_size = training_data_size self.num_layers = n_layers self.drop_prob = drop_prob self.lr = lr self.output_size = output_size if hidden is None: self.hidden = self.calculate_hidden_size() else: self.hidden = hidden self.lstm = nn.LSTM( self.input_size, # Expected features in the input self.hidden, # Features in the hidden state self.num_layers, # Stacked LSTM's bias=True, # Bias weights should be used or not dropout=drop_prob, # Dropout layer of each LSTM batch_first= True, # Input and output tensors are provided as (batch, seq, feature) bidirectional=False) # Bidirectional LSTM # Additional Dropout Layer self.dropout = nn.Dropout(drop_prob) # Fully-Connected Output Layer self.fc = nn.Linear(self.hidden, output_size) # Sigmoid Layer self.sig = nn.LogSoftmax(dim=1) # Check GPU Usage self.can_use_gpu = torch.cuda.is_available() if self.can_use_gpu: if self.should_use_gpu: LoggerHelper.info('Training on GPU!') else: LoggerHelper.info('GPU usage is disabled by config.json') else: LoggerHelper.info( 'No GPU available, training on CPU; consider making n_epochs very small.' )
def __init__(self, input_size=102, n_layers=2, drop_prob=0.2, n_filters=100, filter_sizes=[3, 4, 5], lr=0.001, output_size=3, use_gpu=True): super().__init__() self.should_use_gpu = use_gpu self.input_size = input_size self.output_size = output_size self.num_layers = n_layers self.drop_prob = drop_prob self.lr = lr # 2D Convolution Layer self.conv_0 = nn.Conv1d(in_channels=input_size, out_channels=n_filters, kernel_size=filter_sizes[0]) self.conv_1 = nn.Conv1d(in_channels=input_size, out_channels=n_filters, kernel_size=filter_sizes[1]) self.conv_2 = nn.Conv1d(in_channels=input_size, out_channels=n_filters, kernel_size=filter_sizes[2]) # Additional Dropout Layer self.dropout = nn.Dropout(drop_prob) # Fully-Connected Output Layer self.fc = nn.Linear(len(filter_sizes) * n_filters, output_size) # Sigmoid Layer self.sig = nn.Softmax(dim=1) # Check GPU Usage self.can_use_gpu = torch.cuda.is_available() if self.can_use_gpu: if self.should_use_gpu: LoggerHelper.info('Training on GPU!') else: LoggerHelper.info('GPU usage is disabled by config.json') else: LoggerHelper.info('No GPU available, training on CPU; consider making n_epochs very small.')
LoggerHelper.info("News Stock Prediction is ended.") # WordEmbedding(path=Config.word_embedding.path) # news_dnn = NewsDnnMain(epochs=int(Config.training.epochs), # batch_size=int(Config.training.batch_size), # seq_length=int(Config.training.sequence_length), # lr=float(Config.training.lr))3 if args.statistics: LoggerHelper.info("Starting Statistic Collection Mode...") Statistics().collect() LoggerHelper.info("Statistic Collection is ended...") if args.test: LoggerHelper.info("Starting Test Mode...") TransformersTest.sentiment_analysis_test() LoggerHelper.info("Test Mode is ended...") if args.webservice: web_manager = WebManager() web_manager.add_static_files() web_manager.add_news_root() web_manager.run() if __name__ == "__main__": try: main() except Exception as exception: LoggerHelper.error("Ex: " + str(exception)) LoggerHelper.error(traceback.format_exc()) exit()
def main(): # Load Config load_config() # Load arg args = load_arg() if args.fdc: LoggerHelper.info("Starting Financial Data Collector Mode...") fdc = FDC() fdc.collect() LoggerHelper.info("Financial Data Collector Mode is ended.") if args.wiki: LoggerHelper.info("Starting Wikipedia Load Mode...") wiki = WikiRecorder() wiki.collect_all() LoggerHelper.info("Wikipedia Load Mode is ended.") if args.organize: LoggerHelper.info("Starting News Organizer Mode...") collector = NewsOrganizer() collector.dnn_organizer_for_dnn_filtered_news() LoggerHelper.info("News Organizer Mode is ended.") if args.ind: LoggerHelper.info("Starting Indicators Collector Mode...") ind_collector = IndicatorsCollector() if args.ind == "zip": ind_collector.collect_from_zip() else: ind_collector.collect() LoggerHelper.info("Indicators Collector Mode is ended.") if args.news is not None: LoggerHelper.info("Starting Stock Prediction Mode...") news_dnn = get_news_type(args.news) news_dnn.train(print_every=int(Config.training.print_every)) news_dnn.test() LoggerHelper.info("News Stock Prediction is ended.") # WordEmbedding(path=Config.word_embedding.path) # news_dnn = NewsDnnMain(epochs=int(Config.training.epochs), # batch_size=int(Config.training.batch_size), # seq_length=int(Config.training.sequence_length), # lr=float(Config.training.lr))3 if args.statistics: LoggerHelper.info("Starting Statistic Collection Mode...") Statistics().collect() LoggerHelper.info("Statistic Collection is ended...") if args.test: LoggerHelper.info("Starting Test Mode...") TransformersTest.sentiment_analysis_test() LoggerHelper.info("Test Mode is ended...") if args.webservice: web_manager = WebManager() web_manager.add_static_files() web_manager.add_news_root() web_manager.run()
def get_network_input_size(self): size = 1 LoggerHelper.info("Network Input Size :" + str(size)) return size
def stop(self, time_for=None): end_dt = dt.datetime.now() if time_for is None: LoggerHelper.info('Time taken: %s' % (end_dt - self.start_dt)) else: LoggerHelper.info('Time taken for ' + time_for + ' : %s' % (end_dt - self.start_dt))
def train(self, print_every=20): df = pandas.DataFrame(columns=['Epoch', 'Step', 'Train Mean Loss Cumulative', 'Train Accuracy', 'Val Mean Loss', 'Val Accuracy']) self.timer.start() self.model.train() # Set mode of model losses = [] train_set = self.reader.get_data(fetch_type=NewsCateDataReader.DictDataTerm["Train"], data_type=NewsCateDataReader.DictDataType[ self.config["options"]["network_type"]]) for e in range(self.epochs): print(self.config["options"]["network_type"]) print(NewsCateDataReader.DictDataType[ self.config["options"]["network_type"]]) self.model.train() # Set to Train Mode total_loss_for_epoch = 0 epoch_timer = Timer() epoch_timer.start() for step, batch in enumerate(train_set): # For each batch of training data... # Progress update every 40 batches. if step % print_every == 0: # Report progress. print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_set))) # Get Data b_input_ids = batch[0].to(self.device) b_input_mask = batch[1].to(self.device) b_labels = batch[2].to(self.device) # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) self.model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = outputs[0] total_loss_for_epoch += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) # modified based on their gradients, the learning rate, etc. self.optimizer.step() # Update the learning rate. self.scheduler.step() # Calculate the average loss over the training data. avg_train_loss = total_loss_for_epoch / len(train_set) # Store the loss value for plotting the learning curve. losses.append(avg_train_loss) LoggerHelper.info(" Average training loss: {0:.2f}".format(avg_train_loss)) epoch_timer.stop(time_for="Epoch") timer = Timer(start=True) df = self.validate(df, e, losses) timer.stop(time_for="Validate") self.model.train() self.timer.stop(time_for="Train") self.save_model() self.current_date = DateHelper.get_current_date() Export.append_df_to_excel(df, self.current_date) Export.append_df_to_excel(self.get_info(), self.current_date)
async def __random_news_handler(self, request): request = await request.json() print(request) default = self.get_news_data(self.db, self.defaultCollection, request['object_id']) if default is None: res = {'isError': True, 'Message': "Object Is Not Found."} res = JSONEncoder().encode(res) return web.json_response(res) else: try: self.toCollection.insert({ "_id": default["_id"], "title": default["title"], "summery": default["summery"], "article": default['authors'], "url": default["url"], "category": request["categories"], "price_after_minute": default["price_after_minute"], "price_after_hour": default["price_after_hour"], "price_after_day": default["price_after_day"], "price_before": default["price_before"], "wiki_relatedness": default["wiki_relatedness"], "tweet_count": default["tweet_count"], "tweet_percentage": default["tweet_percentage"], "wiki_relatedness_nor": default["wiki_relatedness_nor"], "tweet_count_nor": default["tweet_count_nor"], "date": default["date"], "authors": default["authors"], "comment": request['comment'], "price_effect": request['effect'] }) default['is_controlled'] = True default['is_incorrect'] = False self.record_one_field(self.defaultCollection, default) # price_effect effect res = {'isError': False, 'Message': "Success"} except Exception as exception: res = { 'isError': True, 'Message': "Insert Error. Please inform the Admin" } LoggerHelper.error(type(exception).__name__) LoggerHelper.error("Ex: " + str(exception)) LoggerHelper.error(traceback.format_exc()) res = JSONEncoder().encode(res) return web.json_response(res)
def save_model(self): save_file_name = self.get_save_file_name() FileHelper.create_path_if_not_exists(save_file_name) self.model.save_pretrained(save_file_name) # save self.reader.tokenizer.save_pretrained(save_file_name) # save LoggerHelper.info("Model Saved to disk")