def parse_currency(currency_key, directory, name): # Type : 1 - Currency print("Currency") col = Mongo().create_collection("Currency", FDC.get_index_models()) with open(directory) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') print(currency_key) hour = -1 fd = None for row in csv_reader: if len(row) < 2: # Check Data continue add_value = 0 if currency_key == "EURUSD": date = DateHelper.str2date(row[0]) add_value = -1 else: date = DateHelper.str2date(row[0]+row[1]) if hour != date.hour: hour = date.hour if fd is not None: try: col.insert(fd.get_currency()) except: Logger().get_logger().error('Insert Error', exc_info=True) fd = FinancialData(name, currency_key, date, row[FDLocations.Currency_Open.value + add_value], row[FDLocations.Currency_High.value + add_value], row[FDLocations.Currency_Low.value + add_value], row[FDLocations.Currency_Close.value + add_value]) else: fd.add(row[FDLocations.Currency_High.value + add_value], row[FDLocations.Currency_Low.value + add_value], row[FDLocations.Currency_Close.value + add_value])
def sentiment_analysis_test(date=None, hashtags=None): nlp = pipeline('sentiment-analysis') if date is None: date = DateHelper.str2date("2015-05-12T16:07:40Z") if hashtags is None: hashtags = ["oil", "crude", "crude oil"] tweets = TwitterForecast.get_tweets_before_date_from_elastic_search( date, hashtags, days=5, maxsize=10000) total_tweets = tweets["hits"]["total"]["value"] if total_tweets == 0: print("No Tweet Found") else: for es_tweet in tweets["hits"]["hits"]: tweet = es_tweet["_source"] try: text = tweet["tweet_text"].replace("\n", "") username = tweet['tweet_user_name'] sentiment = nlp(text)[0] if sentiment['score'] > 0.98: if tweet["tweet_user_verified"]: print('[%s-%s] - %s (%s)' % (u"\U0001F44D", sentiment['label'], text, username)) else: print('[%s] - %s (%s)' % (sentiment['label'], text, username)) except Exception as exception: print(exception) traceback.print_exc()
def parse_index_datetime(currency_key, directory, name, interval): # Type : 4 - Index col = Mongo().create_collection("Index") with open(directory) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 print(currency_key) hour = -1 hour_count = 0 fd = None for row in csv_reader: if len(row) < 2: # Check Data continue date = DateHelper.str2date(row[0] + row[1]) if hour != date.hour: hour = date.hour hour_count = 0 if fd is not None: print(fd) try: col.insert(fd.get_index()) except: Logger().get_logger().error('Insert Error', exc_info=True) fd = FinancialData(name, currency_key, date, row[FDLocations.IndexDateTime_Open.value], row[FDLocations.IndexDateTime_High.value], row[FDLocations.IndexDateTime_Low.value], row[FDLocations.IndexDateTime_Close.value]) else: fd.add(row[FDLocations.IndexDateTime_High.value], row[FDLocations.IndexDateTime_Low.value], row[FDLocations.IndexDateTime_Close.value]) hour_count += 1 line_count += 1 print(f'Processed {line_count} lines.')
async def __price_handler(self, request): request = await request.json() date = DateHelper.str2date(request['news_date']) info = self.get_price_before_date(self.db, request['collection'], request['key'], date, request['range']) date_list = [] open_list = [] high_list = [] low_list = [] close_list = [] volume_list = [] for a in info: date_list.append(str(a.get('Date'))) open_list.append(a.get('Open')) high_list.append(a.get('High')) low_list.append(a.get('Low')) close_list.append(a.get('Close')) volume_list.append(a.get('Volume')) res = { 'Title': request['collection'] + " - " + request['key'], 'PriceDate': date_list, 'OpenPrice': open_list, 'HighPrice': high_list, 'LowPrice': low_list, 'ClosePrice': close_list, 'Volume': volume_list } res = JSONEncoder().encode(res) return web.json_response(res)
def get_date(news): date = news['Date'] rss_date = news['RSS_Date'] selected_date = rss_date if date: if DateHelper.is_time_of_date_exist(date): try: if date > rss_date: selected_date = rss_date else: selected_date = date except: selected_date = date elif rss_date: selected_date = rss_date else: try: metadata = news['Meta_Data'].get("pubdate") if metadata: return DateHelper.str2date(metadata) else: html = news['HTML'] sub_index = html.find('publishDate') print(news['Meta_Data']["pubdate"]) if sub_index > 0: date = html[sub_index:(sub_index + 100)] result = re.search('publishDate":"(.*?)",', date) if result: print(result.group(1)) selected_date = DateHelper.str2date( result.group(1)) else: return None else: return None except Exception: return None return selected_date
def collect(self): db = Mongo() conn = sqlite3.connect(self.SQL_LOCATION) c = conn.cursor() c.execute( 'SELECT title, author, date, publication, category, digital, section, url FROM longform' ) line_count = 0 date_count = 0 newslist = [] for row in c: url = row[self.Url] date = DateHelper.str2date(row[self.Date]) title = row[self.Title] if url == "" or url is None or date == "": # Is There Url Or Date continue if db.is_title_url_exists(title, url): continue allUrls = FileCollector.extract_url_from_text(url) article = Article(allUrls[1]) category = row[self.Category] section = row[self.Section] newslist.append( News.RssNews(title=title, time=date, summery='', category=FileCollector.get_category( category, section), tags='', url=allUrls[1], iaurl=allUrls[0], article=article)) print(line_count) if len(newslist) == 20: pool = NewsPool() pool.set(newslist) pool.join() newslist = [] line_count += 1 print(f'\t{line_count}') print(f'\t{len(newslist)}')
def __init__(self, config, epochs=None, batch_size=None, seq_length=None, use_gpu=None, lr=None, hidden_size=None): self.config = config if epochs is None: self.epochs = config["networkConfig"]["epochs"] else: self.epochs = epochs if batch_size is None: self.batch_size = config["networkConfig"]["batch_size"] else: self.batch_size = batch_size if seq_length is None: self.seq_length = self.config["networkConfig"]["sequence_length"] else: self.seq_length = seq_length if use_gpu is None: self.use_gpu = self.config["networkConfig"]["useGPU"] else: self.use_gpu = use_gpu if hidden_size is None: if self.config["networkConfig"]["hidden_size"] < 0: self.hidden_size = None else: self.hidden_size = self.config["networkConfig"]["hidden_size"] else: self.hidden_size = hidden_size if lr is None: self.lr = self.config["networkConfig"]["learning_rate"] else: self.lr = lr self.timer = Timer() self.current_date = DateHelper.get_current_date() self.criterion = self.load_criterion()
def parse_stock(currency_key, directory, name, interval): # Type : 3 - Stock print("Stock") col = Mongo().create_collection("Stock", FDC.get_index_models()) with open(directory) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') print(currency_key) for row in csv_reader: if len(row) < 2: # Check Data continue date = DateHelper.str2date(row[0]) if interval == 60: fd = FinancialData(name, currency_key, date, row[FDLocations.Stock_Open.value], row[FDLocations.Stock_High.value], row[FDLocations.Stock_Low.value], row[FDLocations.Stock_Close.value], row[FDLocations.Stock_Volume.value], row[FDLocations.Stock_Trade.value], row[FDLocations.Stock_Avg.value]) col.insert(fd.get_stock()) else: print("Not Handled !!!")
def train(self, lr=0.001, clip=5, val_frac=0.1, print_every=10): """ Training a network Arguments --------- lr: learning rate clip: gradient clipping val_frac: Fraction of data to hold out for validation print_every: Number of steps for printing training and validation loss """ df = pandas.DataFrame( columns=['Epoch', 'Step', 'Last Train Loss', 'Mean Test Loss']) self.timer.start() self.model.train() if self.model.train_on_gpu: self.model.cuda() counter = 0 h = None for e in range(self.epochs): if h is None: # initialize hidden state h = self.model.init_hidden(self.reader.batch_size) for x, y in self.reader.get_train_data( ): # get_batches(data, batch_size, seq_length): counter += 1 inputs, targets = torch.from_numpy(x), torch.from_numpy(y) if self.model.train_on_gpu: inputs, targets = inputs.cuda(), targets.cuda() # Creating new variables for the hidden state, otherwise # we'd backprop through the entire training history h = tuple([each.data for each in h]) # zero accumulated gradients self.model.zero_grad() # get the output from the model - output, h = self.model( inputs, h ) # Input Should Be 3-Dimensional: seq_len, batch, input_size # calculate the loss and perform back propagation loss = self.criterion( output, targets.view(self.reader.batch_size * self.reader.sequence_length)) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. nn.utils.clip_grad_norm_(self.model.parameters(), clip) self.optimizer.step() # loss stats if counter % print_every == 0: # Get validation loss val_h = self.model.init_hidden(self.reader.batch_size) val_losses = [] self.model.eval() for x, y in self.reader.get_test_data( ): # get_batches(val_data, batch_size, seq_length): x, y = torch.from_numpy(x), torch.from_numpy(y) # Creating new variables for the hidden state, otherwise # we'd backprop through the entire training history val_h = tuple([each.data for each in val_h]) inputs, targets = x, y if self.model.train_on_gpu: inputs, targets = inputs.cuda(), targets.cuda() output, val_h = self.model(inputs, val_h) val_loss = self.criterion( output, targets.view(self.reader.batch_size * self.reader.sequence_length)) val_losses.append(val_loss.item()) self.model.train( ) # reset to train mode after iterationg through validation data print("Epoch: {}/{}...".format(e + 1, self.epochs), "Step: {}...".format(counter), "Loss: {:.4f}...".format(loss.item()), "Val Loss: {:.4f}".format(np.mean(val_losses))) df = df.append( { 'Epoch': "{}/{}".format(e + 1, self.epochs), 'Step': counter, 'Last Train Loss': loss.item(), 'Mean Test Loss': np.mean(val_losses) }, ignore_index=True) self.timer.stop() self.save_model() date = DateHelper.get_current_date() Export.append_df_to_excel(df, date) Export.append_df_to_excel(self.get_info(), date)
def train(self, clip=5, val_frac=0.1, print_every=20): """ Training a network Arguments --------- clip: gradient clipping val_frac: Fraction of data to hold out for validation print_every: Number of steps for printing training and validation loss """ df = pandas.DataFrame( columns=['Epoch', 'Step', 'Last Train Loss', 'Mean Test Loss']) self.timer.start() self.model.train() if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]: self.model.cuda() counter = 0 h = None for e in range(self.epochs): h = self.model.init_hidden(self.reader.batch_size) print(self.config["options"]["network_type"]) print(NewsDnnGeneralDataReader.DictDataType[self.config["options"] ["network_type"]]) # Batch Loop for x, y in self.reader.get_data( fetch_type=NewsDnnGeneralDataReader.DictDataTerm["Train"], data_type=NewsDnnGeneralDataReader.DictDataType[ self.config["options"]["network_type"]]): counter += 1 inputs, targets = torch.from_numpy(x), torch.from_numpy(y) if self.model.can_use_gpu and self.config["networkConfig"][ "useGPU"]: inputs, targets = inputs.cuda(), targets.cuda() # Creating new variables for the hidden state, otherwise # we'd backprop through the entire training history h = tuple([each.data for each in h]) # zero accumulated gradients self.model.zero_grad() # get the output from the model - output, h = self.model( inputs, h ) # Input Should Be 3-Dimensional: seq_len, batch, input_size # calculate the loss and perform back propagation loss = self.criterion(output.squeeze(), targets.long()) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. nn.utils.clip_grad_norm_(self.model.parameters(), clip) self.optimizer.step() # Validate if counter % print_every == 0: timer = Timer() timer.start() df = self.validate(df, e, counter, loss) timer.stop(time_for="Validate") self.model.train() self.timer.stop(time_for="Train") self.save_model() self.current_date = DateHelper.get_current_date() Export.append_df_to_excel(df, self.current_date) Export.append_df_to_excel(self.get_info(), self.current_date)
def train(self, clip=5, val_frac=0.1, print_every=20): """ Training a network Arguments --------- clip: gradient clipping val_frac: Fraction of data to hold out for validation print_every: Number of steps for printing training and validation loss """ df = pandas.DataFrame(columns=['Epoch', 'Step', 'Train Mean Loss Cumulative', 'Train Accuracy', 'Val Mean Loss', 'Val Accuracy']) self.timer.start() self.model.train() if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]: self.model.cuda() counter = 0 for e in range(self.epochs): print(self.config["options"]["network_type"]) print(NewsDnnBaseDataReader.DictDataType[ self.config["options"]["network_type"]]) train_accuracy = 0 losses = [] # Batch Loop for x, y in self.reader.get_data(fetch_type=NewsDnnBaseDataReader.DictDataTerm["Train"], data_type=NewsDnnBaseDataReader.DictDataType[self.config["options"]["network_type"]]): counter += 1 inputs, targets = torch.from_numpy(x), torch.from_numpy(y) if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]: inputs, targets = inputs.cuda(), targets.cuda() # zero accumulated gradients self.optimizer.zero_grad() # self.model.zero_grad() # get the output from the model - output = self.model(inputs) # Input Should Be 3-Dimensional: seq_len, batch, input_size # calculate the loss and perform back propagation loss = self.criterion(output, targets.long()) loss.backward() losses.append(loss.item()) train_accuracy += self.calculate_accuracy(output, targets) # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. nn.utils.clip_grad_norm_(self.model.parameters(), clip) self.optimizer.step() # Validate In Steps if counter % print_every == 0: timer = Timer() timer.start() df = self.validate(df, e, counter, losses, train_accuracy, print_every) train_accuracy = 0 # Clear Train Accuracy timer.stop(time_for="Validate") self.model.train() self.timer.stop(time_for="Train") self.save_model() self.current_date = DateHelper.get_current_date() Export.append_df_to_excel(df, self.current_date) Export.append_df_to_excel(self.get_info(), self.current_date)
def train(self, print_every=20): df = pandas.DataFrame(columns=['Epoch', 'Step', 'Train Mean Loss Cumulative', 'Train Accuracy', 'Val Mean Loss', 'Val Accuracy']) self.timer.start() self.model.train() # Set mode of model losses = [] train_set = self.reader.get_data(fetch_type=NewsCateDataReader.DictDataTerm["Train"], data_type=NewsCateDataReader.DictDataType[ self.config["options"]["network_type"]]) for e in range(self.epochs): print(self.config["options"]["network_type"]) print(NewsCateDataReader.DictDataType[ self.config["options"]["network_type"]]) self.model.train() # Set to Train Mode total_loss_for_epoch = 0 epoch_timer = Timer() epoch_timer.start() for step, batch in enumerate(train_set): # For each batch of training data... # Progress update every 40 batches. if step % print_every == 0: # Report progress. print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_set))) # Get Data b_input_ids = batch[0].to(self.device) b_input_mask = batch[1].to(self.device) b_labels = batch[2].to(self.device) # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) self.model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = outputs[0] total_loss_for_epoch += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) # modified based on their gradients, the learning rate, etc. self.optimizer.step() # Update the learning rate. self.scheduler.step() # Calculate the average loss over the training data. avg_train_loss = total_loss_for_epoch / len(train_set) # Store the loss value for plotting the learning curve. losses.append(avg_train_loss) LoggerHelper.info(" Average training loss: {0:.2f}".format(avg_train_loss)) epoch_timer.stop(time_for="Epoch") timer = Timer(start=True) df = self.validate(df, e, losses) timer.stop(time_for="Validate") self.model.train() self.timer.stop(time_for="Train") self.save_model() self.current_date = DateHelper.get_current_date() Export.append_df_to_excel(df, self.current_date) Export.append_df_to_excel(self.get_info(), self.current_date)