def __init__(self, epochs, batch_size, seq_length): self.epochs = epochs self.config = self.get_config() self.model: TaModel = TaModel() self.reader = TaDataReader(self.config['data'], batch_size, seq_length) self.timer = Timer() # Network Information self.criterion = nn.MSELoss() #nn.CrossEntropyLoss() - nn.NLLLoss() self.optimizer = optim.Adam(self.model.parameters(), lr=0.003) print(self.reader.get_train_count()) print(self.reader.get_test_count())
def __init__(self, path=None, vector_size=100, word_processing=None): self.timer = Timer() self.manager = multiprocessing.Manager() WordEmbedding.Words = self.manager.dict() self.vector_size = vector_size if path is None: self.path = 'glove.6B.100d.txt' else: self.path = path if word_processing is None: self.word_processing = PreProcessing() else: self.word_processing = word_processing self.__read_embeddings()
def __init__(self, config, epochs=None, batch_size=None, seq_length=None, use_gpu=None, lr=None, hidden_size=None): self.config = config if epochs is None: self.epochs = config["networkConfig"]["epochs"] else: self.epochs = epochs if batch_size is None: self.batch_size = config["networkConfig"]["batch_size"] else: self.batch_size = batch_size if seq_length is None: self.seq_length = self.config["networkConfig"]["sequence_length"] else: self.seq_length = seq_length if use_gpu is None: self.use_gpu = self.config["networkConfig"]["useGPU"] else: self.use_gpu = use_gpu if hidden_size is None: if self.config["networkConfig"]["hidden_size"] < 0: self.hidden_size = None else: self.hidden_size = self.config["networkConfig"]["hidden_size"] else: self.hidden_size = hidden_size if lr is None: self.lr = self.config["networkConfig"]["learning_rate"] else: self.lr = lr self.timer = Timer() self.current_date = DateHelper.get_current_date() self.criterion = self.load_criterion()
class WordEmbedding(object): Words = {} def __init__(self, path=None, vector_size=100, word_processing=None): self.timer = Timer() self.manager = multiprocessing.Manager() WordEmbedding.Words = self.manager.dict() self.vector_size = vector_size if path is None: self.path = 'glove.6B.100d.txt' else: self.path = path if word_processing is None: self.word_processing = PreProcessing() else: self.word_processing = word_processing self.__read_embeddings() def __read_embeddings(self): WordEmbedding.Words = self.manager.dict() self.timer.start() with open(self.path, 'r', encoding="utf-8") as f: for line in f: values = line.split() word = values[0] if self.word_processing.is_stop_word_or_punctuation(word): continue vector = np.asarray(values[1:], dtype=np.float32) WordEmbedding.Words[word] = vector self.timer.stop(time_for='Word Embedding Loading') def __read_embeddings_gensim(self): self.timer.start() WordEmbedding.Words = gensim.models.KeyedVectors.load_word2vec_format( self.path, binary=True) self.timer.stop(time_for='Word Embedding Loading') @staticmethod def vec(w): return WordEmbedding.Words[w] @staticmethod def find_closest_embeddings(embedding): return sorted(WordEmbedding.Words.keys(), key=lambda word: spatial.distance.cosine( WordEmbedding.Words[word], embedding)) @staticmethod def multi_cosine_distance_word_embedding(count, date, news_title): cpu_count = int((multiprocessing.cpu_count() / 2)) p = multiprocessing.Pool(cpu_count) numbers = list() total = int(count / cpu_count) for a in range(cpu_count): if a == cpu_count - 1: info = { "skip": total * a, "to": (total + (count % cpu_count)), "date": date, "news_title": news_title } numbers.append(info) else: info = { "skip": total * a, "to": total, "date": date, "news_title": news_title } numbers.append(info) calculate_partial = partial(WordEmbedding.calculate_distance_for_tweet, input=input) result = p.map(calculate_partial, numbers) p.close() p.join() return sum(result) #p.map(mp_worker, data) @staticmethod def calculate_distance_for_tweet(info, input): skip = info["skip"] get = info["to"] date = info["date"] title = info["news_title"] db = Mongo(test=2) pre = PreProcessing() tweets = WordEmbedding.get_tweets_before_date( db, date).skip(skip).limit(get) tweetcount = 0 count = 0 print(get) vector = WordEmbedding.get_vector_list(title) for tweet in tweets: tweetcount += 1 try: cosine = WordEmbedding.cosine_distance_word_embedding_with_vector( vector, pre.preprocess(tweet["tweet_text"])) percentage = round((1 - cosine) * 100, 2) except Exception as exception: print("Exeption") percentage = 0 if percentage > 80: count += 1 if tweet["tweet_user_verified"]: count += 1 print("count" + str(count)) return count @staticmethod def get_tweets_before_date(db, date, collection="Tweet", days=5): start = date - timedelta(days=days) end = date query = {"tweet_created_at": {"$gte": start, "$lt": end}} fields = { "tweet_text": 1, "tweet_user_fallowers_count": 1, "tweet_user_verified": 1, "tweet_created_at": 1, "_id": 0 } return db.get_data(collection, query, fields).sort([("tweet_created_at", 1)]) @staticmethod def cosine_distance_word_embedding_with_vector(vector, s2): vector2 = WordEmbedding.get_vector_list(s2) if vector2 is np.NaN: return 0.99 else: mean = np.mean(vector, axis=0) mean2 = np.mean(vector2, axis=0) cosine = spatial.distance.cosine(mean, mean2) return cosine @staticmethod def cosine_distance_word_embedding(s1, s2): try: vector_1 = np.mean(WordEmbedding.get_vector_list(s1), axis=0) vector_2 = np.mean(WordEmbedding.get_vector_list(s2), axis=0) except: return 0.99 cosine = spatial.distance.cosine(vector_1, vector_2) return cosine @staticmethod def get_vector_list(paragraph): word_to_vector_list = [] for word in paragraph: if word in WordEmbedding.Words: word_to_vector_list.append(WordEmbedding.vec(word)) if len(word_to_vector_list) == 0: return np.NaN return word_to_vector_list def _similarity_query(self, word_vec, number): words_matrix = WordEmbedding.Words.values() dst = (np.dot(words_matrix, word_vec) / np.linalg.norm(words_matrix, axis=1) / np.linalg.norm(word_vec)) word_ids = np.argsort(-dst) return [(WordEmbedding.Words[x].name, dst[x]) for x in word_ids[:number] if x in WordEmbedding.Words] # return [(self.inverse_dictionary[x], dst[x]) for x in word_ids[:number] # if x in self.inverse_dictionary] # https://github.com/maciejkula/glove-python/blob/749494290fdfd24379dcc2e244c583ee61808634/glove/glove.py#L273 # https://stats.stackexchange.com/questions/242863/how-does-python-glove-compute-most-similar def get_weight_matrix(self, article): vocabulary_size = len(article) embedding_matrix = np.zeros((vocabulary_size, self.vector_size), dtype=np.double) for index in range(vocabulary_size): word = article[index] embedding_vector = WordEmbedding.Words.get(word) if embedding_vector is not None: embedding_matrix[index] = embedding_vector return embedding_matrix def get_weight_matrix_with_wiki_tweet(self, article, wiki, tweet): vocabulary_size = len(article) embedding_matrix = np.zeros((vocabulary_size, self.vector_size + 2), dtype=np.double) for index in range(vocabulary_size): word = article[index] embedding_vector = WordEmbedding.Words.get(word) if embedding_vector is not None: # Add Wiki Info embedding_matrix[index] = np.append(embedding_vector, wiki / 100) # Add Tweet embedding_matrix[index] = np.append(embedding_vector, tweet) return embedding_matrix def get_weight_matrix_with_wiki(self, article, wiki): vocabulary_size = len(article) embedding_matrix = np.zeros((vocabulary_size, self.vector_size + 1), dtype=np.double) for index in range(vocabulary_size): word = article[index] embedding_vector = WordEmbedding.Words.get(word) if embedding_vector is not None: # Add Wiki Info embedding_matrix[index] = np.append(embedding_vector, wiki / 100) return embedding_matrix def get_weight_matrix_all(self, article, wiki=None, wiki_multiply_factors=0, tweet=None, tweet_multiply_factors=0): vocabulary_size = len(article) vector_size = self.vector_size + wiki_multiply_factors + tweet_multiply_factors embedding_matrix = np.zeros((vocabulary_size, vector_size), dtype=np.double) for index in range(vocabulary_size): word = article[index] embedding_vector = WordEmbedding.Words.get(word) if embedding_vector is not None: embedding_matrix[index] = embedding_vector if wiki is not None: wiki_array = np.full(wiki_multiply_factors, wiki / 100) embedding_matrix[index] = np.append( embedding_vector, wiki_array) if tweet is not None: tweet_array = np.full(wiki_multiply_factors, tweet) embedding_matrix[index] = np.append( embedding_vector, tweet_array) return embedding_matrix
class TaMain(object): """ Initializer Arguments --------- epochs: Number of epochs to train batch_size: Number of mini-sequences per mini-batch, aka batch size seq_length: Number of character steps per mini-batch """ def __init__(self, epochs, batch_size, seq_length): self.epochs = epochs self.config = self.get_config() self.model: TaModel = TaModel() self.reader = TaDataReader(self.config['data'], batch_size, seq_length) self.timer = Timer() # Network Information self.criterion = nn.MSELoss() #nn.CrossEntropyLoss() - nn.NLLLoss() self.optimizer = optim.Adam(self.model.parameters(), lr=0.003) print(self.reader.get_train_count()) print(self.reader.get_test_count()) def train(self, lr=0.001, clip=5, val_frac=0.1, print_every=10): """ Training a network Arguments --------- lr: learning rate clip: gradient clipping val_frac: Fraction of data to hold out for validation print_every: Number of steps for printing training and validation loss """ df = pandas.DataFrame( columns=['Epoch', 'Step', 'Last Train Loss', 'Mean Test Loss']) self.timer.start() self.model.train() if self.model.train_on_gpu: self.model.cuda() counter = 0 h = None for e in range(self.epochs): if h is None: # initialize hidden state h = self.model.init_hidden(self.reader.batch_size) for x, y in self.reader.get_train_data( ): # get_batches(data, batch_size, seq_length): counter += 1 inputs, targets = torch.from_numpy(x), torch.from_numpy(y) if self.model.train_on_gpu: inputs, targets = inputs.cuda(), targets.cuda() # Creating new variables for the hidden state, otherwise # we'd backprop through the entire training history h = tuple([each.data for each in h]) # zero accumulated gradients self.model.zero_grad() # get the output from the model - output, h = self.model( inputs, h ) # Input Should Be 3-Dimensional: seq_len, batch, input_size # calculate the loss and perform back propagation loss = self.criterion( output, targets.view(self.reader.batch_size * self.reader.sequence_length)) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. nn.utils.clip_grad_norm_(self.model.parameters(), clip) self.optimizer.step() # loss stats if counter % print_every == 0: # Get validation loss val_h = self.model.init_hidden(self.reader.batch_size) val_losses = [] self.model.eval() for x, y in self.reader.get_test_data( ): # get_batches(val_data, batch_size, seq_length): x, y = torch.from_numpy(x), torch.from_numpy(y) # Creating new variables for the hidden state, otherwise # we'd backprop through the entire training history val_h = tuple([each.data for each in val_h]) inputs, targets = x, y if self.model.train_on_gpu: inputs, targets = inputs.cuda(), targets.cuda() output, val_h = self.model(inputs, val_h) val_loss = self.criterion( output, targets.view(self.reader.batch_size * self.reader.sequence_length)) val_losses.append(val_loss.item()) self.model.train( ) # reset to train mode after iterationg through validation data print("Epoch: {}/{}...".format(e + 1, self.epochs), "Step: {}...".format(counter), "Loss: {:.4f}...".format(loss.item()), "Val Loss: {:.4f}".format(np.mean(val_losses))) df = df.append( { 'Epoch': "{}/{}".format(e + 1, self.epochs), 'Step': counter, 'Last Train Loss': loss.item(), 'Mean Test Loss': np.mean(val_losses) }, ignore_index=True) self.timer.stop() self.save_model() date = DateHelper.get_current_date() Export.append_df_to_excel(df, date) Export.append_df_to_excel(self.get_info(), date) def test(self): # Test the network for data in self.reader.get_test_data(): # Format Data print(data) # Train def get_info(self): info = pandas.DataFrame(columns=[ 'Database', 'Key', 'Batch Size', 'Sequence Length', 'Input Size', 'Hidden', 'Number of Layers', 'Dropout Prob', 'Learning Rate' ]) info = info.append( { 'Database': self.config["data"]["db"], 'Key': self.config["data"]["train_query"]["Key"], 'Batch Size': self.reader.batch_size, 'Sequence Length': self.reader.sequence_length, 'Input Size': self.model.input_size, 'Hidden': self.model.hidden, 'Number of Layers': self.model.num_layers, 'Dropout Prob': self.model.drop_prob, 'Learning Rate': self.model.lr }, ignore_index=True) return info def get_save_file_name(self): # serialize model to JSON save_file_name = os.path.join( self.config["model"]["save_dir"], '%s-e%s(%s-%s).pth' % (dt.datetime.now().strftime('%d%m%Y-%H%M%S'), str(self.epochs), self.config["data"]["db"], self.config["data"]["train_query"]["Key"])) return save_file_name def save_model(self): # serialize model to JSON save_file_name = self.get_save_file_name() checkpoint = { 'model': TaModel(), 'model_state_dict': self.model.state_dict(), 'optimizer': optim.Adam(self.model.parameters(), lr=0.003), 'optimizer_state_dict': self.optimizer.state_dict() } torch.save(checkpoint, save_file_name) print("Model Saved to disk") def load_model(self, path): checkpoint = torch.load(path) self.model = checkpoint['model'] self.model.load_state_dict(checkpoint['state_dict']) self.optimizer = checkpoint['optimizer'] self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) print("Model loaded from disk") @staticmethod def get_config(): pwd = os.path.dirname(os.path.abspath(__file__)) return json.load(open(pwd + '/config.json', 'r'), cls=DateTimeDecoder)
def train(self, clip=5, val_frac=0.1, print_every=20): """ Training a network Arguments --------- clip: gradient clipping val_frac: Fraction of data to hold out for validation print_every: Number of steps for printing training and validation loss """ df = pandas.DataFrame( columns=['Epoch', 'Step', 'Last Train Loss', 'Mean Test Loss']) self.timer.start() self.model.train() if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]: self.model.cuda() counter = 0 h = None for e in range(self.epochs): h = self.model.init_hidden(self.reader.batch_size) print(self.config["options"]["network_type"]) print(NewsDnnGeneralDataReader.DictDataType[self.config["options"] ["network_type"]]) # Batch Loop for x, y in self.reader.get_data( fetch_type=NewsDnnGeneralDataReader.DictDataTerm["Train"], data_type=NewsDnnGeneralDataReader.DictDataType[ self.config["options"]["network_type"]]): counter += 1 inputs, targets = torch.from_numpy(x), torch.from_numpy(y) if self.model.can_use_gpu and self.config["networkConfig"][ "useGPU"]: inputs, targets = inputs.cuda(), targets.cuda() # Creating new variables for the hidden state, otherwise # we'd backprop through the entire training history h = tuple([each.data for each in h]) # zero accumulated gradients self.model.zero_grad() # get the output from the model - output, h = self.model( inputs, h ) # Input Should Be 3-Dimensional: seq_len, batch, input_size # calculate the loss and perform back propagation loss = self.criterion(output.squeeze(), targets.long()) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. nn.utils.clip_grad_norm_(self.model.parameters(), clip) self.optimizer.step() # Validate if counter % print_every == 0: timer = Timer() timer.start() df = self.validate(df, e, counter, loss) timer.stop(time_for="Validate") self.model.train() self.timer.stop(time_for="Train") self.save_model() self.current_date = DateHelper.get_current_date() Export.append_df_to_excel(df, self.current_date) Export.append_df_to_excel(self.get_info(), self.current_date)
def train(self, clip=5, val_frac=0.1, print_every=20): """ Training a network Arguments --------- clip: gradient clipping val_frac: Fraction of data to hold out for validation print_every: Number of steps for printing training and validation loss """ df = pandas.DataFrame(columns=['Epoch', 'Step', 'Train Mean Loss Cumulative', 'Train Accuracy', 'Val Mean Loss', 'Val Accuracy']) self.timer.start() self.model.train() if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]: self.model.cuda() counter = 0 for e in range(self.epochs): print(self.config["options"]["network_type"]) print(NewsDnnBaseDataReader.DictDataType[ self.config["options"]["network_type"]]) train_accuracy = 0 losses = [] # Batch Loop for x, y in self.reader.get_data(fetch_type=NewsDnnBaseDataReader.DictDataTerm["Train"], data_type=NewsDnnBaseDataReader.DictDataType[self.config["options"]["network_type"]]): counter += 1 inputs, targets = torch.from_numpy(x), torch.from_numpy(y) if self.model.can_use_gpu and self.config["networkConfig"]["useGPU"]: inputs, targets = inputs.cuda(), targets.cuda() # zero accumulated gradients self.optimizer.zero_grad() # self.model.zero_grad() # get the output from the model - output = self.model(inputs) # Input Should Be 3-Dimensional: seq_len, batch, input_size # calculate the loss and perform back propagation loss = self.criterion(output, targets.long()) loss.backward() losses.append(loss.item()) train_accuracy += self.calculate_accuracy(output, targets) # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. nn.utils.clip_grad_norm_(self.model.parameters(), clip) self.optimizer.step() # Validate In Steps if counter % print_every == 0: timer = Timer() timer.start() df = self.validate(df, e, counter, losses, train_accuracy, print_every) train_accuracy = 0 # Clear Train Accuracy timer.stop(time_for="Validate") self.model.train() self.timer.stop(time_for="Train") self.save_model() self.current_date = DateHelper.get_current_date() Export.append_df_to_excel(df, self.current_date) Export.append_df_to_excel(self.get_info(), self.current_date)
def train(self, print_every=20): df = pandas.DataFrame(columns=['Epoch', 'Step', 'Train Mean Loss Cumulative', 'Train Accuracy', 'Val Mean Loss', 'Val Accuracy']) self.timer.start() self.model.train() # Set mode of model losses = [] train_set = self.reader.get_data(fetch_type=NewsCateDataReader.DictDataTerm["Train"], data_type=NewsCateDataReader.DictDataType[ self.config["options"]["network_type"]]) for e in range(self.epochs): print(self.config["options"]["network_type"]) print(NewsCateDataReader.DictDataType[ self.config["options"]["network_type"]]) self.model.train() # Set to Train Mode total_loss_for_epoch = 0 epoch_timer = Timer() epoch_timer.start() for step, batch in enumerate(train_set): # For each batch of training data... # Progress update every 40 batches. if step % print_every == 0: # Report progress. print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_set))) # Get Data b_input_ids = batch[0].to(self.device) b_input_mask = batch[1].to(self.device) b_labels = batch[2].to(self.device) # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) self.model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification outputs = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = outputs[0] total_loss_for_epoch += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) # modified based on their gradients, the learning rate, etc. self.optimizer.step() # Update the learning rate. self.scheduler.step() # Calculate the average loss over the training data. avg_train_loss = total_loss_for_epoch / len(train_set) # Store the loss value for plotting the learning curve. losses.append(avg_train_loss) LoggerHelper.info(" Average training loss: {0:.2f}".format(avg_train_loss)) epoch_timer.stop(time_for="Epoch") timer = Timer(start=True) df = self.validate(df, e, losses) timer.stop(time_for="Validate") self.model.train() self.timer.stop(time_for="Train") self.save_model() self.current_date = DateHelper.get_current_date() Export.append_df_to_excel(df, self.current_date) Export.append_df_to_excel(self.get_info(), self.current_date)