def main(): from sklearn.preprocessing import LabelEncoder import pickle print("-" * 20 + " " + config.WORD2VEC_MODEL + " embedding model is loading " + "-" * 20) os.makedirs("preprocessed_data", exist_ok=True) try: with open("preprocessed_data/word.vec", 'rb') as file: word2vec = pickle.load(file) wordvec_index = word2vec['wordvec_index'] word_vectors = word2vec['word_vectors'] except: wordvec_index, word_vectors = load_word2vec() word2vec = { "wordvec_index": wordvec_index, "word_vectors": word_vectors } with open("preprocessed_data/word.vec", 'wb') as file: pickle.dump(word2vec, file) train_set = preprocessing(config.TRAIN_DATA_PATH, wordvec_index) test_set = preprocessing(config.TEST_DATA_PATH, wordvec_index) lbe = LabelEncoder() lbe.fit(train_set['labels']) lbe.fit(test_set['labels']) train_set['labels'] = np.array(lbe.transform(train_set['labels']), dtype='int32') test_set['labels'] = np.array(lbe.transform(test_set['labels']), dtype='int32') print( "train:\n\ttoken_mat.shape: {}, pos_mat1.shape: {}, pos_mat2.shape: {}" .format(train_set['token_mat'].shape, train_set['pos_mat1'].shape, train_set['pos_mat2'].shape)) print( "test:\n\ttoken_mat.shape: {}, pos_mat1.shape: {}, pos_mat2.shape: {}". format(test_set['token_mat'].shape, test_set['pos_mat1'].shape, test_set['pos_mat2'].shape)) print("data in train/test: {}/{}".format(len(train_set['labels']), len(test_set['labels']))) print("There are {} classes in dataset".format(len(lbe.classes_))) data = { "wordvec_index": wordvec_index, "word_vectors": word_vectors, "train_set": train_set, "test_set": test_set, "label_encoder": lbe } with open('preprocessed_data/data.pkl', 'wb') as file: pickle.dump(data, file)
def predict(config, text, code, model=None, embedding_input=None): if model is None: model = load_model(config, code) preprocessed = preprocess_text(text) if embedding_input is None: embedding = [] word_model = load_word2vec(config.embeddings_model) for word in preprocessed.split(' '): if word in word_model.wv.index2word: vec = word_model.wv[word] embedding.append(vec) embedding_input = Variable( torch.Tensor(np_sentence_to_list(embedding))) pred = model(embedding_input) pred_label = pred.data.max(1)[1].numpy()[0] pred_char = get_char_for_binary(code, pred_label) return pred_char
def main(passed_args=None): parser = argparse.ArgumentParser( description="train a neural network on tweets against prices") parser.add_argument( "--word2vec", "-w", dest="word2vec", action="store_true", default=False, help="toggle this option if you are obtaining dataset using word2vec", ) parser.add_argument( "--tune", "-t", dest="tuning", action="store_true", default=False, help="toogle this option if you are tuning hyperparameters", ) parser.add_argument( "--rnn", "-r", dest="train_rnn", action="store_true", default=False, help="toogle this option to train rnn", ) parser.add_argument( "--predict", "-d", dest="predict", action="store_true", default=False, help="toogle this option if you are making predictions", ) parser.add_argument( "--markowitz", "-m", dest="markowitz", action="store_true", default=False, help= "toogle this option if you are doing Markowitz portfolio optimisation", ) parser.add_argument( "--glove", "-g", dest="glove", action="store_true", default=False, help="toogle this option if you are obtaining dataset using glove", ) parser.add_argument( "--metrics", "-f", dest="metrics", action="store_true", default=False, help="toogle this option if you are evaluating the metrics", ) args = parser.parse_args(passed_args) if args.word2vec: # prepare Word2Vec model if not os.path.exists(PATH_TO_WORD2VEC): w2v.train_word2vec() # prepare all data required prices = d.load_prices() w2v_model = w2v.load_word2vec() for stock in stock_universe: d.get_return_by_stock(stock, prices) d.load_tweets_by_stock(stock) w2v.get_padded_embeddings(stock, w2v_model) sys.exit() if args.glove: # prepare all data required prices = d.load_prices() w2v_model = w2v.load_glove_model( path_to_glove="~/Downloads/GloVe-1.2/glove.twitter.27B.50d.txt", path_to_output="./temp/glove_pretrained_w2vformat.txt", ) for stock in stock_universe: d.get_return_by_stock(stock, prices) d.load_tweets_by_stock(stock) w2v.get_padded_embeddings( stock, w2v_model, path_to_output="./temp/padded_embeddings/glove_pretrained", ) sys.exit() if args.tuning: hyperparam_list = get_hyperparam_list(NN_HYPERPARAM_DICT) best_hyperparam_list = [] for stock in stock_universe: print(stock) x = pd.read_pickle( "temp/padded_embeddings/glove_pretrained/pickle/" + stock + ".pickle") y = pd.read_pickle("temp/returns/pickle/" + stock + ".pickle") torch_dataset = nn.get_tensor_dataset(x, y) for hyperparam in hyperparam_list: train_set, _ = nn.train_test_split(torch_dataset, hyperparam["TEST_SIZE"]) train_set, validation_set = nn.train_test_split( train_set, hyperparam["VALIDATION_SIZE"]) tuning_list = [] _, _, validation_losses = nn.train_nn(train_set, validation_set, hyperparam) tuning_list.append((hyperparam, validation_losses[-1])) tuning_list.sort(key=operator.itemgetter(1)) best_hyperparam = tuning_list[0][0] best_hyperparam_list.append((stock, best_hyperparam)) with open("./temp/best-hyperparam-glove-pretrained.txt", "wb") as f: pickle.dump(best_hyperparam_list, f) print(best_hyperparam_list) sys.exit() if args.predict: if os.path.exists("./temp/best-hyperparam-glove.txt"): with open("./temp/best-hyperparam-glove.txt", "rb") as f: best_hyperparam_list = pickle.load(f) best_hyperparam_dict = dict(best_hyperparam_list) for stock in stock_universe: hyperparam = best_hyperparam_dict[stock] x = pd.read_pickle("temp/padded_embeddings/glove/pickle/" + stock + ".pickle") y = pd.read_pickle("temp/returns/pickle/" + stock + ".pickle") torch_dataset = nn.get_tensor_dataset(x, y) _, test_set = nn.train_test_split(torch_dataset, hyperparam["TEST_SIZE"]) results = nn.predict_nn(test_set, "temp/nn/glove/" + stock + ".pth") results_df = pd.DataFrame(results) results_df.columns = ["y", "pred", "loss"] if not os.path.exists("./output/glove"): os.makedirs("./output/glove") results_df.to_csv("./output/glove/" + stock + ".csv") sys.exit() if args.train_rnn: eval_only = True hyperparam_list = get_hyperparam_list(RNN_HYPERPARAM_DICT) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") for hyperparam in hyperparam_list: for stock in stock_universe: print(stock) returns = pd.read_pickle("temp/returns/pickle/" + stock + ".pickle") returns = nn.normalise( torch.tensor(np.stack(returns.values, axis=0), device=device)) vectorised_seq, vocab = rnn.get_vectorised_seq_by_stock(stock) input_size = len(vocab) encoder, feedforward, results = rnn.train_rnn( vectorised_seq, returns, input_size, hyperparam, eval_only=eval_only, path_to_encoder="temp/rnn/encoder/" + stock + ".pth", path_to_feedforward="temp/rnn/feedforward/" + stock + ".pth", ) if eval_only == False: if not os.path.exists("temp/rnn"): os.makedirs("temp/rnn/encoder") os.makedirs("temp/rnn/feedforward") torch.save(encoder.state_dict(), "temp/rnn/encoder/" + stock + ".pth") torch.save( feedforward.state_dict(), "temp/rnn/feedforward/" + stock + ".pth", ) results_df = pd.DataFrame(results) results_df.columns = ["returns", "pred", "loss"] if not os.path.exists("./output/rnn"): os.makedirs("./output/rnn") results_df.to_csv("./output/rnn/" + stock + ".csv") sys.exit() if args.markowitz: model_dict = { "dtm": "purple", "tfidf": "pink", "word2vec": "black", "glove": "blue", "glove_pretrained": "green", "rnn": "orange", "actual": "red", } mean_var_dict = d.get_etf_mean_var() p.plot_frontier_with_points(model_dict, mean_var_dict) # p.plot_frontier(model_dict) sys.exit() if args.metrics: models = [ "rnn", "glove", "glove_pretrained", "word2vec", "dtm", "tfidf" ] for model in models: me.get_metrics_summary(model) sys.exit() if os.path.exists("./temp/best-hyperparam-glove.txt"): with open("./temp/best-hyperparam-glove.txt", "rb") as f: best_hyperparam_list = pickle.load(f) best_hyperparam_dict = dict(best_hyperparam_list) for stock in stock_universe: print(stock) hyperparam = best_hyperparam_dict[stock] x = pd.read_pickle("temp/padded_embeddings/glove/pickle/" + stock + ".pickle") y = pd.read_pickle("temp/returns/pickle/" + stock + ".pickle") torch_dataset = nn.get_tensor_dataset(x, y) train_set, test_set = nn.train_test_split(torch_dataset, hyperparam["TEST_SIZE"]) model, _, _ = nn.train_nn(train_set, test_set, hyperparam) if not os.path.exists("temp/nn/glove"): os.makedirs("temp/nn/glove") torch.save(model.state_dict(), "temp/nn/glove/" + stock + ".pth") sys.exit()
def CNN(): #Load word2vec W_in, W_out = w2v.load_word2vec(num_review, numwords, mode, dimension) print(gd.get_splited_reviews(datas, w2i)[0])
max_length = 150 review_pad = pad_sequences(sequences, maxlen=max_length) # train test split print('Step9: train and test set generation...') from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(review_pad, y, test_size=0.20, random_state=0) # word2vec Embedding Matrix print('Step10: Generating word2vec embedding matrix...') num_words = len(tokenizer_word_index) + 1 embedding_matrix_w2v = word2vec.load_word2vec( working_directory + '/' + 'embeddings' + '/' + 'embeddings_w2v.txt', tokenizer_word_index=tokenizer_word_index, EMBEDDING_DIM=EMBEDDING_DIM) # training the word2vec model with lstm print('Step11: designing lstm+w2v model...') w2v_lstm = designing_network.model_architecture_word2vec( embedding_matrix_w2v, num_words, EMBEDDING_DIM=EMBEDDING_DIM, max_length=max_length) w2v_lstm, history = designing_network.fit_network(w2v_lstm, X_train, X_test, y_train, y_test) designing_network.save_network_model(w2v_lstm, modelname='w2v_lstm', directory=model_directory)
help='No. of hidden units [128]') parser.add_argument('--decay', default=1.0, type=float, help='Learning rate decay [1.0]') parser.add_argument('--dropout', default=0.0, type=float, help='Probability of dropping [0.0]') args = parser.parse_args() n_epochs = args.epochs batch_size = args.batch lr = args.lr n_hidden = args.hidden decay = args.decay dropout = args.dropout dataset = args.dataset train_filename = '../data/{}/train'.format(dataset) val_filename = '../data/{}/dev.out'.format(dataset) out_filename = '../data/{}/dev.p5.out'.format(dataset) test_filename = '../data/{}/test.in'.format(dataset) test_out_filename = '../data/{}/test.p5.out'.format(dataset) word2vec_dir = 'weights/word2vec/{}'.format(dataset) w2v_W, w2v_U = load_word2vec(word2vec_dir) main()