def preprare_data_for_processing(min_occurrences, use_cache_for_train, use_cache_for_test, duration, sentiment_method): training_data = None testing_data = None print("Loading data...") if duration is not None: if os.path.isfile(test_data_word2vec_file_name) and os.path.isfile( test_data_bow_file_name): os.remove(test_data_word2vec_file_name) os.remove(test_data_bow_file_name) testing_data, word2vec_testing_data = preprocess( "data/one_month_clean_test_data_with_prices.csv", True, min_occurrences, test_data_bow_file_name, test_data_word2vec_file_name, duration) if not os.path.isfile("data/BTC.csv"): prices_data = GetPricesData() prices_data.main() if use_cache_for_train: print("Reading the processed files") train_data_initializer_obj = DataInitializer() train_data_initializer_obj.initialize( None, cache_bow_output=train_data_bow_file_name, cache_word2vec_output=train_data_word2vec_file_name) training_data = train_data_initializer_obj.data_model word2vec_training_data = train_data_initializer_obj.word2vec_data else: print("Preprocessing data...") training_data, word2vec_training_data = preprocess( "data/one_month_clean_data_with_prices.csv", False, min_occurrences, train_data_bow_file_name, train_data_word2vec_file_name, sentiment_method=sentiment_method) if use_cache_for_test: test_data_initializer_obj = DataInitializer() test_data_initializer_obj.initialize( None, cache_bow_output=test_data_bow_file_name, cache_word2vec_output=test_data_word2vec_file_name) word2vec_testing_data = test_data_initializer_obj.word2vec_data testing_data = test_data_initializer_obj.data_model print("Loaded from cached files...") else: testing_data, word2vec_testing_data = preprocess( "data/one_month_clean_test_data_with_prices.csv", True, min_occurrences, test_data_bow_file_name, test_data_word2vec_file_name, sentiment_method=sentiment_method) print("Data preprocessed & cached...") return training_data, word2vec_training_data, testing_data, word2vec_testing_data
def ingest(): seed = 1000 data = DataInitializer() data.initialize("data/train.csv") data = DataCleaning(data) data.cleanup(DataCleaner()) data = Sentiments(data) data.sentiment_analysis_by_text() data = data.processed_data[['sentiment', 'text']] print('dataset loaded with shape', data.shape) print("Distribution of sentiments: ", pd.Series(data["sentiment"]).value_counts()) # data["sentiment"] = data["sentiment"].map(codes) return data
def preprare_data(min_occurrences, use_cache, duration): training_data = None testing_data = None print("Loading data...") if duration is not None: if os.path.isfile(test_data_word2vec_file_name) and os.path.isfile( test_data_bow_file_name): os.remove(test_data_word2vec_file_name) os.remove(test_data_bow_file_name) testing_data, word2vec_testing_data = preprocess( "data/clean_test.csv", True, min_occurrences, test_data_bow_file_name, test_data_word2vec_file_name, duration) if not os.path.isfile("data/BTC.csv"): prices_data = GetPricesData() prices_data.main() if use_cache: train_data_initializer_obj = DataInitializer() train_data_initializer_obj.initialize( None, from_cached_bow=train_data_bow_file_name, from_cached_word2vec=train_data_word2vec_file_name) training_data = train_data_initializer_obj.data_model word2vec_training_data = train_data_initializer_obj.word2vec_data test_data_initializer_obj = DataInitializer() test_data_initializer_obj.initialize( None, from_cached_bow=test_data_bow_file_name, from_cached_word2vec=test_data_word2vec_file_name) word2vec_testing_data = test_data_initializer_obj.word2vec_data testing_data = test_data_initializer_obj.data_model print("Loaded from cached files...") else: print("Preprocessing data...") training_data, word2vec_training_data = preprocess( "data/clean_train.csv", False, min_occurrences, train_data_bow_file_name, train_data_word2vec_file_name) testing_data, word2vec_testing_data = preprocess( "data/clean_test.csv", True, min_occurrences, test_data_bow_file_name, test_data_word2vec_file_name) print("Data preprocessed & cached...") return training_data, word2vec_training_data, testing_data, word2vec_testing_data
def preprocess(data_path, is_testing, min_occurrences=5, cache_bow_output=None, cache_word2vec_output=None, duration=None): if duration: data = DataInitializer() data.initialize(data_path, is_testing, duration=duration) else: data = DataInitializer() data.initialize(data_path, is_testing) if os.path.isfile("data/BTC.csv"): prices_data = GetPricesData() prices_data.main() data = DataCleaning(data, is_testing) data.cleanup(DataCleaner(is_testing)) if is_testing: print("Testing data shape:", data.processed_data.shape) else: print("Training data shape:", data.processed_data.shape) data = Sentiments(data) data.sentiment_analysis_by_text() print("First five rows with sentiment: ", data.processed_data.head()) if is_testing: data.processed_data.to_csv("data/clean_test_with_sentiments.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) else: data.processed_data.to_csv("data/clean_train_with_sentiments.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) data = DataTokenize(data) data.tokenize() data.stem() data = WordList(data) data.build_wordlist(min_occurrences=min_occurrences) word2vec_data = data data = BagOfWords(data.processed_data, data.wordlist, is_testing) data.build_data_model() print("data model head: ", data.data_model.head(5)) """ Word 2 vec """ word2vec = Word2VecProvider() # REPLACE PATH TO THE FILE word2vec.load("../twitter/data/glove.twitter.27B.200d.txt") word2vec_data = RedditData(word2vec_data) word2vec_data.build_final_model(word2vec) word2vec_data_model = word2vec_data.data_model if "index" in word2vec_data_model.columns: word2vec_data_model.drop("index", axis=1, inplace=True) word2vec_data_model.dropna(axis=0, inplace=True) word2vec_data_model.reset_index(inplace=True) word2vec_data_model.index = word2vec_data_model['timestamp_ms'] print("final word2vec data model: \n", word2vec_data_model.head(), "\n") """ Tokenizing the data """ texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["summary"]: texts.append(text) for sentiment in data.processed_data["sentiment"]: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer(num_words=20000) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=200) print( "\n\n##################################################\npadded sequence head: \n", padded_sequences[0:5]) print( "\n####################################################\n padded sequence length \n", len(padded_sequences)) if not is_testing: data = Plotting(data) data.plot() if cache_bow_output is not None: data.data_model.to_csv(cache_bow_output, index=False, float_format="%.6f") word2vec_data_model.to_csv(cache_word2vec_output, index=False, float_format="%.6f") with open('sequences', 'wb') as fp: pickle.dump(padded_sequences, fp) with open('sentiments', 'wb') as fp: pickle.dump(sentiments, fp) return data.data_model, word2vec_data_model
def main(): m = 5 use_cache = os.path.isfile( train_data_bow_file_name) and os.path.isfile( test_data_bow_file_name) and os.path.isfile( train_data_word2vec_file_name) and os.path.isfile( test_data_word2vec_file_name) print("Preparing data with min_occurrences=" + str(m)) training_data, word2vec_training_data, testing_data, word2vec_testing_data = preprare_data( m, use_cache, duration=None) log("********************************************************") log("Validating for {0} min_occurrences:".format(m)) if use_cache: col_names = [ "author", "title", "timestamp_ms", "summary", "sentiment", "sentiment_score" ] data = DataInitializer() data.initialize("data/clean_train_with_sentiments.csv", col_names=col_names) print("printing head:\n*******************************\n") data.processed_data = data.processed_data.reset_index(drop=True) # data.processed_data.rename(columns={"author": "timestamp_ms", "timestamp_ms", "summary"}) print(data.processed_data.head()) original_data = data.processed_data data.data_model = pd.read_csv(train_data_bow_file_name) data.wordlist = pd.read_csv("data/wordlist.csv") data = Plotting(data) data.plot() """ Naive Bayes """ print("***************************************************\n" "FOR NAIVE BAYES:\n" "***************************************************\n") print("testing_data shape: ", testing_data.shape) print("testing_data head: ", testing_data.head()) X_train, X_test, y_train, y_test = train_test_split( training_data.iloc[:, 1:], training_data.iloc[:, 0], train_size=0.7, stratify=training_data.iloc[:, 0], random_state=seed) if use_test_data: X_train = training_data.iloc[:, 1:] y_train = training_data.iloc[:, 0] X_test = testing_data.iloc[:, 1:] y_test = testing_data.iloc[:, 0] precision, recall, accuracy, f1 = Classification.test_classifier( X_train, y_train, X_test, y_test, BernoulliNB()) # nb_acc = Classification.cv(BernoulliNB(), training_data.iloc[:, 1:], training_data.iloc[:, 0]) """ Random Forest """ print("***************************************************\n" "FOR RANDOM FORESTS:\n" "***************************************************\n") X_train, X_test, y_train, y_test = train_test_split( training_data.iloc[:, 1:], training_data.iloc[:, 0], train_size=0.7, stratify=training_data.iloc[:, 0], random_state=seed) if use_test_data: X_train = training_data.iloc[:, 1:] y_train = training_data.iloc[:, 0] X_test = testing_data.iloc[:, 1:] y_test = testing_data.iloc[:, 0] precision, recall, accuracy, f1 = Classification.test_classifier( X_train, y_train, X_test, y_test, RandomForestClassifier(random_state=seed, n_estimators=403, n_jobs=-1)) # rf_acc = Classification.cv(RandomForestClassifier(n_estimators=403, n_jobs=-1, random_state=seed), training_data.iloc[:, 1:], # training_data.iloc[:, 0]) """ Word2Vec + Random Forest """ print("***************************************************\n" "FOR WORD2VEC WITH RANDOM FORESTS:\n" "***************************************************\n") X_train, X_test, y_train, y_test = train_test_split( word2vec_training_data.iloc[:, 2:], word2vec_training_data.iloc[:, 1], train_size=0.7, stratify=word2vec_training_data.iloc[:, 1], random_state=seed) # word2vec_training_data.drop(columns=['index'], inplace=True) # word2vec_testing_data.drop(columns=['index'], inplace=True) print("word2vec_training_data.columns: ", word2vec_training_data.columns) if use_test_data: X_train = word2vec_training_data.iloc[:, 3:] y_train = word2vec_training_data.iloc[:, 1] X_test = word2vec_testing_data.iloc[:, 3:] y_test = word2vec_testing_data.iloc[:, 1] precision, recall, accuracy, f1 = Classification.test_classifier( X_train, y_train, X_test, y_test, RandomForestClassifier(n_estimators=403, n_jobs=-1, random_state=seed)) print("***************************\n") print("For Regression\n") print("***************************\n") print("first five rows: ", word2vec_training_data.head()) X_train = word2vec_training_data.iloc[:, 4:] y_train = word2vec_training_data.iloc[:, 3] X_test = word2vec_testing_data.iloc[:, 4:] y_test = word2vec_testing_data.iloc[:, 3] regr = RandomForestRegressor(max_depth=2, random_state=0) regr.fit(X_train, y_train) # print(regr.feature_importances_) # print(regr.predict([[0, 0, 0, 0]])) predictions = regr.predict(X_test) print("predictions:\n*****************************", predictions, "\n****************************\n") print("Real values:\n*****************************", y_test, "\n****************************\n") print("score: ", regr.score(X_test, y_test)) redditposts_sentiment = pd.DataFrame() # Create a column from the datetime variable redditposts_sentiment['datetime'] = word2vec_testing_data[ "timestamp_ms"] redditposts_sentiment['sentiment_score'] = predictions # Convert that column into a datetime datatype redditposts_sentiment['datetime'] = pd.to_datetime( redditposts_sentiment['datetime']) # Set the datetime column as the index redditposts_sentiment.index = redditposts_sentiment['datetime'] reddit_posts = [ Scatter(x=redditposts_sentiment.resample('5Min').mean().index, y=redditposts_sentiment.resample('5Min').mean() ["sentiment_score"], mode="lines") ] plotly.offline.plot( { "data": reddit_posts, "layout": graph_objs.Layout(title="Reddit posts sentiment") }, filename='plots/redditposts_predicted_sentiment.html') print("***************************************************\n" "FOR KERAS:\n" "***************************************************\n") X_train, X_test, y_train, y_test = train_test_split( word2vec_training_data.iloc[:, 2:], word2vec_training_data.iloc[:, 1], train_size=0.7, stratify=word2vec_training_data.iloc[:, 1], random_state=seed) # word2vec_training_data.drop(columns=['index'], inplace=True) # word2vec_testing_data.drop(columns=['index'], inplace=True) print("word2vec_training_data.columns: ", word2vec_training_data.columns) if use_test_data: X_train = word2vec_training_data.iloc[:, 3:] y_train = word2vec_training_data.iloc[:, 1] X_test = word2vec_testing_data.iloc[:, 3:] y_test = word2vec_testing_data.iloc[:, 1] # params use_gpu = True config = tf.ConfigProto( intra_op_parallelism_threads=multiprocessing.cpu_count(), inter_op_parallelism_threads=multiprocessing.cpu_count(), allow_soft_placement=True, device_count={ 'CPU': 1, 'GPU': 1 if use_gpu else 0 }) session = tf.Session(config=config) K.set_session(session) model_location = './data/model/' # Keras convolutional model batch_size = 32 nb_epochs = 10 vector_size = 200 # Tweet max length (number of tokens) max_tweet_length = 15 print("X_train shape:", X_train.shape) print("Y_train shape:", y_train.shape) print("x_test shape:", X_test.shape) print("y_test shape:", y_test.shape) model = Sequential() model = Sequential() model.add(Dense(32, activation='relu', input_dim=204)) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) # Fit the model model.fit(X_train, y_train, batch_size=batch_size, shuffle=True, epochs=nb_epochs, validation_data=(X_test, y_test), callbacks=[EarlyStopping(min_delta=0.00025, patience=2)]) score = model.evaluate(X_test, y_test, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1]) # Save the model # serialize model to JSON model_json = model.to_json() with open("model.json", "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 model.save_weights("model.h5") print("Saved model to disk") print("****************************\n") print("Building a Neural Network\n") print("****************************\n") with open('sequences', 'rb') as fp: sequences = pickle.load(fp) with open('sentiments', 'rb') as fp: sentiments = pickle.load(fp) EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto') model = Sequential() model.add(Embedding(20000, 128, input_length=200)) model.add(Dropout(0.2)) model.add(Conv1D(64, 5, activation='relu')) model.add(MaxPooling1D(pool_size=4)) model.add(LSTM(128)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(sequences, np.array(sentiments), validation_split=0.5, epochs=10)
from data_calculator import DataCalculator from data_outputer import DataOutputer import os # creating file paths for the inputs and outputs file_dir = os.path.dirname(os.path.realpath('__file__')) input_path = os.path.join(file_dir, '../input/itcont.txt') input_path = os.path.abspath(os.path.realpath(input_path)) percent_path = os.path.join(file_dir, '../input/percentile.txt') percent_path = os.path.abspath(os.path.realpath(percent_path)) output_path = os.path.join(file_dir, '../output/repeat_donors.txt') output_path = os.path.abspath(os.path.realpath(output_path)) # initialize data to get ready for calculations raw_data = DataInitializer(input_path) raw_gen = raw_data.get_data() filtered_data = raw_data.filter_data(raw_gen) repeat_gen = raw_data.get_data() repeat_filter = raw_data.filter_data(repeat_gen) raw_data.set_nonrepeat_donors(repeat_filter) clean_gen = raw_data.get_repeat_donors(filtered_data) # execute calculations for all the returned elements data_calc = DataCalculator(clean_gen, percent_path) proc_data = data_calc.process_data() # format the results and output them into a text file output = DataOutputer(proc_data, output_path) output.write_to_txt()
def preprocess(data_path, is_testing, min_occurrences=5, cache_bow_output=None, cache_word2vec_output=None, duration=None, sentiment_method=None): if duration and cache_bow_output and cache_word2vec_output: data = DataInitializer() data.initialize(data_path, is_testing, duration=duration) elif cache_bow_output and cache_word2vec_output: data = DataInitializer() data.initialize(data_path, is_testing, cache_bow_output=cache_bow_output, cache_word2vec_output=cache_word2vec_output) else: data = DataInitializer() data.initialize(data_path, is_testing) if not os.path.isfile("data/Train_BTC.csv"): prices_data = GetPricesData() prices_data.main() if not os.path.isfile("data/Test_BTC.csv"): prices_data = GetPricesData() prices_data.main() data = DataCleaning(data, is_testing) data.cleanup(DataCleaner(is_testing)) if is_testing: print("Testing data shape:", data.processed_data.shape) else: print("Training data shape:", data.processed_data.shape) data = Sentiments(data, sentiment_method=sentiment_method) data.sentiment_analysis_by_text() print("First five rows with sentiment: ", data.processed_data.head()) if is_testing: data.processed_data.to_csv( "data/one_month_clean_test_data_with_prices.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) else: data.processed_data.to_csv("data/one_month_clean_data_with_prices.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) if os.path.isfile(cache_word2vec_output): print("cache_word2vec_output file name: ", cache_word2vec_output) word2vec_data_model = pd.read_csv(cache_word2vec_output) data.data_model = pd.read_csv(cache_bow_output) print("data model head: ", data.data_model.head(5)) else: data = DataTokenize(data) data.tokenize() data.stem() data = WordList(data) data.build_wordlist(min_occurrences=min_occurrences) word2vec_data = data data = BagOfWords(data.processed_data, data.wordlist, is_testing) data.build_data_model() print("data model head: ", data.data_model.head(5)) """ Word 2 vec """ word2vec = Word2VecProvider() # REPLACE PATH TO THE FILE word2vec.load("data/glove.twitter.27B.200d-with2num.txt") word2vec_data = TwitterData(word2vec_data) word2vec_data.build_final_model(word2vec) word2vec_data_model = word2vec_data.data_model if "original_id" in word2vec_data_model.columns: word2vec_data_model.drop("original_id", axis=1, inplace=True) word2vec_data_model.dropna(axis=0, inplace=True) word2vec_data_model.reset_index(inplace=True, drop=True) word2vec_data_model.index = word2vec_data_model['timestamp'] print("final word2vec data model: \n", word2vec_data_model.head(), "\n") # if not is_testing: # data = Plotting(data) # data.plot() if not is_testing: if not os.path.isfile("train_sequences"): print("\n##########################\n" "Tokenizing the tweets\n" "############################\n") texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["text"]: texts.append(text) for sentiment in data.processed_data['sentiment']: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=20, padding='post') padded_sequences = pd.DataFrame(data=padded_sequences) merged_train_data = pd.concat([ padded_sequences, data.processed_data[[ "high", "low", "open", "quoteVolume", "volume", "weightedAverage" ]] ], axis=1) train_targets = data.processed_data[["close"]] print("shape of merged train data: ", merged_train_data.shape) with open('data/train_sequences', 'wb') as fp: pickle.dump(merged_train_data, fp) with open('data/train_prices', 'wb') as fp: pickle.dump(train_targets, fp) # load the whole embedding into memory embeddings_index = dict() with open("data/glove.twitter.27B.200d-with2num.txt", "r", encoding="utf-8") as my_file: for line in my_file: values = line.split() word = values[0] coefs = numpy.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs # f.close() print("*" * 80, "\n" * 10) print('Loaded %s train word vectors.' % len(embeddings_index)) print('Total %s of word indexes.' % len(tokenizer.word_index)) with open('data/embeddings_index', 'wb') as fp: pickle.dump(embeddings_index, fp) with open('data/train_word_indexes', 'wb') as fp: pickle.dump(tokenizer.word_index, fp) # encode class values as integers # encoder = LabelEncoder() # encoder.fit(sentiments) # encoded_sentiments = encoder.transform(sentiments) # convert integers to dummy variables (i.e. one hot encoded) # dummy_sentiments = np_utils.to_categorical(encoded_sentiments) # for text in data.processed_data.loc[data.processed_data['sentiment'] != 0, "text"]: # texts.append(text) # # for sentiment in data.processed_data.loc[data.processed_data['sentiment'] != 0, "sentiment"]: # sentiments.append(sentiment) else: if not os.path.isfile("test_sequences"): print("\n##########################\n" "Tokenizing the tweets\n" "############################\n") texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["text"]: texts.append(text) for sentiment in data.processed_data['sentiment']: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=20, padding='post') padded_sequences = pd.DataFrame(data=padded_sequences) merged_test_data = pd.concat([ padded_sequences, data.processed_data[[ "high", "low", "open", "quoteVolume", "volume", "weightedAverage" ]] ], axis=1) test_targets = data.processed_data[["close"]] print("shape of merged test data: ", merged_test_data.shape) with open('data/test_sequences', 'wb') as fp: pickle.dump(merged_test_data, fp) with open('data/test_prices', 'wb') as fp: pickle.dump(test_targets, fp) with open('data/test_word_indexes', 'wb') as fp: pickle.dump(tokenizer.word_index, fp) # padded_sequences = pd.DataFrame(data=padded_sequences) print( "\n\n##################################################\npadded sequence head: \n", padded_sequences[0:5]) print( "\n####################################################\n padded sequence length \n", len(padded_sequences)) if not os.path.isfile(train_data_word2vec_file_name) or not os.path.isfile( test_data_word2vec_file_name): if cache_bow_output is not None: data.data_model.to_csv(cache_bow_output, index=False, float_format="%.6f") word2vec_data_model.to_csv(cache_word2vec_output, index=False, float_format="%.6f") return data.data_model, word2vec_data_model