Python DataInitializer 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: data_initializer

클래스/타입: DataInitializer

hotexamples.com에서의 예제들: 7

Python DataInitializer - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 data_initializer.DataInitializer에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

DataInitializer(7)

initialize(6)

cleanup(3)

sentiment_analysis_by_text(3)

build_data_model(2)

build_wordlist(2)

data_model(2)

plot(2)

stem(2)

tokenize(2)

filter_data(1)

get_data(1)

get_repeat_donors(1)

processed_data(1)

set_nonrepeat_donors(1)

wordlist(1)

예제 #1

파일 보기

def preprare_data_for_processing(min_occurrences, use_cache_for_train,
                                 use_cache_for_test, duration,
                                 sentiment_method):
    training_data = None
    testing_data = None
    print("Loading data...")

    if duration is not None:
        if os.path.isfile(test_data_word2vec_file_name) and os.path.isfile(
                test_data_bow_file_name):
            os.remove(test_data_word2vec_file_name)
            os.remove(test_data_bow_file_name)
        testing_data, word2vec_testing_data = preprocess(
            "data/one_month_clean_test_data_with_prices.csv", True,
            min_occurrences, test_data_bow_file_name,
            test_data_word2vec_file_name, duration)
    if not os.path.isfile("data/BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()
    if use_cache_for_train:
        print("Reading the processed files")
        train_data_initializer_obj = DataInitializer()
        train_data_initializer_obj.initialize(
            None,
            cache_bow_output=train_data_bow_file_name,
            cache_word2vec_output=train_data_word2vec_file_name)
        training_data = train_data_initializer_obj.data_model
        word2vec_training_data = train_data_initializer_obj.word2vec_data
    else:
        print("Preprocessing data...")
        training_data, word2vec_training_data = preprocess(
            "data/one_month_clean_data_with_prices.csv",
            False,
            min_occurrences,
            train_data_bow_file_name,
            train_data_word2vec_file_name,
            sentiment_method=sentiment_method)

    if use_cache_for_test:
        test_data_initializer_obj = DataInitializer()
        test_data_initializer_obj.initialize(
            None,
            cache_bow_output=test_data_bow_file_name,
            cache_word2vec_output=test_data_word2vec_file_name)
        word2vec_testing_data = test_data_initializer_obj.word2vec_data
        testing_data = test_data_initializer_obj.data_model
        print("Loaded from cached files...")
    else:
        testing_data, word2vec_testing_data = preprocess(
            "data/one_month_clean_test_data_with_prices.csv",
            True,
            min_occurrences,
            test_data_bow_file_name,
            test_data_word2vec_file_name,
            sentiment_method=sentiment_method)

    print("Data preprocessed & cached...")
    return training_data, word2vec_training_data, testing_data, word2vec_testing_data

예제 #2

파일 보기

파일: build_word2vec.py 프로젝트: saikrishna7/NLP_CryptoPricePrediction

def ingest():
    seed = 1000

    data = DataInitializer()
    data.initialize("data/train.csv")

    data = DataCleaning(data)
    data.cleanup(DataCleaner())

    data = Sentiments(data)
    data.sentiment_analysis_by_text()

    data = data.processed_data[['sentiment', 'text']]
    print('dataset loaded with shape', data.shape)
    print("Distribution of sentiments: ",
          pd.Series(data["sentiment"]).value_counts())

    # data["sentiment"] = data["sentiment"].map(codes)

    return data

예제 #3

파일 보기

파일: main.py 프로젝트: saikrishna7/NLP_CryptoPricePrediction

def preprare_data(min_occurrences, use_cache, duration):
    training_data = None
    testing_data = None
    print("Loading data...")
    if duration is not None:
        if os.path.isfile(test_data_word2vec_file_name) and os.path.isfile(
                test_data_bow_file_name):
            os.remove(test_data_word2vec_file_name)
            os.remove(test_data_bow_file_name)
        testing_data, word2vec_testing_data = preprocess(
            "data/clean_test.csv", True, min_occurrences,
            test_data_bow_file_name, test_data_word2vec_file_name, duration)
    if not os.path.isfile("data/BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()
    if use_cache:
        train_data_initializer_obj = DataInitializer()
        train_data_initializer_obj.initialize(
            None,
            from_cached_bow=train_data_bow_file_name,
            from_cached_word2vec=train_data_word2vec_file_name)
        training_data = train_data_initializer_obj.data_model
        word2vec_training_data = train_data_initializer_obj.word2vec_data

        test_data_initializer_obj = DataInitializer()
        test_data_initializer_obj.initialize(
            None,
            from_cached_bow=test_data_bow_file_name,
            from_cached_word2vec=test_data_word2vec_file_name)
        word2vec_testing_data = test_data_initializer_obj.word2vec_data
        testing_data = test_data_initializer_obj.data_model
        print("Loaded from cached files...")
    else:
        print("Preprocessing data...")
        training_data, word2vec_training_data = preprocess(
            "data/clean_train.csv", False, min_occurrences,
            train_data_bow_file_name, train_data_word2vec_file_name)
        testing_data, word2vec_testing_data = preprocess(
            "data/clean_test.csv", True, min_occurrences,
            test_data_bow_file_name, test_data_word2vec_file_name)
        print("Data preprocessed & cached...")

    return training_data, word2vec_training_data, testing_data, word2vec_testing_data

예제 #4

파일 보기

파일: main.py 프로젝트: saikrishna7/NLP_CryptoPricePrediction

def preprocess(data_path,
               is_testing,
               min_occurrences=5,
               cache_bow_output=None,
               cache_word2vec_output=None,
               duration=None):
    if duration:
        data = DataInitializer()
        data.initialize(data_path, is_testing, duration=duration)
    else:
        data = DataInitializer()
        data.initialize(data_path, is_testing)

    if os.path.isfile("data/BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    data = DataCleaning(data, is_testing)
    data.cleanup(DataCleaner(is_testing))

    if is_testing:
        print("Testing data shape:", data.processed_data.shape)
    else:
        print("Training data shape:", data.processed_data.shape)

    data = Sentiments(data)
    data.sentiment_analysis_by_text()
    print("First five rows with sentiment: ", data.processed_data.head())
    if is_testing:
        data.processed_data.to_csv("data/clean_test_with_sentiments.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)
    else:
        data.processed_data.to_csv("data/clean_train_with_sentiments.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)

    data = DataTokenize(data)
    data.tokenize()
    data.stem()

    data = WordList(data)
    data.build_wordlist(min_occurrences=min_occurrences)

    word2vec_data = data
    data = BagOfWords(data.processed_data, data.wordlist, is_testing)
    data.build_data_model()
    print("data model head: ", data.data_model.head(5))
    """
    Word 2 vec
    """

    word2vec = Word2VecProvider()

    # REPLACE PATH TO THE FILE
    word2vec.load("../twitter/data/glove.twitter.27B.200d.txt")

    word2vec_data = RedditData(word2vec_data)
    word2vec_data.build_final_model(word2vec)

    word2vec_data_model = word2vec_data.data_model
    if "index" in word2vec_data_model.columns:
        word2vec_data_model.drop("index", axis=1, inplace=True)
    word2vec_data_model.dropna(axis=0, inplace=True)
    word2vec_data_model.reset_index(inplace=True)
    word2vec_data_model.index = word2vec_data_model['timestamp_ms']
    print("final word2vec data model: \n", word2vec_data_model.head(), "\n")
    """
    Tokenizing the data
    """
    texts = []
    sentiments = []
    tokenized_data = pd.DataFrame()
    for text in data.processed_data["summary"]:
        texts.append(text)
    for sentiment in data.processed_data["sentiment"]:
        sentiments.append(sentiment)
    print("texts: ", texts[0:5])
    tokenizer = Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=200)

    print(
        "\n\n##################################################\npadded sequence head: \n",
        padded_sequences[0:5])
    print(
        "\n####################################################\n padded sequence length \n",
        len(padded_sequences))

    if not is_testing:
        data = Plotting(data)
        data.plot()

    if cache_bow_output is not None:
        data.data_model.to_csv(cache_bow_output,
                               index=False,
                               float_format="%.6f")
        word2vec_data_model.to_csv(cache_word2vec_output,
                                   index=False,
                                   float_format="%.6f")
        with open('sequences', 'wb') as fp:
            pickle.dump(padded_sequences, fp)
        with open('sentiments', 'wb') as fp:
            pickle.dump(sentiments, fp)

    return data.data_model, word2vec_data_model

예제 #5

파일 보기

파일: main.py 프로젝트: saikrishna7/NLP_CryptoPricePrediction

    def main():
        m = 5
        use_cache = os.path.isfile(
            train_data_bow_file_name) and os.path.isfile(
                test_data_bow_file_name) and os.path.isfile(
                    train_data_word2vec_file_name) and os.path.isfile(
                        test_data_word2vec_file_name)
        print("Preparing data with min_occurrences=" + str(m))

        training_data, word2vec_training_data, testing_data, word2vec_testing_data = preprare_data(
            m, use_cache, duration=None)

        log("********************************************************")
        log("Validating for {0} min_occurrences:".format(m))
        if use_cache:
            col_names = [
                "author", "title", "timestamp_ms", "summary", "sentiment",
                "sentiment_score"
            ]
            data = DataInitializer()
            data.initialize("data/clean_train_with_sentiments.csv",
                            col_names=col_names)
            print("printing head:\n*******************************\n")
            data.processed_data = data.processed_data.reset_index(drop=True)
            # data.processed_data.rename(columns={"author": "timestamp_ms", "timestamp_ms", "summary"})
            print(data.processed_data.head())
            original_data = data.processed_data
            data.data_model = pd.read_csv(train_data_bow_file_name)
            data.wordlist = pd.read_csv("data/wordlist.csv")
            data = Plotting(data)
            data.plot()
        """
        Naive Bayes
        """
        print("***************************************************\n"
              "FOR NAIVE BAYES:\n"
              "***************************************************\n")
        print("testing_data shape: ", testing_data.shape)
        print("testing_data head: ", testing_data.head())
        X_train, X_test, y_train, y_test = train_test_split(
            training_data.iloc[:, 1:],
            training_data.iloc[:, 0],
            train_size=0.7,
            stratify=training_data.iloc[:, 0],
            random_state=seed)

        if use_test_data:
            X_train = training_data.iloc[:, 1:]
            y_train = training_data.iloc[:, 0]

            X_test = testing_data.iloc[:, 1:]
            y_test = testing_data.iloc[:, 0]
        precision, recall, accuracy, f1 = Classification.test_classifier(
            X_train, y_train, X_test, y_test, BernoulliNB())

        # nb_acc = Classification.cv(BernoulliNB(), training_data.iloc[:, 1:], training_data.iloc[:, 0])
        """
        Random Forest
        """
        print("***************************************************\n"
              "FOR RANDOM FORESTS:\n"
              "***************************************************\n")
        X_train, X_test, y_train, y_test = train_test_split(
            training_data.iloc[:, 1:],
            training_data.iloc[:, 0],
            train_size=0.7,
            stratify=training_data.iloc[:, 0],
            random_state=seed)
        if use_test_data:
            X_train = training_data.iloc[:, 1:]
            y_train = training_data.iloc[:, 0]

            X_test = testing_data.iloc[:, 1:]
            y_test = testing_data.iloc[:, 0]

        precision, recall, accuracy, f1 = Classification.test_classifier(
            X_train, y_train, X_test, y_test,
            RandomForestClassifier(random_state=seed,
                                   n_estimators=403,
                                   n_jobs=-1))
        # rf_acc = Classification.cv(RandomForestClassifier(n_estimators=403, n_jobs=-1, random_state=seed), training_data.iloc[:, 1:],
        #                            training_data.iloc[:, 0])
        """
         Word2Vec + Random Forest
        """
        print("***************************************************\n"
              "FOR WORD2VEC WITH RANDOM FORESTS:\n"
              "***************************************************\n")

        X_train, X_test, y_train, y_test = train_test_split(
            word2vec_training_data.iloc[:, 2:],
            word2vec_training_data.iloc[:, 1],
            train_size=0.7,
            stratify=word2vec_training_data.iloc[:, 1],
            random_state=seed)
        # word2vec_training_data.drop(columns=['index'], inplace=True)
        # word2vec_testing_data.drop(columns=['index'], inplace=True)
        print("word2vec_training_data.columns: ",
              word2vec_training_data.columns)

        if use_test_data:
            X_train = word2vec_training_data.iloc[:, 3:]
            y_train = word2vec_training_data.iloc[:, 1]

            X_test = word2vec_testing_data.iloc[:, 3:]
            y_test = word2vec_testing_data.iloc[:, 1]

        precision, recall, accuracy, f1 = Classification.test_classifier(
            X_train, y_train, X_test, y_test,
            RandomForestClassifier(n_estimators=403,
                                   n_jobs=-1,
                                   random_state=seed))

        print("***************************\n")
        print("For Regression\n")
        print("***************************\n")

        print("first five rows: ", word2vec_training_data.head())
        X_train = word2vec_training_data.iloc[:, 4:]
        y_train = word2vec_training_data.iloc[:, 3]

        X_test = word2vec_testing_data.iloc[:, 4:]
        y_test = word2vec_testing_data.iloc[:, 3]

        regr = RandomForestRegressor(max_depth=2, random_state=0)
        regr.fit(X_train, y_train)
        # print(regr.feature_importances_)
        # print(regr.predict([[0, 0, 0, 0]]))
        predictions = regr.predict(X_test)
        print("predictions:\n*****************************", predictions,
              "\n****************************\n")
        print("Real values:\n*****************************", y_test,
              "\n****************************\n")
        print("score: ", regr.score(X_test, y_test))

        redditposts_sentiment = pd.DataFrame()
        # Create a column from the datetime variable
        redditposts_sentiment['datetime'] = word2vec_testing_data[
            "timestamp_ms"]
        redditposts_sentiment['sentiment_score'] = predictions
        # Convert that column into a datetime datatype
        redditposts_sentiment['datetime'] = pd.to_datetime(
            redditposts_sentiment['datetime'])
        # Set the datetime column as the index
        redditposts_sentiment.index = redditposts_sentiment['datetime']

        reddit_posts = [
            Scatter(x=redditposts_sentiment.resample('5Min').mean().index,
                    y=redditposts_sentiment.resample('5Min').mean()
                    ["sentiment_score"],
                    mode="lines")
        ]

        plotly.offline.plot(
            {
                "data": reddit_posts,
                "layout": graph_objs.Layout(title="Reddit posts sentiment")
            },
            filename='plots/redditposts_predicted_sentiment.html')

        print("***************************************************\n"
              "FOR KERAS:\n"
              "***************************************************\n")
        X_train, X_test, y_train, y_test = train_test_split(
            word2vec_training_data.iloc[:, 2:],
            word2vec_training_data.iloc[:, 1],
            train_size=0.7,
            stratify=word2vec_training_data.iloc[:, 1],
            random_state=seed)
        # word2vec_training_data.drop(columns=['index'], inplace=True)
        # word2vec_testing_data.drop(columns=['index'], inplace=True)
        print("word2vec_training_data.columns: ",
              word2vec_training_data.columns)
        if use_test_data:
            X_train = word2vec_training_data.iloc[:, 3:]
            y_train = word2vec_training_data.iloc[:, 1]

            X_test = word2vec_testing_data.iloc[:, 3:]
            y_test = word2vec_testing_data.iloc[:, 1]

        # params
        use_gpu = True

        config = tf.ConfigProto(
            intra_op_parallelism_threads=multiprocessing.cpu_count(),
            inter_op_parallelism_threads=multiprocessing.cpu_count(),
            allow_soft_placement=True,
            device_count={
                'CPU': 1,
                'GPU': 1 if use_gpu else 0
            })

        session = tf.Session(config=config)
        K.set_session(session)

        model_location = './data/model/'

        # Keras convolutional model
        batch_size = 32
        nb_epochs = 10
        vector_size = 200
        # Tweet max length (number of tokens)
        max_tweet_length = 15
        print("X_train shape:", X_train.shape)
        print("Y_train shape:", y_train.shape)
        print("x_test shape:", X_test.shape)
        print("y_test shape:", y_test.shape)
        model = Sequential()

        model = Sequential()
        model.add(Dense(32, activation='relu', input_dim=204))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='rmsprop',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

        # Fit the model
        model.fit(X_train,
                  y_train,
                  batch_size=batch_size,
                  shuffle=True,
                  epochs=nb_epochs,
                  validation_data=(X_test, y_test),
                  callbacks=[EarlyStopping(min_delta=0.00025, patience=2)])

        score = model.evaluate(X_test, y_test, verbose=0)
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])

        # Save the model
        # serialize model to JSON
        model_json = model.to_json()
        with open("model.json", "w") as json_file:
            json_file.write(model_json)
        # serialize weights to HDF5
        model.save_weights("model.h5")
        print("Saved model to disk")

        print("****************************\n")
        print("Building a Neural Network\n")
        print("****************************\n")

        with open('sequences', 'rb') as fp:
            sequences = pickle.load(fp)

        with open('sentiments', 'rb') as fp:
            sentiments = pickle.load(fp)

        EarlyStopping(monitor='val_loss',
                      min_delta=0,
                      patience=0,
                      verbose=0,
                      mode='auto')
        model = Sequential()
        model.add(Embedding(20000, 128, input_length=200))
        model.add(Dropout(0.2))
        model.add(Conv1D(64, 5, activation='relu'))
        model.add(MaxPooling1D(pool_size=4))
        model.add(LSTM(128))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        model.fit(sequences,
                  np.array(sentiments),
                  validation_split=0.5,
                  epochs=10)

예제 #6

파일 보기

from data_calculator import DataCalculator
from data_outputer import DataOutputer
import os

# creating file paths for the inputs and outputs
file_dir = os.path.dirname(os.path.realpath('__file__'))

input_path = os.path.join(file_dir, '../input/itcont.txt')
input_path = os.path.abspath(os.path.realpath(input_path))
percent_path = os.path.join(file_dir, '../input/percentile.txt')
percent_path = os.path.abspath(os.path.realpath(percent_path))
output_path = os.path.join(file_dir, '../output/repeat_donors.txt')
output_path = os.path.abspath(os.path.realpath(output_path))

# initialize data to get ready for calculations
raw_data = DataInitializer(input_path)
raw_gen = raw_data.get_data()
filtered_data = raw_data.filter_data(raw_gen)
repeat_gen = raw_data.get_data()
repeat_filter = raw_data.filter_data(repeat_gen)
raw_data.set_nonrepeat_donors(repeat_filter)
clean_gen = raw_data.get_repeat_donors(filtered_data)

# execute calculations for all the returned elements
data_calc = DataCalculator(clean_gen, percent_path)
proc_data = data_calc.process_data()

# format the results and output them into a text file
output = DataOutputer(proc_data, output_path)
output.write_to_txt()

예제 #7

파일 보기

def preprocess(data_path,
               is_testing,
               min_occurrences=5,
               cache_bow_output=None,
               cache_word2vec_output=None,
               duration=None,
               sentiment_method=None):
    if duration and cache_bow_output and cache_word2vec_output:
        data = DataInitializer()
        data.initialize(data_path, is_testing, duration=duration)
    elif cache_bow_output and cache_word2vec_output:
        data = DataInitializer()
        data.initialize(data_path,
                        is_testing,
                        cache_bow_output=cache_bow_output,
                        cache_word2vec_output=cache_word2vec_output)
    else:
        data = DataInitializer()
        data.initialize(data_path, is_testing)

    if not os.path.isfile("data/Train_BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    if not os.path.isfile("data/Test_BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    data = DataCleaning(data, is_testing)
    data.cleanup(DataCleaner(is_testing))

    if is_testing:
        print("Testing data shape:", data.processed_data.shape)
    else:
        print("Training data shape:", data.processed_data.shape)

    data = Sentiments(data, sentiment_method=sentiment_method)
    data.sentiment_analysis_by_text()

    print("First five rows with sentiment: ", data.processed_data.head())
    if is_testing:
        data.processed_data.to_csv(
            "data/one_month_clean_test_data_with_prices.csv",
            sep=',',
            encoding='utf-8',
            index=False)
        # os.remove(data_path)
    else:
        data.processed_data.to_csv("data/one_month_clean_data_with_prices.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)

    if os.path.isfile(cache_word2vec_output):
        print("cache_word2vec_output file name: ", cache_word2vec_output)
        word2vec_data_model = pd.read_csv(cache_word2vec_output)
        data.data_model = pd.read_csv(cache_bow_output)
        print("data model head: ", data.data_model.head(5))
    else:
        data = DataTokenize(data)
        data.tokenize()
        data.stem()

        data = WordList(data)
        data.build_wordlist(min_occurrences=min_occurrences)

        word2vec_data = data
        data = BagOfWords(data.processed_data, data.wordlist, is_testing)
        data.build_data_model()
        print("data model head: ", data.data_model.head(5))
        """
        Word 2 vec
        """

        word2vec = Word2VecProvider()

        # REPLACE PATH TO THE FILE
        word2vec.load("data/glove.twitter.27B.200d-with2num.txt")
        word2vec_data = TwitterData(word2vec_data)
        word2vec_data.build_final_model(word2vec)
        word2vec_data_model = word2vec_data.data_model

        if "original_id" in word2vec_data_model.columns:
            word2vec_data_model.drop("original_id", axis=1, inplace=True)
        word2vec_data_model.dropna(axis=0, inplace=True)
        word2vec_data_model.reset_index(inplace=True, drop=True)
        word2vec_data_model.index = word2vec_data_model['timestamp']

    print("final word2vec data model: \n", word2vec_data_model.head(), "\n")

    # if not is_testing:
    #     data = Plotting(data)
    #     data.plot()

    if not is_testing:
        if not os.path.isfile("train_sequences"):
            print("\n##########################\n"
                  "Tokenizing the tweets\n"
                  "############################\n")
            texts = []
            sentiments = []
            tokenized_data = pd.DataFrame()

            for text in data.processed_data["text"]:
                texts.append(text)

            for sentiment in data.processed_data['sentiment']:
                sentiments.append(sentiment)

            print("texts: ", texts[0:5])
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            padded_sequences = pad_sequences(sequences,
                                             maxlen=20,
                                             padding='post')
            padded_sequences = pd.DataFrame(data=padded_sequences)

            merged_train_data = pd.concat([
                padded_sequences, data.processed_data[[
                    "high", "low", "open", "quoteVolume", "volume",
                    "weightedAverage"
                ]]
            ],
                                          axis=1)
            train_targets = data.processed_data[["close"]]
            print("shape of merged train data: ", merged_train_data.shape)

            with open('data/train_sequences', 'wb') as fp:
                pickle.dump(merged_train_data, fp)
            with open('data/train_prices', 'wb') as fp:
                pickle.dump(train_targets, fp)

            # load the whole embedding into memory
            embeddings_index = dict()
            with open("data/glove.twitter.27B.200d-with2num.txt",
                      "r",
                      encoding="utf-8") as my_file:
                for line in my_file:
                    values = line.split()
                    word = values[0]
                    coefs = numpy.asarray(values[1:], dtype='float32')
                    embeddings_index[word] = coefs
            # f.close()
            print("*" * 80, "\n" * 10)
            print('Loaded %s train word vectors.' % len(embeddings_index))
            print('Total %s of word indexes.' % len(tokenizer.word_index))

            with open('data/embeddings_index', 'wb') as fp:
                pickle.dump(embeddings_index, fp)
            with open('data/train_word_indexes', 'wb') as fp:
                pickle.dump(tokenizer.word_index, fp)

            # encode class values as integers
            # encoder = LabelEncoder()
            # encoder.fit(sentiments)
            # encoded_sentiments = encoder.transform(sentiments)

            # convert integers to dummy variables (i.e. one hot encoded)
            # dummy_sentiments = np_utils.to_categorical(encoded_sentiments)

            # for text in data.processed_data.loc[data.processed_data['sentiment'] != 0, "text"]:
            #     texts.append(text)
            #
            # for sentiment in data.processed_data.loc[data.processed_data['sentiment'] != 0, "sentiment"]:
            #     sentiments.append(sentiment)

    else:
        if not os.path.isfile("test_sequences"):
            print("\n##########################\n"
                  "Tokenizing the tweets\n"
                  "############################\n")
            texts = []
            sentiments = []
            tokenized_data = pd.DataFrame()

            for text in data.processed_data["text"]:
                texts.append(text)

            for sentiment in data.processed_data['sentiment']:
                sentiments.append(sentiment)

            print("texts: ", texts[0:5])
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            padded_sequences = pad_sequences(sequences,
                                             maxlen=20,
                                             padding='post')
            padded_sequences = pd.DataFrame(data=padded_sequences)

            merged_test_data = pd.concat([
                padded_sequences, data.processed_data[[
                    "high", "low", "open", "quoteVolume", "volume",
                    "weightedAverage"
                ]]
            ],
                                         axis=1)
            test_targets = data.processed_data[["close"]]
            print("shape of merged test data: ", merged_test_data.shape)

            with open('data/test_sequences', 'wb') as fp:
                pickle.dump(merged_test_data, fp)
            with open('data/test_prices', 'wb') as fp:
                pickle.dump(test_targets, fp)
            with open('data/test_word_indexes', 'wb') as fp:
                pickle.dump(tokenizer.word_index, fp)

            # padded_sequences = pd.DataFrame(data=padded_sequences)

    print(
        "\n\n##################################################\npadded sequence head: \n",
        padded_sequences[0:5])
    print(
        "\n####################################################\n padded sequence length \n",
        len(padded_sequences))

    if not os.path.isfile(train_data_word2vec_file_name) or not os.path.isfile(
            test_data_word2vec_file_name):
        if cache_bow_output is not None:
            data.data_model.to_csv(cache_bow_output,
                                   index=False,
                                   float_format="%.6f")
            word2vec_data_model.to_csv(cache_word2vec_output,
                                       index=False,
                                       float_format="%.6f")
    return data.data_model, word2vec_data_model