Exemplo n.º 1
0
    def main():

        for m in range(3, 4):
            print("Preparing data with min_occurrences=" + str(m))
            training_data, testing_data = preprare_data(m)
            log("********************************************************")
            log("Validating for {0} min_occurrences:".format(m))
            # drop idx & id columns
            # if training_data.columns[0] == "idx":
            #     training_data = training_data.iloc[:, 1:]
            #
            # if testing_data.columns[0] == "idx":
            #     testing_data = testing_data.iloc[:, 1:]
            #
            # if "original_id" in training_data.columns:
            #     training_data.drop( "original_id", axis=1, inplace=True )
            #
            # if "original_id" in testing_data.columns:
            #     testing_data.drop( "original_id", axis=1, inplace=True )

            td = TwitterData()
            td.initialize("data\\train.csv")
            td.build_features()
            td.cleanup(TwitterCleanuper())
            td.tokenize()
            td.stem()
            td.build_wordlist()
            td.build_final_model(word2vec)

            td.data_model.head(5)

        print("Done!")
Exemplo n.º 2
0
def preprocess(results,
               data_path,
               is_testing,
               data_name,
               min_occurrences=5,
               cache_output=None):
    twitter_data = TwitterData()
    twitter_data.initialize(data_path, is_testing)
    twitter_data.build_features()
    twitter_data.cleanup(TwitterCleanuper())
    twitter_data.tokenize()
    twitter_data.stem()
    twitter_data.build_wordlist(min_occurrences=min_occurrences)
    #twitter_data.build_data_model()
    # twitter_data.build_ngrams()
    # twitter_data.build_ngram_model()
    # twitter_data.build_data_model(with_ngram=2)
    # word2vec = Word2VecProvider()
    # word2vec.load("H:\\Programowanie\\glove.twitter.27B.200d.txt")
    # twitter_data.build_word2vec_model(word2vec)
    print(cache_output)
    if cache_output is not None:
        twitter_data.data_model.to_csv(cache_output,
                                       index_label="idx",
                                       float_format="%.6f")
    results[data_name] = twitter_data.data_model
Exemplo n.º 3
0
def preprocess(results,
               data_path,
               is_testing,
               data_name,
               min_occurrences=5,
               cache_output=None):
    twitter_data = TwitterData()
    twitter_data.initialize(data_path, is_testing)
    twitter_data.build_features()
    twitter_data.cleanup(TwitterCleanuper())
    twitter_data.tokenize()
    twitter_data.stem()
    twitter_data.build_wordlist(min_occurrences=min_occurrences)

    if cache_output is not None:
        twitter_data.data_model.to_csv(cache_output,
                                       index_label="idx",
                                       float_format="%.6f")
    results[data_name] = twitter_data.data_model