def main(): for m in range(3, 4): print("Preparing data with min_occurrences=" + str(m)) training_data, testing_data = preprare_data(m) log("********************************************************") log("Validating for {0} min_occurrences:".format(m)) # drop idx & id columns # if training_data.columns[0] == "idx": # training_data = training_data.iloc[:, 1:] # # if testing_data.columns[0] == "idx": # testing_data = testing_data.iloc[:, 1:] # # if "original_id" in training_data.columns: # training_data.drop( "original_id", axis=1, inplace=True ) # # if "original_id" in testing_data.columns: # testing_data.drop( "original_id", axis=1, inplace=True ) td = TwitterData() td.initialize("data\\train.csv") td.build_features() td.cleanup(TwitterCleanuper()) td.tokenize() td.stem() td.build_wordlist() td.build_final_model(word2vec) td.data_model.head(5) print("Done!")
def preprocess(results, data_path, is_testing, data_name, min_occurrences=5, cache_output=None): twitter_data = TwitterData() twitter_data.initialize(data_path, is_testing) twitter_data.build_features() twitter_data.cleanup(TwitterCleanuper()) twitter_data.tokenize() twitter_data.stem() twitter_data.build_wordlist(min_occurrences=min_occurrences) #twitter_data.build_data_model() # twitter_data.build_ngrams() # twitter_data.build_ngram_model() # twitter_data.build_data_model(with_ngram=2) # word2vec = Word2VecProvider() # word2vec.load("H:\\Programowanie\\glove.twitter.27B.200d.txt") # twitter_data.build_word2vec_model(word2vec) print(cache_output) if cache_output is not None: twitter_data.data_model.to_csv(cache_output, index_label="idx", float_format="%.6f") results[data_name] = twitter_data.data_model
def preprocess(results, data_path, is_testing, data_name, min_occurrences=5, cache_output=None): twitter_data = TwitterData() twitter_data.initialize(data_path, is_testing) twitter_data.build_features() twitter_data.cleanup(TwitterCleanuper()) twitter_data.tokenize() twitter_data.stem() twitter_data.build_wordlist(min_occurrences=min_occurrences) if cache_output is not None: twitter_data.data_model.to_csv(cache_output, index_label="idx", float_format="%.6f") results[data_name] = twitter_data.data_model