data_dir = "data" # Where to save preprocessed data clean_data_dir = "data/clean_data" # Name of input file. Should be inside of data_dir input_file = "reddit_cambridge.txt" # Should we load pretrained embeddings from file load_embeds = True # Read in data file df = pd.read_csv(data_dir + "/" + input_file, sep="\t") # Initialize a preprocessor P = Preprocessor(df, "texts", token_type="lower", max_features=10000, maxlen=10000, min_count=30, nlp="en_core_web_lg") # Run the preprocessing on your dataframe P.preprocess() P.get_bias_term_indexes( ['privacy', 'anonymity', 'confidentiality', 'disclosure']) # Load embeddings from file if we choose to do so if load_embeds: # Load embedding matrix from file path - change path to where you saved them embedding_matrix = P.load_glove("../glove.6B.300d.txt") else: embedding_matrix = None
data_dir = "data" # Where to save preprocessed data clean_data_dir = "data/clean_data" # Name of input file. Should be inside of data_dir input_file = "20_newsgroups.txt" # Should we load pretrained embeddings from file load_embeds = True # Read in data file df = pd.read_csv(data_dir + "/" + input_file, sep="\t") # Initialize a preprocessor P = Preprocessor(df, "texts", token_type="lower", max_features=10000, maxlen=10000, min_count=30, nlp="en_core_web_lg") # Run the preprocessing on your dataframe P.preprocess() # Load embeddings from file if we choose to do so if load_embeds: # Load embedding matrix from file path - change path to where you saved them embedding_matrix = P.load_glove("../glove.6B.300d.txt") else: embedding_matrix = None # Save data to data_dir
test_df = pd.DataFrame(test_set) train_df.to_csv("{}/train.csv".format(clean_data_dir), index=False) test_df.to_csv("{}/test.csv".format(clean_data_dir), index=False) else: train_df = pd.read_csv("{}/train.csv".format(clean_data_dir)) test_df = pd.read_csv("{}/test.csv".format(clean_data_dir)) print("Number of Articles in Training Set: {}".format(str(len( train_df.index)))) print("Number of Articles in Test Set: {}".format(str(len(test_df.index)))) # Initialize a preprocessor P = Preprocessor(train_df, "texts", max_features=30000, maxlen=10000, min_count=20) # Run the preprocessing on your dataframe P.preprocess() # Load embeddings from file if we choose to do so if load_embeds: # Load embedding matrix from file path - change path to where you saved them embedding_matrix = P.load_glove(glove_path) else: embedding_matrix = None # Save data to data_dir P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)
from lda2vec.nlppipe import Preprocessor # Data directory data_dir = "tests/twenty_newsgroups/data" # Where to save preprocessed data clean_data_dir = "tests/twenty_newsgroups/data/clean_data" # Name of input file. Should be inside of data_dir input_file = "20_newsgroups.txt" # Should we load pretrained embeddings from file load_embeds = True # Read in data file df = pd.read_csv(data_dir + "/" + input_file, sep="\t") # Initialize a preprocessor P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30) # Run the preprocessing on your dataframe P.preprocess() # Load embeddings from file if we choose to do so if load_embeds: # Load embedding matrix from file path - change path to where you saved them embedding_matrix = P.load_glove( "tests/twenty_newsgroups/glove_embeddings/glove.6B.300d.txt") else: embedding_matrix = None # Save data to data_dir P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)
import pandas as pd from lda2vec.nlppipe import Preprocessor # Data directory data_dir ="data" # Where to save preprocessed data clean_data_dir = "data/clean_data" # Name of input file. Should be inside of data_dir input_file = "20_newsgroups.txt" # Should we load pretrained embeddings from file load_embeds = True # Read in data file df = pd.read_csv(data_dir+"/"+input_file, sep="\t") # Initialize a preprocessor P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30) # Run the preprocessing on your dataframe P.preprocess() # Load embeddings from file if we choose to do so if load_embeds: # Load embedding matrix from file path - change path to where you saved them embedding_matrix = P.load_glove("PATH/TO/GLOVE/glove.6B.300d.txt") else: embedding_matrix = None # Save data to data_dir P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)
# Name of input file. Should be inside of data_dir INPUT_FILE = "export.csv" # Should we load pretrained embeddings from file load_embeds = True # Read in data file reddit_df = pd.read_csv(DATA_FOLDER + '/' + INPUT_FILE) reddit_df['p_test'] = reddit_df.p.str.extract("selftext:(.*),over", re.DOTALL) reddit_df.columns # Initialize a preprocessor P = Preprocessor(reddit_df, "p_test", max_features=30000, maxlen=10000, min_count=30) # Run the preprocessing on your dataframe P.preprocess() # Load embeddings from file if we choose to do so if load_embeds: # Load embedding matrix from file path - change path to where you saved them embedding_matrix = P.load_glove(EMBEDDING_DIR + "/" + "glove.6B.100d.txt") else: embedding_matrix = None # Save data to data_dir P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)
with open('datasets/parsed_full_titles.txt') as f: for line in f.readlines(): clean_titles.append(line) titles = pd.DataFrame(clean_titles, columns=['processed_title']) # Where to save preprocessed data clean_data_dir = "data/clean_data" # Should we load pretrained embeddings from file load_embeds = True # Initialize a preprocessor P = Preprocessor(titles, "processed_title", max_features=30000, maxlen=10000, min_count=30) # Run the preprocessing on your dataframe t0 = time() print('INFO: beginning preprocesssing tokens from titles') P.preprocess() print('INFO: finished preprocessing tokens from titles in %0.3fs.' % (time() - t0)) else: clean_descriptions = [] with open('datasets/parsed_full_descriptions.txt') as f: