Exemplo n.º 1
0
data_dir = "data"
# Where to save preprocessed data
clean_data_dir = "data/clean_data"
# Name of input file. Should be inside of data_dir
input_file = "reddit_cambridge.txt"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
df = pd.read_csv(data_dir + "/" + input_file, sep="\t")

# Initialize a preprocessor
P = Preprocessor(df,
                 "texts",
                 token_type="lower",
                 max_features=10000,
                 maxlen=10000,
                 min_count=30,
                 nlp="en_core_web_lg")

# Run the preprocessing on your dataframe
P.preprocess()
P.get_bias_term_indexes(
    ['privacy', 'anonymity', 'confidentiality', 'disclosure'])

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove("../glove.6B.300d.txt")
else:
    embedding_matrix = None
data_dir = "data"
# Where to save preprocessed data
clean_data_dir = "data/clean_data"
# Name of input file. Should be inside of data_dir
input_file = "20_newsgroups.txt"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
df = pd.read_csv(data_dir + "/" + input_file, sep="\t")

# Initialize a preprocessor
P = Preprocessor(df,
                 "texts",
                 token_type="lower",
                 max_features=10000,
                 maxlen=10000,
                 min_count=30,
                 nlp="en_core_web_lg")

# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove("../glove.6B.300d.txt")
else:
    embedding_matrix = None

# Save data to data_dir
    test_df = pd.DataFrame(test_set)

    train_df.to_csv("{}/train.csv".format(clean_data_dir), index=False)
    test_df.to_csv("{}/test.csv".format(clean_data_dir), index=False)
else:
    train_df = pd.read_csv("{}/train.csv".format(clean_data_dir))
    test_df = pd.read_csv("{}/test.csv".format(clean_data_dir))

print("Number of Articles in Training Set: {}".format(str(len(
    train_df.index))))
print("Number of Articles in Test Set: {}".format(str(len(test_df.index))))

# Initialize a preprocessor
P = Preprocessor(train_df,
                 "texts",
                 max_features=30000,
                 maxlen=10000,
                 min_count=20)

# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove(glove_path)
else:
    embedding_matrix = None

# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)
Exemplo n.º 4
0
from lda2vec.nlppipe import Preprocessor

# Data directory
data_dir = "tests/twenty_newsgroups/data"
# Where to save preprocessed data
clean_data_dir = "tests/twenty_newsgroups/data/clean_data"
# Name of input file. Should be inside of data_dir
input_file = "20_newsgroups.txt"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
df = pd.read_csv(data_dir + "/" + input_file, sep="\t")

# Initialize a preprocessor
P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30)

# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove(
        "tests/twenty_newsgroups/glove_embeddings/glove.6B.300d.txt")
else:
    embedding_matrix = None

# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)
import pandas as pd
from lda2vec.nlppipe import Preprocessor

# Data directory
data_dir ="data"
# Where to save preprocessed data
clean_data_dir = "data/clean_data"
# Name of input file. Should be inside of data_dir
input_file = "20_newsgroups.txt"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
df = pd.read_csv(data_dir+"/"+input_file, sep="\t")

# Initialize a preprocessor
P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30)

# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove("PATH/TO/GLOVE/glove.6B.300d.txt")
else:
    embedding_matrix = None

# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)
# Name of input file. Should be inside of data_dir
INPUT_FILE = "export.csv"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
reddit_df = pd.read_csv(DATA_FOLDER + '/' + INPUT_FILE)

reddit_df['p_test'] = reddit_df.p.str.extract("selftext:(.*),over", re.DOTALL)

reddit_df.columns

# Initialize a preprocessor
P = Preprocessor(reddit_df,
                 "p_test",
                 max_features=30000,
                 maxlen=10000,
                 min_count=30)

# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove(EMBEDDING_DIR + "/" + "glove.6B.100d.txt")
else:
    embedding_matrix = None

# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)
Exemplo n.º 7
0
    with open('datasets/parsed_full_titles.txt') as f:
        for line in f.readlines():
            clean_titles.append(line)

    titles = pd.DataFrame(clean_titles, columns=['processed_title'])

    # Where to save preprocessed data
    clean_data_dir = "data/clean_data"

    # Should we load pretrained embeddings from file
    load_embeds = True

    # Initialize a preprocessor
    P = Preprocessor(titles,
                     "processed_title",
                     max_features=30000,
                     maxlen=10000,
                     min_count=30)

    # Run the preprocessing on your dataframe
    t0 = time()

    print('INFO: beginning preprocesssing tokens from titles')

    P.preprocess()

    print('INFO: finished preprocessing tokens from titles in %0.3fs.' %
          (time() - t0))
else:
    clean_descriptions = []
    with open('datasets/parsed_full_descriptions.txt') as f: