Python Preprocessor примеры использования

Язык программирования: Python

Пространство имен/Пакет: lda2vec.nlppipe

Класс/Тип: Preprocessor

Примеров на hotexamples.com: 7

Python Preprocessor - 7 примеров найдено. Это лучшие примеры Python кода для lda2vec.nlppipe.Preprocessor, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

load_glove(7)

Preprocessor(5)

get_bias_term_indexes(1)

preprocess(1)

save_data(1)

Пример #1

Показать файл

data_dir = "data"
# Where to save preprocessed data
clean_data_dir = "data/clean_data"
# Name of input file. Should be inside of data_dir
input_file = "reddit_cambridge.txt"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
df = pd.read_csv(data_dir + "/" + input_file, sep="\t")

# Initialize a preprocessor
P = Preprocessor(df,
                 "texts",
                 token_type="lower",
                 max_features=10000,
                 maxlen=10000,
                 min_count=30,
                 nlp="en_core_web_lg")

# Run the preprocessing on your dataframe
P.preprocess()
P.get_bias_term_indexes(
    ['privacy', 'anonymity', 'confidentiality', 'disclosure'])

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove("../glove.6B.300d.txt")
else:
    embedding_matrix = None

Пример #2

Показать файл

Файл: load_20newsgroups.py Проект: McKaneAndrus/Lda2vec-Tensorflow

data_dir = "data"
# Where to save preprocessed data
clean_data_dir = "data/clean_data"
# Name of input file. Should be inside of data_dir
input_file = "20_newsgroups.txt"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
df = pd.read_csv(data_dir + "/" + input_file, sep="\t")

# Initialize a preprocessor
P = Preprocessor(df,
                 "texts",
                 token_type="lower",
                 max_features=10000,
                 maxlen=10000,
                 min_count=30,
                 nlp="en_core_web_lg")

# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove("../glove.6B.300d.txt")
else:
    embedding_matrix = None

# Save data to data_dir

Пример #3

Показать файл

Файл: load_50webhose.py Проект: harshadchavan10/Lda2vec-Tensorflow

    test_df = pd.DataFrame(test_set)

    train_df.to_csv("{}/train.csv".format(clean_data_dir), index=False)
    test_df.to_csv("{}/test.csv".format(clean_data_dir), index=False)
else:
    train_df = pd.read_csv("{}/train.csv".format(clean_data_dir))
    test_df = pd.read_csv("{}/test.csv".format(clean_data_dir))

print("Number of Articles in Training Set: {}".format(str(len(
    train_df.index))))
print("Number of Articles in Test Set: {}".format(str(len(test_df.index))))

# Initialize a preprocessor
P = Preprocessor(train_df,
                 "texts",
                 max_features=30000,
                 maxlen=10000,
                 min_count=20)

# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove(glove_path)
else:
    embedding_matrix = None

# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)

Пример #4

Показать файл

from lda2vec.nlppipe import Preprocessor

# Data directory
data_dir = "tests/twenty_newsgroups/data"
# Where to save preprocessed data
clean_data_dir = "tests/twenty_newsgroups/data/clean_data"
# Name of input file. Should be inside of data_dir
input_file = "20_newsgroups.txt"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
df = pd.read_csv(data_dir + "/" + input_file, sep="\t")

# Initialize a preprocessor
P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30)

# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove(
        "tests/twenty_newsgroups/glove_embeddings/glove.6B.300d.txt")
else:
    embedding_matrix = None

# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)

Пример #5

Показать файл

Файл: text.py Проект: mrtayarti/Topic-Modeling-using-Deep-Learning

import pandas as pd
from lda2vec.nlppipe import Preprocessor

# Data directory
data_dir ="data"
# Where to save preprocessed data
clean_data_dir = "data/clean_data"
# Name of input file. Should be inside of data_dir
input_file = "20_newsgroups.txt"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
df = pd.read_csv(data_dir+"/"+input_file, sep="\t")

# Initialize a preprocessor
P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30)

# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove("PATH/TO/GLOVE/glove.6B.300d.txt")
else:
    embedding_matrix = None

# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)

Пример #6

Показать файл

Файл: lda_exploration.py Проект: ndelah/Analytics-in-a-Big-Data-World-Assignment-4-Network-Analysis

# Name of input file. Should be inside of data_dir
INPUT_FILE = "export.csv"
# Should we load pretrained embeddings from file
load_embeds = True

# Read in data file
reddit_df = pd.read_csv(DATA_FOLDER + '/' + INPUT_FILE)

reddit_df['p_test'] = reddit_df.p.str.extract("selftext:(.*),over", re.DOTALL)

reddit_df.columns

# Initialize a preprocessor
P = Preprocessor(reddit_df,
                 "p_test",
                 max_features=30000,
                 maxlen=10000,
                 min_count=30)

# Run the preprocessing on your dataframe
P.preprocess()

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove(EMBEDDING_DIR + "/" + "glove.6B.100d.txt")
else:
    embedding_matrix = None

# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)

Пример #7

Показать файл

Файл: run_lda2vec.py Проект: rileymiller/research

    with open('datasets/parsed_full_titles.txt') as f:
        for line in f.readlines():
            clean_titles.append(line)

    titles = pd.DataFrame(clean_titles, columns=['processed_title'])

    # Where to save preprocessed data
    clean_data_dir = "data/clean_data"

    # Should we load pretrained embeddings from file
    load_embeds = True

    # Initialize a preprocessor
    P = Preprocessor(titles,
                     "processed_title",
                     max_features=30000,
                     maxlen=10000,
                     min_count=30)

    # Run the preprocessing on your dataframe
    t0 = time()

    print('INFO: beginning preprocesssing tokens from titles')

    P.preprocess()

    print('INFO: finished preprocessing tokens from titles in %0.3fs.' %
          (time() - t0))
else:
    clean_descriptions = []
    with open('datasets/parsed_full_descriptions.txt') as f: