Python Data_Processing示例，data_processing.data_processing.Data_Processing Python示例

示例#1

0

显示文件

def word_predict(save_path, i):
    """
    Creates TSV with scores of all vocabulary.
    :param save_path:
    :param i:
    :return:
    """
    print("TRAINING SECTION:", (i), save_path)
    config = DefaultConfig()

    data = dp.Data_Processing(load_tokenizers=True)
    x_train, y_train, x_test, y_test, x_tokenizer, y_tokenizer = data.run(
        train_file="../../data/train_data/train_data.p",
        test_file="../../data/test_data/test_data.p",
        section=i,
        y_mode="vectorize",
        shuffle=False)

    get_counts = Tokenizer()
    get_counts.fit_on_texts(x_test)

    words = [word[0] for word in get_counts.word_index.items()]

    word_vectors = data.pad_sequences(data.vectorize_text(words), max_len=30)

    labels = [[0, 1] for label in range(len(words))]

    num_words = len(x_tokenizer.word_index) + 1
    num_classes = len(y_tokenizer.word_index.items())
    print("LEN X: ", len(word_vectors))

    w2v_layer = data.create_embedding_layer(
        '../../trained_w2v/twitter2vec.w2v')

    validation_batches = generate_instances(word_vectors,
                                            labels,
                                            num_words,
                                            num_classes,
                                            config.max_timesteps,
                                            batch_size=config.batch_size)

    # Train the model
    predict(config,
            validation_batches,
            w2v_layer,
            save_path,
            words,
            pred_type="words")
    print("FINISHED:", save_path, "WORD PREDICTION\n\n")

示例#2

0

显示文件

def train(save_path, i, from_saved=False):
    """
    Trains RNN model with given save_path. Can resume training from saved model.
    :param save_path:
    :param i:
    :return:
    """
    print("TRAINING SECTION:", i, save_path)
    config = DefaultConfig()

    data = dp.Data_Processing(load_tokenizers=True)
    x_train, y_train, x_test, y_test, x_tokenizer, y_tokenizer = data.run(
        train_file="../../data/train_data/train_data.p",
        test_file="../../data/test_data/test_data.p",
        section=i,
        x_mode="index",
        y_mode="vectorize")

    num_words = len(x_tokenizer.word_index) + 1
    num_classes = len(y_tokenizer.word_index.items())

    # TODO CHECK INDEX LIST

    w2v_layer = data.create_embedding_layer(
        '../../trained_w2v/twitter2vec.w2v')

    # Generate batches
    train_batches = generate_instances(x_train,
                                       y_train,
                                       num_words,
                                       num_classes,
                                       config.max_timesteps,
                                       batch_size=config.batch_size)
    validation_batches = generate_instances(x_test,
                                            y_test,
                                            num_words,
                                            num_classes,
                                            config.max_timesteps,
                                            batch_size=config.batch_size)

    # Train the model
    train_model(config,
                train_batches,
                validation_batches,
                w2v_layer,
                save_path,
                from_saved=from_saved)
    print("FINISHED:", save_path, "TRAINING\n\n")

示例#3

0

显示文件

def tweet_predict(save_path, i):
    """
    Creates TSV with scores of all tweets
    :param save_path:
    :param i:
    :return:
    """
    print("TRAINING SECTION:", i, save_path)
    config = DefaultConfig()

    data = dp.Data_Processing(load_tokenizers=True)
    x_train, y_train, x_test, y_test, x_tokenizer, y_tokenizer = data.run(
        train_file="../../data/train_data/train_data.p",
        test_file="../../data/test_data/test_data.p",
        section=i,
        x_mode="index",
        y_mode="vectorize",
        shuffle=False)

    tweets_train, _, tweets_test, _, _, _ = data.run(
        train_file="../../data/train_data/train_data.p",
        test_file="../../data/test_data/test_data.p",
        section=i,
        shuffle=False)

    num_words = len(x_tokenizer.word_index) + 1
    num_classes = len(y_tokenizer.word_index.items())
    print("NUM WORDS: ", num_words, " NUM CLASSES: ", num_classes)

    w2v_layer = data.create_embedding_layer(
        '../../trained_w2v/twitter2vec.w2v')

    # TODO STILL ONLY VALIDATION BATCHES?
    validation_batches = generate_instances(x_test,
                                            y_test,
                                            num_words,
                                            num_classes,
                                            config.max_timesteps,
                                            batch_size=config.batch_size)

    # Train the model
    predict(config, validation_batches, w2v_layer, save_path, tweets_test)
    print("FINISHED:", save_path, "TWEET PREDICTION\n\n")

示例#4

0

显示文件

def tfidf_analysis(sections, i):
    """
    Writes 50 highest scoring words from each political party per quarter
    :param sections:
    :param i:
    :return:
    """
    with open("tfidf_results/tfidf_scores.tsv", "w+") as results:
        results.write("time\t" + "d_word\t" + "d_score\t" + 'r_word\t' +
                      'r_score\t')
        results.close()

    print("WORKING ON SECTION:", i, save_path)

    data = dp.Data_Processing()
    x, y, x_test, y_test, x_tokenizer, y_tokenizer = data.run(
        train_file="../data/train_data/train_data.p",
        test_file="../data/test_data/test_data.p",
        section=i,
        shuffle=False)

    x = np.append(x, x_test)
    y = np.append(y, y_test)

    x = data.tweet_tokenizer(x)

    r_x = []
    d_x = []
    x_full = []

    for idx, tweet in enumerate(x):
        if y[idx] == "R":
            r_x.append(tweet)
        else:
            d_x.append(tweet)
        x_full.append(tweet)

    t = Tfidf()
    t.build_model(x_full)

    r_tfidf = dict()
    d_tfidf = dict()

    for tweet in r_x:
        counts, length = t.counts(tweet)
        for word in tweet:
            score = t.tf_idf(counts[word], length, t.num_docs,
                             t.token_appearance[word])
            r_tfidf[word] = r_tfidf.get(word, 0) + score

    for tweet in d_x:
        counts, length = t.counts(tweet)
        for word in tweet:
            score = t.tf_idf(counts[word], length, t.num_docs,
                             t.token_appearance[word])
            d_tfidf[word] = d_tfidf.get(word, 0) + score

    r_words = [item[0] for item in r_tfidf.items()]
    r_scores = [item[1] for item in r_tfidf.items()]
    r_top_words = [(r_words[idx], r_scores[idx])
                   for idx in list(np.argsort(r_scores))][::-1]

    d_words = [item[0] for item in d_tfidf.items()]
    d_scores = [item[1] for item in d_tfidf.items()]
    d_top_words = [(d_words[idx], d_scores[idx])
                   for idx in list(np.argsort(d_scores))][::-1]

    with open("tfidf_results/tfidf_scores.tsv", "a") as results:
        for idx in range(50):
            results.write("\n%s\t%s\t%.5f\t%s\t%.5f" %
                          (save_path, d_top_words[idx][0], d_top_words[idx][1],
                           r_top_words[idx][0], d_top_words[idx][1]))
        results.close()

    print(save_path, "FINISHED")

示例#5

0

显示文件

文件： ml_models.py 项目： ryancallihan/pol_rhet_project_2017

# Load some categories from the training se
# writes coef_d feat_d coef_r feat_r section model acc_d acc_r acc_all recall f1 support

with open("svm_top_features.tsv", "w", encoding="utf-8") as tsv:
    tsv.write("coef_d\t" + "feat_d\t" + "coef_r\t" + "feat_r\t" + "section\t" +
              "model\t" + "acc_d\t" + "acc_r\t" + "acc_all\t" + "recall\t" +
              "f1\t" + "support")
    tsv.close()

sections = [
    "2015_1q", "2015_2q", "2015_3q", "2015_4q", "2016_1q", "2016_2q",
    "2016_3q", "2016_4q", "2017_1q", "2017_2q", "2017_2q_REDUCED",
    "2017_2q_REDUCED"
]

data = dp.Data_Processing(load_tokenizers=True)
for i, section in enumerate(sections):
    print("\nWORKING ON SECTION: ", section, "\n")
    data_train, y_train, data_test, y_test, _, _ = data.run(
        train_file="../data/train_data/train_data.p",
        test_file="../data/test_data/test_data.p",
        section=i + 1)

    target_names = list(set(y_train))

    print(
        "Extracting features from the training data using a sparse vectorizer")

    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 stop_words='english')