예제 #1
0
def prepare_data():
    path = '../outputs/'
    path_data = '../data/'

    with open(path + 'train/vocab.pkl', 'rb') as f:
        vocab_train_dict = pickle.load(f)

    with open(path + '/test/vocab.pkl', 'rb') as f:
        vocab_test_dict = pickle.load(f)

    W = np.load(path + 'train/embeddings.npy')
    W_test = np.load(path + 'test/embeddings.npy')

    train, labels, test = load_data_and_labels(
        path_data + 'train_pos_clean.txt', path_data + 'train_neg_clean.txt',
        path_data + 'test_data_clean.txt')

    print("Vectorization of the tweet sets")
    # To be improved (does not work when passed as a function ???)
    ls = []
    for sent in train:
        ls_temp = []
        for word in list(sent.split()):
            try:
                ls_temp.append(vocab_train_dict[word])
            except:
                ls_temp.append(0)
        ls.append(ls_temp)

    ls_sum = []
    for ls_in in ls:
        sum_vect = 0
        for index in ls_in:
            sum_vect += W[index]
        ls_sum.append(sum_vect)
    X_train = ls_sum

    ls = []
    for sent in test:
        ls_temp = []
        for word in list(sent.split()):
            try:
                ls_temp.append(vocab_test_dict[word])
            except:
                ls_temp.append(0)
        ls.append(ls_temp)

    ls_sum = []
    for ls_in in ls:
        sum_vect = 0
        for index in ls_in:
            sum_vect += W_test[index]
        ls_sum.append(sum_vect)
    X_test = ls_sum
    print("Sets vectorized")

    return X_train, labels, X_test
import time
import datetime
import sys
from cnn import cnn
# from utils import *
from sklearn.model_selection import train_test_split
from tensorflow.contrib import learn
import pandas as pd
import helpers
# import seaborn as sns

data = pd.read_csv("./data/spam.csv", encoding='latin-1')

data = data.drop(labels=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

x_input, y_raw = helpers.load_data_and_labels(data)

max_length = max([len(x.split(" ")) for x in x_input])

vocabprocess = learn.preprocessing.VocabularyProcessor(max_length)

x = np.array(list(vocabprocess.fit_transform(x_input)))
y = np.array(y_raw)

## shuffle data

np.random.seed(10)
shuffleindice = np.random.permutation(np.arange(len(y)))
xshuf = x[shuffleindice]
yshuf = y[shuffleindice]
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.models import load_model

# set parameters:
max_features = 5000
maxlen = 400
batch_size = 1000
embedding_dims = 100
nb_filter = 250
filter_length = 3
hidden_dims = 250
nb_epoch = 2

print('Loading data...')
x_train, labels, x_test = load_data_and_labels('data/train_pos.txt', 'data/train_neg.txt', 'data/test_data.txt')

# Tokenize the words and creating sequences and padding sequences
tokenizer = Tokenizer(nb_words=maxlen)
tokenizer.fit_on_texts(x_train)
sequences_test = tokenizer.texts_to_sequences(x_test)
X_test = sequence.pad_sequences(sequences_test, maxlen=maxlen)


print('Loading the model...')
model = load_model('model_CNN_test.h5')

# Predictions for the test data
probas = model.predict(X_test, batch_size=32)

# Replacing the predictions with -1 or +1
예제 #4
0
NUM_QUESTIONS = 2000

tf.flags.DEFINE_string("training_data_file", "./datasets/training.full.tsv",
                       "Data source for the training data")

FLAGS = tf.flags.FLAGS

FLAGS._parse_flags()
print("\n Parameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
    print("")

#Load the data
print("Loading data")
q1, q2, y_truth, q1_len, q2_len = helpers.load_data_and_labels(
    FLAGS.training_data_file)
dataset = list(zip(q1, q2, y_truth))

#q1 = [word for word in q1[0].split() if word not in stopwords.words('english')]

model_file = "./GoogleNews-vectors-negative300.bin"

model = gensim.models.KeyedVectors.load_word2vec_format(model_file,
                                                        binary=True)


def avg_feature_vector(words, model, num_features, index2word_set):
    #function to average all words vectors in a given paragraph
    featureVec = np.zeros((num_features, ), dtype="float32")
    nwords = 0
예제 #5
0
tf.flags.DEFINE_boolean("use_cached_embeddings", True,
                        "Cache embeddings locally on disk for repeated runs")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparation
# ==================================================

# Load data
print("Loading data...")
q1, q2, y, x1_lengths, x2_lengths = helpers.load_data_and_labels(
    FLAGS.training_data_file)

#print q1[:11], y[:11], x1_lengths[:11]

# Build vocabulary
max_question_length = max(max([len(x.split(" ")) for x in q1]),
                          max([len(x.split(" ")) for x in q2]))
vocab_processor = learn.preprocessing.VocabularyProcessor(max_question_length)
print "max_question_length: ", max_question_length

#x_text = q1 + q2
x1 = np.array(list(vocab_processor.fit_transform(q1)))
x2 = np.array(list(vocab_processor.fit_transform(q2)))
#x = np.array(list(vocab_processor.fit_transform(x_text)))
#xx1 = x[:len(q1)]
#xx2 = x[len(q1):]
예제 #6
0
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True,
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Load data
print("Loading data...")
q1, q2, y, q1_lengths, q2_lengths = helpers.load_data_and_labels(
    FLAGS.test_data_file)
x_raw = q1 + q2

# Build vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

x1_test = x_test[:len(q1)]
x2_test = x_test[len(q1):]
y_test = np.argmax(y, axis=1)

print("\nEvaluating...\n")

# Evaluation
# ==================================================
import numpy as np
import pickle
from helpers import load_data_and_labels
from helpers import clean_str
from sklearn.ensemble import RandomForestClassifier

path = 'data/mini/'

with open(path + 'vocab.pkl', 'rb') as f:
    vocab_dict = pickle.load(f)

W = np.load(path + 'embeddings.npy')

x_text, y = load_data_and_labels(positive_data_file=path + 'pos_train.txt',
                                 negative_data_file=path + 'neg_train.txt')

ls = []
for sent in x_text:
    ls_temp = []
    for word in list(sent.split()):
        try:
            ls_temp.append(vocab_dict[word])
        except:
            ls_temp.append(0)
    ls.append(ls_temp)
# print(ls)

ls_sum = []
for ls_in in ls:
    sum_vect = 0
    for index in ls_in:
예제 #8
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparation
# ==================================================

# Load data
print("Loading data...")
x_text, y = helpers.load_data_and_labels(FLAGS.pos_text, FLAGS.neg_text,
                                         FLAGS.max_document_length)

# Build vocabulary
d_vocab, x = helpers.vocab_processor(x_text)

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
x_train, x_dev = x_shuffled[:-2000], x_shuffled[-2000:]
y_train, y_dev = y_shuffled[:-2000], y_shuffled[-2000:]
print("Vocabulary Size: {:d}".format(len(d_vocab)))