예제 #1
0
    def predict(self,url):
        params = data_helper.loadDict(self.training_params_file)
        num_labels = int(params['num_labels'])
        max_document_length = int(params['max_document_length'])
        x_raw = [url]
        sentences, max_document_length = data_helper.padding_sentences(x_raw, '<PADDING>', padding_sentence_length = max_document_length)
        x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load = self.trained_word2vec_model_file))
        # print(x_test)
        # with self.graph.as_default():
        # with self.sess.as_default():

        result = self.sess.run(self.predictions, {self.input_x: x_test,self.dropout_keep_prob: 1.0})
        result = 'good' if result else 'bad'
        print("Request examples: {}, inference result: {}".format(url,result))
        return result
#y = y[0:1000]

#random select a part of the original data
new_x_text = []
new_y = []
for i in range(3000):
    rand_idx = random.randint(0, len(x_text))
    #rand_y = random.randint(0, len(x_text))
    new_x_text.append(x_text[rand_idx])
    new_y.append(y[rand_idx])
print "new_x_text length: %d" % len(new_x_text)
print "new_y length: %d" % len(new_y)

# embedding vector
print("Padding sentences...")
sentences, max_document_length = data_helper.padding_sentences(new_x_text, '<PADDING>')    #max_document_length = 

print("embedding_sentences...")
all_vectors = word2vec_helpers.embedding_sentences(sentences, embedding_size = FLAGS.embedding_dim, file_to_save = os.path.join(out_dir, 'trained_word2vec.model'))
print "all_vectors length %d  *  %d  *  %d : " %  (len(all_vectors) , len(all_vectors[0]) , len(all_vectors[0][0]))
#x = np.array(all_vectors)   ## this operation  could lead to memory error!!!

#TODO: transform large vectors into sparse matrix
x = np.asarray(all_vectors)
y = np.asarray(new_y)
print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))

# Save params
training_params_file = os.path.join(out_dir, 'training_params.pickle')
params = {'num_labels' : FLAGS.num_labels, 'max_document_length' : max_document_length}
예제 #3
0
# 建立词表
vocab_tokens = [
    line.strip()
    for line in codecs.open('./runs/vocab', 'r', 'utf-8').readlines()
]
vocsize = len(vocab_tokens)
vocab = {}
for (i, token) in enumerate(vocab_tokens):
    vocab[token] = i

# 加载的是训练集数据及标签
x_text, y = data_helper.load_data_and_labels('./data/train_data/',
                                             './runs/vocab')

# 进行padding  传入的是语料, 填充的标志, 最大填充的长度
sentences = data_helper.padding_sentences(x_text, FLAGS.padding_token,
                                          FLAGS.max_sentence_len)

print("len(x_text)", len(x_text))
print("len(y)", len(y))
# Build vocabulary

# 将语料转为对应的id
x = np.array(
    data_helper.sentence2matrix(sentences, FLAGS.max_sentence_len, vocab))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))

x_train = x[shuffle_indices]
y_train = y[shuffle_indices]
예제 #4
0
# Load params
params = data_helper.loadDict(training_params_file)
num_labels = int(params['num_labels'])
max_document_length = int(params['max_document_length'])
# Load data
if FLAGS.eval_train and FLAGS.single_url is None:
    x_raw, y_test = data_helper.load_data_and_labels(FLAGS.input_text_file)
elif FLAGS.single_url is not None:
    x_raw = [FLAGS.single_url]
    y_test = None
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Get Embedding vector x_test
sentences, max_document_length = data_helper.padding_sentences(
    x_raw, '<PADDING>', padding_sentence_length=max_document_length)
x_test = np.array(
    word2vec_helpers.embedding_sentences(
        sentences, file_to_load=trained_word2vec_model_file))
print("x_test.shape = {}".format(x_test.shape))

# Evaluation
# ==================================================
print("\nEvaluating...\n")
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
예제 #5
0
def data_preprocess():
    # Data preprocess
    # =======================================================
    # Load data
    print("Loading data...")
    if not os.path.exists(os.path.join(out_dir, "data_x.npy")):
        x, y = data_helper.load_data_and_labels(FLAGS.data_file)
        # Get embedding vector
        x = x[:1000]
        y = y[:1000]
        sentences, max_document_length = data_helper.padding_sentences(
            x, '<PADDING>', padding_sentence_length=FLAGS.sequence_length)
        print(len(sentences[0]))
        if not os.path.exists(os.path.join(out_dir, "trained_word2vec.model")):
            x = np.array(
                word2vec_helpers.embedding_sentences(
                    sentences,
                    embedding_size=FLAGS.embedding_dim,
                    file_to_save=os.path.join(out_dir,
                                              'trained_word2vec.model')))
        else:
            print('w2v model found...')
            x = np.array(
                word2vec_helpers.embedding_sentences(
                    sentences,
                    embedding_size=FLAGS.embedding_dim,
                    file_to_save=os.path.join(out_dir,
                                              'trained_word2vec.model'),
                    file_to_load=os.path.join(out_dir,
                                              'trained_word2vec.model')))
        y = np.array(y)
        # np.save(os.path.join(out_dir,"data_x.npy"),x)
        # np.save(os.path.join(out_dir,"data_y.npy"),y)
        del sentences
    else:
        print('data found...')
        x = np.load(os.path.join(out_dir, "data_x.npy"))
        y = np.load(os.path.join(out_dir, "data_y.npy"))
    print("x.shape = {}".format(x.shape))
    print("y.shape = {}".format(y.shape))

    # Save params
    if not os.path.exists(os.path.join(out_dir, "training_params.pickle")):
        training_params_file = os.path.join(out_dir, 'training_params.pickle')
        params = {
            'num_labels': FLAGS.num_labels,
            'max_document_length': max_document_length
        }
        data_helper.saveDict(params, training_params_file)

    # Shuffle data randomly
    # np.random.seed(10)
    # shuffle_indices = np.random.permutation(np.arange(len(y)))
    # x_shuffled = x[shuffle_indices]
    # y_shuffled = y[shuffle_indices]
    # del x,y

    # x_train, x_test, y_train, y_test = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=42)  # split into training and testing set 80/20 ratio
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2,
        random_state=42)  # split into training and testing set 80/20 ratio
    del x, y
    return x_train, x_test, y_train, y_test