def preprocess(data_dirs, document_length_limit, is_line_as_word, dev_sample_percentage): x_text, y = load_data_and_labels(data_dirs, document_length_limit, is_line_as_word) #print(x_text) # Vocabulary max_document_length = max([len(text.split(" ")) for text in x_text]) print("max_docment_length: {}".format(max_document_length)) max_document_length = min(document_length_limit, max_document_length) print("max_docment_length: {}".format(max_document_length)) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) print(x) # Random np.random.seed(100) shuffle_indices = np.random.permutation(np.arange(len(x_text))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set dev_sample_index = -1 * int(dev_sample_percentage * len(y)) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del x, y, x_shuffled, y_shuffled print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) return x_train, y_train, vocab_processor, x_dev, y_dev
def getVocabulary(data_dirs, document_length_limit, is_line_as_word, dev_sample_percentage): x_text, y = load_data_and_labels(data_dirs, document_length_limit, is_line_as_word) # Vocabulary max_document_length = max([len(text.split(" ")) for text in x_text]) print("max_docment_length: {}".format(max_document_length)) max_document_length = min(document_length_limit, max_document_length) print("max_docment_length: {}".format(max_document_length)) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) # 一维化之后的向量 x = np.array(list(vocab_processor.fit_transform(x_text))) print(x) print(y) print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) return x, y, vocab_processor
import random import numpy as np import torch from torch import optim import torch.nn as nn from sklearn.metrics import f1_score import data import model_gru import torch.nn.functional as F from Arg import args torch.manual_seed = (1) random.seed(1) #从data源文件中导入项目需要的文件,详情请查看data文件 train_sentence, valid_sentence, test_sentence, word_dict, word2ix, ix2word, word_matrix = data.load_data_and_labels( ) args = args() #定义训练流程,每次训练一个所有训练样本 def trainIters(SiaNetwork, train_sentence, criterion1, batch_size=2000, learning_rate=0.005): SiaNetwork.train() SiaNetwork_optimizer = optim.Adam(filter(lambda p: p.requires_grad, SiaNetwork.parameters()), lr=learning_rate)
def compare_similar_words(n, word, vocab, embeddings_arr): """ n: Number of most similar words to compare. word: Word to be compared. vocab: The vocabulary dictionary used to extract the index of word. embeddings_arr: Array of word embeddings to be compared. """ idx = vocab.get(word) for i in range(len(embeddings_arr)): print("%d:" % (i)) print_most_similar(n, idx, embeddings_arr[i], vocabulary) # Load data print("Loading data...") x_text, y = data.load_data_and_labels("data/rt-polarity.pos", "data/rt-polarity.neg") # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) # Extract vocabulary from vocab_processor vocab_dict = vocab_processor.vocabulary_._mapping sorted_vocab = sorted(vocab_dict.items(), key=lambda x: x[1]) vocabulary = list(list(zip(*sorted_vocab))[0]) # list of words in vocabulary # Restore model v1.1 sess = tf.Session() saver = tf.train.import_meta_graph("runs/v1/1507798871/checkpoints/model-7100.meta") saver.restore(sess, tf.train.latest_checkpoint("runs/v1/1507798871/checkpoints/."))
import data # Parameters # ================================================== # Data params test_sample_percentage = 0.1 # percentage of training data to use for validation positive_data_file = "data/rt-polarity.pos" negative_data_file = "data/rt-polarity.neg" # Data Preparation # ================================================== # Load data x, y = data.load_data_and_labels(positive_data_file, negative_data_file) # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set test_sample_index = -1 * int(test_sample_percentage * float(len(y))) x_train, x_test = x_shuffled[:test_sample_index], x_shuffled[ test_sample_index:] y_train, y_test = y_shuffled[:test_sample_index], y_shuffled[ test_sample_index:] # Transform targets from arrays to labels