def __init__(self, data_path, vocabulary_size, embedding_size, learning_rate=1.0): self.corpus = read_own_data(data_path) self.data, self.word_count, self.word2index, self.index2word = build_dataset( self.corpus, vocabulary_size) self.vocabs = list(set(self.data)) self.model: SkipGramNeg = SkipGramNeg(vocabulary_size, embedding_size).cuda() self.model_optim = SGD(self.model.parameters(), lr=learning_rate)
def dump_train_data(word_dict,train_data_path=train_data_path,new_train_data_path=new_train_data_path): print('building dataset...') train_data = build_dataset(train_data_path,word_dict) print('building dataset2...') train_data = map(lambda x:" ".join(map(lambda y:str(y),x)),train_data) print('dumping train...') rows = [] for i,d in enumerate(train_data): if i%10000==0 and i!=0: with open(new_train_data_path, 'a') as f: f.write("".join(rows)) rows = [] rows.append("%s\n"%d) if len(rows)!=0: with open(new_train_data_path, 'a') as f: f.write("".join(rows))
def __init__(self, path, vocab_size=200, n_review=1000, embedding_size=50, learning_rate=0.1): self.corpus = read_own_data(path) self.corpus = self.corpus[:n_review] self.data, self.word_count, self.word2index, self.index2word = build_dataset( self.corpus, vocab_size) self.vocabs = list(set(self.data)) # self.model = SkipGramNeg(vocab_size, embedding_size).cuda() self.model = SkipGramNeg(vocab_size, embedding_size) self.model_optim = optim.SGD(self.model.parameters(), lr=learning_rate) print("Number of token is: ", len(self.data))
parser.add_argument("--decay_rate", type=float, default=0.98, help="Decay learning rate every n steps") parser.add_argument("--decay_steps", type=int, default=10000, help="Decay learning rate every n steps") parser.add_argument("--batch_size", type=int, default=64, help="batch size.") parser.add_argument("--num_epochs", type=int, default=100, help="number of epochs.") parser.add_argument("--shuffle", type=bool, default=False, help="if shuffle the dataset?") parser.add_argument("--model_dir", type=str, default="save", help="model params dir") parser.add_argument("--check_steps", type=int, default=300, help="every n steps check one time") args = parser.parse_args() word_dict_file = "data/words_char.json" word_dict = get_word_dict(word_dict_file) # file not yet transformed to one hot data = map_get_words("test.txt") data = list(build_dataset(data,word_dict,True)) scores = inference(data, len(word_dict), args) # file already transformed to one hot #inference_file = "data/valid_char1.txt" #scores = inference(inference_file, len(word_dict), args) #data = get_dataset(inference_file) data = list(map(lambda x:x[1:(np.sum(np.sign(x))-1)],data)) mean_log_scores = [] for d,s in zip(data,scores): mean_log_score = np.mean([-np.log(s[i][n]) for i,n in enumerate(d)]) mean_log_scores.append(mean_log_score) print("mean_log_score: ",np.mean(mean_log_scores))
def dump_valid_data(word_dict,valid_data_path=valid_data_path,new_valid_data_path=new_valid_data_path): valid_data = build_dataset(valid_data_path,word_dict) valid_data = list(map(lambda x:" ".join(map(lambda y:str(y),x)),valid_data)) print('dumping valid...') with open(new_valid_data_path, 'w') as f: f.write("\n".join(valid_data))
model_path = os.path.join(args.model_dir, "model_{}.model".format(args.model)) if args.model == "FastText": import fasttext from model_utils_fasttext import test_model logger.info("Loading model from {}".format(model_path)) model = fasttext.load_model(model_path) logger.info("Testing model...") y_pred = test_model(model, X_test=test_df["text"].tolist()) else: from data_utils import build_dataset, build_iterator from model_utils import test_model test_df["text_words"] = test_df["text"].apply(lambda x: x.split()) vocab2id = pkl.load(open(os.path.join(args.model_dir, "vocab_{}.vocab".format(args.model)), "rb")) data = build_dataset(test_df["text_words"], vocab2id, max_doc_len=config.max_doc_len) data_iter = build_iterator(data, config.batch_size, device) logger.info("Loading model from {}".format(model_path)) model = torch.load(model_path) logger.info("Testing model...") y_pred = test_model(model, data_iter) test_df["label"] = y_pred test_df["label"].to_csv("predict_{}.csv".format(args.model), index=False, header="label") logger.info("Finishing predict label for testing data.")
help="Checkpoint dir for saved model.") parser.add_argument("--batch_size", type=int, default=24, help="Batch size.") parser = argparse.ArgumentParser() add_arguments(parser) args = parser.parse_args() print("Loading dictionary...") word_dict, reversed_dict, document_max_len = build_dict(args.test_tsv, is_train=False) print("Building test dataset...") test_x, test_y = build_dataset(args.test_tsv, word_dict, document_max_len) checkpoint_file = tf.train.latest_checkpoint(args.checkpoint_dir) # File for saving the predicted values nameHandle = open('y_pred.txt', 'w') graph = tf.Graph() with graph.as_default(): with tf.Session() as sess: saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) x = graph.get_operation_by_name("x").outputs[0] y = graph.get_operation_by_name("y").outputs[0]
# split document to words logger.info("Split words...") train_df["text_words"] = train_df["text"].apply(lambda x: x.split()) X_train, y_train = train_df["text_words"], train_df["label"] train_df.drop(columns=["text"], inplace=True) logger.info("Building dataset...") vocab2id = build_vocab(docs=X_train, min_count=config.min_count) pkl.dump( vocab2id, open( os.path.join(args.model_dir, "vocab_{}.vocab".format(args.model)), "wb")) train_data = build_dataset(X_train, vocab2id, max_doc_len=config.max_doc_len) train_df.drop(columns=["text_words"], inplace=True) logger.info("Loading embeddings...") embeddings = load_embeddings(args.embedding_path, vocab2id) device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") if args.nsplits > 1: SKF = StratifiedKFold(n_splits=args.nsplits, shuffle=True) for fold_idx, (train_idx, val_idx) in enumerate(SKF.split(X_train, y_train)): logger.info("*" * 20 + "Training {}-fold...".format(fold_idx))
encoded_word = dictionary['UNK'] encoded_sentence += str(encoded_word) + " " print sentence encoded_sentence = encoded_sentence[:-1] # Remove final space f.write(encoded_sentence + '\n') # Write sentence to file f.close() # Generate dictionary for dataset tokenized_data = data_utils.read_data(num_movie_scripts) print '-------- tokenized_data' print tokenized_data[:10] data, count, dictionary, reverse_dictionary = data_utils.build_dataset(tokenized_data, vocabulary_size) print '-------- data' print data print '-------- count' print count print '-------- dictionary' data_utils.print_dic(dictionary, 5) print dictionary print '-------- reverse_dictionary' data_utils.print_dic(reverse_dictionary, 5) print reverse_dictionary print '-------- generateEncodedFile' tokenized_sentences = data_utils.read_sentences(num_movie_scripts) # Generate file
default=1e-3, help="learning rate.") parser.add_argument("--batch_size", type=int, default=64, help="batch size.") parser.add_argument("--num_epochs", type=int, default=10, help="number of epochs.") parser.add_argument("--max_document_len", type=int, default=100, help="max document length.") args = parser.parse_args() if not os.path.exists("dbpedia_csv"): print("Downloading dbpedia dataset...") download_dbpedia() print("\nBuilding dictionary..") word_dict = build_word_dict() print("Preprocessing dataset..") train_x, train_lm_y, train_clf_y = build_dataset("train", word_dict, args.max_document_len) test_x, test_lm_y, test_clf_y = build_dataset("test", word_dict, args.max_document_len) train(train_x, train_lm_y, train_clf_y, test_x, test_lm_y, test_clf_y, word_dict, args)
import math import numpy as np import tensorflow as tf import os from data_utils import maybe_download, read_data, build_dataset, tsne_and_plot, generate_batch os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # suppress tensorflow warnings # Step 1: Download the data (maybe). filename = maybe_download('text8.zip', 31344016) # Step 2: Read and build the dictionary and replace rare words with UNK token. vocabulary = read_data(filename) print('Data size', len(vocabulary)) vocabulary_size = 50000 data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size) del vocabulary # Hint to reduce memory. print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) # Step 3: get batch data generator data_index = 0 sample_batch, sample_labels, data_index = generate_batch(data, data_index, batch_sz=8, n_skips=2, skip_wd=1) for i in range(8): print(sample_batch[i], reverse_dictionary[sample_batch[i]], '->', sample_labels[i, 0], reverse_dictionary[sample_labels[i, 0]]) # Step 4: Build a skip-gram model. batch_size = 128
if word in dictionary: encoded_word = dictionary[word] else: encoded_word = dictionary['UNK'] encoded_sentence += str(encoded_word) + " " print sentence encoded_sentence = encoded_sentence[:-1] # Remove final space f.write(encoded_sentence + '\n') # Write sentence to file f.close() # Generate dictionary for dataset tokenized_data = data_utils.read_data(num_movie_scripts) print '-------- tokenized_data' print tokenized_data[:10] data, count, dictionary, reverse_dictionary = data_utils.build_dataset( tokenized_data, vocabulary_size) print '-------- data' print data print '-------- count' print count print '-------- dictionary' data_utils.print_dic(dictionary, 5) print dictionary print '-------- reverse_dictionary' data_utils.print_dic(reverse_dictionary, 5) print reverse_dictionary print '-------- generateEncodedFile' tokenized_sentences = data_utils.read_sentences(num_movie_scripts) # Generate file generateEncodedFile('X_train_for_3_scripts', tokenized_sentences)
default="saved_model", help="Checkpoint directory.") parser = argparse.ArgumentParser() add_arguments(parser) args = parser.parse_args() num_class = 3 if not os.path.exists(args.checkpoint_dir): os.mkdir(args.checkpoint_dir) print("Building dictionary...") word_dict, reversed_dict, document_max_len = build_dict(args.train_tsv) print("Building dataset...") x, y = build_dataset(args.train_tsv, word_dict, document_max_len) # Split to train and validation data train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.10, random_state=42) #train_x, train_y = build_dataset(args.train_tsv, word_dict, document_max_len) #print("Building validation dictionary...") # #valid_tsv = 'data/lstm_single/africell_calls/dev_data.tsv' #word_dict_valid, reversed_dict, document_max_len_valid = build_dict(valid_tsv) #print("Building validation dataset...") #valid_x, valid_y = build_dataset(valid_tsv, word_dict_valid, document_max_len_valid)
parser.add_argument('--batch_size', type=int, default=8, help='batch_size') os.environ["CUDA_VISIBLE_DEVICES"] = "0" args = parser.parse_args() if __name__ == '__main__': dataset = 'text_emotion' # 数据集 model_name = args.model # bert x = import_module('models.' + model_name) config = x.Config(dataset) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True # 保证每次结果一样 if torch.cuda.device_count() > 0: torch.cuda.manual_seed_all(1) start_time = time.time() print("Loading data...") train_iter, dev_iter = build_dataset(config, args) #train_iter = build_iterator(train_data, config) #dev_iter = build_iterator(dev_data, config) #test_iter = build_iterator(test_data, config) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) model = x.Model(config).to(config.device) train(config, model, train_iter, dev_iter, None)
import tensorflow as tf import pickle from model import Model from data_utils import build_dict, build_dataset, batch_iter from train import hyper_params_path, word2index_path, seq2seq_model_dir with open(hyper_params_path, "rb") as f: args = pickle.load(f) print("Loading dictionary...") word_dict, reversed_dict, article_list, _ = build_dict( word2index_path=word2index_path) print("Loading validation dataset...") valid_x = build_dataset(word_dict, article_list, args.article_max_len) with tf.Session() as sess: print("Loading saved model...") model = Model(word_dict, args, train=False) saver = tf.train.Saver(tf.global_variables()) ckpt = tf.train.get_checkpoint_state(seq2seq_model_dir) saver.restore(sess, ckpt.model_checkpoint_path) batches = batch_iter(valid_x, [0] * len(valid_x), args.batch_size, 1) print("Writing summaries to 'result.txt'...") for batch_x, _ in batches: batch_x_len = [len([y for y in x if y != 0]) for x in batch_x] valid_feed_dict = { model.batch_size: len(batch_x), model.X: batch_x,
os.mkdir(seq2seq_model_dir) else: if args.with_model: pre_model_checkpoint = open(seq2seq_model_dir + 'checkpoint', 'r') pre_model_checkpoint = "".join([ seq2seq_model_dir, pre_model_checkpoint.read().splitlines()[0].split('"')[1] ]) print("Building dictionary...") word_dict, reversed_dict, article_list, headline_list = build_dict( train=True, word2index_path=word2index_path) print("Loading training dataset...") train_x, train_y = build_dataset(word_dict, article_list, args.article_max_len, headline_list=headline_list, headline_max_len=args.headline_max_len, train=True) with tf.Session() as sess: model = Model(word_dict, args) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) if 'pre_model_checkpoint' in globals(): print("Continuing training from pre-trained model:", pre_model_checkpoint, "......") saver.restore(sess, pre_model_checkpoint) batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs) num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1