def worker(dataset): # pat = re.compile(r'\(|\)') # def tokenize(string): # string = re.sub(pat, '', string) # return string.split() shared_content = {} for example in tqdm(dataset): s1_tokenize = tokenize(example['sentence1_binary_parse']) s2_tokenize = tokenize(example['sentence2_binary_parse']) s1_token_exact_match = [0] * len(s1_tokenize) s2_token_exact_match = [0] * len(s2_tokenize) s1_token_antonym = [0] * len(s1_tokenize) s2_token_antonym = [0] * len(s2_tokenize) s1_token_synonym = [0] * len(s1_tokenize) s2_token_synonym = [0] * len(s2_tokenize) for i, word in enumerate(s1_tokenize): matched = False for j, w2 in enumerate(s2_tokenize): matched = is_exact_match(word, w2) if matched: s1_token_exact_match[i] = 1 s2_token_exact_match[j] = 1 ant = is_antonyms(word, w2) if ant: s1_token_antonym[i] = 1 s2_token_antonym[j] = 1 syn = is_synonyms(word, w2) if syn: s1_token_synonym[i] = 1 s2_token_synonym[j] = 1 #TODO: add synonym content = {} content['sentence1_token_exact_match_with_s2'] = s1_token_exact_match content['sentence2_token_exact_match_with_s1'] = s2_token_exact_match content['sentence1_token_antonym_with_s2'] = s1_token_antonym content['sentence2_token_antonym_with_s1'] = s2_token_antonym content['sentence1_token_synonym_with_s2'] = s1_token_synonym content['sentence2_token_synonym_with_s1'] = s2_token_synonym #TODO: syn content shared_content[example["pairID"]] = content # print(shared_content[example["pairID"]]) # print(shared_content) return shared_content
def sample_text(self, session, vocabulary, prime, num_steps=100): """Let the model generate a sequence based on a preceding string. This method tokenizes the prime string and feeds the tokens to the model. Then, it feeds the model its own output (disgusting, I know) token by token and thus lets it generate / complete the text. For char level, this will result in 100 generated characters, for word level 100 generated tokens (words / punctuation / whitespace). Args: session (tf.Session): The TF session to run the operations in. vocabulary (tf.dataset.Vocabulary): A vocabulary for tokenizing the prime string and translating the output ids back to tokens. prime (str): A string to prime the model with. num_steps (int): The number of tokens generated by the model. Returns: str: The generated text. """ # Sample from the model prime_tokens = tokenize(prime, level=vocabulary.level) prime_ids = vocabulary.tokens_to_ids(prime_tokens) output_ids = self.sample_ids(session, prime_ids, num_steps) output_tokens = vocabulary.ids_to_tokens(output_ids) return ''.join(output_tokens)
def read_corpus(doc_path, dirs): """ Livedoorコーパスを読み込み、文書ごとに単語ベクトルを生成 """ documents = [] categories = [] for category, dir_name in enumerate(dirs): dir_name = os.path.join(doc_path, dir_name) for filename in os.listdir(dir_name): filename = os.path.join(dir_name, filename) # 1記事について全行読み込む # (1〜2行目は本文と関係ないので除去) text = codecs.open( filename, "r", "utf-8").readlines()[2:] # for removing the date (1st line) # 全行を1つのテキストに結合する text = u"".join(text) # テキストを単語に分割 words = dataset.tokenize(text) documents.append(words) categories.append(category) return documents, categories
import random import mxnet as mx from dataset import load_conversations, dataset_filter, make_vocab, tokenize, pad_sentence from seq2seq_lstm import Seq2seqLSTM context = mx.cpu() num_embed = 128 num_hidden = 1024 num_layers = 2 sequence_length = 32 sample_size = 1024 print("Loading dataset...", flush=True) dataset = dataset_filter(load_conversations("data/couplets.conv"), sequence_length) vocab = make_vocab(dataset) dataset = tokenize(dataset, vocab) print("Loading model...", flush=True) model = Seq2seqLSTM(vocab.size(), num_embed, num_hidden, num_layers) model.load_parameters("model/seq2seq_lstm.params", ctx=context) print("Evaluating...", flush=True) ppl = mx.metric.Perplexity(ignore_label=None) for source, target in random.sample(dataset, sample_size): source = pad_sentence(source, vocab, [2 ** (i + 1) for i in range(int(math.log(sequence_length, 2)))]) source = mx.nd.reverse(mx.nd.array(source, ctx=context), axis=0) label = mx.nd.array(target + [vocab.char2idx("<EOS>")], ctx=context) target = mx.nd.array([vocab.char2idx("<GO>")] + target, ctx=context) hidden = model.begin_state(func=mx.nd.zeros, batch_size=1, ctx=context) output, hidden = model(source.reshape((1, -1)).T, target.reshape((1, -1)).T, hidden) probs = mx.nd.softmax(output, axis=1)
def main(num_embed, num_hidden, num_layers, batch_size, sequence_length, context, sgd=False): print("Loading dataset...", flush=True) dataset = dataset_filter( load_conversations("data/xiaohuangji50w_nofenci.conv"), sequence_length) vocab = make_vocab(dataset) vocab.save("data/vocabulary.json") dataset = tokenize(dataset, vocab) model = Seq2seqLSTM(vocab.size(), num_embed, num_hidden, num_layers) loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() if os.path.isfile("model/seq2seq_lstm.ckpt"): with open("model/seq2seq_lstm.ckpt", "r") as f: ckpt_lines = f.readlines() ckpt_argv = ckpt_lines[-1].split() epoch = int(ckpt_argv[0]) best_L = float(ckpt_argv[1]) learning_rate = float(ckpt_argv[2]) epochs_no_progress = int(ckpt_argv[3]) model.load_parameters("model/seq2seq_lstm.params", ctx=context) else: epoch = 0 best_L = float("Inf") epochs_no_progress = 0 learning_rate = 0.001 model.initialize(mx.init.Xavier(), ctx=context) print("Learning rate:", learning_rate) if sgd: print("Optimizer: SGD") trainer = mx.gluon.Trainer(model.collect_params(), "SGD", { "learning_rate": learning_rate, "momentum": 0.5, "clip_gradient": 5.0 }) else: print("Optimizer: Adam") trainer = mx.gluon.Trainer(model.collect_params(), "Adam", { "learning_rate": learning_rate, "clip_gradient": 5.0 }) print("Training...", flush=True) while learning_rate >= 1e-8: random.shuffle(dataset) ts = time.time() total_L = 0.0 batch = 0 ppl = mx.metric.Perplexity(ignore_label=None) for bucket, src_len, tgt_len in rnn_buckets( dataset, [2**(i + 1) for i in range(int(math.log(sequence_length, 2)))]): for source, target, label in rnn_batches(bucket, vocab, batch_size, src_len, tgt_len, context): batch += 1 hidden = model.begin_state(func=mx.nd.zeros, batch_size=source.shape[1], ctx=context) with mx.autograd.record(): output, hidden = model(source, target, hidden) L = loss(output, label) L.backward() trainer.step(source.shape[1]) batch_L = mx.nd.mean(L).asscalar() if batch_L != batch_L: raise ValueError() total_L += batch_L probs = mx.nd.softmax(output, axis=1) ppl.update([label], [probs]) print( "[Epoch %d Bucket (%d, %d) Batch %d] batch_loss %.10f average_loss %.10f elapsed %.2fs" % (epoch, src_len, tgt_len, batch, batch_L, total_L / batch, time.time() - ts), flush=True) epoch += 1 avg_L = total_L / batch print( "[Epoch %d] learning_rate %.10f loss %.10f %s %f epochs_no_progress %d duration %.2fs" % (epoch, learning_rate, avg_L, ppl.get()[0], ppl.get()[1], epochs_no_progress, time.time() - ts), flush=True) if avg_L < best_L: best_L = avg_L epochs_no_progress = 0 model.save_params("model/seq2seq_lstm.params") with open("model/seq2seq_lstm.ckpt", "a") as f: f.write("%d %.10f %.10f %d\n" % (epoch, best_L, learning_rate, epochs_no_progress)) elif epochs_no_progress < 2: epochs_no_progress += 1 else: epochs_no_progress = 0 learning_rate *= 0.5 trainer.set_learning_rate(learning_rate)
def _get_review_text_from(data): text = json.loads(data).get(constants.TEXT, None) return dataset.tokenize(dataset.cleanse(text))
import dataset input_seqs, target_seqs, sequences, input_characters, [ PAD, EOS, StOS ] = dataset.load_sequences('Sequences_alarm_warning_171208.xlsx', 5) PADfaultclass, y_label, [YEncoder, YEncoder_OH ] = dataset.load_faultclasses('Fault_0822.xlsx') input_seqs, target_seqs, faultclass = dataset.split_sequences_with_labels( input_seqs, target_seqs, sequences, 5, 3, PAD, StOS, EOS, y_label) target_characters = input_characters encoder_input_data, decoder_input_data, decoder_target_data, [ num_encoder_tokens, num_decoder_tokens ], [max_encoder_seq_length, max_decoder_seq_length], [input_token_index, target_token_index ] = dataset.tokenize(input_seqs, target_seqs, input_characters, target_characters) resval1, resval2, ovres, [ output_token_list0, output_token_list1, output_token_list2 ], tree_seq, tree_conf, states, seqs = train_sequences_crossval( encoder_input_data, decoder_input_data, decoder_target_data, input_token_index, target_token_index, input_seqs, target_seqs, faultclass, EOS, PAD,
''' Date: 2021-03-24 20:46:22 LastEditors: ELROY LastEditTime: 2021-03-24 20:54:59 FilePath: \torch\main.py ''' from word_sequence import word2sequence import pickle import os from dataset import tokenize from tqdm import tqdm #打印进度条 if __name__ == '__main__': ws = word2sequence() path = "IMDB/aclImdb/train" temp_data_path = [os.path.join(path, 'pos'), os.path.join(path, 'neg')] for data_path in temp_data_path: file_paths = [ os.path.join(data_path, file_name) for file_name in os.listdir(data_path) if file_name.endswith('txt') ] for file_path in tqdm(file_paths): sentence = tokenize(open(file_path, errors='ignore').read()) ws.fit(sentence) ws.build_vocab(min=10) #ws.build_vocab(min=10,max_features=10000) pickle.dump(ws, open('./model/ws.pkl', 'wb')) print(len(ws))
if __name__ == '__main__': """ws = word2sequence() ws.fit(['who','i','am']) ws.fit(['who','are','you']) ws.build_vocab(min=0) print(ws.dict) ret = ws.transform(['who','the','f**k','you','are'],max_len=10) print(ret)""" from word_sequence import word2sequence import pickle import os from dataset import tokenize ws = word2sequence() data_path = "IMDB/aclImdb/train" temp_data_path = [ os.path.join(data_path, 'pos'), os.path.join(data_path, 'neg') ] for data_path in temp_data_path: file_name = os.listdir(data_path) file_path = os.path.join(data_path, file_name) sentence = tokenize(open(file_path).read()) ws.fit(sentence) ws.build_vocab(min=10) pickle.dump(ws, open('./model/ws.pkl', 'rb')) print(len(ws))
if __name__ == '__main__': from word_to_sequence import WordToSequence # pickle的bug... import pickle ws = WordToSequence() path = './dataset/aclImdb/train' temp_data_path = [os.path.join(path, 'pos'), os.path.join(path, 'neg')] for data_path in temp_data_path: file_paths = [ os.path.join(data_path, file_name) for file_name in os.listdir(data_path) if file_name.endswith('.txt') ] for file_path in tqdm(file_paths): sentences = tokenize(open(file_path, encoding='utf-8').read()) ws.count_word_frequence(sentences) ws.build_vocab(min_count=10, num_word=10000) if not os.path.exists('./model'): os.mkdir('./model') pickle.dump(ws, open("./model/ws.pkl", "wb")) print('word sequence length:', len(ws))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--batch-size', default=50, type=int) parser.add_argument('--dropout', default=0.5, type=float) parser.add_argument('--epoch', default=20, type=int) parser.add_argument('--learning-rate', default=0.1, type=float) parser.add_argument("--mode", default="non-static", help="available models: rand, static, non-static") parser.add_argument('--num-feature-maps', default=100, type=int) parser.add_argument("--pretrained-word-vectors", default="fasttext", help="available models: fasttext, Word2Vec") parser.add_argument("--save-word-vectors", action='store_true', default=False, help='save trained word vectors') parser.add_argument("--predict", action='store_true', default=False, help='classify your sentence') args = parser.parse_args() # load data print("Load data...\n") texts, labels = dataset.load_data() print("Tokenizing...\n") tokenized_texts, word2idx, max_len = dataset.tokenize(texts) input_ids = dataset.encode(tokenized_texts, word2idx, max_len) train_inputs, val_inputs, train_labels, val_labels = train_test_split( input_ids, labels, test_size=0.1, random_state=42) print("Creating Dataloader...\n") train_dataloader, val_dataloader = dataset.data_loader( train_inputs, val_inputs, train_labels, val_labels, batch_size=args.batch_size) if args.mode == 'rand': # CNN-rand: Word vectors are randomly initialized. train.set_seed(42) cnn_model, optimizer = model.initilize_model( vocab_size=len(word2idx), embed_dim=300, learning_rate=args.learning_rate, dropout=args.dropout) train.train(cnn_model, optimizer, train_dataloader, val_dataloader, epochs=args.epoch) elif args.mode == 'static': # CNN-static: fastText pretrained word vectors are used and freezed during training. train.set_seed(42) embeddings = pretrained_vectors.get_embeddings( word2idx, args.pretrained_word_vectors) cnn_model, optimizer = model.initilize_model( pretrained_embedding=embeddings, freeze_embedding=True, learning_rate=args.learning_rate, dropout=args.dropout) train.train(cnn_model, optimizer, train_dataloader, val_dataloader, epochs=args.epoch) else: # CNN-non-static: fastText pretrained word vectors are fine-tuned during training. train.set_seed(42) embeddings = pretrained_vectors.get_embeddings( word2idx, args.pretrained_word_vectors) cnn_model, optimizer = model.initilize_model( pretrained_embedding=embeddings, freeze_embedding=False, learning_rate=args.learning_rate, dropout=args.dropout) train.train(cnn_model, optimizer, train_dataloader, val_dataloader, epochs=args.epoch) if args.save_word_vectors == True: save_embeddings.write_embeddings( 'trained_embeddings_{}.txt'.format(args.mode), cnn_model.embedding.weight.data, word2idx) if args.predict == True: x = input('영어 텍스트를 입력하세요! : ') x = str(x) train.predict(x, cnn_model, word2idx) while True: conti = input('계속하시겠습니까? (y/n) : ') if conti == 'y': x1 = input('영어 텍스트를 입력하세요! : ') x1 = str(x1) train.predict(x1, cnn_model, word2idx) else: break