def test_init(self): # v0 = vocab.Vocab('bitch', [ # ('hi', '1101'), # ('goodbye', '1000029'), # ('au revoir', 'shit') # ]) name = 'test' dict1 = {'hi': '1101', 'goodbye': '1000029', 'au revoir': 'shit' } v0 = vocab.Vocab(name, dict1) self.assertEqual(v0.get_name(), 'test') # self.assertTrue(False) list0 = v0.get_vocab_list() self.assertIsInstance(v0.get_name(), str) self.assertIsInstance(list0, dict) self.assertIn('hi', list0) self.assertEqual(list0['hi'], '1101') self.assertIn('goodbye', list0) self.assertEqual(list0['goodbye'], '1000029') self.assertIn('au revoir', list0) self.assertEqual(list0['au revoir'], 'shit') v_blank = vocab.Vocab() self.assertEqual(v_blank.get_name(), 'default name') self.assertEqual(v_blank.get_vocab_list(), {}) self.assertIsInstance(v_blank.get_name(), str) self.assertIsInstance(list0, dict)
def __init__(self, opts, debug=False): """Pass in program options for now.""" self.opts = opts self.source_vocab = vocab.Vocab() self.target_vocab = vocab.Vocab() self.me = maxent.MaxentModel() self.m1_probs = {} self.lm = None self.dictionary = {} self.feature_functions = [] self.debug = debug
def load_model(model_fn): print('Building model and optimizer...') epoch = 0 voc = vocab.Vocab('data/pretrained_embedding/pretrained_embedding_5M.vec') model = BiLSTMCrf(voc, const.CHARACTER_LIST, character_embedding_dim, character_hidden_dim, context_hidden_dim, tag_list) optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9) if model_fn is not None: checkpoint = torch.load(model_fn) epoch = checkpoint['epoch'] model_sd = checkpoint['model'] optimizer_sd = checkpoint['optimizer'] model.load_state_dict(model_sd) optimizer.load_state_dict(optimizer_sd) model = model.to(const.DEVICE) for state in optimizer.state.values(): for k, v in state.items(): if type(v) is torch.Tensor: state[k] = v.to(const.DEVICE) return model, optimizer, epoch + 1
def parseFromLocal(self, file): f = open(file, "r") vocabId = 0 prevNode = None firstNode = None for i in f: line = i.split(",") regexCheck = re.search("(Text)|(Unit_\d)|(^\s*$)", line[0]) curNode = None if (regexCheck): unitCheck = re.search("Unit_\d", line[0]) if (unitCheck): unitNum = unitCheck.string[5:] curNode = UnitNode(int(unitNum)) elif (not regexCheck): curNode = vocab.Vocab(line[0], line[1], line[2], line[3], line[4].rstrip()) # give vocab an ID curNode.setId(vocabId) self.defPool.setdefault(vocabId, line[1]) vocabId += 1 if (curNode): if (not prevNode): firstNode = curNode else: curNode.setPrev(prevNode) prevNode.setNext(curNode) prevNode = curNode return firstNode
def parseFromCloud(self): fetchedData = GoogleSheetConnect.fetchEverySheetData() vocabId = 0 prevNode = None firstNode = None for i in fetchedData[1:]: curNode = None if (len(i) != 0): unitCheck = re.search("Unit_\d", i[0]) if (unitCheck): unitNum = unitCheck.string[5:] curNode = vocab.UnitNode(int(unitNum)) else: while (len(i) < 5): i.append("") text = i[0] definition = i[1] root = i[2] errorCount = int(i[3]) importancy = i[4] curNode = vocab.Vocab(text, definition, root, errorCount, importancy) # give vocab an ID curNode.setId(vocabId) self.defPool.setdefault(vocabId, i[1]) vocabId += 1 if (curNode): if (not prevNode): firstNode = curNode else: curNode.setPrev(prevNode) prevNode.setNext(curNode) prevNode = curNode return firstNode
def main(): ''' Main function ''' parser = argparse.ArgumentParser() parser.add_argument('-input_file', required=True) parser.add_argument('-output_file', required=True) parser.add_argument('-max_len', '--max_seq_len', type=int, default=64) parser.add_argument('-vocab', type=str, default="model/multilingual/vocab.txt") parser.add_argument('-checkoov', default=False) opt = parser.parse_args() multi_language_vocab = vocab.Vocab(opt.vocab) if (opt.checkoov): check_examples(opt.input_file, opt.max_seq_len, multi_language_vocab) src_lists, tgt_lists = read_examples(opt.input_file, opt.max_seq_len, multi_language_vocab) data = {'settings': opt, 'data': {'src': src_lists, 'tgt': tgt_lists}} logger.info('Dumping the processed data to file {}'.format( opt.output_file)) torch.save(data, opt.output_file) logger.info('Finish.')
def test_get_name(self): name = 'test' dict1 = {'hi': '1101', 'goodbye': '1000029', 'au revoir': 'shit' } v = vocab.Vocab(name, dict1) self.assertIsInstance(v.get_name(), str) self.assertEqual(v.get_name(), 'test')
def train(): # creatthe vocabulary file first and then submit the job input_data = json.loads(request.data) files_to_exclude = ",".join(input_data.get("files_to_exclude")) v = vocab.Vocab("gs://text-summarization-webapp.appspot.com/data/data", "data/vocab", files_to_exclude) v.create_vocab_file() os.system("sudo sh submit_training_job.sh {} {}".format( str(input_data.get("input")), files_to_exclude)) return json.dumps({"responseText": "Submitted training job"})
def test_set_vocab_list(self): name = 'test' dict1 = {'hi': '1101', 'goodbye': '1000029', 'au revoir': 'shit' } v = vocab.Vocab(name, dict1) self.assertEqual(v.get_vocab_list(), {'hi': '1101', 'goodbye': '1000029', 'au revoir': 'shit'}) v.set_vocab_list({'hi': '1101', 'goodbye': '1000029', 'au revoir': 'shit'}) self.assertEqual(v.get_name(), 'test')
def test_get_vocab_list(self): name = 'test' dict1 = {'hi': '1101', 'goodbye': '1000029', 'au revoir': 'shit' } v = vocab.Vocab(name, dict1) # self.assertTrue(isinstance(v.get_vocab_list(), dict)) self.assertIsInstance(v.get_vocab_list(), dict) self.assertEqual(v.get_vocab_list(), {'hi': '1101', 'goodbye': '1000029', 'au revoir': 'shit'})
def __init__(self, corpus, freq): """ Load Data from the corpus. Parameters: :: corpus :: filepath to read the whole dataset in txt format :: freq :: Min frequency of the word to be included into vocab """ super(EmbedGlove, self).__init__() f = open(corpus,'r') vc = Counter() for l in f: vc.update(Counter(l.split())) self.vcb = vocab.Vocab(vc, wv_type = "glove.840B",min_freq=freq,specials = ["EOS","SOS"]) f.close()
def __init__(self, opts, debug=False): """Pass in program options for now.""" # This class will directly read certain program options: # TODO: list them self.opts = opts # Vocab is a mapping from string to integer used for both the source and # target. self.vocab = vocab.Vocab() # p(s|t) and p(t|s): Lexical probabilities. These are stored as a dict from # (int, int) tuples to a float - the integers are vocab indices. self.pst = {} self.pts = {} self.feature_functions = [] self.debug = debug
def write_wv_to_file(load_path, vocab_path, output_path, size): print('writing wv file to %s' % output_path) vc, ic = torch.load(load_path) voc = vocab.Vocab(vocab_path) with codecs.open(output_path, 'w', 'utf8') as fout: fout.write('%d %d\n' % (len(voc), size)) for v, i in zip(vc, ic): v = v.cpu().numpy() i = i.cpu().item() fout.write(voc[i] + ' ') for vi in v: fout.write('%.4f ' % vi) fout.write('\n')
def test_check(self): name = 'test' dict1 = {'hi': '1101', 'goodbye': '90210', 'au revoir': 'goodbye', 'weeaboo shit': 'domo arigatou mr roboto' } v = vocab.Vocab(name, dict1) self.assertTrue(v.check('hi', '1101')) self.assertFalse(v.check('hi', 1101)) self.assertFalse(v.check('hi', [])) self.assertFalse(v.check('hi', {})) self.assertTrue(v.check('goodbye', '90210')) self.assertTrue(v.check('au revoir', 'goodbye')) self.assertTrue(v.check('weeaboo shit', 'domo arigatou mr roboto')) self.assertFalse(v.check('kboo shit', 'domo arigatou mr roboto'))
def build_vocab(): dataset_file = "train-v1.1.json" with open(dataset_file) as dataset_file: dataset_json = json.load(dataset_file) dataset = dataset_json['data'] token_list = [] for i in range(len(dataset)): for j in range(len(dataset[i]["paragraphs"])): passage = dataset[i]["paragraphs"][j]["context"] passage = passage.replace("''", '" ') passage = passage.replace("``", '" ') token_list.extend(word_tokenize(passage)) for k in range(len(dataset[i]["paragraphs"][j]["qas"])): question = dataset[i]["paragraphs"][j]["qas"][k]["question"] token_list.extend(word_tokenize(question)) c = Counter(token_list) v = vocab.Vocab(c, wv_type='glove.840B') del c del token_list return v
def pretrain(train_token, train_entity, train_relation, train_name, test_token, test_entity, test_relation, test_name): word_alphabet, postag_alphabet, relation_alphabet, entity_type_alphabet, entity_alphabet = dataset_stat( train_token, train_entity, train_relation) logging.info("training dataset stat completed") if opt.full_data: test_word_alphabet, test_postag_alphabet, test_relation_alphabet, test_entity_type_alphabet, test_entity_alphabet = dataset_stat( test_token, test_entity, test_relation) word_alphabet = word_alphabet | test_word_alphabet postag_alphabet = postag_alphabet | test_postag_alphabet relation_alphabet = relation_alphabet | test_relation_alphabet entity_type_alphabet = entity_type_alphabet | test_entity_type_alphabet entity_alphabet = entity_alphabet | test_entity_alphabet del test_word_alphabet, test_postag_alphabet, test_relation_alphabet, test_entity_type_alphabet, test_entity_alphabet logging.info("test dataset stat completed") position_alphabet = sortedcontainers.SortedSet() for i in range(opt.max_seq_len): position_alphabet.add(i) position_alphabet.add(-i) relation_vocab = vocab.Vocab(relation_alphabet, None, opt.relation_emb_size) word_vocab = vocab.Vocab(word_alphabet, opt.emb, opt.word_emb_size) postag_vocab = vocab.Vocab(postag_alphabet, None, opt.pos_emb_size) entity_type_vocab = vocab.Vocab(entity_type_alphabet, None, opt.entity_type_emb_size) entity_vocab = vocab.Vocab(entity_alphabet, None, opt.entity_emb_size) position_vocab1 = vocab.Vocab(position_alphabet, None, opt.position_emb_size) position_vocab2 = vocab.Vocab(position_alphabet, None, opt.position_emb_size) # we directly use position_alphabet to build them, since they are all numbers tok_num_betw_vocab = vocab.Vocab(position_alphabet, None, opt.entity_type_emb_size) et_num_vocab = vocab.Vocab(position_alphabet, None, opt.entity_type_emb_size) logging.info("vocab build completed") logging.info("saving ... vocab") pickle.dump(word_vocab, open(os.path.join(opt.pretrain, 'word_vocab.pkl'), "wb"), True) pickle.dump(postag_vocab, open(os.path.join(opt.pretrain, 'postag_vocab.pkl'), "wb"), True) pickle.dump(relation_vocab, open(os.path.join(opt.pretrain, 'relation_vocab.pkl'), "wb"), True) pickle.dump( entity_type_vocab, open(os.path.join(opt.pretrain, 'entity_type_vocab.pkl'), "wb"), True) pickle.dump(entity_vocab, open(os.path.join(opt.pretrain, 'entity_vocab.pkl'), "wb"), True) pickle.dump(position_vocab1, open(os.path.join(opt.pretrain, 'position_vocab1.pkl'), "wb"), True) pickle.dump(position_vocab2, open(os.path.join(opt.pretrain, 'position_vocab2.pkl'), "wb"), True) pickle.dump( tok_num_betw_vocab, open(os.path.join(opt.pretrain, 'tok_num_betw_vocab.pkl'), "wb"), True) pickle.dump(et_num_vocab, open(os.path.join(opt.pretrain, 'et_num_vocab.pkl'), "wb"), True) train_X, train_Y, _ = my_utils.getRelationInstance2( train_token, train_entity, train_relation, train_name, word_vocab, postag_vocab, relation_vocab, entity_type_vocab, entity_vocab, position_vocab1, position_vocab2, tok_num_betw_vocab, et_num_vocab) logging.info("training instance build completed, total {}".format( len(train_Y))) pickle.dump(train_X, open(os.path.join(opt.pretrain, 'train_X.pkl'), "wb"), True) pickle.dump(train_Y, open(os.path.join(opt.pretrain, 'train_Y.pkl'), "wb"), True) test_X, test_Y, test_other = my_utils.getRelationInstance2( test_token, test_entity, test_relation, test_name, word_vocab, postag_vocab, relation_vocab, entity_type_vocab, entity_vocab, position_vocab1, position_vocab2, tok_num_betw_vocab, et_num_vocab) logging.info("test instance build completed, total {}".format(len(test_Y))) pickle.dump(test_X, open(os.path.join(opt.pretrain, 'test_X.pkl'), "wb"), True) pickle.dump(test_Y, open(os.path.join(opt.pretrain, 'test_Y.pkl'), "wb"), True) pickle.dump(test_other, open(os.path.join(opt.pretrain, 'test_Other.pkl'), "wb"), True)
def summarize(input): # create the vocabulary file first and then submit the job v = vocab.Vocab("gs://sasidhar-project1-mlengine", "data/vocab") v.create_vocab_file() os.system("sudo sh train_textsum_dist.sh " + input) return "done"
if args.validation_text: for li, line in enumerate(file(args.validation_text)): words = line.split() words = [start] * (n - 1) + words + [stop] validation_data.append(words) else: if args.validation_size > 0: validation_data = train_data[-args.validation_size:] train_data[-args.validation_size:] = [] c = collections.Counter() for words in train_data: c.update(words[n - 1:]) v = vocab.Vocab() v.insert_word(start) v.insert_word(stop) v.insert_word(null) inserted = v.from_counts(c, args.n_vocab) if inserted == len(c): sys.stderr.write( "warning: only %d words types in training data; set --n_vocab lower to learn unknown word\n" ) if args.words_file: with open(args.words_file, "w") as outfile: for w in v.words: outfile.write("%s\n" % (w, )) if args.train_file == '-':
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Apr 1 16:37:07 2017 @author: sachin """ import torch import vocab from collections import Counter c = Counter(['hello', 'world']) v = vocab.Vocab(c, wv_type='glove.840B') print(v.itos) print(v.vectors[v.stoi["hello"]])
import utils, pickle, vocab # glove_path = '/home/ouzj01/zhangyc/project/glove/glove.840B.300d.txt' glove_path = 'data/glove/glove_ori.6B.50d.txt' # save_path = 'data/glove/glove_multiwoz.840B.300d.txt' # save_path = 'data/glove/glove_multiwoz.6B.50d.txt' save_path = 'data/glove/glove_kvret.6B.50d.txt' vocab = vocab.Vocab(1100) vocab.load_vocab('data/kvret/vocab') # vocab.load_vocab('data/MultiWOZ/processed/vocab') vec_array = [] with open(glove_path, 'r', encoding='UTF-8') as ef: with open(save_path, 'w') as f: for line in ef.readlines(): line_sep = line.strip().split(' ') word, vec = line_sep[0], line_sep[1:] if vocab.has_word(word): f.write(line) ef.close()
def pipeline(src_suf, tgt_suf, arg_string, size, old_params=None, data_dir='13-es-100K', center=False): import os # arg_string = '--exp_id k0gf0f007v --src_lang es --tgt_lang en '\ # '--n_refinement 5 --emb_dim 500 --normalize_embeddings center --full_vocab' lazy_wvs = dict() for suf in [src_suf, tgt_suf]: if suf is None: continue load_path = '../EMNLP-NMT/data/%s/torch_save.%s' % (data_dir, suf) vocab_path = '../EMNLP-NMT/data/%s/%s' % (data_dir, suf) output_path = 'wv.%s' % suf #if not os.path.isfile(output_path): lazy_wvs[suf] = LazyObject( lambda load_path=load_path, vocab_path= vocab_path, output_path=output_path: write_wv_to_file( load_path, vocab_path, output_path, size)) if suf == src_suf: arg_string += ' --src_emb %s' % output_path else: arg_string += ' --tgt_emb %s' % output_path if old_params is not None and center: print('getting test') lazy_test = LazyObject( lambda: get_test_models(arg_string, old_params.compute()[0])) #params, (test_eval, test_trainer) = get_test_models(arg_string, old_params.compute()[0]) else: lazy_test = LazyObject(lambda: get_saved_models(arg_string)) #params, (test_eval, test_trainer) = get_saved_models(arg_string) import torch import vocab for i, suf in enumerate([src_suf, tgt_suf]): #for emb, suf in zip([src_emb, tgt_emb], [src_suf, tgt_suf]): if suf is None: continue save_path = '../EMNLP-NMT/data/%s/torch_save.MUSE.%s' % (data_dir, suf) if os.path.isfile(save_path): continue if center: save_path += '.center' vocab_path = '../EMNLP-NMT/data/%s/%s' % (data_dir, suf) for k in lazy_wvs: lazy_wvs[k].compute() voc = vocab.Vocab(vocab_path) test_eval = lazy_test.compute()[1][0] if i == 0: emb = test_eval.src_emb # mapped word embeddings emb = test_eval.mapping(emb.weight).data else: emb = lazy_test.compute()[1][0].tgt_emb emb = emb.weight.data vc = emb.cpu() eval_vocab = test_eval.tgt_dico if suf == tgt_suf else test_eval.src_dico ic = torch.from_numpy( np.asarray([ voc[w] for w in [eval_vocab.id2word[i] for i in range(len(vc))] ])) torch.save([vc, ic], save_path) if 'test_eval' in locals(): return test_eval else: return None
optimizer.load_state_dict(optimizer_sd) model = model.to(const.DEVICE) for state in optimizer.state.values(): for k, v in state.items(): if type(v) is torch.Tensor: state[k] = v.to(const.DEVICE) return model, optimizer, epoch + 1, all_losses, eval_losses, test_scores, best_test_score if __name__ == '__main__': args = parse() print(args) print('Loading vocab...') voc = vocab.Vocab(args.pretrained_path, freeze=True) print('Loading data ...') train_sentences = [Sentence(sentence, voc) for sentence in utils.read_data(args.trainpath)] dev_sentences = [Sentence(sentence, voc) for sentence in utils.read_data(args.devpath)] test_sentences = [Sentence(sentence, voc) for sentence in utils.read_data(args.testpath)] train_ds = dataset.Dataset(train_sentences, word_padding_idx=voc.padding_index, pos_padding_idx=const.POS_PADDING_IDX, chunk_padding_idx=const.CHUNK_PADDING_IDX, character_padding_idx=const.CHARACTER2INDEX['<PAD>'], tag_padding_idx=const.CHUNK_PADDING_IDX) dev_ds = dataset.Dataset(dev_sentences, word_padding_idx=voc.padding_index, pos_padding_idx=const.POS_PADDING_IDX, chunk_padding_idx=const.CHUNK_PADDING_IDX, character_padding_idx=const.CHARACTER2INDEX['<PAD>'],
def test_holy_mother_of_symbols(self): v = vocab.Vocab('[[[[]gorp!@)(87845,./;[]|\\-_=+)]][][]', {}) self.assertEqual(v.get_name(), '[[[]gorp!@)(87845,./;[]|\\-_=+)]][][')
run_path = os.path.join('runs', run_hash) results_path = os.path.join(run_path, 'results.txt') os.makedirs(run_path) with open(results_path, 'w+') as f: f.write(f'train_loss\ttrain_mrr\tvalid_loss\tvalid_mrr\n') with open(os.path.join(run_path, 'args.json'), 'w+') as f: json.dump(args_dict, f, indent=2) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) code_vocab = vocab.Vocab(args.code_vocab, args.vocab_max_size, args.vocab_min_freq, UNK_TOKEN, PAD_TOKEN) desc_vocab = vocab.Vocab(args.desc_vocab, args.vocab_max_size, args.vocab_min_freq, UNK_TOKEN, PAD_TOKEN) print(f'code vocab size: {len(code_vocab)}') print(f'desc vocab size: {len(desc_vocab)}') train_data = utils.load_retrieval_data(args.train_data, code_vocab, desc_vocab, args.code_max_length, args.desc_max_length) valid_data = utils.load_retrieval_data(args.valid_data, code_vocab, desc_vocab, args.code_max_length, args.desc_max_length)
def main(): parser = OptionParser() cwd = os.getcwd() parser.add_option( "-p", "--parallel_data", dest="training_file", default=cwd + "/data/euro_esen_10k", help="Parallel data, expecting \".source\" and \".target\"") parser.add_option( "-c", "--comparable_data", dest="comp_data", default=cwd + "/data/es_dev", help= "Annotated comparable data, expecting \".source\", \".target\", and \".alignment\"" ) parser.add_option( "-r", "--raw_data", dest="raw_data", default=cwd + "/data/esen_docs_small", help="Raw comparable data, expecting \".source\" and \".target\"") parser.add_option("-t", "--t_table", dest="m1_data", default=cwd + "/data/pruned.model", help="Word alignment parameters from some parallel data") parser.add_option( "-e", "--example_window", dest="example_window", type="int", default=3, help="Size of the example window for gathering training data") parser.add_option( "--length_ratio", type="float", dest="max_len_ratio", default=3.0, help="Maximum length ratio for sentences to be considered parallel") parser.add_option( "--test_max", type="int", dest="test_max", default=100, help="Number of sentences from the parallel data to use as test data") parser.add_option("--prob_floor", type="float", dest="prob_floor", default=1e-4, help="Lowest probability value for LM and M1") parser.add_option("--max_iterations", type="int", dest="max_iterations", default=20, help="Maximum number of L-BFGS iterations") parser.add_option("--l2_norm", type="float", dest="l2_norm", default="2.0", help="L2 normalizing value for the Maxent model") parser.add_option( "--sent_out", dest="sent_out", default="", help="Extract sentences from the raw documents to this location") (opts, args) = parser.parse_args() # Read available data source_vocab = vocab.Vocab() target_vocab = vocab.Vocab() if opts.training_file: source_parallel = read_text(opts.training_file + '.source', source_vocab) target_parallel = read_text(opts.training_file + '.target', target_vocab) t_lm = create_lm(target_parallel, opts) if opts.comp_data: (source_docs, target_docs, alignments) = read_comp_data(opts.comp_data, source_vocab, target_vocab) if opts.raw_data: (raw_source, raw_target) = read_comp_data(opts.raw_data, source_vocab, target_vocab) if opts.m1_data: m1 = read_m1_data(opts.m1_data, source_vocab, target_vocab) #(train_data, test_data) = create_train_test_data( # me, source_parallel, target_parallel, m1, t_lm, opts) comp_data = create_comp_test_data(source_docs, target_docs, alignments, m1, t_lm, opts) print_feature_stats(comp_data) folds = range(5) for fold in folds: comp_test_data = [] me = maxent.MaxentModel() print "\nFold " + str(fold + 1) + ":" me.begin_add_event() for i, event in enumerate(comp_data): if i % len(folds) == fold: comp_test_data.append(event) else: me.add_event(event[0], event[1]) me.end_add_event() me.train(opts.max_iterations, "lbfgs", opts.l2_norm) parallel_eval(me, comp_test_data) if opts.sent_out: full_me = maxent.MaxentModel() full_me.begin_add_event() for event in comp_data: full_me.add_event(event[0], event[1]) full_me.end_add_event() full_me.train(opts.max_iterations, "lbfgs", opts.l2_norm) #for threshold in drange(0.05, 0.96, 0.05): threshold = 0.5 out_file = opts.sent_out + '.' + str(threshold) extract_sentences(full_me, raw_source, raw_target, out_file, threshold, m1, t_lm, source_vocab, target_vocab, opts)
criteria = nn.NLLLoss(weight=weights, size_average=True) optimizer = torch.optim.Adam(lstm_model.parameters()) torch.manual_seed(settings.seed) random.seed(settings.seed) if torch.cuda.is_available(): lstm_model.cuda() criteria.cuda() torch.cuda.manual_seed(settings.seed) for param in lstm_model.parameters(): param.data.uniform_(-0.08, 0.08) # Load Voccab src_vocab = vocab.Vocab(settings.src_vocab_size, os.path.join(args.path, "src_vocab.txt")) trg_vocab = vocab.Vocab(settings.trg_vocab_size, os.path.join(args.path, "trg_vocab.txt")) biVocab = vocab.BiVocab(src_vocab, trg_vocab) # Load Dataset train_ner_data = dataset.NERDataset( os.path.join(args.path, "atis.train.txt"), biVocab) train_ner_data_loader = DataLoader(train_ner_data, batch_size=settings.batch_size, shuffle=True, collate_fn=lambda x: x) dev_ner_data = dataset.NERDataset(os.path.join(args.path, "atis.test.txt"), biVocab) dev_ner_data_loader = DataLoader(dev_ner_data, batch_size=settings.batch_size,
max_len = max(len(token), len(tag_)) + 4 formated_sentence.append(fill(token, max_len)) formated_tag.append(fill(tag_, max_len)) no_lines = len(formated_sentence) // max_words_per_line + ( 0 if len(formated_sentence) % max_words_per_line == 0 else 1) for i in range(no_lines): print(' '.join(formated_sentence[max_words_per_line * i: max_words_per_line * (i + 1)])) print(' '.join(formated_tag[max_words_per_line * i: max_words_per_line * (i + 1)])) print('\n') if __name__ == '__main__': print('loading vocab......') voc = vocab.Vocab(args['pretrained_path']) print('loading model.......') model = load_model( model_fn=args['checkpoint_fn'], voc=voc, character_embedding_dim=args['character_embedding_dim'], character_hidden_dim=args['character_hidden_dim'], context_hidden_dim=args['context_hidden_dim'], dropout=args['dropout'], crf_loss_reduction=args['crf_loss_reduction'], using_pos_chunk=args['using_pos_chunk'] ) model = model.to(device) model.eval() print('program is running.....')
def prepare(args): """ Checks data, create vocab, load pretrained embedding or initialization randomly embedding """ logger = logging.getLogger("alibaba") logger.info("Checking the data files... ") for data in args.data_files: assert os.path.exists(data), "{} file does not exist".format(data) logger.info("preprocess raw data...") jieba.load_userdict(args.dict_file) logger.info("segment raw data") preposs_file = open(args.preposs_file, "w") index = 1 for data_file in args.data_files: with open(data_file, "r") as fin: for idx, line in enumerate(fin): line = unicode(line, encoding="utf8") line_re = re.sub( u"[’!\"#$%&'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+", "", line) line_list = str(line_re).strip("\n").split("\t") if len(line_list) != 4: logger.warning("{} - {} from is wrong".format( args.data_files, idx + 1)) continue document1 = line_list[1].strip().replace(" ", "") document2 = line_list[2].strip().replace(" ", "") segment_document1 = [_ for _ in jieba.cut(document1)] segment_document2 = [_ for _ in jieba.cut(document2)] preposs_file.write(str(index)) preposs_file.write("|") preposs_file.write(" ".join(segment_document1)) preposs_file.write("|") preposs_file.write(" ".join(segment_document2)) preposs_file.write("|") preposs_file.write(line_list[3] + "\n") index += 1 preposs_file.close() logger.info("Building vocabulary...") for dir_path in [ args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir ]: if not os.path.exists(dir_path): os.makedirs(dir_path) data = dataset.Dataset(args) word_vocab_ = vocab.Vocab() for token in data.word_iter(set_name="train"): word_vocab_.add(token) unfiltered_vocab_size = word_vocab_.size() word_vocab_.filter_word_by_count(min_count=2) filtered_num = unfiltered_vocab_size - word_vocab_.size() logger.info( "After filter {} tokens, the final word vocab size is {}".format( filtered_num, word_vocab_.size())) logger.info("Assigning word embeddings...") word_vocab_.random_init_embeddings(args.embedding_size) character_vocab_ = vocab.Vocab() for character in data.word_iter("train", character=True): character_vocab_.add(character) unfiltered_vocab_size = character_vocab_.size() character_vocab_.filter_word_by_count(min_count=2) filtered_num = unfiltered_vocab_size - character_vocab_.size() logger.info( "After filter {} characters, the final character vocab size is {}". format(filtered_num, character_vocab_.size())) logger.info("Assigning character embeddings...") character_vocab_.random_init_embeddings(args.character_embedding_size) logger.info("Saving vocab...") with open(os.path.join(args.vocab_dir, "vocab.data"), "wb") as fout: pickle.dump(word_vocab_, fout) logger.info("Saving character vocab...") with open(os.path.join(args.vocab_dir, "vocab_character.data"), "wb") as fout: pickle.dump(character_vocab_, fout) logger.info("Done with preparing!")
def main(): ''' Main function ''' parser = argparse.ArgumentParser() # parser.add_argument('-data', required=True) parser.add_argument('-train_atok', required=True) parser.add_argument('-valid_atok', required=True) parser.add_argument('-epoch', type=int, default=200) parser.add_argument('-batch_size', type=int, default=8) parser.add_argument('-d_word_vec', type=int, default=512) parser.add_argument('-d_model', type=int, default=512) parser.add_argument('-d_inner_hid', type=int, default=2048) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_layers', type=int, default=6) parser.add_argument('-n_warmup_steps', type=int, default=4000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-log', default=None) parser.add_argument('-save_model', default=None) parser.add_argument('-save_mode', type=str, choices=['all', 'best'], default='best') parser.add_argument('-no_cuda', action='store_true') parser.add_argument('-label_smoothing', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Loading Dataset =========# train_atok = torch.load(opt.train_atok) valid_atok = torch.load(opt.valid_atok) train_vocab = vocab.Vocab(train_atok['settings'].vocab) training_data = dataset.translation_dataloader(train_atok, opt.batch_size, shuffle=True) validation_data = dataset.translation_dataloader(valid_atok, opt.batch_size, shuffle=False) # data = torch.load(opt.data) opt.max_token_seq_len = train_atok['settings'].max_seq_len # training_data, validation_data = prepare_dataloaders(data, opt) opt.src_vocab_size = train_vocab.size() opt.tgt_vocab_size = train_vocab.size() #========= Preparing Model =========# # if opt.embs_share_weight: # assert training_data.dataset.src_word2idx == training_data.dataset.tgt_word2idx, \ # 'The src/tgt word2idx table are different but asked to share word embedding.' print(opt) device = torch.device('cuda' if opt.cuda else 'cpu') transformer = Transformer(opt.src_vocab_size, opt.tgt_vocab_size, opt.max_token_seq_len, tgt_emb_prj_weight_sharing=opt.proj_share_weight, emb_src_tgt_weight_sharing=opt.embs_share_weight, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner=opt.d_inner_hid, n_layers=opt.n_layers, n_head=opt.n_head, dropout=opt.dropout).to(device) if os.path.exists("trained.chkpt"): x = torch.load("trained.chkpt") # print(type(x["model"])) transformer.load_state_dict(x["model"]) optimizer = ScheduledOptim( optim.Adam(filter(lambda x: x.requires_grad, transformer.parameters()), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) train(transformer, training_data, validation_data, optimizer, device, opt)
sys.path.append("..") import vocab as vc import pickle import random import collections vocab_file = "vocab.txt" train_file = "train.p" infer_file = "infer.p" valid_file = "validation.p" score_file = "score.txt" max_seq_length = 40 pad_id = 1 cab = vc.Vocab(vocab_file, verbose=False) scores = {} data = collections.defaultdict(list) infer = collections.defaultdict(list) def load_scores(): with open("final-score.csv") as f: f_csv = csv.DictReader(f) for i, row in enumerate(f_csv): scores[row['skuid']] = int(row['score']) - 5 def load_data(): with open("jd-comment.csv", encoding='utf8') as f: f_csv = csv.DictReader(f)