def create_vocab(datasets, embed_file=None, bert_vocab_path=None, min_count=2): wd_vocab = Vocab(min_count, bos=None, eos=None) char_vocab = Vocab(bos=None, eos=None) tag_vocab = Vocab(bos=None, eos=None) ner_vocab = Vocab(bos=None, eos=None) for insts in datasets: for inst in insts: wd_vocab.add(inst.word) char_vocab.add(list(inst.word)) tag_vocab.add(inst.pos_tag) if inst.ner_tag != 'O': # including PER ORG LOC MISC and UNK ner_tag = inst.ner_tag.split('-')[1] ner_vocab.add(ner_tag) embed_count = wd_vocab.load_embeddings(embed_file) print("%d word pre-trained embeddings loaded..." % embed_count) bert_vocab = BERTVocab( bert_vocab_path) if bert_vocab_path is not None else None return MultiVocab( dict(word=wd_vocab, char=char_vocab, tag=tag_vocab, ner=ner_vocab, bert=bert_vocab))
def pdtb_prepare(args): print('Loading dataset...') train_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in PathConfig.train_sections] dev_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in PathConfig.dev_sections] test_sections = [os.path.join(PathConfig.json_data_dir, '{:02}'.format(section_num)) for section_num in PathConfig.test_sections] dataset = PDTBDataSet(train_sections, dev_sections, test_sections, level=2 if args.task.startswith('fine') else 1) print('Size of train: {}, dev: {}, test: {}'.format(len(dataset.train_set), len(dataset.dev_set), len(dataset.test_set))) print('Creating word vocab...') if not os.path.exists(PathConfig.experiment_data_dir): os.mkdir(PathConfig.experiment_data_dir) word_vocab = Vocab(mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD]) for word in dataset.get_all_words(): word_vocab.add(word) word_vocab.load_pretrained_emb(PathConfig.embedding_path) print('Size of word vocab: {}'.format(word_vocab.size())) torch.save(word_vocab, os.path.join(PathConfig.experiment_data_dir, 'word_vocab.obj')) tag_vocab = Vocab() for tag in dataset.get_all_tags(): tag_vocab.add(tag) print('Size of tag vocab: {}'.format(tag_vocab.size())) tag_vocab.init_embed(ModelConfig.tag_embed_dim) torch.save(tag_vocab, os.path.join(PathConfig.experiment_data_dir, 'tag_vocab.obj')) print('Formatting the dataset to torch variables...') dataset.format_instances_to_torch_var(word_vocab, tag_vocab) torch.save(dataset, os.path.join(PathConfig.experiment_data_dir, 'dataset.obj'))
def build_vocab(files, vocabulary=None, mtl=False, name="src", save_dir="/"): vocabs = [] if vocabulary is not None: for v in vocabulary: print(f'Loading from {v}') vocab = Vocab() vocab.load_from_file(v) vocabs.append(vocab) else: if mtl is True: for index, f in enumerate(files): vocab = Vocab() vocab.build_vocab([f]) vocab.save(save_dir + name + ".vocab." + str(index) + ".json") vocabs.append(vocab) else: vocab = Vocab() vocab.build_vocab(files) vocab.save(save_dir + name + ".vocab.json") vocabs.append(vocab) for index, vocab in enumerate(vocabs): print(f'vocabulary size {index+1:d}: {vocab.len():d}') return vocabs
def configuration(cls, plm=None, method='lgesql', table_path='data/tables.json', tables='data/tables.bin', db_dir='data/database'): cls.plm, cls.method = plm, method cls.grammar = ASDLGrammar.from_filepath(GRAMMAR_FILEPATH) cls.trans = TransitionSystem.get_class_by_lang('sql')(cls.grammar) cls.tables = pickle.load(open(tables, 'rb')) if type(tables) == str else tables cls.evaluator = Evaluator(cls.trans, table_path, db_dir) if plm is None: cls.word2vec = Word2vecUtils() cls.tokenizer = lambda x: x cls.word_vocab = Vocab( padding=True, unk=True, boundary=True, default=UNK, filepath='./pretrained_models/glove.42b.300d/vocab.txt', specials=SCHEMA_TYPES) # word vocab for glove.42B.300d else: cls.tokenizer = AutoTokenizer.from_pretrained( os.path.join('./pretrained_models', plm)) cls.word_vocab = cls.tokenizer.get_vocab() cls.relation_vocab = Vocab(padding=False, unk=False, boundary=False, iterable=RELATIONS, default=None) cls.graph_factory = GraphFactory(cls.method, cls.relation_vocab)
def create_vocab(data_path): wd_vocab = Vocab(min_count=3, bos=None, eos=None) lbl_vocab = Vocab(pad=None, unk=None, bos=None, eos=None) assert os.path.exists(data_path) with open(data_path, 'r', encoding='utf-8') as fin: loader = map(lambda x: x.strip().split('|||'), fin) for lbl, data_item in loader: wds = data_item.strip().split(' ') wd_vocab.add(wds) lbl_vocab.add(lbl.strip()) return MultiVocab({'word': wd_vocab, 'label': lbl_vocab})
def get_embed_vocab(embed_file): assert os.path.exists(embed_file) embed_vocab = Vocab(bos=None, eos=None) vec_dim = 0 with open(embed_file, 'r', encoding='utf-8') as fin: for line in fin: tokens = line.strip().split(' ') if len(tokens) < 10: continue embed_vocab.add(tokens[0]) if vec_dim == 0: vec_dim = len(tokens[1:]) embed_weights = np.random.uniform(-0.5 / vec_dim, 0.5 / vec_dim, (len(embed_vocab), vec_dim)) with open(embed_file, 'r', encoding='utf-8') as fin: for line in fin: tokens = line.strip().split(' ') if len(tokens) < 10: continue idx = embed_vocab.inst2idx(tokens[0]) embed_weights[idx] = np.asarray(tokens[1:], dtype=np.float32) embed_weights[embed_vocab.pad_idx] = 0. embed_weights /= np.std(embed_weights) embed_vocab.embeddings = embed_weights return embed_vocab
def create_vocab(self): if self.is_training: if not os.path.exists(self.vocab_file_path): print("Creating vocab") self.vocab = Vocab(add_bos=False, add_eos=False, add_padding=False, min_count=self.min_count) for example in self.dataset: self.vocab.add_tokenized_sentence( example['tokens'][:self.train_max_length]) self.vocab.finish() with open(self.vocab_file_path, 'wb') as f: pickle.dump(self.vocab, f) else: with open(self.vocab_file_path, 'rb') as f: self.vocab = pickle.load(f) else: print("Cargando vocab") with open(self.vocab_file_path, 'rb') as f: self.vocab = pickle.load(f)
def make_vocab_label(self, sents, vocab_label_init=None): if len(sents) == 0: return None if vocab_label_init: vocab_label = deepcopy(vocab_label_init) else: vocab_label = Vocab() if self.argv.data_type == 'conll05': core_labels = ["A0", "A1", "A2", "A3", "A4", "A5"] else: core_labels = ["ARG0", "ARG1", "ARG2", "ARG3", "ARG4", "ARG5"] for label in core_labels: vocab_label.add_word(label) bio_labels = [] for sent in sents: for props in sent.prd_bio_labels: bio_labels += props cnt = Counter(bio_labels) bio_labels = [(w, c) for w, c in cnt.most_common()] for label, count in bio_labels: if not label.endswith('-V') and len(label) > 1: vocab_label.add_word(label[2:]) return vocab_label
def train(params): assert params["mode"].lower() == "train", "change training mode to 'train'" vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count # params["trained_epoch"] = get_train_msg() params["learning_rate"] *= np.power(0.9, params["trained_epoch"]) # 构建模型 print("Building the model ...") model = Seq2Seq(params) # 获取保存管理者 checkpoint = tf.train.Checkpoint(Seq2Seq=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, SEQ2SEQ_CKPT, max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") # 训练模型 print("开始训练模型..") print("trained_epoch:", params["trained_epoch"]) print("mode:", params["mode"]) print("epochs:", params["epochs"]) print("batch_size:", params["batch_size"]) print("max_enc_len:", params["max_enc_len"]) print("max_dec_len:", params["max_dec_len"]) print("learning_rate:", params["learning_rate"]) train_model(model, vocab, params, checkpoint_manager)
def make_vocab_label(self, sents, vocab_label_init=None): if len(sents) == 0: return None if vocab_label_init: vocab_label = deepcopy(vocab_label_init) else: vocab_label = Vocab() none_label = 'O' vocab_label.add_word(none_label) labels = [] for sent in sents: if sent.has_prds: for prop in sent.prd_bio_labels: labels += prop cnt = Counter(labels) labels = [(w, c) for w, c in cnt.most_common()] for label, count in labels: vocab_label.add_word(label) return vocab_label
def build_vocab(df, vocab_path): print(f"building vocab ...") vocab_dict = {"<unk>": 1, "<eos>": 2, "<pad>": 3} vocab_set = [] for row in tqdm(df.itertuples()): text = row.text.replace(" ", "") # remove spaces phones = pyopenjtalk.g2p(text, join=False) # remove pause phones = [phone for phone in phones if phone != "pau"] for phone in phones: if phone not in vocab_set: vocab_set.append(phone) # alphabetical order vocab_set.sort() wlines = [] for v in vocab_set: index = len(vocab_dict) + 1 vocab_dict[v] = index for v, index in vocab_dict.items(): wlines.append(f"{v} {index:d}\n") with open(vocab_path, "w", encoding="utf-8") as f: f.writelines(wlines) print(f"vocabulary saved to {vocab_path}") return Vocab(vocab_path)
def prepare(args): logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) logger.info('Preparing the directories...') for dir_path in [ args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir ]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.gpus, args.batch_size, args.train_files, args.dev_files, args.test_files) vocab = Vocab(init_random=False, trainable_oov_cnt_threshold=2) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) logger.info('Assigning embeddings...') # vocab.build_embedding_matrix(args.pretrained_word_path) vocab.randomly_init_embeddings(args.embed_size) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
def vocabs_init(train_data: List[str]) -> Vocab: print("Constructing vocabularies...", flush=True) vocab = Vocab(train_data) print('len(labels_vocab): %d' % len(vocab)) return vocab
def __init__(self, args): super().__init__() self.args = args self.K = args.K self.rnn_hidden = args.rnn_hidden self.max_sent_len = args.max_sent_len print("loading pretrained emb......") self.emb_matrix = np.load(args.dset_dir + '/' + args.dataset + '/embedding.npy') print("loading dataset vocab......") self.vocab = Vocab(args.dset_dir + '/' + args.dataset + '/vocab.pkl') # create embedding layers self.emb = nn.Embedding(self.vocab.size, args.emb_dim, padding_idx=constant.PAD_ID) self.pos_emb = nn.Embedding(len(constant.POS_TO_ID), args.pos_dim) if args.pos_dim > 0 else None # initialize embedding with pretrained word embeddings self.init_embeddings() # dropout self.input_dropout = nn.Dropout(args.input_dropout) # GRU for P(Trc|S,Y') self.GRU_mean_rc = torch.nn.GRUCell( len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID), self.rnn_hidden * 2) self.GRU_std_rc = torch.nn.GRUCell( len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID), self.rnn_hidden * 2) # GRU for P(Tner|S,Y') self.GRU_mean_ner = torch.nn.GRUCell( len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID), self.rnn_hidden * 2) self.GRU_std_ner = torch.nn.GRUCell( len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID), self.rnn_hidden * 2) # define r self.r_mean_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K)) self.r_std_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K)) self.r_mean_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K)) self.r_std_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K)) # define encoder for the sharing representations S self.BiLSTM = LSTMRelationModel(args) # classifer self.Lr = nn.Linear(4 * self.rnn_hidden, 2 * self.rnn_hidden) self.Cr = nn.Linear(2 * self.rnn_hidden, len(constant.LABEL_TO_ID)) self.Cg = nn.Linear(2 * self.rnn_hidden, len(constant.BIO_TO_ID)) # Fn self.logsoft_fn1 = nn.LogSoftmax(dim=2) self.logsoft_fn2 = nn.LogSoftmax(dim=3)
def __init__(self, args): self.args = args self.epoch = args.epoch self.batch_size = args.batch_size self.lr = args.lr self.K = args.K self.num_avg = args.num_avg self.global_iter = 0 self.global_epoch = 0 self.log_file = args.log_file # Network & Optimizer self.toynet = ToyNet(args).cuda() self.optim = optim.Adam(self.toynet.parameters(), lr=self.lr) self.ckpt_dir = Path(args.ckpt_dir) if not self.ckpt_dir.exists(): self.ckpt_dir.mkdir(parents=True, exist_ok=True) self.load_ckpt = args.load_ckpt if self.load_ckpt != '': self.load_checkpoint(self.load_ckpt) # loss function self.ner_lossfn = nn.NLLLoss(reduction='sum') self.rc_lossfn = nn.BCELoss(reduction='sum') # History self.history = dict() # class loss self.history['ner_train_loss1'] = [] self.history['rc_train_loss1'] = [] self.history['ner_test_loss1'] = [] self.history['rc_test_loss1'] = [] self.history['ner_train_loss2'] = [] self.history['rc_train_loss2'] = [] self.history['ner_test_loss2'] = [] self.history['rc_test_loss2'] = [] self.history['precision_test'] = [] self.history['recall_test'] = [] self.history['F1_test'] = [] # info loss self.history['info_train_loss'] = [] self.history['info_test_loss'] = [] # Dataset vocab = Vocab(args.dset_dir + '/' + args.dataset + '/vocab.pkl') self.data_loader = dict() self.data_loader['train'] = Dataloader( args.dset_dir + '/' + args.dataset + '/train.json', args.batch_size, vars(args), vocab) self.data_loader['test'] = Dataloader(args.dset_dir + '/' + args.dataset + '/test.json', args.batch_size, vars(args), vocab, evaluation=True)
def evaluate_model(evalparams): torch.manual_seed(evalparams.seed) random.seed(1234) if evalparams.cpu: evalparams.cuda = False elif evalparams.cud: torch.cuda.manual_seed(args.seed) # load opt print(evalparams.model_dir, evalparams.model) # model_file = evalparams.model_dir + "/" + evalparams.model model_file = 'best_model.pt' print("Loading model from {}".format(model_file)) opt = torch_utils.load_config(model_file) model = RelationModel(opt) model.load(model_file) # load vocab vocab_file = evalparams.model_dir + '/vocab.pkl' vocab = Vocab(vocab_file, load=True) assert opt[ 'vocab_size'] == vocab.size, "Vocab size must match that in the saved model." # load data data_file = opt['data_dir'] + '/{}.json'.format(evalparams.dataset) print("Loading data from {} with batch size {}...".format( data_file, opt['batch_size'])) batch = DataLoader(data_file, opt['batch_size'], opt, vocab, evaluation=True) helper.print_config(opt) id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()]) predictions = [] all_probs = [] for i, b in enumerate(batch): preds, probs, _ = model.predict(b) predictions += preds all_probs += probs predictions = [id2label[p] for p in predictions] p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True) # save probability scores if len(evalparams.out) > 0: helper.ensure_dir(os.path.dirname(evalparams.out)) with open(evalparams.out, 'wb') as outfile: pickle.dump(all_probs, outfile) print("Prediction scores saved to {}.".format(evalparams.out)) print("Evaluation ended.") return (batch.gold(), predictions, model)
def load_word_vector(path): """ loading word vector(this project employs GLOVE word vector), save GLOVE word, vector as file respectively :param path: GLOVE word vector path :return: glove vocab,: vocab object, vector(numpy array, of shape(words_num, word_dim)) """ base = os.path.splitext(os.path.basename(path))[0] glove_vocab_path = os.path.join('../data/glove/', base + '.vocab') glove_vector_path = os.path.join('../data/glove/', base + '.path') # haved loaded word vector if os.path.isfile(glove_vocab_path) and os.path.isfile(glove_vector_path): print('======> File found, loading memory <=====!') vocab = Vocab(glove_vocab_path) vector = np.load(glove_vector_path) return vocab, vector print('=====>Loading glove word vector<=====') with open(path, 'r', encoding='utf8', errors='ignore') as f: contents = f.readline().rstrip('\n').split(' ') word_dim = len(contents[1:]) count = 1 for line in f: count += 1 vocab = [None] * count vector = np.zeros((count, word_dim)) with open(path, 'r', encoding='utf8', errors='ignore') as f: idx = 0 for line in f: contents = line.rstrip('\n').split(' ') vocab[idx] = contents[0] vector[idx] = np.array(list(map(float, contents[1:])), dtype=float) idx += 1 assert count == idx with open(glove_vector_path, 'w', encoding='utf8', errors='ignore') as f: for token in vocab: f.write(token + '\n') vocab = Vocab(glove_vocab_path) torch.save(vector, glove_vector_path) return vocab, vector
def prepare_data(): # load the dataset train_sections = [ os.path.join(paths.json_data_dir, '{:02}'.format(section_num)) for section_num in paths.train_sections ] dev_sections = [ os.path.join(paths.json_data_dir, '{:02}'.format(section_num)) for section_num in paths.dev_sections ] test_sections = [ os.path.join(paths.json_data_dir, '{:02}'.format(section_num)) for section_num in paths.test_sections ] train_dataset = PDTBDataSet(train_sections, tree_type=args.tree_type, level=args.level, multiple_labels=False) dev_dataset = PDTBDataSet(dev_sections, tree_type=args.tree_type, level=args.level, multiple_labels=True) test_dataset = PDTBDataSet(test_sections, tree_type=args.tree_type, level=args.level, multiple_labels=True) if not (train_dataset.consistent_with(dev_dataset) and dev_dataset.consistent_with(test_dataset)): print('Dataset labels are not consistent.') print('Train: {}'.format(sorted(train_dataset.label_map.keys()))) print('Dev: {}'.format(sorted(dev_dataset.label_map.keys()))) print('Test: {}'.format(sorted(test_dataset.label_map.keys()))) print('Size of train set: {}, dev set: {}, test set: {}'.format( len(train_dataset), len(dev_dataset), len(test_dataset))) # save the dataset torch.save(train_dataset, os.path.join(paths.experiment_data_dir, 'train.data')) torch.save(dev_dataset, os.path.join(paths.experiment_data_dir, 'dev.data')) torch.save(test_dataset, os.path.join(paths.experiment_data_dir, 'test.data')) # build the vocab vocab = Vocab( mannual_add=[PAD_WORD, UNK_WORD, BOS_WORD, EOS_WORD, NUM_WORD]) all_words = train_dataset.get_all_words() + dev_dataset.get_all_words( ) + test_dataset.get_all_words() # all_words = train_dataset.get_all_words() for word in all_words: vocab.add(word) # load and initialize the embeddings vocab.load_pretrained_emb(paths.embedding_path) print('Size of PDTB vocabulary: {}'.format(vocab.size())) # save the vocab torch.save(vocab, paths.vocab_path)
def __init__(self, args): super().__init__() self.args = args self.K = args.K self.rnn_hidden = args.rnn_hidden self.max_sent_len = args.max_sent_len print("loading pretrained emb......") self.emb_matrix = np.load(args.dset_dir+'/'+args.dataset+'/embedding.npy') print("loading dataset vocab......") self.vocab = Vocab(args.dset_dir+'/'+args.dataset+'/vocab.pkl') # create embedding layers self.emb = nn.Embedding(self.vocab.size, args.emb_dim, padding_idx=constant.PAD_ID) self.pos_emb = nn.Embedding(len(constant.POS_TO_ID), args.pos_dim) if args.pos_dim > 0 else None # initialize embedding with pretrained word embeddings self.init_embeddings() # dropout self.input_dropout = nn.Dropout(args.input_dropout) # define r rc distribution self.r_mean_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K)) self.r_std_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K, self.K)) self.r_diag_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K)) # orthogonal initialization r_std_rc for i in range(self.max_sent_len): nn.init.orthogonal_(self.r_std_rc[i], gain=1) # define r ner distribution self.r_mean_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K)) self.r_std_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K, self.K)) self.r_diag_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K)) # orthogonal initialization r_std_ner for i in range(self.max_sent_len): nn.init.orthogonal_(self.r_std_ner[i], gain=1) # define encoder self.BiLSTM = LSTMRelationModel(args) self.hidden2mean_rc = nn.Linear(self.rnn_hidden*2, self.K) self.hidden2std_rc = nn.Linear(self.rnn_hidden*2, self.K) # ner encoder self.hidden2mean_ner = nn.Linear(self.rnn_hidden*2, self.K) self.hidden2std_ner = nn.Linear(self.rnn_hidden*2, self.K) # decoder self.rc_lr = nn.Linear(args.K*2, args.K) self.rc_cla = nn.Linear(args.K, len(constant.LABEL_TO_ID)) self.ner_cla = nn.Linear(args.K, len(constant.BIO_TO_ID)) self.logsoft_fn = nn.LogSoftmax(dim=3) # mse loss self.loss_fn = torch.nn.MSELoss(reduction='sum')
def __init__(self, source_name, target_name, max_length=300, source_vocab=None, target_vocab=None): self.data_source = self.read_file(source_name) self.data_target = self.read_file(target_name) self.max_length = max_length self.source_vocab = source_vocab if source_vocab == None: self.source_vocab = Vocab() self.source_vocab.build_vocab([source_name]) self.target_vocab = target_vocab if target_vocab == None: self.target_vocab = Vocab() self.target_vocab.build_vocab([target_name])
def __init__(self, logger, config, data_name, data_path, embed_path=None, user_dict=None, vocab_path=None, stop_word=None, max_len=50, query_max_len=20, target_max_len=20, test_split=0.0, training=True): self.logger = logger self.reset = config.reset self._data_dir = Path('data') / data_name self.query_max_len = query_max_len self.target_max_len = target_max_len self.max_len = max_len if training: embedding_path = self._data_dir / embed_path print(embedding_path.absolute()) self._embedding = Embedding(str(embedding_path), logger=logger) print( f"Begin to build segment and ..... feature engnieer .... ngram ....." ) self._segment = Segment_jieba(user_dict=str(self._data_dir / user_dict)) if training: print(f"Begin to build vocab") self._vocab = Vocab(str(self._data_dir / 'RAW' / vocab_path), self._segment, self._embedding) self.word2idx, self.idx2word = self._vocab.word2idx, self._vocab.idx2word dump_to_pickle(str(self._data_dir / 'vocab.pkl'), (self.word2idx, self.idx2word), self.reset) else: print(f"load the vocab") (self.word2idx, self.idx2word) = load_from_pickle( str(self._data_dir / 'vocab.pkl')) self.vocab_size = len(self.word2idx) if training: filename = str(self._data_dir / 'RAW' / data_path) # train_test_split and exist self._get_train_and_test(filename, test_split)
def main(args): print("Load Tokenizer and Define Variables.") ## by arguments if args.lang == 'ko': tokenizer = ko.Tokenizer() else: raise ValueError( "Wrong arguments for --lang. Please pass 'ko' for --lang arguments." ) processed_path = args.path ## etc emo = emoji.get_emoji_regexp() now = datetime.now() ## Load data for synthesio cols = ['Mention Title', 'Mention Content'] df = pd.read_parquet('data/Korean.parquet', columns=cols) df = df.fillna('') docs = [doc for doc in df['Mention Title'] + ' ' + df['Mention Content']] print("Tokenize the documents and build the vocab.") with Pool(processes=os.cpu_count()) as pool: tokenized_docs = pool.map(tokenizer.tokenize, docs) token_counts = Counter(list(zip(*chain(*tokenized_docs)))[0]).most_common() vocab = Vocab(list_of_tokens=[ token for token, count in token_counts if count >= int(args.min_count) ], token_to_idx={ '[PAD]': 0, '[UNK]': 1 }) vocab.lexeme['is_Emoji'] = [ True if emo.fullmatch(term) != None else False for term in vocab.idx_to_token ] vocab.lexeme['is_Digit'] = [ True if re.fullmatch(r'[\d\,\.]+', term) != None else False for term in vocab.idx_to_token ] vocab.lexeme['is_Punct'] = [ True if re.fullmatch(rf'[{string.punctuation}]+', term) != None else False for term in vocab.idx_to_token ] print(f"Build the new vocab vocab-size : {len(vocab)}") with open(f"{processed_path}/vocab-{now:%Y%m%d}.pkl", 'wb') as f: pickle.dump(vocab, f)
def create_vocab(data_path, min_count=3): root_rel = None wd_vocab = Vocab(min_count, eos=None) char_vocab = Vocab(min_count, eos=None) tag_vocab = Vocab(eos=None) rel_vocab = Vocab(bos=None, eos=None) with open(data_path, 'r', encoding='utf-8') as fr: for deps in read_deps(fr): for dep in deps: wd_vocab.add(dep.form) char_vocab.add(list(dep.form)) tag_vocab.add(dep.pos_tag) if dep.head != 0: rel_vocab.add(dep.dep_rel) elif root_rel is None: root_rel = dep.dep_rel rel_vocab.add(dep.dep_rel) elif root_rel != dep.dep_rel: print('root = ' + root_rel + ', rel for root = ' + dep.dep_rel) return MultiVocab( dict(word=wd_vocab, char=char_vocab, tag=tag_vocab, rel=rel_vocab))
def __init__(self, d_model, attn_dropout=0.1, temper_value=0.5): super(ScaledDotProductAttention, self).__init__() # add temper as hyperparameter self.temper = np.power(d_model, temper_value) # 0.5 originally self.dropout = nn.Dropout(attn_dropout) self.softmax = nn.Softmax(dim=2) # this is only used in attention investigation # TODO: set it as a flag in runner.py vocab_file = 'dataset/vocab/vocab.pkl' self.vocab = Vocab(vocab_file, load=True) self.tanh = nn.Tanh() self.conv = nn.Conv2d(240, kernel_size=1, out_channels=1)
def __init__(self, test=False, data_dir="data", vocab_path='data/vocab'): super(Articles, self).__init__() '''Initialization''' self.vocab = Vocab(vocab_path, voc_size) self.tokenizer = data.get_tokenizer('basic_english') self.max_len_story = MAX_LEN_STORY self.max_len_highlight = MAX_LEN_HIGHLIGHT is_test = { False: os.path.join(data_dir, "train.pkl"), True: os.path.join(data_dir, "test.pkl") } self.data_path = is_test.get(test, "Wrong set name.") with open(self.data_path, 'rb') as f: self.data = load(f)
def main(args): with open(args.pkl_path, "rb") as f: labels = pickle.load(f) print("pickle loaded") tsv_path = get_eval_path(args.ref) dfref = pd.read_table(tsv_path) if args.vocab is not None: vocab = Vocab(args.vocab) else: vocab = None acc1, acck, cnt = accuracy(labels, dfref, vocab=vocab) print(f"{cnt:d} tokens") print(f"Accuracy top1: {acc1:.3f} topk: {acck:.3f}")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data', required=True) parser.add_argument('--vocab', required=True) parser.add_argument('--vocab-size', required=True, type=int) parser.add_argument('--max-length', required=True, type=int) parser.add_argument('--out', required=True) args = parser.parse_args() word_vocab = Vocab.from_file(path=args.vocab, add_pad=True, add_unk=True, max_size=args.vocab_size) label_dict = {'neutral': 0, 'entailment': 1, 'contradiction': 2} label_vocab = Vocab(vocab_dict=label_dict, add_pad=False, add_unk=False) data_reader = SNLIDataset( data_path=args.data, word_vocab=word_vocab, label_vocab=label_vocab, max_length=args.max_length) with open(args.out, 'wb') as f: pickle.dump(data_reader, f)
def build(self, corpus, min_freq=1, embed=None): sequences = getattr(corpus, self.name) counter = Counter(char for sequence in sequences for token in sequence for char in self.transform(token)) self.vocab = Vocab(counter, min_freq, self.specials) if not embed: self.embed = None else: tokens = self.transform(embed.tokens) # if the `unk` token has existed in the pretrained, # then replace it with a self-defined one if embed.unk: tokens[embed.unk_index] = self.unk self.vocab.extend(tokens) self.embed = torch.zeros(len(self.vocab), embed.dim) self.embed[self.vocab.token2id(tokens)] = embed.vectors
def run(do_train, do_eval, do_predict, ckpt, get_rouge, max_epochs=100): train_set = Articles(test=False) test_set = Articles(test=True) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False, num_workers=1) test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=1) encoder = Encoder() attention_decoder = AttnDecoder() model = PointerGenerator(encoder, attention_decoder) model.to(device) optimizer = torch.optim.Adagrad(model.parameters(), lr=lr) loss_function = torch.nn.NLLLoss() if ckpt: model, optimizer, epoch = load_ckp(checkpoint_path=ckpt, model=model, optimizer=optimizer) if do_eval: eval(test_loader, model, loss_function) elif do_predict: vocab = Vocab('data/vocab', voc_size) batch = iter(train_loader).next() story, highlight = batch batcher = Batcher(story, highlight, vocab) stories, highlights, extra_zeros, story_extended, highlight_extended, vocab_extended = batcher.get_batch( get_vocab_extended=True) stories = stories.to(device) highlights = highlights.to(device) story_extended = story_extended.to(device) extra_zeros = extra_zeros.to(device) # stories, highlights = get_random_sentences(test_set, batch_size) with torch.no_grad(): output = model(stories, highlights, story_extended, extra_zeros) get_batch_prediction(stories, output, highlights) if get_rouge: get_rouge_files(model, test_loader) get_rouge_score() else: epoch = 0 if do_train: train(train_loader, test_loader, loss_function, model, optimizer, epoch, num_epochs=max_epochs - epoch)
def __init__(self, args): super().__init__() self.args = args self.K = args.K self.L = args.L self.rnn_hidden = args.rnn_hidden self.max_sent_len = args.max_sent_len print("loading pretrained emb......") self.emb_matrix = np.load(args.dset_dir + '/' + args.dataset + '/embedding.npy') print("loading dataset vocab......") self.vocab = Vocab(args.dset_dir + '/' + args.dataset + '/vocab.pkl') # create embedding layers self.emb = nn.Embedding(self.vocab.size, args.emb_dim, padding_idx=constant.PAD_ID) self.pos_emb = nn.Embedding(len(constant.POS_TO_ID), args.pos_dim) if args.pos_dim > 0 else None # initialize embedding with pretrained word embeddings self.init_embeddings() # dropout self.input_dropout = nn.Dropout(args.input_dropout) # define r distribution self.r_var = self.K * self.max_sent_len self.r_mean = nn.Parameter(torch.randn(self.max_sent_len, self.K)) self.r_std = nn.Parameter( torch.randn(self.max_sent_len * self.K, self.L)) # define encoder self.BiLSTM = LSTMRelationModel(args) self.hidden2mean = nn.Linear(self.rnn_hidden * 2, self.K) self.hidden2std = nn.Linear(self.rnn_hidden * 2, self.K) # decoder self.layer_rc1 = nn.Linear(args.K * 2, args.K) self.rc_cla = nn.Linear(args.K, len(constant.LABEL_TO_ID)) self.layer_ner1 = nn.Linear(args.K, args.K // 2) self.ner_cla = nn.Linear(args.K // 2, len(constant.BIO_TO_ID)) self.logsoft_fn = nn.LogSoftmax(dim=3)