def configuration(cls, plm=None, method='lgesql', table_path='data/tables.json', tables='data/tables.bin', db_dir='data/database'): cls.plm, cls.method = plm, method cls.grammar = ASDLGrammar.from_filepath(GRAMMAR_FILEPATH) cls.trans = TransitionSystem.get_class_by_lang('sql')(cls.grammar) cls.tables = pickle.load(open(tables, 'rb')) if type(tables) == str else tables cls.evaluator = Evaluator(cls.trans, table_path, db_dir) if plm is None: cls.word2vec = Word2vecUtils() cls.tokenizer = lambda x: x cls.word_vocab = Vocab( padding=True, unk=True, boundary=True, default=UNK, filepath='./pretrained_models/glove.42b.300d/vocab.txt', specials=SCHEMA_TYPES) # word vocab for glove.42B.300d else: cls.tokenizer = AutoTokenizer.from_pretrained( os.path.join('./pretrained_models', plm)) cls.word_vocab = cls.tokenizer.get_vocab() cls.relation_vocab = Vocab(padding=False, unk=False, boundary=False, iterable=RELATIONS, default=None) cls.graph_factory = GraphFactory(cls.method, cls.relation_vocab)
def get_dataloader_for_train(args, tokenizer): data_path, raw_data_path = args.data_path, args.raw_data_path batch_size = args.batch_size if args.load_userdict: jieba.load_userdict(args.userdict) domain_map = Vocab.from_file(os.path.join(data_path, "domains.txt")) intent_map = Vocab.from_file(os.path.join(data_path, "intents.txt")) slots_map = Vocab.from_file(os.path.join(data_path, "slots.txt")) label_vocab = Vocab.from_file(os.path.join(data_path, "label_vocab.txt")) bin_label_vocab = Vocab.from_file(os.path.join(data_path, "bin_label_vocab.txt")) # train all_train_data = [] train_dom_data = read_all_train_data( os.path.join(raw_data_path, "source.json"), tokenizer, domain_map, intent_map, slots_map, label_vocab, bin_label_vocab) for dom, dom_data in train_dom_data.items(): all_train_data.extend(dom_data) dev_sup_dom_data = read_support_data( os.path.join(raw_data_path, "dev", "support"), tokenizer, domain_map, intent_map, slots_map, label_vocab, bin_label_vocab) for i_dom, dom_data in dev_sup_dom_data.items(): all_train_data.extend(dom_data) dataloader = thdata.DataLoader(dataset=Dataset(all_train_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn) return dataloader
def get_dataloader_for_support(args, tokenizer, sep_dom=False): data_path, fin_data_path = args.data_path, args.fin_data_path batch_size = args.batch_size if args.load_userdict: jieba.load_userdict(args.userdict) domain_map = Vocab.from_file(os.path.join(data_path, "domains.txt")) intent_map = Vocab.from_file(os.path.join(data_path, "intents.txt")) slots_map = Vocab.from_file(os.path.join(data_path, "slots.txt")) label_vocab = Vocab.from_file(os.path.join(data_path, "label_vocab.txt")) bin_label_vocab = Vocab.from_file(os.path.join(data_path, "bin_label_vocab.txt")) sup_dom_data = read_support_data( os.path.join(fin_data_path, "support"), tokenizer, domain_map, intent_map, slots_map, label_vocab, bin_label_vocab) if not sep_dom: sup_data = [] for dom_data in sup_dom_data.values(): sup_data.extend(dom_data) suploader = thdata.DataLoader( dataset=Dataset(sup_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn) return suploader else: suploaders = {} for dom, dom_data in sup_dom_data.items(): suploaders[dom] = thdata.DataLoader( dataset=Dataset(sup_dom_data[dom]), batch_size=batch_size, shuffle=True, collate_fn=collate_fn) return suploaders
def create_vocab(self): if self.is_training: if not os.path.exists(self.vocab_file_path): print("Creating vocab") self.vocab = Vocab(add_bos=False, add_eos=False, add_padding=False, min_count=self.min_count) for example in self.dataset: self.vocab.add_tokenized_sentence( example['tokens'][:self.train_max_length]) self.vocab.finish() with open(self.vocab_file_path, 'wb') as f: pickle.dump(self.vocab, f) else: with open(self.vocab_file_path, 'rb') as f: self.vocab = pickle.load(f) else: print("Cargando vocab") with open(self.vocab_file_path, 'rb') as f: self.vocab = pickle.load(f)
def main(args): print("Load Tokenizer and Define Variables.") ## by arguments if args.lang == 'ko': tokenizer = ko.Tokenizer() else: raise ValueError( "Wrong arguments for --lang. Please pass 'ko' for --lang arguments." ) processed_path = args.path ## etc emo = emoji.get_emoji_regexp() now = datetime.now() ## Load data for synthesio cols = ['Mention Title', 'Mention Content'] df = pd.read_parquet('data/Korean.parquet', columns=cols) df = df.fillna('') docs = [doc for doc in df['Mention Title'] + ' ' + df['Mention Content']] print("Tokenize the documents and build the vocab.") with Pool(processes=os.cpu_count()) as pool: tokenized_docs = pool.map(tokenizer.tokenize, docs) token_counts = Counter(list(zip(*chain(*tokenized_docs)))[0]).most_common() vocab = Vocab(list_of_tokens=[ token for token, count in token_counts if count >= int(args.min_count) ], token_to_idx={ '[PAD]': 0, '[UNK]': 1 }) vocab.lexeme['is_Emoji'] = [ True if emo.fullmatch(term) != None else False for term in vocab.idx_to_token ] vocab.lexeme['is_Digit'] = [ True if re.fullmatch(r'[\d\,\.]+', term) != None else False for term in vocab.idx_to_token ] vocab.lexeme['is_Punct'] = [ True if re.fullmatch(rf'[{string.punctuation}]+', term) != None else False for term in vocab.idx_to_token ] print(f"Build the new vocab vocab-size : {len(vocab)}") with open(f"{processed_path}/vocab-{now:%Y%m%d}.pkl", 'wb') as f: pickle.dump(vocab, f)
def train(params): assert params["mode"].lower() == "train", "change training mode to 'train'" vocab = Vocab(params["vocab_path"], params["vocab_size"]) params['vocab_size'] = vocab.count # params["trained_epoch"] = get_train_msg() params["learning_rate"] *= np.power(0.9, params["trained_epoch"]) # 构建模型 print("Building the model ...") model = Seq2Seq(params) # 获取保存管理者 checkpoint = tf.train.Checkpoint(Seq2Seq=model) checkpoint_manager = tf.train.CheckpointManager(checkpoint, SEQ2SEQ_CKPT, max_to_keep=5) checkpoint.restore(checkpoint_manager.latest_checkpoint) if checkpoint_manager.latest_checkpoint: print("Restored from {}".format(checkpoint_manager.latest_checkpoint)) else: print("Initializing from scratch.") # 训练模型 print("开始训练模型..") print("trained_epoch:", params["trained_epoch"]) print("mode:", params["mode"]) print("epochs:", params["epochs"]) print("batch_size:", params["batch_size"]) print("max_enc_len:", params["max_enc_len"]) print("max_dec_len:", params["max_dec_len"]) print("learning_rate:", params["learning_rate"]) train_model(model, vocab, params, checkpoint_manager)
def build_vocab(df, vocab_path): print(f"building vocab ...") vocab_dict = {"<unk>": 1, "<eos>": 2, "<pad>": 3} vocab_set = [] for row in tqdm(df.itertuples()): text = row.text.replace(" ", "") # remove spaces phones = pyopenjtalk.g2p(text, join=False) # remove pause phones = [phone for phone in phones if phone != "pau"] for phone in phones: if phone not in vocab_set: vocab_set.append(phone) # alphabetical order vocab_set.sort() wlines = [] for v in vocab_set: index = len(vocab_dict) + 1 vocab_dict[v] = index for v, index in vocab_dict.items(): wlines.append(f"{v} {index:d}\n") with open(vocab_path, "w", encoding="utf-8") as f: f.writelines(wlines) print(f"vocabulary saved to {vocab_path}") return Vocab(vocab_path)
def __init__(self, config): super(XfmrDecoder, self).__init__() self.vocab = Vocab.load(config["vocab_file"]) with open(config["typelib_file"]) as type_f: self.typelib = TypeLibCodec.decode(type_f.read()) self.typelib = self.typelib.fix() self.target_embedding = nn.Embedding(len(self.vocab.subtypes), config["target_embedding_size"]) self.target_transform = nn.Linear( config["target_embedding_size"] + config["hidden_size"], config["hidden_size"], ) # self.cached_decode_mask: Dict[int, torch.Tensor] = {} # self.size = torch.zeros(len(self.vocab.types), dtype=torch.long) # concat variable encoding and previous target token embedding as input decoder_layer = TransformerDecoderLayer( config["hidden_size"], 1, config["hidden_size"], config["dropout"], activation="gelu", ) decoder_norm = LayerNorm(config["hidden_size"]) self.decoder = TransformerDecoder(decoder_layer, config["num_layers"], decoder_norm) self.output = nn.Linear(config["hidden_size"], len(self.vocab.subtypes)) self.config: Dict = config
def __init__(self, url: str, config: Optional[Dict] = None, percent: float = 1.0): # support wildcards urls = sorted(glob.glob(url)) urls = urls[:int(percent * len(urls))] super().__init__(urls) if config: # annotate example for training from utils.vocab import Vocab self.vocab = Vocab.load(config["vocab_file"]) with open(config["typelib_file"]) as type_f: self.typelib = TypeLibCodec.decode(type_f.read()) self.max_src_tokens_len = config["max_src_tokens_len"] self.max_num_var = config["max_num_var"] annotate = self._annotate self.rename = config.get("rename", False) # sort = Dataset._sort sort = identity else: # for creating the vocab annotate = identity sort = identity self = (self.pipe(Dataset._file_iter_to_line_iter).map( Example.from_json).map(annotate).shuffle( Dataset.SHUFFLE_BUFFER).pipe(sort))
def __init__(self, config): super(XfmrDecoder, self).__init__() self.vocab = Vocab.load(config["vocab_file"]) with open(config["typelib_file"]) as type_f: self.typelib = TypeLibCodec.decode(type_f.read()) retype_vocab_size = len(self.vocab.types) rename_vocab_size = len(self.vocab.names) self.target_embedding = nn.Embedding( retype_vocab_size + rename_vocab_size, config["target_embedding_size"] ) self.target_transform = nn.Linear( config["target_embedding_size"] + config["hidden_size"], config["hidden_size"], ) # concat variable encoding and previous target token embedding as input decoder_layer = TransformerDecoderLayer( config["hidden_size"], 1, config["hidden_size"], config["dropout"], activation="gelu", ) decoder_norm = LayerNorm(config["hidden_size"]) self.decoder = TransformerDecoder( decoder_layer, config["num_layers"], decoder_norm ) self.output = nn.Linear( config["hidden_size"], retype_vocab_size + rename_vocab_size ) self.mem_mask = config["mem_mask"] self.config: Dict = config self.retype_vocab_size = retype_vocab_size
def __init__(self, test=False, data_dir="data", vocab_path='data/vocab'): super(Articles, self).__init__() '''Initialization''' self.vocab = Vocab(vocab_path, voc_size) self.tokenizer = data.get_tokenizer('basic_english') self.max_len_story = MAX_LEN_STORY self.max_len_highlight = MAX_LEN_HIGHLIGHT is_test = { False: os.path.join(data_dir, "train.pkl"), True: os.path.join(data_dir, "test.pkl") } self.data_path = is_test.get(test, "Wrong set name.") with open(self.data_path, 'rb') as f: self.data = load(f)
def get_dataloader_for_fs_eval(data_path, raw_data_path, eval_domains: list, batch_size, max_sup_ratio, max_sup_size, n_shots, tokenizer, return_suploader=False): domain_map = Vocab.from_file(os.path.join(data_path, "domains.txt")) intent_map = Vocab.from_file(os.path.join(data_path, "intents.txt")) slots_map = Vocab.from_file(os.path.join(data_path, "slots.txt")) label_vocab = Vocab.from_file(os.path.join(data_path, "label_vocab.txt")) bin_label_vocab = Vocab.from_file( os.path.join(data_path, "bin_label_vocab.txt")) # train all_train_data = read_all_train_data( os.path.join(raw_data_path, "source.json"), tokenizer, domain_map, intent_map, slots_map, label_vocab, bin_label_vocab) data = {k: v for k, v in all_train_data.items() if k in eval_domains} # eval support & query fs_data = [] fs_sup_data = [] for dom, dom_data in data.items(): sup_size = max(min(int(max_sup_ratio * len(dom_data)), max_sup_size), n_shots) sup_data, qry_data = separate_data_to_support_and_query( dom_data, sup_size) dom_data = collect_support_instances(sup_data, qry_data, int(n_shots)) fs_data.extend(dom_data) if return_suploader: fs_sup_data.extend(sup_data) dataloader = thdata.DataLoader(dataset=Dataset(fs_data), batch_size=batch_size, shuffle=False, collate_fn=collate_fn) if return_suploader: suploader = thdata.DataLoader(dataset=Dataset(fs_sup_data), batch_size=batch_size, shuffle=True, collate_fn=collate_fn) return dataloader, suploader else: return dataloader
def vocabs_init(train_data: List[str]) -> Vocab: print("Constructing vocabularies...", flush=True) vocab = Vocab(train_data) print('len(labels_vocab): %d' % len(vocab)) return vocab
def __init__(self, args): super().__init__() self.args = args self.K = args.K self.rnn_hidden = args.rnn_hidden self.max_sent_len = args.max_sent_len print("loading pretrained emb......") self.emb_matrix = np.load(args.dset_dir + '/' + args.dataset + '/embedding.npy') print("loading dataset vocab......") self.vocab = Vocab(args.dset_dir + '/' + args.dataset + '/vocab.pkl') # create embedding layers self.emb = nn.Embedding(self.vocab.size, args.emb_dim, padding_idx=constant.PAD_ID) self.pos_emb = nn.Embedding(len(constant.POS_TO_ID), args.pos_dim) if args.pos_dim > 0 else None # initialize embedding with pretrained word embeddings self.init_embeddings() # dropout self.input_dropout = nn.Dropout(args.input_dropout) # GRU for P(Trc|S,Y') self.GRU_mean_rc = torch.nn.GRUCell( len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID), self.rnn_hidden * 2) self.GRU_std_rc = torch.nn.GRUCell( len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID), self.rnn_hidden * 2) # GRU for P(Tner|S,Y') self.GRU_mean_ner = torch.nn.GRUCell( len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID), self.rnn_hidden * 2) self.GRU_std_ner = torch.nn.GRUCell( len(constant.BIO_TO_ID) + len(constant.LABEL_TO_ID), self.rnn_hidden * 2) # define r self.r_mean_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K)) self.r_std_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K)) self.r_mean_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K)) self.r_std_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K)) # define encoder for the sharing representations S self.BiLSTM = LSTMRelationModel(args) # classifer self.Lr = nn.Linear(4 * self.rnn_hidden, 2 * self.rnn_hidden) self.Cr = nn.Linear(2 * self.rnn_hidden, len(constant.LABEL_TO_ID)) self.Cg = nn.Linear(2 * self.rnn_hidden, len(constant.BIO_TO_ID)) # Fn self.logsoft_fn1 = nn.LogSoftmax(dim=2) self.logsoft_fn2 = nn.LogSoftmax(dim=3)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data', required=True) parser.add_argument('--vocab', required=True) parser.add_argument('--vocab-size', required=True, type=int) parser.add_argument('--max-length', required=True, type=int) parser.add_argument('--out', required=True) args = parser.parse_args() word_vocab = Vocab.from_file(path=args.vocab, add_pad=True, add_unk=True, max_size=args.vocab_size) label_dict = {'neutral': 0, 'entailment': 1, 'contradiction': 2} label_vocab = Vocab(vocab_dict=label_dict, add_pad=False, add_unk=False) data_reader = SNLIDataset( data_path=args.data, word_vocab=word_vocab, label_vocab=label_vocab, max_length=args.max_length) with open(args.out, 'wb') as f: pickle.dump(data_reader, f)
def build(cls, config): params = util.update(cls.default_params(), config) vocab = Vocab.load(params['vocab_file']) model = cls(params['ast_node_encoding_size'], params['hidden_size'], params['dropout'], vocab) model.config = params return model
def __init__(self, config): super().__init__() self.vocab = vocab = Vocab.load(config['vocab_file']) self.src_word_embed = nn.Embedding(len(vocab.source_tokens), config['source_embedding_size']) self.config = config self.decoder_cell_init = nn.Linear(config['source_encoding_size'], config['decoder_hidden_size']) if self.config['transformer'] == 'none': dropout = config['dropout'] self.lstm_encoder = nn.LSTM(input_size=self.src_word_embed.embedding_dim, hidden_size=config['source_encoding_size'] // 2, num_layers=config['num_layers'], batch_first=True, bidirectional=True, dropout=dropout) self.dropout = nn.Dropout(dropout) elif self.config['transformer'] == 'bert': self.vocab_size = len(self.vocab.source_tokens) + 1 state_dict = torch.load('saved_checkpoints/bert_2604/bert_pretrained_epoch_23_batch_140000.pth') keys_to_delete = ["cls.predictions.bias", "cls.predictions.transform.dense.weight", "cls.predictions.transform.dense.bias", "cls.predictions.transform.LayerNorm.weight", "cls.predictions.transform.LayerNorm.bias", "cls.predictions.decoder.weight", "cls.predictions.decoder.bias", "cls.seq_relationship.weight", "cls.seq_relationship.bias"] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict['model'].items(): if k in keys_to_delete: continue name = k[5:] # remove `bert.` new_state_dict[name] = v bert_config = BertConfig(vocab_size=self.vocab_size, max_position_embeddings=512, num_hidden_layers=6, hidden_size=256, num_attention_heads=4) self.bert_model = BertModel(bert_config) self.bert_model.load_state_dict(new_state_dict) elif self.config['transformer'] == 'xlnet': self.vocab_size = len(self.vocab.source_tokens) + 1 state_dict = torch.load('saved_checkpoints/xlnet_2704/xlnet1_pretrained_epoch_13_iter_500000.pth') keys_to_delete = ["lm_loss.weight", "lm_loss.bias"] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict['model'].items(): if k in keys_to_delete: continue if k[:12] == 'transformer.': name = k[12:] else: name = k new_state_dict[name] = v xlnet_config = XLNetConfig(vocab_size=self.vocab_size, d_model=256, n_layer=12) self.xlnet_model = XLNetModel(xlnet_config) self.xlnet_model.load_state_dict(new_state_dict) else: print("Error! Unknown transformer type '{}'".format(self.config['transformer']))
def covar_analysis(args): model = GaussianBilinearModel.load_model(args.model) rel_vocab = Vocab.load(args.relation) rel_mats = model.relation_mats scores = [abs(np.linalg.det(mat)) for mat in rel_mats] sort_idxs = np.argsort(scores)[::-1] for idx in sort_idxs: print('{} : {}'.format(rel_vocab.get_word(idx), scores[idx]))
def build(self, corpus, min_freq=1, embed=None): sequences = getattr(corpus, self.name) counter = Counter(char for sequence in sequences for token in sequence for char in self.transform(token)) self.vocab = Vocab(counter, min_freq, self.specials) if not embed: self.embed = None else: tokens = self.transform(embed.tokens) # if the `unk` token has existed in the pretrained, # then replace it with a self-defined one if embed.unk: tokens[embed.unk_index] = self.unk self.vocab.extend(tokens) self.embed = torch.zeros(len(self.vocab), embed.dim) self.embed[self.vocab.token2id(tokens)] = embed.vectors
def create_vocab(data_path): wd_vocab = Vocab(min_count=3, bos=None, eos=None) lbl_vocab = Vocab(pad=None, unk=None, bos=None, eos=None) assert os.path.exists(data_path) with open(data_path, 'r', encoding='utf-8') as fin: loader = map(lambda x: x.strip().split('|||'), fin) for lbl, data_item in loader: wds = data_item.strip().split(' ') wd_vocab.add(wds) lbl_vocab.add(lbl.strip()) return MultiVocab({'word': wd_vocab, 'label': lbl_vocab})
def __init__(self, args): self.args = args self.epoch = args.epoch self.batch_size = args.batch_size self.lr = args.lr self.K = args.K self.num_avg = args.num_avg self.global_iter = 0 self.global_epoch = 0 self.log_file = args.log_file # Network & Optimizer self.toynet = ToyNet(args).cuda() self.optim = optim.Adam(self.toynet.parameters(), lr=self.lr) self.ckpt_dir = Path(args.ckpt_dir) if not self.ckpt_dir.exists(): self.ckpt_dir.mkdir(parents=True, exist_ok=True) self.load_ckpt = args.load_ckpt if self.load_ckpt != '': self.load_checkpoint(self.load_ckpt) # loss function self.ner_lossfn = nn.NLLLoss(reduction='sum') self.rc_lossfn = nn.BCELoss(reduction='sum') # History self.history = dict() # class loss self.history['ner_train_loss1'] = [] self.history['rc_train_loss1'] = [] self.history['ner_test_loss1'] = [] self.history['rc_test_loss1'] = [] self.history['ner_train_loss2'] = [] self.history['rc_train_loss2'] = [] self.history['ner_test_loss2'] = [] self.history['rc_test_loss2'] = [] self.history['precision_test'] = [] self.history['recall_test'] = [] self.history['F1_test'] = [] # info loss self.history['info_train_loss'] = [] self.history['info_test_loss'] = [] # Dataset vocab = Vocab(args.dset_dir + '/' + args.dataset + '/vocab.pkl') self.data_loader = dict() self.data_loader['train'] = Dataloader( args.dset_dir + '/' + args.dataset + '/train.json', args.batch_size, vars(args), vocab) self.data_loader['test'] = Dataloader(args.dset_dir + '/' + args.dataset + '/test.json', args.batch_size, vars(args), vocab, evaluation=True)
def evaluate_model(evalparams): torch.manual_seed(evalparams.seed) random.seed(1234) if evalparams.cpu: evalparams.cuda = False elif evalparams.cud: torch.cuda.manual_seed(args.seed) # load opt print(evalparams.model_dir, evalparams.model) # model_file = evalparams.model_dir + "/" + evalparams.model model_file = 'best_model.pt' print("Loading model from {}".format(model_file)) opt = torch_utils.load_config(model_file) model = RelationModel(opt) model.load(model_file) # load vocab vocab_file = evalparams.model_dir + '/vocab.pkl' vocab = Vocab(vocab_file, load=True) assert opt[ 'vocab_size'] == vocab.size, "Vocab size must match that in the saved model." # load data data_file = opt['data_dir'] + '/{}.json'.format(evalparams.dataset) print("Loading data from {} with batch size {}...".format( data_file, opt['batch_size'])) batch = DataLoader(data_file, opt['batch_size'], opt, vocab, evaluation=True) helper.print_config(opt) id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()]) predictions = [] all_probs = [] for i, b in enumerate(batch): preds, probs, _ = model.predict(b) predictions += preds all_probs += probs predictions = [id2label[p] for p in predictions] p, r, f1 = scorer.score(batch.gold(), predictions, verbose=True) # save probability scores if len(evalparams.out) > 0: helper.ensure_dir(os.path.dirname(evalparams.out)) with open(evalparams.out, 'wb') as outfile: pickle.dump(all_probs, outfile) print("Prediction scores saved to {}.".format(evalparams.out)) print("Evaluation ended.") return (batch.gold(), predictions, model)
def build(cls, config): params = util.update(cls.default_params(), config) vocab = Vocab.load(params['vocab_file']) model = cls(params['variable_encoding_size'], params['hidden_size'], params['dropout'], params['tie_embedding'], params['input_feed'], vocab) model.config = params return model
def load_word_vector(path): """ loading word vector(this project employs GLOVE word vector), save GLOVE word, vector as file respectively :param path: GLOVE word vector path :return: glove vocab,: vocab object, vector(numpy array, of shape(words_num, word_dim)) """ base = os.path.splitext(os.path.basename(path))[0] glove_vocab_path = os.path.join('../data/glove/', base + '.vocab') glove_vector_path = os.path.join('../data/glove/', base + '.path') # haved loaded word vector if os.path.isfile(glove_vocab_path) and os.path.isfile(glove_vector_path): print('======> File found, loading memory <=====!') vocab = Vocab(glove_vocab_path) vector = np.load(glove_vector_path) return vocab, vector print('=====>Loading glove word vector<=====') with open(path, 'r', encoding='utf8', errors='ignore') as f: contents = f.readline().rstrip('\n').split(' ') word_dim = len(contents[1:]) count = 1 for line in f: count += 1 vocab = [None] * count vector = np.zeros((count, word_dim)) with open(path, 'r', encoding='utf8', errors='ignore') as f: idx = 0 for line in f: contents = line.rstrip('\n').split(' ') vocab[idx] = contents[0] vector[idx] = np.array(list(map(float, contents[1:])), dtype=float) idx += 1 assert count == idx with open(glove_vector_path, 'w', encoding='utf8', errors='ignore') as f: for token in vocab: f.write(token + '\n') vocab = Vocab(glove_vocab_path) torch.save(vector, glove_vector_path) return vocab, vector
def train_model(db_file, entity_db_file, vocab_file, word2vec, **kwargs): db = AbstractDB(db_file, 'r') entity_db = EntityDB.load(entity_db_file) vocab = Vocab.load(vocab_file) if word2vec: w2vec = ModelReader(word2vec) else: w2vec = None train.train(db, entity_db, vocab, w2vec, **kwargs)
class Articles(torch.utils.data.Dataset): def __init__(self, test=False, data_dir="data", vocab_path='data/vocab'): super(Articles, self).__init__() '''Initialization''' self.vocab = Vocab(vocab_path, voc_size) self.tokenizer = data.get_tokenizer('basic_english') self.max_len_story = MAX_LEN_STORY self.max_len_highlight = MAX_LEN_HIGHLIGHT is_test = { False: os.path.join(data_dir, "train.pkl"), True: os.path.join(data_dir, "test.pkl") } self.data_path = is_test.get(test, "Wrong set name.") with open(self.data_path, 'rb') as f: self.data = load(f) def __len__(self): '''return the number of articles''' return len(self.data) def __getitem__(self, idx): '''generates one sample of data''' X, y = self.data[idx]['story'], self.data[idx]['highlights'] X_tokenized, y_tokenized = list(map(lambda x: self.tokenize(x), [X, y])) X_padded = self.padding(X_tokenized) y_padded = self.padding(y_tokenized, sequence_type="highlight") return X_padded, y_padded def tokenize(self, sequence): '''tokenize a sequence''' tokenized_sequence = [] tokenized_sequence.extend( [token for token in self.tokenizer(sequence)]) tokenized_sequence.append(STOP_TOKEN) return tokenized_sequence def words_to_index(self, tokenized_sequence): '''return list of index of tokens in the sequence''' return self.vocab.sequence_2_id(tokenized_sequence) def padding(self, sequence, sequence_type="story"): '''pad the sequence with the corresponding length''' if sequence_type == "story": max_len = self.max_len_story else: max_len = self.max_len_highlight if len(sequence) > max_len: sequence = sequence[:max_len] else: sequence += [PAD_TOKEN] * (max_len - len(sequence)) return sequence
def __init__(self, args): super().__init__() self.args = args self.K = args.K self.rnn_hidden = args.rnn_hidden self.max_sent_len = args.max_sent_len print("loading pretrained emb......") self.emb_matrix = np.load(args.dset_dir+'/'+args.dataset+'/embedding.npy') print("loading dataset vocab......") self.vocab = Vocab(args.dset_dir+'/'+args.dataset+'/vocab.pkl') # create embedding layers self.emb = nn.Embedding(self.vocab.size, args.emb_dim, padding_idx=constant.PAD_ID) self.pos_emb = nn.Embedding(len(constant.POS_TO_ID), args.pos_dim) if args.pos_dim > 0 else None # initialize embedding with pretrained word embeddings self.init_embeddings() # dropout self.input_dropout = nn.Dropout(args.input_dropout) # define r rc distribution self.r_mean_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K)) self.r_std_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K, self.K)) self.r_diag_rc = nn.Parameter(torch.randn(self.max_sent_len, self.K)) # orthogonal initialization r_std_rc for i in range(self.max_sent_len): nn.init.orthogonal_(self.r_std_rc[i], gain=1) # define r ner distribution self.r_mean_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K)) self.r_std_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K, self.K)) self.r_diag_ner = nn.Parameter(torch.randn(self.max_sent_len, self.K)) # orthogonal initialization r_std_ner for i in range(self.max_sent_len): nn.init.orthogonal_(self.r_std_ner[i], gain=1) # define encoder self.BiLSTM = LSTMRelationModel(args) self.hidden2mean_rc = nn.Linear(self.rnn_hidden*2, self.K) self.hidden2std_rc = nn.Linear(self.rnn_hidden*2, self.K) # ner encoder self.hidden2mean_ner = nn.Linear(self.rnn_hidden*2, self.K) self.hidden2std_ner = nn.Linear(self.rnn_hidden*2, self.K) # decoder self.rc_lr = nn.Linear(args.K*2, args.K) self.rc_cla = nn.Linear(args.K, len(constant.LABEL_TO_ID)) self.ner_cla = nn.Linear(args.K, len(constant.BIO_TO_ID)) self.logsoft_fn = nn.LogSoftmax(dim=3) # mse loss self.loss_fn = torch.nn.MSELoss(reduction='sum')
def __init__(self, source_name, target_name, max_length=300, source_vocab=None, target_vocab=None): self.data_source = self.read_file(source_name) self.data_target = self.read_file(target_name) self.max_length = max_length self.source_vocab = source_vocab if source_vocab == None: self.source_vocab = Vocab() self.source_vocab.build_vocab([source_name]) self.target_vocab = target_vocab if target_vocab == None: self.target_vocab = Vocab() self.target_vocab.build_vocab([target_name])
def make_vocab_label(self, sents, vocab_label_init=None): if len(sents) == 0: return None if vocab_label_init: vocab_label = deepcopy(vocab_label_init) else: vocab_label = Vocab() none_label = 'O' vocab_label.add_word(none_label) labels = [] for sent in sents: if sent.has_prds: for prop in sent.prd_bio_labels: labels += prop cnt = Counter(labels) labels = [(w, c) for w, c in cnt.most_common()] for label, count in labels: vocab_label.add_word(label) return vocab_label
def make_vocab_label(self, sents, vocab_label_init=None): if len(sents) == 0: return None if vocab_label_init: vocab_label = deepcopy(vocab_label_init) else: vocab_label = Vocab() if self.argv.data_type == 'conll05': core_labels = ["A0", "A1", "A2", "A3", "A4", "A5"] else: core_labels = ["ARG0", "ARG1", "ARG2", "ARG3", "ARG4", "ARG5"] for label in core_labels: vocab_label.add_word(label) bio_labels = [] for sent in sents: for props in sent.prd_bio_labels: bio_labels += props cnt = Counter(bio_labels) bio_labels = [(w, c) for w, c in cnt.most_common()] for label, count in bio_labels: if not label.endswith('-V') and len(label) > 1: vocab_label.add_word(label[2:]) return vocab_label
def build_vocab(db_file, entity_db_file, out_file, **kwargs): db = AbstractDB(db_file, 'r') entity_db = EntityDB.load(entity_db_file) vocab = Vocab.build(db, entity_db, **kwargs) vocab.save(out_file)