def create_vocab(data, cfg, dataset_dir): print('[*] Creating word vocab') words = Counter() for m, d in data.items(): bar = tqdm(d, desc='[*] Collecting word tokens form {} data'.format(m), dynamic_ncols=True) for dd in bar: words.update([w.lower() for w in dd['text']]) bar.close() tokens = [w for w, _ in words.most_common(cfg.word.size)] word_vocab = Vocab(tokens, **cfg.word) word_vocab_path = (dataset_dir / 'word.pkl') with word_vocab_path.open(mode='wb') as f: pickle.dump(word_vocab, f) print('[-] Word vocab saved at {}\n'.format(word_vocab_path)) print('[*] Creating char vocab') char_vocab = Vocab(list(string.printable), **cfg.char) char_vocab_path = (dataset_dir / 'char.pkl') with char_vocab_path.open(mode='wb') as f: pickle.dump(char_vocab, f) print('[-] Char vocab saved to {}\n'.format(char_vocab_path)) return word_vocab, char_vocab
def load_word_vectors(path): if os.path.isfile(path + '.pth') and os.path.isfile(path + '.vocab'): print('==> File found, loading to memory') vectors = torch.load(path + '.pth') vocab = Vocab(filename=path + '.vocab') return vocab, vectors # saved file not found, read from txt file # and create tensors for word vectors print('==> File not found, preparing, be patient') count = sum(1 for line in open(path + '.txt', 'r', encoding='utf8', errors='ignore')) with open(path + '.txt', 'r') as f: contents = f.readline().rstrip('\n').split(' ') dim = len(contents[1:]) words = [None] * (count) vectors = torch.zeros(count, dim, dtype=torch.float, device='cpu') with open(path + '.txt', 'r', encoding='utf8', errors='ignore') as f: idx = 0 for line in f: contents = line.rstrip('\n').split(' ') words[idx] = contents[0] values = list(map(float, contents[1:])) vectors[idx] = torch.tensor(values, dtype=torch.float, device='cpu') idx += 1 with open(path + '.vocab', 'w', encoding='utf8', errors='ignore') as f: for word in words: f.write(word + '\n') vocab = Vocab(filename=path + '.vocab') torch.save(vectors, path + '.pth') return vocab, vectors
def build_vocab(self): utils.build_vocab([self.path], QAConfig.vocab) return Vocab(filename=QAConfig.vocab, data=[ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ])
def create_vocab(data, cfg): print('[*] Creating word vocab') words = Counter() bar = tqdm(data, desc='[*] Collecting word tokens', dynamic_ncols=True) for dd in bar: words.update([w for w in dd]) bar.close() tokens = [w for w, _ in words.most_common(cfg.word.size)] word_vocab = Vocab( tokens, **cfg.word, ) char_vocab = Vocab(list(string.printable), **cfg.char) return word_vocab, char_vocab
def __init__(self, trainset_path, testset_path, vocab_path, dataset_name='', remove_entity_mention=False, remove_stop_words=False): self.config = config[dataset_name] self.train_set, self.train_corpus = self.load_dataset( trainset_path, remove_entity_mention, remove_stop_words) self.test_set, self.test_corpus = self.load_dataset( testset_path, remove_entity_mention, remove_stop_words) self.corpus = self.train_corpus + self.test_corpus # if not os.path.isfile(vocab_path): # self.__build_vocab(self.corpus, vocab_path) if os.path.isfile(vocab_path): self.vocab = Vocab(filename=vocab_path, data=['<ukn>', '<ent>', '<num>']) self.unknown = self.vocab.getIndex('<ukn>') self.word_vectorizer = Glove(self.vocab, config['glove_path'], self.config['emb']) for qa_row in self.train_set + self.test_set: for relation in qa_row.sparql.relations: relation.coded = self.decode(relation) # self.__update_relations_emb() self.coded_train_corpus = [[ self.vocab.getIndex(word, self.unknown) for word in tokens ] for tokens in self.train_corpus] self.coded_test_corpus = [[ self.vocab.getIndex(word, self.unknown) for word in tokens ] for tokens in self.test_corpus] self.vocab_path = vocab_path self.one_hop = None if os.path.isfile(self.config['entity_one_hop']): with open(self.config['entity_one_hop'], 'rb') as f: self.one_hop = pk.load(f)
def create_vocab(data,cfg,dataset_dir): print('[*] Creating word vocab') dict_words = Counter() for m, d in data.items(): for words in tqdm(d, desc='[*] creating word vocab', dynamic_ncols=True): dict_words.update(words) dict_words = Counter({word : dict_words[word] for word in dict_words if dict_words[word] > 3}) tokens = [w for w, _ in dict_words.most_common(cfg.word.size)] word_vocab = Vocab(tokens,**cfg.word) print("[*] The word vocabulary size is " + str(word_vocab.__len__())) word_vocab_path = (dataset_dir / 'word.pkl') with word_vocab_path.open(mode='wb') as f: pickle.dump(word_vocab,f) print('[-] Word vocab saved at {}\n'.format(word_vocab_path)) print('[*] Creating char vocab') dict_chars = Counter() for m,d in data.items(): for words in tqdm(d, desc='[*] creating char vocab', dynamic_ncols=True): for word in words: if word == '<BOS>' or word == '<EOS>': dict_chars.update([word]) continue dict_chars.update(word) dict_chars = Counter({char : dict_chars[char] for char in dict_chars if dict_chars[char] > 1000}) tokens = [c for c, _ in dict_chars.most_common(cfg.char.size)] char_vocab = Vocab(tokens,**cfg.char) print("[*] The char vocabulary size is " + str(char_vocab.__len__())) char_vocab_path = (dataset_dir / 'char.pkl') with char_vocab_path.open(mode='wb') as f: pickle.dump(char_vocab, f) print('[-] Char vocab saved to {}\n'.format(char_vocab_path)) return word_vocab, char_vocab
def load_word_vectors(self, path): """ loading GLOVE word vectors if .pth file is found, will load that else will load from .txt file & save :param path: :return: """ if os.path.isfile(path + '.pth') and os.path.isfile(path + '.vocab'): print('==> File found, loading to memory') vectors = torch.load(path + '.pth') vocab = Vocab(filename=path + '.vocab') return vocab, vectors # saved file not found, read from txt file # and create tensors for word vectors print('==> File not found, preparing, be patient') print(path + '.txt') count = sum(1 for line in open(path + '.txt', encoding="utf-8")) with open(path + '.txt', 'r') as f: contents = f.readline().rstrip('\n').split(' ') dim = len(contents[1:]) words = [None] * (count) vectors = torch.zeros(count, dim) with open(path + '.txt', 'r', encoding="utf-8") as f: idx = 0 for line in f: contents = line.rstrip('\n').split(' ') words[idx] = contents[0] vectors[idx] = torch.Tensor(list(map(float, contents[1:]))) idx += 1 with open(path + '.vocab', 'w', encoding="utf-8") as f: for word in words: f.write(word + '\n') vocab = Vocab(filename=path + '.vocab') torch.save(vectors, path + '.pth') return vocab, vectors
class Base_Dataset: def __init__(self, trainset_path, testset_path, vocab_path, dataset_name='', remove_entity_mention=False, remove_stop_words=False): self.config = config[dataset_name] self.train_set, self.train_corpus = self.load_dataset( trainset_path, remove_entity_mention, remove_stop_words) self.test_set, self.test_corpus = self.load_dataset( testset_path, remove_entity_mention, remove_stop_words) self.corpus = self.train_corpus + self.test_corpus # if not os.path.isfile(vocab_path): # self.__build_vocab(self.corpus, vocab_path) if os.path.isfile(vocab_path): self.vocab = Vocab(filename=vocab_path, data=['<ukn>', '<ent>', '<num>']) self.unknown = self.vocab.getIndex('<ukn>') self.word_vectorizer = Glove(self.vocab, config['glove_path'], self.config['emb']) for qa_row in self.train_set + self.test_set: for relation in qa_row.sparql.relations: relation.coded = self.decode(relation) # self.__update_relations_emb() self.coded_train_corpus = [[ self.vocab.getIndex(word, self.unknown) for word in tokens ] for tokens in self.train_corpus] self.coded_test_corpus = [[ self.vocab.getIndex(word, self.unknown) for word in tokens ] for tokens in self.test_corpus] self.vocab_path = vocab_path self.one_hop = None if os.path.isfile(self.config['entity_one_hop']): with open(self.config['entity_one_hop'], 'rb') as f: self.one_hop = pk.load(f) def decode(self, relation, max_length=3): idxs = self.vocab.convertToIdx( map(str.lower, relation.tokens[:max_length]), self.unknown) length = len(idxs) if len(idxs) < max_length: idxs = idxs + [0] * (max_length - len(idxs)) return torch.LongTensor(idxs), length def load_dataset(self, dataset_path, remove_entity_mention, remove_stop_words): return [], [] def __load_candidate_relations(self): vocab = set() # if not os.path.exists(self.config['rel2id']): # for qa_row in self.train_set + self.test_set: # for relation in qa_row.sparql.relations: # vocab |= set(map(str.lower, relation.tokens)) # return vocab with open(self.config['rel2id'], 'rb') as f_h: rel2id = pk.load(f_h, encoding='latin1') for item_id, item in rel2id.items(): words = [word.lower().replace('.', '') for word in item[2]] vocab |= set(words) if os.path.isfile(self.config['entity_one_hop']): with open(self.config['entity_one_hop'], 'rb') as f: one_hop = pk.load(f) print(len(vocab)) for entity, uris in one_hop.items(): for idx in range(len(uris)): uri, label = uris[idx][:2] label = re.sub(r"([A-Z])", r" \1", label).replace('_', ' ').replace('.', ' ') words = list(map(str.lower, label.split(' '))) vocab |= set(words) print(len(vocab)) return vocab def __update_relations_emb(self): emb_shape = self.word_vectorizer.emb.shape emb = nn.Embedding(emb_shape[0], emb_shape[1], padding_idx=0, sparse=False) emb.weight.data.copy_(self.word_vectorizer.emb) if torch.cuda.is_available(): emb.cuda() with open(self.config['rel2id'], 'rb') as f_h: rel2id = pk.load(f_h, encoding='latin1') ## Need to fix cases where there are non-alphabet chars in the label max_length = 3 for item_id, item in rel2id.items(): if len(item[2]) > max_length: idxs = [] else: idxs = [ self.vocab.getIndex( word.lower().replace('.', '') if not word.replace( '.', '').replace('(', '').isdigit() else '<num>') for word in item[2] ] idxs = [id for id in idxs if id is not None] length = len(idxs) if length == 0: length = 1 if len(idxs) < max_length: idxs = idxs + [0] * (max_length - len(idxs)) idxs = torch.LongTensor(idxs) item[5] = idxs if len(item) == 6: item.append(length) else: item[6] = length with open(self.config['rel2id'], 'wb') as f_h: pk.dump(rel2id, f_h) def __build_vocab(self, lines, vocab_path): vocab = set() for tokens in lines: vocab |= set(tokens) relations_vocab = self.__load_candidate_relations() vocab |= relations_vocab vocab = [ w for w in vocab if not w.replace('.', '').replace('(', '').isdigit() ] if '<ent>' in vocab: vocab.remove('<ent>') with open(vocab_path, 'w', encoding='utf-8') as f: for token in sorted(vocab): f.write(token + '\n') def find_one_hop_relations(self, entities): extra_candidates = [] if self.one_hop is not None: for entity in entities: if entity in self.one_hop: extra_candidates.extend(self.one_hop[entity]) return extra_candidates
logger.error('Sparsity and weight decay are incompatible, pick one!') exit() # debugging args logger.debug(args) # set seed for torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True # get vocab object from vocab file previously written imdb_vocab_file = classificationConfig.vocab vocab = Vocab(filename=imdb_vocab_file, data=[ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]) logger.debug('==> imdb vocabulary size : %d ' % vocab.size()) emb_file = classificationConfig.embed emb = torch.load(emb_file) ## built treeLSTM model tree_model = TreeLSTM(vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes, args.sparse, args.freeze_embed, device) criterion = nn.CrossEntropyLoss() tree_model.to(device), criterion.to(device) tree_model.emb.weight.data.copy_(emb) with open('%s.pt' % os.path.join(args.save, args.expname), 'rb') as f: tree_model.load_state_dict(torch.load(f)['model'])
logger.error('Sparsity and weight decay are incompatible, pick one!') exit() # debugging args logger.debug(args) # set seed for torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True # get vocab object from vocab file previously written imdb_vocab_file = classificationConfig.vocab vocab = Vocab(filename=imdb_vocab_file, data=[ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]) logger.debug('==> imdb vocabulary size : %d ' % vocab.size()) emb_file = classificationConfig.embed emb = torch.load(emb_file) # dev_dir = classificationConfig.token_file_labels[1] # dev_file = os.path.join(Global.external_tools, 'imdb_end2end_dev.pth') # if os.path.isfile(dev_file): # dev_data = torch.load(dev_file) # else: # dev_data = CommonDataset(dev_dir, vocab, device) # torch.save(dev_data, dev_file) # logger.debug('==> Size of dev data : %d ' % len(dev_data))
## build vocab token_files = [] for k in ['pos', 'neg']: token_files.extend([ os.path.join(token_file_label, k + ".json") for token_file_label in classificationConfig.token_file_labels ]) # imdb_vocab_file = os.path.join(args.data, 'imdb.vocab') print('token_files', token_files) imdb_vocab_file = classificationConfig.vocab utils.build_vocab(token_files, imdb_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=imdb_vocab_file, data=[ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]) logger.debug('==> imdb vocabulary size : %d ' % vocab.size()) ## build embedding of vocab # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors # emb_file = os.path.join(Global.external_tools, 'imdb_embed.pth') emb_file = classificationConfig.embed if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = utils.load_word_vectors( classificationConfig.glove)
with open(config['dbpedia']['relations'], 'r', encoding='utf-8') as file_handler: for line in tqdm(file_handler): json_object = json.loads(line)['_source'] uri = json_object['uri'] if 'http://dbpedia.org/' in uri: uri = URI(uri) vocab |= set(uri.tokens) print(len(vocab)) vocab_list = [URI.normalize(word) for word in vocab] vocab = set([word for words in vocab_list for word in words]) with open(config['vocab'], 'w', encoding='utf-8') as f: for token in sorted(vocab): f.write(token + '\n') vocab = Vocab(config['vocab'], data=['<ukn>', '<ent>', '<num>']) word_vectorizer = Glove(vocab, config['glove_path'], config['emb']) coded_labels = {} max_length = 3 with open(config['dbpedia']['relations'], 'r', encoding='utf-8') as file_handler: for line in tqdm(file_handler): json_object = json.loads(line)['_source'] uri = json_object['uri'] if 'http://dbpedia.org/' in uri: uri = URI(uri) if uri.raw_uri not in coded_labels: idxs = vocab.convertToIdx(uri.tokens, '')[:max_length] length = len(idxs) if len(idxs) < max_length:
config['dbpedia']['relations'], index_name=args.index_name) bulk_data = [] manual_list = [{ 'uri': 'http://dbpedia.org/ontology/TelevisionShow', 'label': 'show' }] for item in manual_list: data_dict = { 'key': item['uri'], 'dtype': 'uri', 'label': item['label'] } op_dict = { "index": { "_index": args.index_name, "_type": 'resources' } } bulk_data.append(op_dict) bulk_data.append(data_dict) e.bulk_indexing(args.index_name, delete_index=False, index_config=index_config, bulk_data=bulk_data) vocab = Vocab(filename=config['lc_quad']['vocab'], data=['<ent>', '<num>']) print(e.search_index(args.search, args.index_name, size=args.size))