def main(args): word_embs = read_embedding(args.source_emb) data = read_jsonlines(args.dataset_path, max_rows=0) tokenizer = word_tokenizer(args.lowercase, args.normalize_digits) words = flatten([read_text(d.text, tokenizer) for d in data]) word_freq = sorted(Counter(words).items(), key=lambda x: -x[1]) for word, freq in word_freq: if word in word_embs: line = [word] + word_embs[word] line = ' '.join([str(x) for x in line]) print(line)
def main(args): tokenizer = word_tokenizer(args.lowercase, args.normalize_digits, separative_tokens=['-', '/']) data = read_jsonlines(args.descdata_path) word_freq = OrderedDict(sorted([(k, freq) for k, freq in Counter(flatten([tokenizer(d.desc) for d in data])).items()], key=lambda x: -x[1])) embedding_dict = read_pretrained_emb(word_freq, tokenizer) with open(args.emb_target_path, 'w') as f: for w, v in embedding_dict.items(): if not v: continue line = "%s %s\n" % (w, ' '.join([str(x) for x in v])) f.write(line)
def __init__(self, config, vocab, mask_link_in_test=True): self.vocab = vocab properties_path = os.path.join(config.source_dir, config.prop_data) self.properties = OrderedDict([ (d['qid'], d) for d in read_jsonlines(properties_path) ]) self.vocab.rel = WikiP2DRelVocabulary(self.properties.values(), start_vocab=[_UNK]) self.train = self.dataset_class(config, config.filename.train, vocab, config.max_rows.train, self.properties, config.mask_link) self.valid = self.dataset_class(config, config.filename.valid, vocab, config.max_rows.valid, self.properties, mask_link_in_test) self.test = self.dataset_class(config, config.filename.test, vocab, config.max_rows.test, self.properties, mask_link_in_test)
def load_data(self, source_path, max_rows): sys.stdout.write("Loading dataset from \'%s\'... \n" % source_path) data = read_jsonlines(source_path, max_rows=max_rows) data = [self.preprocess(d) for d in data] self.data = self.create_examples(data)
def load_data(self): sys.stderr.write("Loading wikiP2D dataset from \'%s\'... \n" % self.source_path) data = read_jsonlines(self.source_path, max_rows=self.max_rows) data = [self.preprocess(d) for d in data] self.data = flatten([self.article2entries(d) for d in data])