def main(args):
    word_embs = read_embedding(args.source_emb)
    data = read_jsonlines(args.dataset_path, max_rows=0)
    tokenizer = word_tokenizer(args.lowercase, args.normalize_digits)
    words = flatten([read_text(d.text, tokenizer) for d in data])
    word_freq = sorted(Counter(words).items(), key=lambda x: -x[1])
    for word, freq in word_freq:
        if word in word_embs:
            line = [word] + word_embs[word]
            line = ' '.join([str(x) for x in line])
            print(line)
def main(args):
  tokenizer = word_tokenizer(args.lowercase, args.normalize_digits,
                             separative_tokens=['-', '/'])
  data = read_jsonlines(args.descdata_path)

  word_freq = OrderedDict(sorted([(k, freq) for k, freq in Counter(flatten([tokenizer(d.desc) for d in data])).items()], key=lambda x: -x[1]))
  embedding_dict = read_pretrained_emb(word_freq, tokenizer)

  with open(args.emb_target_path, 'w') as f:
    for w, v in embedding_dict.items():
      if not v:
        continue
      line = "%s %s\n" % (w, ' '.join([str(x) for x in v]))
      f.write(line)
示例#3
0
    def __init__(self, config, vocab, mask_link_in_test=True):
        self.vocab = vocab
        properties_path = os.path.join(config.source_dir, config.prop_data)
        self.properties = OrderedDict([
            (d['qid'], d) for d in read_jsonlines(properties_path)
        ])
        self.vocab.rel = WikiP2DRelVocabulary(self.properties.values(),
                                              start_vocab=[_UNK])
        self.train = self.dataset_class(config, config.filename.train, vocab,
                                        config.max_rows.train, self.properties,
                                        config.mask_link)

        self.valid = self.dataset_class(config, config.filename.valid, vocab,
                                        config.max_rows.valid, self.properties,
                                        mask_link_in_test)
        self.test = self.dataset_class(config, config.filename.test, vocab,
                                       config.max_rows.test, self.properties,
                                       mask_link_in_test)
示例#4
0
 def load_data(self, source_path, max_rows):
     sys.stdout.write("Loading dataset from \'%s\'... \n" % source_path)
     data = read_jsonlines(source_path, max_rows=max_rows)
     data = [self.preprocess(d) for d in data]
     self.data = self.create_examples(data)
示例#5
0
 def load_data(self):
     sys.stderr.write("Loading wikiP2D dataset from \'%s\'... \n" %
                      self.source_path)
     data = read_jsonlines(self.source_path, max_rows=self.max_rows)
     data = [self.preprocess(d) for d in data]
     self.data = flatten([self.article2entries(d) for d in data])