예제 #1
0
  def init_vocab(self, emb_configs, vocab_size):
    # Load several pretrained embeddings and concatenate them.
    pretrained = [self.load(c['path'], vocab_size, c['size'], c['skip_first']) for c in emb_configs]
    rev_vocab = common.flatten([list(e.keys()) for e in pretrained])
    rev_vocab = self.start_vocab + rev_vocab[:vocab_size]
    vocab = collections.OrderedDict()
    for i,t in enumerate(rev_vocab):
      vocab[t] = i

    # Merge pretrained embeddings.
    if self.normalize_embedding:
      # Normalize the pretrained embeddings for each of the embedding types.
      embeddings = [common.flatten([common.normalize_vector(emb[w]) for emb in pretrained]) for w in vocab]
    else:
      embeddings = [common.flatten([emb[w] for emb in pretrained]) for w in vocab]

    # tokens in START_VOCAB are randomly initialized.
    #rand_gen = random_embedding_generator(len(embeddings[0]))
    #for i in range(len(self.start_vocab)):
    #  embeddings[i] = rand_gen()

    embeddings = np.array(embeddings)
    sys.stderr.write("Done loading word embeddings.\n")

    return vocab, rev_vocab, embeddings
예제 #2
0
def read_text(text, tokenizer):
    if isinstance(text, list):
        assert type(text[0]) == str
        words = flatten(tokenizer(sent) for sent in text)
    else:
        words = tokenizer(text)
    return words
예제 #3
0
    def merge(self,
              pretrained,
              vocab_size,
              vocab_merge_type='union',
              embedding_merge_type='first_found'):
        '''
    <Args>
    - pretrained: A list of dictionary, {word: vector}.
    - vocab_merge_type: ['union', 'intersection']
    - embedding_merge_type: ['first_found', 'average', 'concat']
    '''
        if vocab_merge_type != 'union':
            raise NotImplementedError

        if embedding_merge_type != 'first_found':
            raise NotImplementedError

        rev_vocab = list(
            OrderedSet(common.flatten([list(v.keys())
                                       for v in pretrained])))  # union.
        embedding_size = len(pretrained[0][rev_vocab[0]])
        embedding_dict = defaultdict(zero_embedding_generator(embedding_size))
        for w in rev_vocab:
            embedding_dict[w] = [vecs[w] for vecs in pretrained
                                 if w in vecs][0]
        return embedding_dict
예제 #4
0
    def decorate_text(self_class, example, vocab, prediction=None):
        '''
    Args:
    - example: A recDotDefaultDict, one example of a flattened batch.
    Refer to WikiP2DRelExDataset.article2entries. 
    '''
        text = copy.deepcopy(example.text.flat)
        query = example.query
        for i, w in enumerate(text):
            if vocab.word.is_unk(w):
                text[i] = UNDERLINE + text[i]
            query_positions = set([
                j
                for j in range(query.flat_position[0], query.flat_position[1] +
                               1)
            ])
            gold_mention_positions = set(
                flatten([[j for j in range(begin, end + 1)]
                         for begin, end in example.mentions.flat_position]))
            if PAD_ID in gold_mention_positions:
                gold_mention_positions.remove(PAD_ID)

            if i in query_positions:
                text[i] = MAGENTA + text[i]
            if i in gold_mention_positions:
                text[i] = BLUE + text[i]
            text[i] = text[i] + RESET
        return text  #'\n'.join([' '.join(sent) for sent in text])
예제 #5
0
    def get_input_feed(self, batch, is_training):
        input_feed = {}

        ## Texts
        if self.encoder.cbase:
            input_feed[self.ph.text.char] = batch.text.char
        if self.encoder.wbase:
            input_feed[self.ph.text.word] = batch.text.word
        input_feed[self.ph.sentence_length] = batch.sentence_length

        ## Mention spans and their clusters
        gold_mentions = sorted(tuple(m) for m in flatten(batch.clusters))
        gold_mention_map = {m: i for i, m in enumerate(gold_mentions)}
        cluster_ids = np.zeros(len(gold_mentions))
        for cluster_id, cluster in enumerate(batch.clusters):
            for mention in cluster:
                cluster_ids[gold_mention_map[tuple(mention)]] = cluster_id
        gold_starts, gold_ends = self.tensorize_mentions(gold_mentions)

        input_feed[self.ph.gold_starts] = np.array(gold_starts)
        input_feed[self.ph.gold_ends] = np.array(gold_ends)
        input_feed[self.ph.cluster_ids] = np.array(cluster_ids)

        ## Metadata
        input_feed[self.is_training] = is_training
        input_feed[self.ph.speaker_ids] = batch.speakers
        input_feed[self.ph.genre] = batch.genre

        if is_training and batch.text.word.shape[
                0] > self.max_training_sentences:
            return self.truncate_example(input_feed)
        else:
            return input_feed
예제 #6
0
 def preprocess(self, article):
     raw_text = [s.split() for s in article.text]
     num_words = [len(s) for s in raw_text]
     article.text = raw_text
     article.flat_text = flatten(raw_text)
     article.desc = article.desc.split()
     article.num_words = sum([len(s) for s in article.text])
     return article
예제 #7
0
def main(args):
    word_embs = read_embedding(args.source_emb)
    data = read_jsonlines(args.dataset_path, max_rows=0)
    tokenizer = word_tokenizer(args.lowercase, args.normalize_digits)
    words = flatten([read_text(d.text, tokenizer) for d in data])
    word_freq = sorted(Counter(words).items(), key=lambda x: -x[1])
    for word, freq in word_freq:
        if word in word_embs:
            line = [word] + word_embs[word]
            line = ' '.join([str(x) for x in line])
            print(line)
예제 #8
0
 def _tokenizer(sent, flatten=False):
   if normalize_digits:
     sent = re.sub(_DIGIT_RE, "0", sent) 
   if lowercase:
     sent = sent.lower()
   def word2chars(word):
     return [c for c in word]
   words = sent.replace('\n', '').split()
   chars = [word2chars(w) for w in words]
   if flatten:
     chars = common.flatten(chars)
   return chars
def main(args):
  tokenizer = word_tokenizer(args.lowercase, args.normalize_digits,
                             separative_tokens=['-', '/'])
  data = read_jsonlines(args.descdata_path)

  word_freq = OrderedDict(sorted([(k, freq) for k, freq in Counter(flatten([tokenizer(d.desc) for d in data])).items()], key=lambda x: -x[1]))
  embedding_dict = read_pretrained_emb(word_freq, tokenizer)

  with open(args.emb_target_path, 'w') as f:
    for w, v in embedding_dict.items():
      if not v:
        continue
      line = "%s %s\n" % (w, ' '.join([str(x) for x in v]))
      f.write(line)
예제 #10
0
def setup_tokenizer(tokenizer_type=None):
    assert tokenizer_type is None or tokenizer_type in ['corenlp', 'nltk']
    if tokenizer_type == 'corenlp':
        #from core.utils.tokenizer import connect_to_corenlp_server, run_corenlp
        corenlp = connect_to_corenlp_server(host='http://localhost', port=9000)
        tokenizer = lambda uttr: flatten(run_corenlp(uttr, corenlp))
    elif tokenizer_type == 'nltk':
        from nltk import word_tokenize
        tokenizer = word_tokenize
    else:
        tokenizer = lambda uttr: uttr.split()

    # tokenizer must return a list of words.
    return tokenizer
예제 #11
0
    def preprocess(self, article):
        raw_text = [s.split() for s in article.text]
        num_words = [len(s) for s in raw_text]
        links = {}

        # Convert a list of sentneces to a flattened sequence of words.
        for qid, link in article.link.items():
            (sent_id, (begin, end)) = link
            flatten_begin = begin + sum(num_words[:sent_id])
            flatten_end = end + sum(num_words[:sent_id])
            assert flatten_begin >= 0 and flatten_end >= 0
            links[qid] = (flatten_begin, flatten_end)
        article.link = links
        article.text = flatten(raw_text)
        article.desc = article.desc.split()
        return article
예제 #12
0
    def _tokenizer(sent, flatten=False):
        if split_quotation:
            sent = sent.replace("'", " ' ")
        if normalize_digits:
            sent = re.sub(_DIGIT_RE, "0", sent)
        if lowercase:
            sent = sent.lower()

        def word2chars(word):
            return [c for c in word]

        words = sent.replace('\n', '')
        words = words.split() if not use_nltk_tokenizer else word_tokenize(
            words)
        chars = [word2chars(w) for w in words]
        if flatten:
            chars = common.flatten(chars)
        return chars
예제 #13
0
파일: main.py 프로젝트: jack-and-rozz/hsai
    def debug(self, model=None):
        if not model:
            model = self.create_model(self.config)
            self.output_variables_as_text(model)
            exit(1)
        batch = common.recDotDefaultDict()
        state = [common.flatten([[1, 0, 0, 0] for _ in range(160)])]
        batch.state = state
        batch.is_sente = [[1, 0] for _ in state]
        batch.current_num_cards = [[1, 1] for s in state]
        batch.is_training = False
        res = model.step(batch, 0)
        print(res)

        batches = self.dataset.get_batches(self.config.batch_size,
                                           0,
                                           is_training=True)
        for b in batches:
            b = common.flatten_recdict(b)
            for k in b:
                print(k, b[k])
            exit(1)
예제 #14
0
 def test(self, batches, mode, logger, output_path):
     results = []
     used_batches = []
     for i, batch in enumerate(batches):
         input_feed = self.get_input_feed(batch, False)
         outputs = self.sess.run(self.predictions, input_feed)
         try:
             used_batches += flatten_batch(batch)
         except Exception as e:
             pprint(batch)
             print(e)
             exit(1)
         results.append(outputs[:, 0, :])
     results = flatten([r.tolist() for r in results])
     sys.stdout = open(output_path, 'w') if output_path else sys.stdout
     bleu = evaluate_and_print(used_batches, results, vocab=self.vocab)
     if output_path:
         sys.stderr.write(
             "Output the testing results to \'{}\' .\n".format(output_path))
     sys.stdout = sys.__stdout__
     summary_dict = {}
     summary_dict['desc/%s/BLEU' % mode] = bleu
     summary = make_summary(summary_dict)
     return bleu, summary
예제 #15
0
 def load_data(self):
     sys.stderr.write("Loading wikiP2D dataset from \'%s\'... \n" %
                      self.source_path)
     data = read_jsonlines(self.source_path, max_rows=self.max_rows)
     data = [self.preprocess(d) for d in data]
     self.data = flatten([self.article2entries(d) for d in data])
예제 #16
0
    def article2entries(self, article):
        def qid2entity(qid, article):
            assert qid in article.link
            s_id, (begin, end) = article.link[qid]

            # The offset is the number of words in previous sentences.
            offset = sum([len(sent) for sent in article.text[:s_id]])
            entity = recDotDefaultDict()
            # Replace entity's name with the actual representation in the article.
            entity.raw = ' '.join(article.text[s_id][begin:end + 1])
            entity.position = article.link[qid]
            entity.flat_position = (begin + offset, end + offset)
            return entity

        entry = recDotDefaultDict()
        entry.qid = article.qid

        entry.text.raw = article.text
        entry.text.flat = article.flat_text
        entry.text.word = [self.vocab.word.sent2ids(s) for s in article.text]
        entry.text.char = [self.vocab.char.sent2ids(s) for s in article.text]

        entry.query = qid2entity(article.qid, article)  # (begin, end)

        # Articles which contain triples less than self.min_triples are discarded since they can be incorrect.
        if len(article.triples.subjective.ids) + len(
                article.triples.objective.ids) < self.min_triples:
            return []
        entry.mentions.raw = []
        entry.mentions.flat_position = []

        for t_type in ['subjective', 'objective']:
            entry.triples[t_type] = []
            entry.target[t_type] = [[
                self.vocab.rel.UNK_ID for j in range(self.max_mention_width)
            ] for i in range(article.num_words)]

            for triple_idx, triple in enumerate(
                    article.triples[t_type].ids):  # triple = [subj, rel, obj]
                is_subjective = triple[0] == article.qid
                query_qid, rel_pid, mention_qid = triple if is_subjective else reversed(
                    triple)
                # TODO: 同じメンションがクエリと異なる関係を持つ場合は?
                mention = qid2entity(mention_qid, article)
                #entry.mentions[t_type].raw.append(mention.raw)
                #entry.mentions[t_type].flat_position.append(mention.flat_position)
                entry.mentions.raw.append(mention.raw)
                entry.mentions.flat_position.append(mention.flat_position)

                rel = dotDict({
                    'raw': rel_pid,
                    'name': self.vocab.rel.token2name(rel_pid)
                })

                begin, end = mention.flat_position
                if end - begin < self.max_mention_width:
                    entry.target[t_type][begin][
                        end - begin] = self.vocab.rel.token2id(rel_pid)

                triple = [entry.query, rel, mention
                          ] if is_subjective else [mention, rel, entry.query]
                entry.triples[t_type].append(triple)

        relation_freqs = Counter(flatten(entry.target.subjective))

        # TODO: For now this experiments focus only on subjective relations.
        entry.triples.objective = []
        #####################
        entry.loss_weights_by_label = [1.0 for _ in range(self.vocab.rel.size)]

        entry.num_mentions = len(entry.mentions.flat_position)
        return [entry]