Exemplo n.º 1
0
def build_batch_by_article(article, vocab):
    words = jieba.cut(article)
    art_str = " ".join(words)
    example = Example(art_str, '', vocab)
    ex_list = [example for _ in range(config.beam_size)]
    batch = Batch(ex_list, vocab, config.beam_size)
    return batch
Exemplo n.º 2
0
def test_batch():
    # python test_batcher.py --data_path=../data/squad-v1/dev_raw.json --pointer_gen
    input_gen = text_generator(
        data.example_generator(hps.data_path, hps.single_pass))
    example_list = []
    for _ in range(hps.batch_size):
        p, q, a, ap = next(input_gen)
        example_list.append(Example(p, q, a, ap, vocab, hps))
    batch = Batch(example_list, hps, vocab)
    print('batch answer pos:', batch.ans_indices)
    print('enc batch:', batch.enc_batch)
    print('enc batch words:',
          id2sentence(batch.enc_batch, vocab, batch.para_oovs_batch))
    print('enc len:', batch.enc_lens)
    if hps.pointer_gen:
        print('max para oovs:', batch.max_para_oovs)
        print('para oovs:', batch.para_oovs_batch)
        print('enc batch extend vocab:', batch.enc_batch_extend_vocab)
        print(
            'enc batch extend vocab words:',
            id2sentence(batch.enc_batch_extend_vocab, vocab,
                        batch.para_oovs_batch))
    print('dec batch:', batch.dec_batch)
    print('dec batch words:',
          id2sentence(batch.dec_batch, vocab, batch.para_oovs_batch))
    print('target batch:', batch.target_batch)
    print('tgt batch words:',
          id2sentence(batch.target_batch, vocab, batch.para_oovs_batch))
    print('origin para:', batch.original_paragraphs)
    print('origin question:', batch.original_questions)
    print('origin answer:', batch.original_answers)
Exemplo n.º 3
0
def build_batch_by_article(article, vocab):
    nlp = StanfordCoreNLP(path, lang='en')
    words = nlp.word_tokenize(article)
    art_str = " ".join(words)
    example = Example(art_str, [
        "",
    ], vocab)
    ex_list = [example for _ in range(config.beam_size)]
    batch = Batch(ex_list, vocab, config.beam_size)
    return batch
Exemplo n.º 4
0
def generate_summary(spacy_article, ideal_summary_length_tokens=60):
    """
    Generates summary of the given article. Note that this is slow (~20 seconds on a single CPU).
    
    Args:
        spacy_article: Spacy-processed text. The model was trained on the output of
        doc.spacy_text(), so for best results the input here should also come from doc.spacy_text().
    
    Returns:
        Tuple of unicode summary of the text and scalar score of its quality. Score is approximately
        an average log-likelihood of the summary (so it is < 0.) and typically is in the range
        [-.2, -.5]. Summaries with scores below -.4 are usually not very good.
    """
    assert isinstance(spacy_article, Doc)

    # These imports are slow - lazy import.
    from batcher import Batch, Example
    from beam_search import run_beam_search
    from io_processing import process_article, process_output

    if _model is None:
        _load_model()

    # Handle short inputs
    article_tokens, _, orig_article_tokens = process_article(spacy_article)
    if len(article_tokens) <= ideal_summary_length_tokens:
        return spacy_article.text, 0.

    min_summary_length = min(10 + len(article_tokens) / 10,
                             2 * ideal_summary_length_tokens / 3)
    max_summary_length = min(10 + len(article_tokens) / 5,
                             3 * ideal_summary_length_tokens / 2)

    # Make input data
    example = Example(' '.join(article_tokens),
                      abstract='',
                      vocab=_vocab,
                      hps=_hps)
    batch = Batch([example] * _beam_size, _hps, _vocab)

    # Generate output
    hyp, score = run_beam_search(
        _sess,
        _model,
        _vocab,
        batch,
        _beam_size,
        max_summary_length,
        min_summary_length,
        _settings.trace_path,
    )

    # Extract the output ids from the hypothesis and convert back to words
    return process_output(hyp.token_strings[1:], orig_article_tokens), score
Exemplo n.º 5
0
def test_example():
    ex = Example(paragraph, question, answer, answer_indices, vocab, hps)
    print('enc len:', ex.enc_len)
    print('enc input:', ex.enc_input)
    print('enc input words:',
          ' '.join([vocab.id2word(i) for i in ex.enc_input]))
    print('dec len:', ex.dec_len)
    print('dec input:', ex.dec_input)
    print('dec input words:',
          ' '.join([vocab.id2word(i) for i in ex.dec_input]))
    print('dec target:', ex.target)
    print('dec target words:', ' '.join([vocab.id2word(i) for i in ex.target]))
    if hps.pointer_gen:
        print('enc input extend vocab:', ex.enc_input_extend_vocab)
        print('paragraph oov:', ex.paragraph_oovs)
    print('original paragraph:', ex.original_paragraph)
    print('original question:', ex.original_question)
    print('original answer:', ex.original_answer)
    print('ans start pos:', ex.answer_start_idx)
    print('ans end pos:', ex.answer_end_idx)
Exemplo n.º 6
0
def output_to_batch(current_batch, results, batcher, dis_batcher):
    # 生成新的 batch 和 dis-batch
    example_list = []
    db_example_list = []

    for i in range(FLAGS.batch_size):
        decoded_words_all = []
        encode_words = current_batch.original_review_inputs[i]

        for j in range(FLAGS.max_dec_sen_num):
            output_ids = [int(t) for t in results['Greedy_outputs'][i][0:]]
            decoded_words = data.outputids2words(output_ids, batcher._vocab,
                                                 None)
            # Remove the [STOP] token from decoded_words, if necessary
            try:
                # index of the (first) [STOP] symbol
                fst_stop_idx = decoded_words.index(data.STOP_DECODING)
                decoded_words = decoded_words[:fst_stop_idx]
            except ValueError:
                decoded_words = decoded_words
            if len(decoded_words) < 2:
                continue
            if len(decoded_words_all) > 0:
                new_set1 = set(decoded_words_all[len(decoded_words_all) -
                                                 1].split())
                new_set2 = set(decoded_words)
                if len(new_set1 & new_set2) > 0.5 * len(new_set2):
                    continue
            if decoded_words[-1] != '.' and decoded_words[
                    -1] != '!' and decoded_words[-1] != '?':
                decoded_words.append('.')
            decoded_output = ' '.join(decoded_words).strip()
            decoded_words_all.append(decoded_output)

        decoded_words_all = ' '.join(decoded_words_all).strip()
        try:
            fst_stop_idx = decoded_words_all.index(data.STOP_DECODING_DOCUMENT)
            decoded_words_all = decoded_words_all[:fst_stop_idx]
        except ValueError:
            decoded_words_all = decoded_words_all
        decoded_words_all = decoded_words_all.replace("[UNK] ", "")
        decoded_words_all = decoded_words_all.replace("[UNK]", "")
        decoded_words_all, _ = re.subn(r"(! ){2,}", "", decoded_words_all)
        decoded_words_all, _ = re.subn(r"(\. ){2,}", "", decoded_words_all)

        if decoded_words_all.strip() == "":
            new_dis_example = bd.Example(
                current_batch.original_review_output[i], -0.0001,
                dis_batcher._vocab, dis_batcher._hps)
            new_example = Example(current_batch.original_review_output[i],
                                  batcher._vocab, batcher._hps, encode_words)
        else:
            new_dis_example = bd.Example(decoded_words_all, 1,
                                         dis_batcher._vocab, dis_batcher._hps)
            new_example = Example(decoded_words_all, batcher._vocab,
                                  batcher._hps, encode_words)
        example_list.append(new_example)
        db_example_list.append(new_dis_example)
    return Batch(example_list, batcher._hps,
                 batcher._vocab), bd.Batch(db_example_list, dis_batcher._hps,
                                           dis_batcher._vocab)
Exemplo n.º 7
0
 def article_to_batch(self, article):
     abstract_sentences = ''
     example = Example(article, abstract_sentences, self.vocab,
                       self.hps)  # Process into an Example.
     repeated_example = [example for _ in range(self.hps.batch_size)]
     return Batch(repeated_example, self.hps, self.vocab)