Пример #1
0
    def fill_example_queue(self):
        """Reads data from file and processes into Examples which are then placed into the example queue."""

        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                (article, abstract) = input_gen.next(
                )  # read the next example from file. article and abstract are both strings.
            except StopIteration:  # if there are no more examples:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            abstract_sentences = [
                sent.strip() for sent in data.abstract2sents(abstract)
            ]  # Use the <s> and </s> tags in abstract to get a list of sentences.
            example = Example(article, abstract_sentences, self._vocab,
                              self._hps)  # Process into an Example.
            self._example_queue.put(
                example)  # place the Example in the example queue.
Пример #2
0
    def fill_example_queue(self):
        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                (article, abstract) = input_gen.next()

            except StopIteration:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            abstract_sentences = [
                sent.strip() for sent in data.abstract2sents(abstract)
            ]
            article = [sent.strip() for sent in data.article2sents(article)]
            example = Example(article, abstract_sentences, self._vocab,
                              self._concept_vocab)
            self._example_queue.put(example)
def main(unused_argv):

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    source_dir = os.path.join(data_dir, FLAGS.dataset)
    source_files = sorted(glob.glob(source_dir + '/*'))

    for i in range(4):
        ref_dir = os.path.join(log_dir, 'reference_' + str(i), 'reference')
        dec_dir = os.path.join(log_dir, 'reference_' + str(i), 'decoded')
        util.create_dirs(ref_dir)
        util.create_dirs(dec_dir)
        for source_idx, source_file in enumerate(source_files):
            human_summary_texts = get_human_summary_texts(source_file)
            summaries = []
            for summary_text in human_summary_texts:
                summary = data.abstract2sents(summary_text)
                summaries.append(summary)
            candidate = summaries[i]
            references = [
                summaries[idx] for idx in range(len(summaries)) if idx != i
            ]
            rouge_functions.write_for_rouge(references, candidate, source_idx,
                                            ref_dir, dec_dir)

        results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir)
        # print("Results_dict: ", results_dict)
        rouge_functions.rouge_log(results_dict,
                                  os.path.join(log_dir, 'reference_' + str(i)))
Пример #4
0
    def fill_example_queue(self):
        # 创建一个生成器对象
        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                (article, abstract) = input_gen.__next__(
                )  # read the next example from file. article and abstract are both strings.
                article, abstract = article.decode(), abstract.decode()
            except StopIteration:  # if there are no more examples:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            abstract_sentences = [
                sent.strip() for sent in data.abstract2sents(abstract)
            ]  # 编码abstract
            #print("abstract_sentences:", abstract_sentences)
            example = Example(article, abstract_sentences,
                              self._vocab)  # 处理成一个Example.
            self._example_queue.put(example)  # 放处理成一个Example对象至example queue.
            """
Пример #5
0
    def fill_example_queue(self):
        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                # read the next example from file. article and abstract are both strings.
                (article, abstract) = next(input_gen)
            except RuntimeError:  # if there are no more examples:
                logger.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    logger.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            # Use the <s> and </s> tags in abstract to get a list of sentences.
            abstract_sentences = [
                sent.strip() for sent in data.abstract2sents(abstract)
            ]
            example = Example(article, abstract_sentences,
                              self._vocab)  # Process into an Example.
            self._example_queue.put(
                example)  # place the Example in the example queue.
Пример #6
0
def get_summary_from_example(e):
    summary_texts = []
    for abstract in e.features.feature['abstract'].bytes_list.value:
        summary_texts.append(abstract)  # the abstracts texts was saved under the key 'abstract' in the data files
    all_abstract_sentences = [[sent.strip() for sent in data.abstract2sents(
        abstract)] for abstract in summary_texts]
    summary_text = '\n'.join(all_abstract_sentences[0])
    return summary_text
Пример #7
0
 def fill_example_queue(self):
   """Reads data from file and processes into Examples which are then placed into the example queue."""
   input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass))
   self._example_queue = []
   for item in input_gen:
       article, abstract = str(item[0]), str(item[1])
       abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)]
       example = Example(article, abstract_sentences, self._vocab, self._hps)
       if example.flag == False:
           self._example_queue.append(example)
Пример #8
0
def get_decode_data(hps, vocab, data_path, randomize=False):
    tf.logging.info('Fetching data..')
    filelist = glob.glob(data_path)
    inputs = []
    total_examples = 0
    total_batches = 0
    for f in filelist:
        reader = open(f, 'rb')
        while True:
            len_bytes = reader.read(8)
            if not len_bytes: break
            str_len = struct.unpack('q', len_bytes)[0]
            example_str = struct.unpack('%ds' % str_len,
                                        reader.read(str_len))[0]
            e = example_pb2.Example.FromString(example_str)
            try:
                article_text = e.features.feature['article'].bytes_list.value[
                    0].decode()
                if len(article_text) == 0:
                    #tf.logging.warning('Found an example with empty article text. Skipping it.')
                    pass
                else:
                    abstract_text = e.features.feature[
                        'abstract'].bytes_list.value[0].decode()
                    abstract_sentences = [
                        sent.strip()
                        for sent in data.abstract2sents(abstract_text)
                    ]
                    example = Example(article_text, abstract_sentences, vocab,
                                      hps)
                    inputs.append(example)
                    total_examples = total_examples + 1
            except ValueError:
                #tf.logging.error('Failed to get article or abstract from example')
                continue
    batches = []
    tf.logging.info('Creating batches..')
    if randomize:
        random.shuffle(inputs)
        example = inputs[0]
        b = [example for _ in range(hps.beam_size)]
        batches.append(Batch(b, hps, vocab))
        total_batches = 1
        total_examples = 1
    else:
        for i in range(0, len(inputs)):
            b = [inputs[i] for _ in range(hps.beam_size)]
            batches.append(Batch(b, hps, vocab))
            total_batches = total_batches + 1

    tf.logging.info('[TOTAL Batches]  : %i', total_batches)
    tf.logging.info('[TOTAL Examples] : %i', total_examples)
    tf.logging.info('Creating batches..COMPLETE')
    return batches
Пример #9
0
    def fill_example_queue(self):
        input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass))
        while True:
            try:
                (article, abstract) = next(input_gen) 
            except StopIteration: # if there are no more examples:
                tf.logging.info("The example generator for this example queue filling thread has exhausted data.")
                self._finished_reading = True
                break

            abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] 
            example = Example(article, abstract_sentences, self._vocab, self._hps) 
            self._example_queue.put(example) 
Пример #10
0
def get_specific_example(hps, vocab, example_number):

    file_id, number = divmod(example_number, 1000)
    path = '/home/ubuntu/W266/final_0/W266_Final/data/final_chunked/validation_%03d.bin' % file_id
    print(f'Fetching example {number} from: {path}')
    filelist = glob.glob(path)
    inputs = []
    total_examples = 0
    total_batches = 0
    for f in filelist:
        reader = open(f, 'rb')
        while True:
            len_bytes = reader.read(8)
            if not len_bytes: break
            str_len = struct.unpack('q', len_bytes)[0]
            example_str = struct.unpack('%ds' % str_len,
                                        reader.read(str_len))[0]
            e = example_pb2.Example.FromString(example_str)
            try:
                article_text = e.features.feature['article'].bytes_list.value[
                    0].decode()
                if len(article_text) == 0:
                    #tf.logging.warning('Found an example with empty article text. Skipping it.')
                    pass
                else:
                    abstract_text = e.features.feature[
                        'abstract'].bytes_list.value[0].decode()
                    abstract_sentences = [
                        sent.strip()
                        for sent in data.abstract2sents(abstract_text)
                    ]
                    example = Example(article_text, abstract_sentences, vocab,
                                      hps)
                    inputs.append(example)
                    total_examples = total_examples + 1
            except ValueError:
                #tf.logging.error('Failed to get article or abstract from example')
                continue
    batches = []
    tf.logging.info('Creating batches..')
    example = inputs[number]
    b = [example for _ in range(hps.beam_size)]
    batches.append(Batch(b, hps, vocab))
    total_batches = 1
    total_examples = 1

    tf.logging.info('[TOTAL Batches]  : %i', total_batches)
    tf.logging.info('[TOTAL Examples] : %i', total_examples)
    tf.logging.info('Creating batches..COMPLETE')
    return batches
    def fill_example_queue(self):
        input_gen = self.text_generator()

        while True:
            try:
                (content, query, summary) = input_gen.next(
                )  # read the next example from file. article and abstract are both strings.
            except StopIteration:  # if there are no more examples:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            content_sentence = [
                sent.strip() for sent in data.abstract2sents(content)
            ]  # Use the <s> and </s> tags in abstract to get a list of sentences.
            query_sentence = [
                sent.strip() for sent in data.abstract2sents(query)
            ]
            summary_sentence = [
                sent.strip() for sent in data.abstract2sents(summary)
            ]

            example = Example(content_sentence, query_sentence,
                              summary_sentence,
                              self._vocab)  # Process into an Example.
            self._example_queue.put(
                example)  # place the Example in the example queue.
 def example_generator(self):
     # 文章、摘要生成器
     while True:
         try:
             (article, abstract) = self.text_gen.__next__()
         except StopIteration:
             print("example generator 迭代结束")
             break
         # 编码abstract
         abstract_sentences = [
             sent.strip() for sent in data.abstract2sents(abstract)
         ]
         # 处理成一个Example.
         example = Example(article, abstract_sentences[0], self._vocab)
         # 放Example对象到队列
         yield example
Пример #13
0
  def fill_example_queue(self):
    input_gen = self.text_generator(data.example_generator(self._data_path, self._single_pass))

    while True:
      try:
        (article, abstract) = input_gen.next() # read the next example from file. article and abstract are both strings.
      except StopIteration: # if there are no more examples:
        tf.logging.info("The example generator for this example queue filling thread has exhausted data.")
        if self._single_pass:
          tf.logging.info("single_pass mode is on, so we've finished reading dataset. This thread is stopping.")
          self._finished_reading = True
          break
        else:
          raise Exception("single_pass mode is off but the example generator is out of data; error.")

      abstract_sentences = [sent.strip() for sent in data.abstract2sents(abstract)] # Use the <s> and </s> tags in abstract to get a list of sentences.
      example = Example(article, abstract_sentences, self._vocab) # Process into an Example.
      self._example_queue.put(example) # place the Example in the example queue.
Пример #14
0
    def fill_example_queue(self):
        """Reads data from file and processes into Examples which are then placed into the example queue."""

        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass,
                                   self._cnn_500_dm_500))
        # counter = 0
        while True:
            try:
                (
                    article, abstracts, doc_indices_str, raw_article_sents
                ) = input_gen.next(
                )  # read the next example from file. article and abstract are both strings.
            except StopIteration:  # if there are no more examples:
                logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            all_abstract_sentences = [[
                sent.strip() for sent in data.abstract2sents(abstract)
            ] for abstract in abstracts]
            if len(all_abstract_sentences) != 0:
                abstract_sentences = all_abstract_sentences[0]
            else:
                abstract_sentences = []
            doc_indices = [int(idx) for idx in doc_indices_str.strip().split()]
            example = Example(article, abstract_sentences,
                              all_abstract_sentences, doc_indices,
                              raw_article_sents, self._vocab,
                              self._hps)  # Process into an Example.
            self._example_queue.put(
                example)  # place the Example in the example queue.
    def fill_example_queue(self):
        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))
        f = open("inputs.txt", "a")
        while True:
            try:
                (source1, source2, target) = input_gen.next(
                )  # read the next example from file. article and abstract are both strings.
                f.write(source1 + "\t" + source2 + "\t" + target + "\n")
            except StopIteration:  # if there are no more examples:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            abstract_sentences = [
                sent.strip() for sent in data.abstract2sents(target)
            ]  # Use the <s> and </s> tags in abstract to get a list of sentences.
            #example = Example(article, abstract_sentences, self._vocab) # Process into an Example.
            #example = Example2(article, ' '.join(abstract_sentences), abstract_sentences, self._vocab)
            #example = Example2(' '.join(abstract_sentences), article, abstract_sentences, self._vocab)
            #example = Example2(article, article, abstract_sentences, self._vocab)
            #example = Example2(' '.join(abstract_sentences), ' '.join(abstract_sentences), abstract_sentences, self._vocab)
            example = Example2(source1, source2, target.split(), self._vocab)
            self._example_queue.put(
                example)  # place the Example in the example queue.
        f.close()
Пример #16
0
def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.summarizer == 'all':
        summary_methods = list(summarizers.keys())
    else:
        summary_methods = [FLAGS.summarizer]
    if FLAGS.dataset_name == 'all':
        dataset_names = datasets
    else:
        dataset_names = [FLAGS.dataset_name]

    sheets_strs = []
    for summary_method in summary_methods:
        summary_fn = summarizers[summary_method]
        for dataset_name in dataset_names:
            FLAGS.dataset_name = dataset_name

            original_dataset_name = 'xsum' if 'xsum' in dataset_name else 'cnn_dm' if 'cnn_dm' in dataset_name or 'duc_2004' in dataset_name else ''
            vocab = Vocab('logs/vocab' + '_' + original_dataset_name,
                          50000)  # create a vocabulary

            source_dir = os.path.join(data_dir, dataset_name)
            source_files = sorted(
                glob.glob(source_dir + '/' + FLAGS.dataset_split + '*'))

            total = len(source_files) * 1000 if (
                'cnn' in dataset_name or 'newsroom' in dataset_name
                or 'xsum' in dataset_name) else len(source_files)
            example_generator = data.example_generator(
                source_dir + '/' + FLAGS.dataset_split + '*',
                True,
                False,
                should_check_valid=False)

            if dataset_name == 'duc_2004':
                abs_source_dir = os.path.join(
                    os.path.expanduser('~') + '/data/tf_data/with_coref',
                    dataset_name)
                abs_example_generator = data.example_generator(
                    abs_source_dir + '/' + FLAGS.dataset_split + '*',
                    True,
                    False,
                    should_check_valid=False)
                abs_names_to_types = [('abstract', 'string_list')]

            triplet_ssi_list = []
            for example_idx, example in enumerate(
                    tqdm(example_generator, total=total)):
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example(
                    example, names_to_types)
                if dataset_name == 'duc_2004':
                    abs_example = next(abs_example_generator)
                    groundtruth_summary_texts = util.unpack_tf_example(
                        abs_example, abs_names_to_types)
                    groundtruth_summary_texts = groundtruth_summary_texts[0]
                    groundtruth_summ_sents_list = [[
                        sent.strip() for sent in data.abstract2sents(abstract)
                    ] for abstract in groundtruth_summary_texts]

                else:
                    groundtruth_summary_texts = [groundtruth_summary_text]
                    groundtruth_summ_sents_list = []
                    for groundtruth_summary_text in groundtruth_summary_texts:
                        groundtruth_summ_sents = [
                            sent.strip() for sent in
                            groundtruth_summary_text.strip().split('\n')
                        ]
                        groundtruth_summ_sents_list.append(
                            groundtruth_summ_sents)
                article_sent_tokens = [
                    util.process_sent(sent) for sent in raw_article_sents
                ]
                if doc_indices is None:
                    doc_indices = [0] * len(
                        util.flatten_list_of_lists(article_sent_tokens))
                doc_indices = [int(doc_idx) for doc_idx in doc_indices]
                groundtruth_similar_source_indices_list = util.enforce_sentence_limit(
                    groundtruth_similar_source_indices_list,
                    FLAGS.sentence_limit)

                log_dir = os.path.join(log_root,
                                       dataset_name + '_' + summary_method)
                dec_dir = os.path.join(log_dir, 'decoded')
                ref_dir = os.path.join(log_dir, 'reference')
                util.create_dirs(dec_dir)
                util.create_dirs(ref_dir)

                parser = PlaintextParser.from_string(
                    ' '.join(raw_article_sents), Tokenizer("english"))
                summarizer = summary_fn()

                summary = summarizer(
                    parser.document,
                    5)  #Summarize the document with 5 sentences
                summary = [str(sentence) for sentence in summary]

                summary_tokenized = []
                for sent in summary:
                    summary_tokenized.append(sent.lower())

                rouge_functions.write_for_rouge(groundtruth_summ_sents_list,
                                                summary_tokenized,
                                                example_idx,
                                                ref_dir,
                                                dec_dir,
                                                log=False)

                decoded_sent_tokens = [
                    sent.split() for sent in summary_tokenized
                ]
                sentence_limit = 2
                sys_ssi_list, _, _ = get_simple_source_indices_list(
                    decoded_sent_tokens, article_sent_tokens, vocab,
                    sentence_limit, min_matched_tokens)
                triplet_ssi_list.append(
                    (groundtruth_similar_source_indices_list, sys_ssi_list,
                     -1))

            print('Evaluating Lambdamart model F1 score...')
            suffix = util.all_sent_selection_eval(triplet_ssi_list)
            print(suffix)

            results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir)
            print(("Results_dict: ", results_dict))
            sheets_str = rouge_functions.rouge_log(results_dict,
                                                   log_dir,
                                                   suffix=suffix)
            sheets_strs.append(dataset_name + '_' + summary_method + '\n' +
                               sheets_str)

    for sheets_str in sheets_strs:
        print(sheets_str + '\n')
Пример #17
0
    def fill_example_queue(self):
        """Reads data from file and processes into Examples which are then placed into the example queue."""
        def irrelevant_perturbation(sentences):
            summary = ' '.join(sentences) + '\n'
            irrelevant_summary = self.irrelevant_dict[summary]
            return irrelevant_summary.split()

        def syntax_perturbation(sentences):
            summary = ' '.join(sentences)
            summary = summary.split()
            original_summary = deepcopy(summary)
            sentence_len = len(summary)
            done = False
            pos1 = 0
            pos2 = -1
            while not done:
                pos1 += 1
                pos2 -= 1
                summary[pos1] = original_summary[pos2]
                summary[pos2] = original_summary[pos1]
                done = True
                if summary == original_summary:
                    done = False
            return summary

        def semantic_perturbation(sentences):
            summary = ' '.join(sentences)
            summary = summary.split()
            original_summary = deepcopy(summary)
            try:
                tokenized_text = word_tokenize(' '.join(summary))
            except:
                return sentences
            pos_tag = nltk.pos_tag(tokenized_text)
            change = 0
            for pi in range(len(pos_tag)):
                antonym = ''
                try:
                    for syn in wordnet.synsets(pos_tag[pi][0]):
                        for l in syn.lemmas():
                            if l.antonyms():
                                antonym = l.antonyms()[0].name(
                                )  # get the first antonym of the first lemma
                                break
                        if antonym != '':
                            if change < 2:
                                tokenized_text[pi] = antonym
                                change += 1
                                break
                except:
                    tokenized_text[pi] = '[UNK]'

            if summary == original_summary:
                change = 0
                for k in range(len(summary)):
                    try:
                        summary[k] = semantic_change_simple[summary[k]]
                        change += 1
                    except:
                        pass
                    if change >= 2:
                        break

                summary = tokenized_text

            return summary

        def grammar_perturbation(sentences):
            summary = ' '.join(sentences)
            summary = summary.split()
            original_summary = deepcopy(summary)
            change = 0
            for k in range(len(summary)):
                try:
                    summary[k] = grammar_tweek_negation[summary[k]]
                    change += 1
                except:
                    pass
                if change >= 2:
                    break

            if summary == original_summary:
                change = 0
                for k in range(len(summary)):
                    try:
                        summary[k] = grammar_tweek_custom[summary[k]]
                        change += 1
                    except:
                        pass
                    if change >= 2:
                        break

            if summary == original_summary:
                change = 0
                for word in original_summary:
                    new_word = singularize(word)
                    if change >= 2:
                        summary.append(word)
                    else:
                        summary.append(new_word)
                        if new_word != word:
                            change += 1

            return summary

        input_gen = self.text_generator(
            data.example_generator(self._data_path, self._single_pass))

        while True:
            try:
                (article, abstract) = input_gen.next(
                )  # read the next example from file. article and abstract are both strings.
            except StopIteration:  # if there are no more examples:
                tf.logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    tf.logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )

            abstract_sentences = [
                sent.strip() for sent in data.abstract2sents(abstract)
            ]  # Use the <s> and </s> tags in abstract to get a list of sentences.
            # perturbation
            abstract_sentences = grammar_perturbation(abstract_sentences)

            # if lead3
            #abstract_sentences = (article.split('.')[0] + '. ' +  article.split('.')[1] + '. ' +  article.split('.')[2]+ '.').split()

            example = Example(article, abstract_sentences, self._vocab,
                              self._hps)  # Process into an Example.
            self._example_queue.put(
                example)  # place the Example in the example queue.
Пример #18
0
    def fill_example_queue(self):
        """Reads data from file and processes into Examples which are then placed into the example queue."""

        if self._example_generator is None:
            input_gen = self.text_generator(
                data.example_generator(self._data_path,
                                       self._single_pass,
                                       self._cnn_500_dm_500,
                                       is_original=('with_coref'
                                                    not in self._data_path)))
        else:
            input_gen = self.text_generator(self._example_generator)
        if self._hps.pg_mmr and self._hps.ssi_data_path != '':  # if use pg_mmr and bert
            print(util.bcolors.OKGREEN + "Loading SSI from BERT at %s" %
                  os.path.join(self._hps.ssi_data_path, 'ssi.pkl') +
                  util.bcolors.ENDC)
            with open(os.path.join(self._hps.ssi_data_path, 'ssi.pkl')) as f:
                ssi_triple_list = pickle.load(f)
                # ssi_list = [ssi_triple[1] for ssi_triple in ssi_triple_list]
        else:
            ssi_triple_list = None
        counter = 0
        while True:
            try:
                (
                    article, abstracts, doc_indices_str, raw_article_sents,
                    ssi, article_lcs_paths_list
                ) = next(
                    input_gen
                )  # read the next example from file. article and abstract are both strings.
            except StopIteration:  # if there are no more examples:
                logging.info(
                    "The example generator for this example queue filling thread has exhausted data."
                )
                if self._single_pass:
                    logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping."
                    )
                    self._finished_reading = True
                    if ssi_triple_list is not None and counter < len(
                            ssi_triple_list):
                        raise Exception(
                            'Len of ssi list (%d) is greater than number of examples (%d)'
                            % (len(ssi_triple_list), counter))
                    break
                else:
                    raise Exception(
                        "single_pass mode is off but the example generator is out of data; error."
                    )
            if ssi_triple_list is not None:
                if counter >= len(ssi_triple_list):
                    raise Exception(
                        'Len of ssi list (%d) is less than number of examples (>=%d)'
                        % (len(ssi_triple_list), counter))
                ssi_length_extractive = ssi_triple_list[counter][2]
                ssi = ssi_triple_list[counter][1]
                ssi = ssi[:ssi_length_extractive]

            article = article
            abstracts = [abstract for abstract in abstracts]
            if type(doc_indices_str) != str:
                doc_indices_str = doc_indices_str
            raw_article_sents = [sent for sent in raw_article_sents]

            all_abstract_sentences = [[
                sent.strip() for sent in data.abstract2sents(abstract)
            ] for abstract in abstracts]
            if len(all_abstract_sentences) != 0:
                abstract_sentences = all_abstract_sentences[0]
            else:
                abstract_sentences = []
            doc_indices = [int(idx) for idx in doc_indices_str.strip().split()]
            # join_separator = ' [SEP] ' if self._hps.sep else ' '
            if self._hps.by_instance:  # if we are running iteratively on only instances (a singleton/pair + a summary sentence), not the whole article
                for abs_idx, abstract_sentence in enumerate(
                        abstract_sentences):
                    inst_ssi = ssi[abs_idx]
                    if len(inst_ssi) == 0:
                        continue
                    inst_abstract_sentences = abstract_sentence
                    inst_raw_article_sents = util.reorder(
                        raw_article_sents, inst_ssi)
                    inst_article = ' '.join([
                        ' '.join(util.process_sent(sent, whitespace=True))
                        for sent in inst_raw_article_sents
                    ])
                    inst_doc_indices = [0] * len(inst_article.split())
                    inst_article_lcs_paths_list = article_lcs_paths_list[
                        abs_idx]

                    if len(
                            inst_article
                    ) == 0:  # See https://github.com/abisee/pointer-generator/issues/1
                        logging.warning(
                            'Found an example with empty article text. Skipping it.\n*********************************************'
                        )
                    elif len(inst_article.strip().split()
                             ) < 3 and self._hps.skip_with_less_than_3:
                        print(
                            'Article has less than 3 tokens, so skipping\n*********************************************'
                        )
                    elif len(inst_abstract_sentences.strip().split()
                             ) < 3 and self._hps.skip_with_less_than_3:
                        print(
                            'Abstract has less than 3 tokens, so skipping\n*********************************************'
                        )
                    else:
                        inst_example = Example(None, [inst_abstract_sentences],
                                               all_abstract_sentences, None,
                                               inst_raw_article_sents, None,
                                               [inst_article_lcs_paths_list],
                                               self._vocab, self._hps)
                        self._example_queue.put(inst_example)
            else:
                example = Example(None, abstract_sentences,
                                  all_abstract_sentences, None,
                                  raw_article_sents, ssi,
                                  article_lcs_paths_list, self._vocab,
                                  self._hps)  # Process into an Example.
                self._example_queue.put(
                    example)  # place the Example in the example queue.

            # print "example num", counter
            counter += 1
def bin2txt(data_path, finished_dir):
    import glob
    import json
    import struct
    import nltk
    import data
    from tensorflow.core.example import example_pb2
    from collections import OrderedDict

    def example_generator(file_path):
        with open(file_path, 'rb') as reader:
            while True:
                len_bytes = reader.read(8)
                if not len_bytes:
                    break  # finished reading this file
                str_len = struct.unpack('q', len_bytes)[0]
                example_str = struct.unpack('%ds' % str_len,
                                            reader.read(str_len))[0]
                yield example_pb2.Example.FromString(example_str)

    def text_generator(example_generator):
        while True:
            e = example_generator.next()  # e is a tf.Example
            try:
                article_text = e.features.feature['article'].bytes_list.value[
                    0]  # the article text was saved under the key 'article' in the data files
                abstract_text = e.features.feature['abstract'].bytes_list.value[
                    0]  # the abstract text was saved under the key 'abstract' in the data files
            except ValueError:
                tf.logging.error(
                    'Failed to get article or abstract from example')
                continue
            if len(
                    article_text
            ) == 0:  # See https://github.com/abisee/pointer-generator/issues/1
                tf.logging.warning(
                    'Found an example with empty article text. Skipping it.')
            else:
                yield (article_text, abstract_text)

    counter = 0
    filelist = glob.glob(data_path)  # get the list of datafiles
    assert filelist, ('Error: Empty filelist at %s' % data_path
                      )  # check filelist isn't empty
    filelist = sorted(filelist)
    for f in filelist:
        input_gen = text_generator(example_generator(f))
        with open(
                finished_dir + '/' + f.split('/')[-1].replace('.bin', '.txt'),
                'w') as writer:
            while True:
                try:
                    (article, abstract) = input_gen.next(
                    )  # read the next example from file. article and abstract are both strings.
                    abstract_sentences = [
                        sent.strip() for sent in data.abstract2sents(abstract)
                    ]  # Use the <s> and </s> tags in abstract to get a list of sentences.
                    abstract = ' '.join(abstract_sentences)
                    abstract_sentences = [
                        ' '.join(nltk.word_tokenize(sent))
                        for sent in nltk.sent_tokenize(abstract)
                    ]

                    json_format = json.dumps(
                        OrderedDict([('uuid', 'uuid-%i' % counter),
                                     ('article', article), ('summary', ''),
                                     ('reference', abstract)]))
                    counter += 1
                    writer.write(json_format)
                    writer.write('\n')
                except StopIteration:  # if there are no more examples:
                    tf.logging.info(
                        "The example generator for this example queue filling thread has exhausted data."
                    )
                    break
                except UnicodeDecodeError:
                    continue
        print "finished " + f
Пример #20
0
    def fill_example_queue(self):
        """Reads data from file and processes into Examples which are then placed into the example queue."""

        if self._example_generator is None:
            input_gen = self.text_generator(
                data.example_generator(self._data_path, self._single_pass, self._cnn_500_dm_500, is_original=False))
        else:
            input_gen = self.text_generator(self._example_generator)
        counter = 0
        while True:
            try:
                (article,
                 abstracts, doc_indices_str, raw_article_sents, ssi) = next(input_gen)  # read the next example from file. article and abstract are both strings.
            except StopIteration:  # if there are no more examples:
                logging.info("The example generator for this example queue filling thread has exhausted data.")
                if self._single_pass:
                    logging.info(
                        "single_pass mode is on, so we've finished reading dataset. This thread is stopping.")
                    self._finished_reading = True
                    break
                else:
                    raise Exception("single_pass mode is off but the example generator is out of data; error.")

            article = article
            abstracts = [abstract for abstract in abstracts]
            if type(doc_indices_str) != str:
                doc_indices_str = doc_indices_str
            raw_article_sents = [sent for sent in raw_article_sents]

            all_abstract_sentences = [[sent.strip() for sent in data.abstract2sents(
                abstract)] for abstract in abstracts]
            if len(all_abstract_sentences) != 0:
                abstract_sentences = all_abstract_sentences[0]
            else:
                abstract_sentences = []
            doc_indices = [int(idx) for idx in doc_indices_str.strip().split()]
            if self._hps.by_instance:   # if we are running iteratively on only instances (a singleton/pair + a summary sentence), not the whole article
                for abs_idx, abstract_sentence in enumerate(abstract_sentences):
                    inst_ssi = ssi[abs_idx]
                    if len(inst_ssi) == 0:
                        continue
                    inst_abstract_sentences = abstract_sentence
                    inst_raw_article_sents = util.reorder(raw_article_sents, inst_ssi)
                    inst_article = ' '.join([' '.join(util.process_sent(sent, whitespace=True)) for sent in inst_raw_article_sents])
                    inst_doc_indices = [0] * len(inst_article.split())

                    if len(inst_article) == 0:  # See https://github.com/abisee/pointer-generator/issues/1
                        logging.warning(
                            'Found an example with empty article text. Skipping it.\n*********************************************')
                    elif len(inst_article.strip().split()) < 3 and self._hps.skip_with_less_than_3:
                        print(
                            'Article has less than 3 tokens, so skipping\n*********************************************')
                    elif len(inst_abstract_sentences.strip().split()) < 3 and self._hps.skip_with_less_than_3:
                        print(
                            'Abstract has less than 3 tokens, so skipping\n*********************************************')
                    else:
                        inst_example = Example(inst_article, [inst_abstract_sentences], all_abstract_sentences, inst_doc_indices, inst_raw_article_sents, None, self._vocab, self._hps)
                        self._example_queue.put(inst_example)
            else:
                example = Example(article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, ssi, self._vocab, self._hps)  # Process into an Example.
                self._example_queue.put(example)  # place the Example in the example queue.

            # print "example num", counter
            counter += 1