Exemplo n.º 1
0
    def test_cloze_lead(self):
        with tempfile.NamedTemporaryFile(suffix='.jsonl') as output_file:
            Args = namedtuple('Args', [
                'input_jsonl', 'output_jsonl', 'max_sentences', 'max_tokens',
                'max_bytes', 'field_name', 'keep_sentences'
            ])
            args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', output_file.name,
                        1, None, None, 'cloze', True)
            lead.main(args)

            instances = JsonlReader(output_file.name).read()
            assert len(instances) == 25
            assert all('cloze' in instance for instance in instances)
            assert all(
                isinstance(instance['cloze'], list) for instance in instances)

            args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', output_file.name,
                        1, None, None, 'cloze', False)
            lead.main(args)

            instances = JsonlReader(output_file.name).read()
            assert len(instances) == 25
            assert all('cloze' in instance for instance in instances)
            assert all(
                isinstance(instance['cloze'], str) for instance in instances)
Exemplo n.º 2
0
def main(args):
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.source_jsonl) as source:
            with JsonlReader(args.target_jsonl) as target:
                for source_instance, target_instance in zip(source, target):
                    for source_field, target_field in args.field_names:
                        target_instance[target_field] = source_instance[
                            source_field]
                    out.write(target_instance)
Exemplo n.º 3
0
def main(args):
    dfs, num_documents, avg_document_length = load_dfs(args.df_jsonl)

    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in tqdm(f):
                context = instance['context']
                context_tokens = set(token.lower() for sentence in context
                                     for token in sentence.split())
                document = instance['document']

                bm25_scores = []
                for sentence in document:
                    tokenized_sentence = [
                        token.lower() for token in sentence.split()
                    ]
                    bm25 = calculate_bm25(context_tokens, tokenized_sentence,
                                          dfs, num_documents,
                                          avg_document_length, args.k, args.b)

                    bm25_scores.append((bm25, sentence))

                cloze = get_cloze(bm25_scores, args.max_words,
                                  args.max_sentences, args.flatten)
                out.write({'cloze': cloze})
Exemplo n.º 4
0
 def _load_gold_summaries(self) -> List[List[str]]:
     # Loads just the first summary for testing purposes
     summaries = []
     with JsonlReader(_duc2004_file_path) as f:
         for instance in f:
             summaries.append(instance['summaries'][0])
     return summaries
Exemplo n.º 5
0
 def _read(self, file_path: str) -> Iterable[Instance]:
     file_path = cached_path(file_path)
     with JsonlReader(file_path) as f:
         for data in f:
             document = data['document']
             summary = data['summary']
             yield self.text_to_instance(document, summary=summary)
def main(args):
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in f:
                document = instance['document']
                labels = instance['labels']
                summary = [document[index] for index in labels]
                out.write({'summary': summary})
Exemplo n.º 7
0
 def _read(self, file_path: str) -> Iterable[Instance]:
     file_path = cached_path(file_path)
     with JsonlReader(file_path) as f:
         for instance in f:
             document = instance['document']
             topics = instance['topics']
             context = instance['context']
             cloze = instance['cloze']
             yield self.text_to_instance(document, topics, context, cloze=cloze)
Exemplo n.º 8
0
def _load_summaries(
    file_path: str,
    field_name: str = 'summary'
) -> Union[List[List[str]], List[List[List[str]]]]:
    summaries = []
    with JsonlReader(file_path) as f:
        for data in f:
            summaries.append(data[field_name])
    return summaries
Exemplo n.º 9
0
def main(args):
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in f:
                document = instance['document']
                summary = get_lead_summary(document,
                                           max_sentences=args.max_sentences,
                                           max_tokens=args.max_tokens,
                                           max_bytes=args.max_bytes)
                out.write({'summary': summary})
Exemplo n.º 10
0
def main(args):
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in f:
                document = instance['document']
                labels = instance['labels']
                cloze = [document[index] for index in labels]
                if not args.keep_sentences:
                    cloze = ' '.join(cloze)
                out.write({args.field_name: cloze})
Exemplo n.º 11
0
    def test_chen2018(self):
        """
        Tests to ensure that Meteor returns the expected score on the
        Chen 2018 data subset. I ran Meteor on the full data (~11k examples)
        which takes too long to run for a unit test. After confirming the numbers
        are the same as what is reported in the paper, I ran the code on just
        the subset, and this test ensures those numbers are returned.
        """
        gold_file_path = f'{FIXTURES_ROOT}/data/chen2018/gold.jsonl'
        model_file_path = f'{FIXTURES_ROOT}/data/chen2018/model.jsonl'

        gold = JsonlReader(gold_file_path).read()
        model = JsonlReader(model_file_path).read()

        gold = [' '.join(summary['summary']) for summary in gold]
        model = [' '.join(summary['summary']) for summary in model]

        score = run_meteor(gold, model)
        assert abs(score - 18.28372) < 1e-5
Exemplo n.º 12
0
 def _read(self, file_path: str) -> Iterable[Instance]:
     file_path = cached_path(file_path)
     with JsonlReader(file_path) as f:
         for data in f:
             document = data['document']
             labels = data['labels']
             summary = data['summary'] if 'summary' in data else None
             yield self.text_to_instance(document,
                                         labels=labels,
                                         summary=summary)
Exemplo n.º 13
0
    def test_read(self):
        # Write the data to a file
        temp_file = tempfile.NamedTemporaryFile(suffix='.jsonl')
        with open(temp_file.name, 'w') as out:
            for item in self.data:
                serialzed = json.dumps(item)
                out.write(serialzed + '\n')

        # Load from file, ensure it is correct
        actual_data = JsonlReader(temp_file.name).read()
        self.assertEqual(self.data, actual_data)
Exemplo n.º 14
0
def load_dfs(file_path: str) -> Tuple[Dict[str, int], int, float]:
    dfs = defaultdict(int)
    with JsonlReader(file_path) as f:
        for i, data in enumerate(f):
            if i == 0:
                num_documents = data['num_documents']
                avg_document_length = data['average_document_length']
            else:
                token = data['token']
                df = data['df']
                dfs[token] = df
    return dfs, num_documents, avg_document_length
Exemplo n.º 15
0
def main(args):
    if args.backend == 'spacy':
        nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])
    elif args.backend == 'nltk':
        nlp = nltk.word_tokenize

    with JsonlWriter(args.output_file) as out:
        with JsonlReader(args.input_file) as f:
            for instance in tqdm(f, desc=f'Tokenizing {args.input_file}'):
                for field in args.fields:
                    instance[field] = tokenize(nlp, instance[field])
                out.write(instance)
Exemplo n.º 16
0
def main(args):
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in f:
                document = instance['document']
                cloze = get_lead_summary(document,
                                         max_sentences=args.max_sentences,
                                         max_tokens=args.max_tokens,
                                         max_bytes=args.max_bytes)
                if not args.keep_sentences:
                    cloze = ' '.join(cloze)
                out.write({args.field_name: cloze})
Exemplo n.º 17
0
def _load_summaries(
    file_path: str,
    field_name: str = 'summary',
    add_wrapping_list: bool = False
) -> Union[List[List[str]], List[List[List[str]]]]:
    summaries = []
    with JsonlReader(cached_path(file_path)) as f:
        for data in f:
            summary = data[field_name]
            if add_wrapping_list:
                summary = [summary]
            summaries.append(summary)
    return summaries
Exemplo n.º 18
0
def main(args):
    python_rouge = PythonRouge()
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            with Parallel(n_jobs=args.num_cores) as parallel:
                batch = []
                for instance in tqdm(f):
                    batch.append(instance)
                    if len(batch) == _BATCH_SIZE:
                        _process_batch(parallel, batch, python_rouge, out)
                        batch.clear()

                if batch:
                    _process_batch(parallel, batch, python_rouge, out)
Exemplo n.º 19
0
    def test_bz2_file(self):
        # Write the data to a file
        temp_file = tempfile.NamedTemporaryFile(suffix='.jsonl.bz2')
        with bz2.open(temp_file.name, 'wb') as out:
            for item in self.data:
                serialzed = json.dumps(item).encode()
                out.write(serialzed + b'\n')

        # Load from file, ensure it is correct
        actual_data = []
        with JsonlReader(temp_file.name) as f:
            for item in f:
                actual_data.append(item)
        self.assertEqual(self.data, actual_data)
Exemplo n.º 20
0
def main(args):
    # The number of times each document appears
    document_to_num_occurrences = Counter()
    # The histogram of the document set sizes
    document_set_sizes = Counter()
    # The mapping from the document to the page ids
    document_to_page_ids = defaultdict(set)

    with JsonlReader(args.input_jsonl) as f:
        for instance in tqdm(f):
            page_id = instance['page_id']
            documents = instance['documents']
            document_set_sizes[len(documents)] += 1

            for document in documents:
                url = document['canonical_url']
                document_to_num_occurrences[url] += 1
                document_to_page_ids[url].add(page_id)

    # The histogram for the number of times a document appears
    num_occurrences_to_num_documents = Counter()
    for count in document_to_num_occurrences.values():
        num_occurrences_to_num_documents[count] += 1

    # The histogram for the number of pages a document appears
    num_pages_to_num_documents = Counter()
    for page_ids in document_to_page_ids.values():
        num_pages_to_num_documents[len(page_ids)] += 1

    num_instances = sum(document_set_sizes.values())
    num_multidoc = num_instances - document_set_sizes[1]

    num_unique_documents = len(document_to_num_occurrences)
    num_documents_multiple_times = num_unique_documents - num_occurrences_to_num_documents[
        1]

    num_documents_multiple_pages = num_unique_documents - num_pages_to_num_documents[
        1]

    print(f'Total unique documents: {num_unique_documents}')
    print(
        f'Total multi-document: {num_multidoc} ({num_multidoc / num_instances * 100:.2f}%)'
    )
    print(
        f'Total documents appear more than once: {num_documents_multiple_times} ({num_documents_multiple_times / num_unique_documents * 100:.2f}%)'
    )
    print(
        f'Total documents that appear in more than one page: {num_documents_multiple_pages} ({num_documents_multiple_pages / num_unique_documents * 100:.2f}%)'
    )
Exemplo n.º 21
0
    def test_gzip_file_no_extension(self):
        """Tests a gzip file that does not have a ".gz" extension."""
        # Write the data to a file
        temp_file = tempfile.NamedTemporaryFile()
        with gzip.open(temp_file.name, 'wb') as out:
            for item in self.data:
                serialzed = json.dumps(item).encode()
                out.write(serialzed + b'\n')

        # Load from file, ensure it is correct
        actual_data = []
        with JsonlReader(temp_file.name) as f:
            for item in f:
                actual_data.append(item)
        self.assertEqual(self.data, actual_data)
Exemplo n.º 22
0
    def test_sumfocus_runs(self):
        with tempfile.NamedTemporaryFile(suffix='.jsonl') as output_file:
            Args = namedtuple('Args', [
                'input_jsonl', 'output_jsonl', 'beta', 'topic_lambda',
                'context_lambda', 'max_words', 'max_sentences'
            ])
            args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', output_file.name,
                        0.5, 0.2, 0.3, 200, None)
            sumfocus.main(args)

            instances = JsonlReader(output_file.name).read()
            assert len(instances) == 25
            for instance in instances:
                assert 'cloze' in instance
                assert isinstance(instance['cloze'], str)
Exemplo n.º 23
0
def main(args):
    dirname = os.path.dirname(args.output_jsonl)
    if dirname:
        os.makedirs(dirname, exist_ok=True)

    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in tqdm(f):
                document = instance['document']
                topics = instance['topics']
                context = instance['context']
                cloze = run_sumfocus(document, topics, context, args.beta,
                                     args.topic_lambda, args.context_lambda,
                                     args.max_words, args.max_sentences)
                cloze = ' '.join(cloze)
                out.write({'cloze': cloze})
Exemplo n.º 24
0
def compute_file_statistics(file_path: str) -> Dict[str, float]:
    stats = get_default_stats()
    with JsonlReader(file_path) as f:
        for instance in tqdm(f, desc=f'Processing {file_path}'):
            document = instance['document']
            summary = instance['summary']
            stats[NUM_DOC_TOKENS] += [
                sum(len(sentence.split()) for sentence in document)
            ]
            stats[NUM_DOC_SENTS] += [len(document)]
            stats[NUM_SUM_TOKENS] += [
                sum(len(sentence.split()) for sentence in summary)
            ]
            stats[NUM_SUM_SENTS] += [len(summary)]
            stats[NUM_INSTANCES] += 1
    return stats
Exemplo n.º 25
0
def main(args):
    model_dir = args.model_dir
    length = args.length
    temperature = args.temperature
    top_k = args.top_k
    seed = args.seed

    lm = OpenAILanguageModel(model_dir, length, temperature, top_k, seed=seed)
    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in tqdm(f):
                context = instance['context']
                context = ' '.join(context)

                first_sentence = lm.sample_next_sentence(context)
                output_data = {'cloze': first_sentence}
                out.write(output_data)
Exemplo n.º 26
0
def main(args):
    python_rouge = PythonRouge()

    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in f:
                document = instance['document']
                summary = instance['summary']
                _, labels = get_greedy_oracle_summary(document, summary, args.metric,
                                                      max_sentences=args.max_sentences,
                                                      max_tokens=args.max_tokens,
                                                      max_bytes=args.max_bytes,
                                                      use_porter_stemmer=args.use_stemmer,
                                                      remove_stopwords=args.remove_stopwords,
                                                      python_rouge=python_rouge)
                instance['labels'] = labels
                out.write(instance)
Exemplo n.º 27
0
    def test_bm25_runs(self):
        with tempfile.NamedTemporaryFile(suffix='.jsonl') as df_file:
            with tempfile.NamedTemporaryFile(suffix='.jsonl') as bm25_file:
                Args = namedtuple('Args', ['input_jsonl', 'output_jsonl'])
                args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', df_file.name)
                calculate_df.main(args)

                Args = namedtuple('Args', [
                    'input_jsonl', 'df_jsonl', 'output_jsonl', 'k', 'b',
                    'max_words', 'max_sentences', 'flatten'
                ])
                args = Args(f'{FIXTURES_ROOT}/data/cloze.jsonl', df_file.name,
                            bm25_file.name, 1.2, 0.75, None, 1, True)
                bm25.main(args)

                instances = JsonlReader(bm25_file.name).read()
                assert len(instances) == 25
                for instance in instances:
                    assert 'cloze' in instance
                    assert isinstance(instance['cloze'], str)
Exemplo n.º 28
0
def main(args):
    nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])

    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in tqdm(f):
                instance['headings'] = [
                    tokenize(nlp, heading) for heading in instance['headings']
                ]
                for document in instance['documents']:
                    if document['title']:
                        document['title'] = tokenize(nlp, document['title'])
                    document['paragraphs'] = tokenize(nlp,
                                                      document['paragraphs'])

                instance['left_context'] = tokenize(nlp,
                                                    instance['left_context'])
                instance['cloze'] = tokenize(nlp, instance['cloze'])
                instance['right_context'] = tokenize(nlp,
                                                     instance['right_context'])
                out.write(instance)
Exemplo n.º 29
0
def main(args):
    python_rouge = PythonRouge()

    with JsonlWriter(args.output_jsonl) as out:
        with JsonlReader(args.input_jsonl) as f:
            for instance in f:
                document = instance['document']
                cloze = instance['cloze']
                oracle, labels = get_greedy_oracle_summary(
                    document, [cloze],
                    args.metric,
                    max_sentences=args.max_sentences,
                    max_tokens=args.max_tokens,
                    max_bytes=args.max_bytes,
                    use_porter_stemmer=args.use_stemmer,
                    remove_stopwords=args.remove_stopwords,
                    python_rouge=python_rouge)
                if args.cloze_only:
                    oracle = ' '.join(oracle)
                    out.write({'cloze': oracle})
                else:
                    instance['labels'] = labels
                    out.write(instance)
Exemplo n.º 30
0
def assert_line_count(file_path: str, expected_count: int):
    """
    Checks to make sure the input file has an expected number of lines. The
    file should be a jsonl file.

    Parameters
    ----------
    file_path: ``str``
        The file to check.
    expected_count: ``int``
        The expected number of lines.

    Raises
    ------
    Exception: If the actual number of lines is not equal to the expected number.
    """
    count = 0
    with JsonlReader(file_path) as f:
        for _ in f:
            count += 1
    if count != expected_count:
        raise Exception(f'Unexpected number of lines in {file_path}. Found {count}, expected {expected_count}')
    print(f'{file_path} has expected number of lines')