Пример #1
0
def test_provide_all_tasks():
    """Tasks.from_chadh() should provide all tasks from the Chadwyck novels.
    """
    n1 = ChadhNovelFactory(year=1910)
    n2 = ChadhNovelFactory(year=1920)
    n3 = ChadhNovelFactory(year=1930)

    for year in range(1910, 1941):
        BPOArticleFactory(year=year)

    session.commit()

    tasks = Tasks.from_chadh()

    partitions = tasks.partitions(3)

    args = []
    for p in partitions.make_args():
        args += p

    for year in range(1910, 1920):
        assert dict(novel_id=n1.id, year=year) in args

    for year in range(1920, 1930):
        assert dict(novel_id=n2.id, year=year) in args

    for year in range(1930, 1940):
        assert dict(novel_id=n3.id, year=year) in args
Пример #2
0
def test_multiple_matches():

    """
    Record multiple matches for the same pair.
    """

    n1 = ChadhNovelFactory(text='aaa bbb ccc ddd eee fff')

    a1 = BPOArticleFactory(text='aaa bbb ccc')
    a2 = BPOArticleFactory(text='ddd eee fff')

    session.commit()

    call(['mpirun', 'bin/ext-alignments.py'])
    call(['bin/gather-alignments.py'])

    match1 = (
        Alignment.query
        .filter_by(a_id=n1.id, b_id=a1.record_id)
        .one()
    )

    match2 = (
        Alignment.query
        .filter_by(a_id=n1.id, b_id=a2.record_id)
        .one()
    )

    assert match1.a_start == 0
    assert match1.b_start == 0
    assert match1.size == 3

    assert match2.a_start == 3
    assert match2.b_start == 0
    assert match2.size == 3
Пример #3
0
def test_year_range():

    """
    Just check the 10 years after publication.
    """

    n1 = ChadhNovelFactory(text='aaa bbb ccc', year=1900)

    a1 = BPOArticleFactory(text='aaa bbb ccc', year=1905)
    a2 = BPOArticleFactory(text='aaa bbb ccc', year=1915)

    session.commit()

    call(['mpirun', 'bin/ext-alignments.py'])
    call(['bin/gather-alignments.py'])

    assert (
        Alignment.query
        .filter_by(a_id=n1.id, b_id=a1.record_id)
        .one()
    )

    # Ignore the article >10 years after publication.

    assert not (
        Alignment.query
        .filter_by(a_id=n1.id, b_id=a2.record_id)
        .first()
    )
Пример #4
0
def test_ext_alignments():

    """
    ExtAlignments should record BPO -> Chadh alignments.
    """

    n1 = ChadhNovelFactory(text='aaa bbb ccc')
    n2 = ChadhNovelFactory(text='ddd eee fff')

    a1 = BPOArticleFactory(text='aaa bbb ccc')
    a2 = BPOArticleFactory(text='aaa bbb ccc')

    a3 = BPOArticleFactory(text='ddd eee fff')
    a4 = BPOArticleFactory(text='ddd eee fff')

    session.commit()

    call(['mpirun', 'bin/ext-alignments.py'])
    call(['bin/gather-alignments.py'])

    # TODO: Test snippets?

    for a_id, b_id in [
        (n1.id, a1.record_id),
        (n1.id, a2.record_id),
        (n2.id, a3.record_id),
        (n2.id, a4.record_id),
    ]:

        assert (
            Alignment.query
            .filter_by(a_id=a_id, b_id=b_id)
            .one()
        )
Пример #5
0
    def ingest(cls, result_dir: str):
        """Ingest BPO articles.
        """
        paths = scan_paths(result_dir, '\.json')

        # Walk paths.
        for i, path in enumerate(paths):
            with open(path) as fh:

                # Bulk-insert articles.
                session.bulk_insert_mappings(cls, ujson.load(fh))
                session.commit()

                print(dt.now().isoformat(), i)
Пример #6
0
    def ingest(cls, corpus_dir: str):
        """Ingest texts.
        """
        for path in scan_paths(corpus_dir, '\.txt'):

            slug = os.path.splitext(os.path.basename(path))[0]

            year = int(re.search('[0-9]{4}', slug).group())

            with open(path) as fh:
                novel = cls(slug=slug, year=year, text=fh.read())
                session.add(novel)

        session.commit()
Пример #7
0
def test_flush_matches():

    """
    Matches should be flushed when the buffer goes over 1k.
    """

    ChadhNovelFactory(text='aaa bbb ccc')

    for i in range(3000):
        BPOArticleFactory(text='aaa bbb ccc')

    session.commit()

    call(['mpirun', 'bin/ext-alignments.py'])
    call(['bin/gather-alignments.py'])

    assert Alignment.query.count() == 3000
Пример #8
0
    def gather(cls, result_dir: str):

        """
        Bulk-insert alignments.
        """

        paths = scan_paths(result_dir, '\.json')

        # Walk paths.
        for i, path in enumerate(paths):
            with open(path, 'rb') as fh:

                mappings = ujson.load(fh)

                # Bulk-insert matches.
                session.bulk_insert_mappings(cls, mappings)
                session.commit()

                print(dt.now().isoformat(), i)