def test_provide_all_tasks(): """Tasks.from_chadh() should provide all tasks from the Chadwyck novels. """ n1 = ChadhNovelFactory(year=1910) n2 = ChadhNovelFactory(year=1920) n3 = ChadhNovelFactory(year=1930) for year in range(1910, 1941): BPOArticleFactory(year=year) session.commit() tasks = Tasks.from_chadh() partitions = tasks.partitions(3) args = [] for p in partitions.make_args(): args += p for year in range(1910, 1920): assert dict(novel_id=n1.id, year=year) in args for year in range(1920, 1930): assert dict(novel_id=n2.id, year=year) in args for year in range(1930, 1940): assert dict(novel_id=n3.id, year=year) in args
def test_multiple_matches(): """ Record multiple matches for the same pair. """ n1 = ChadhNovelFactory(text='aaa bbb ccc ddd eee fff') a1 = BPOArticleFactory(text='aaa bbb ccc') a2 = BPOArticleFactory(text='ddd eee fff') session.commit() call(['mpirun', 'bin/ext-alignments.py']) call(['bin/gather-alignments.py']) match1 = ( Alignment.query .filter_by(a_id=n1.id, b_id=a1.record_id) .one() ) match2 = ( Alignment.query .filter_by(a_id=n1.id, b_id=a2.record_id) .one() ) assert match1.a_start == 0 assert match1.b_start == 0 assert match1.size == 3 assert match2.a_start == 3 assert match2.b_start == 0 assert match2.size == 3
def test_year_range(): """ Just check the 10 years after publication. """ n1 = ChadhNovelFactory(text='aaa bbb ccc', year=1900) a1 = BPOArticleFactory(text='aaa bbb ccc', year=1905) a2 = BPOArticleFactory(text='aaa bbb ccc', year=1915) session.commit() call(['mpirun', 'bin/ext-alignments.py']) call(['bin/gather-alignments.py']) assert ( Alignment.query .filter_by(a_id=n1.id, b_id=a1.record_id) .one() ) # Ignore the article >10 years after publication. assert not ( Alignment.query .filter_by(a_id=n1.id, b_id=a2.record_id) .first() )
def test_ext_alignments(): """ ExtAlignments should record BPO -> Chadh alignments. """ n1 = ChadhNovelFactory(text='aaa bbb ccc') n2 = ChadhNovelFactory(text='ddd eee fff') a1 = BPOArticleFactory(text='aaa bbb ccc') a2 = BPOArticleFactory(text='aaa bbb ccc') a3 = BPOArticleFactory(text='ddd eee fff') a4 = BPOArticleFactory(text='ddd eee fff') session.commit() call(['mpirun', 'bin/ext-alignments.py']) call(['bin/gather-alignments.py']) # TODO: Test snippets? for a_id, b_id in [ (n1.id, a1.record_id), (n1.id, a2.record_id), (n2.id, a3.record_id), (n2.id, a4.record_id), ]: assert ( Alignment.query .filter_by(a_id=a_id, b_id=b_id) .one() )
def ingest(cls, result_dir: str): """Ingest BPO articles. """ paths = scan_paths(result_dir, '\.json') # Walk paths. for i, path in enumerate(paths): with open(path) as fh: # Bulk-insert articles. session.bulk_insert_mappings(cls, ujson.load(fh)) session.commit() print(dt.now().isoformat(), i)
def ingest(cls, corpus_dir: str): """Ingest texts. """ for path in scan_paths(corpus_dir, '\.txt'): slug = os.path.splitext(os.path.basename(path))[0] year = int(re.search('[0-9]{4}', slug).group()) with open(path) as fh: novel = cls(slug=slug, year=year, text=fh.read()) session.add(novel) session.commit()
def test_flush_matches(): """ Matches should be flushed when the buffer goes over 1k. """ ChadhNovelFactory(text='aaa bbb ccc') for i in range(3000): BPOArticleFactory(text='aaa bbb ccc') session.commit() call(['mpirun', 'bin/ext-alignments.py']) call(['bin/gather-alignments.py']) assert Alignment.query.count() == 3000
def gather(cls, result_dir: str): """ Bulk-insert alignments. """ paths = scan_paths(result_dir, '\.json') # Walk paths. for i, path in enumerate(paths): with open(path, 'rb') as fh: mappings = ujson.load(fh) # Bulk-insert matches. session.bulk_insert_mappings(cls, mappings) session.commit() print(dt.now().isoformat(), i)