def test_ignore_non_xml_files(mock_jstor): """ JSTOR_Corpus#paths() should generate manifest paths. """ # 3 XML manifests. paths = [ mock_jstor.add_article(), mock_jstor.add_article(), mock_jstor.add_article(), ] dirname = os.path.dirname(list(paths)[0]) # 3 non-XML files. for ext in ['js', 'zip', 'txt']: with open(os.path.join(dirname, 'test.'+ext), 'w') as fh: print('content', file=fh) corpus = JSTOR_Corpus(mock_jstor.path) output = list(corpus.paths()) assert set(output) == set(paths) assert len(output) == 3
def test_generate_paths(mock_jstor): """ JSTOR_Corpus#paths() should generate manifest paths. """ paths = [ mock_jstor.add_article(), mock_jstor.add_article(), mock_jstor.add_article(), ] corpus = JSTOR_Corpus(mock_jstor.path) output = list(corpus.paths()) assert set(output) == set(paths) assert len(output) == 3
def ingest_jstor(cls): """ Ingest JSTOR records. """ corpus = JSTOR_Corpus.from_env() for i, text in enumerate(corpus.texts()): try: cls.create(**text) except Exception as e: print(e) sys.stdout.write('\r'+str(i)) sys.stdout.flush()
def test_ignore_non_xml_files(mock_jstor): """ JSTOR_Corpus#paths() should generate manifest paths. """ # 3 XML manifests. paths = [ mock_jstor.add_article(), mock_jstor.add_article(), mock_jstor.add_article(), ] dirname = os.path.dirname(list(paths)[0]) # 3 non-XML files. for ext in ['js', 'zip', 'txt']: with open(os.path.join(dirname, 'test.' + ext), 'w') as fh: print('content', file=fh) corpus = JSTOR_Corpus(mock_jstor.path) output = list(corpus.paths()) assert set(output) == set(paths) assert len(output) == 3