def erratum_open_access_record(): """Return results generator from the WSP spider.""" spider = iop_spider.IOPSpider() body = """ <ArticleSet> <Article> <Journal> <PublisherName>Institute of Physics</PublisherName> <JournalTitle>J. Phys.: Conf. Ser.</JournalTitle> <Volume>143</Volume> <Issue>3</Issue> </Journal> <FirstPage LZero="save">336</FirstPage> <PublicationType>Published Erratum</PublicationType> </Article> </ArticleSet> """ response = fake_response_from_string(body) node = get_node(spider, "Article", response) spider.pdf_files = get_test_suite_path( 'responses', 'iop', 'pdf', ) parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record return parsed_item.record
def get_parsed_from_file(filename): """A dictionary holding the parsed elements of the record.""" path = get_test_suite_path('responses', 'crossref', filename) with open(path) as f: aps_dict = yaml.load(f) return aps_dict
def get_local_settings_for_broken(): package_location = get_test_suite_path( 'desy', 'fixtures', 'ftp_server', 'DESY', 'broken', test_suite='functional', ) os.mkdir(package_location) tmp_file = os.path.join(package_location, 'broken_record.xml') with open(tmp_file, 'w') as f: f.write("<?xml version='1.0' encoding='UTF-8'?>" "<collection>" "<record>" "<datafield tag='260' ind1=' ' ind2=' '>" "<subfield code='c'>BROKEN DATE</subfield>" "</datafield>" "</record>" "</collection>") yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'source_folder': package_location, } } shutil.rmtree(package_location)
def get_parser_by_file(filename): """A CrossrefParser instanciated on an crossref API response.""" path = get_test_suite_path('responses', 'crossref', filename) with open(path) as f: aps_crossref = json.load(f) return CrossrefParser(aps_crossref)
def setup_s3(): test_file_path = get_test_suite_path( "elsevier", "fixtures", "elsevier", test_suite="functional", ) s3 = establish_s3_connection() packages_bucket = get_bucket(s3, CRAWLER_ARGS["packages_bucket_name"]) articles_bucket = get_bucket(s3, CRAWLER_ARGS["files_bucket_name"]) mock_elsevier_bucket = get_bucket(s3, "batch-feed") downloaded_files_bucket = get_bucket(s3, "downloaded") packages_bucket.create() articles_bucket.create() mock_elsevier_bucket.create() downloaded_files_bucket.create() mock_elsevier_bucket.upload_file(os.path.join(test_file_path, "test_zip_file.ZIP"), "test_zip_file.ZIP", ExtraArgs={'ACL': 'public-read'}) mock_elsevier_bucket.upload_file(os.path.join( test_file_path, "test_zip_file_replicated.ZIP"), "test_zip_file.ZIP", ExtraArgs={'ACL': 'public-read'}) mock_elsevier_bucket.upload_file( os.path.join(test_file_path, "wrong_articles.ZIP"), "wrong_articles.ZIP", )
def get_parsed_from_file(filename): """A dictionary holding the parsed elements of the record.""" path = get_test_suite_path('responses', 'elsevier', filename) with open(path) as f: elsevier_expected_dict = yaml.load(f) return elsevier_expected_dict
def get_parser_by_file(filename): """A ElsevierParser instanciated on an APS article.""" path = get_test_suite_path('responses', 'elsevier', filename) with open(path) as f: aps_elsevier = f.read() return ElsevierParser(aps_elsevier)
def get_parser_by_file(filename): """A JatsParser instanciated on an APS article.""" path = get_test_suite_path('responses', 'aps', filename) with open(path) as f: aps_jats = f.read() return JatsParser(aps_jats)
def get_local_settings_for_broken(): package_location = get_test_suite_path( 'desy', 'fixtures', 'ftp_server', 'DESY', 'broken', test_suite='functional', ) os.mkdir(package_location) tmp_file = os.path.join(package_location, 'broken_record.xml') with open(tmp_file, 'w') as f: f.write( "<?xml version='1.0' encoding='UTF-8'?>" "<collection>" "<record>" "<datafield tag='260' ind1=' ' ind2=' '>" "<subfield code='c'>BROKEN DATE</subfield>" "</datafield>" "</record>" "</collection>" ) yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'source_folder': package_location, } } shutil.rmtree(package_location)
def tarfile(): """Return path to test tar.gz file.""" return get_test_suite_path( 'responses', 'iop', 'packages', 'test.tar.gz', )
def load_file(file_name): path = get_test_suite_path( 'responses', 'tohep', file_name, ) with open(path) as input_data: data = yaml.load(input_data.read()) return data
def get_file_name_from_documents(documents_field): file_path = get_test_suite_path( 'desy', 'fixtures', 'ftp_server', 'DESY', 'FFT', documents_field['key'], test_suite='functional', ) return file_path
def get_file_name_from_fft(fft_field): file_path = get_test_suite_path( 'desy', 'fixtures', 'ftp_server', 'DESY', 'FFT', fft_field['filename'] + fft_field['format'], test_suite='functional', ) return file_path
def get_local_settings(): package_location = get_test_suite_path( 'wsp', 'fixtures', 'ftp_server', 'WSP', test_suite='functional', ) return { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'local_package_dir': package_location, } }
def get_local_settings(): package_location = get_test_suite_path( 'desy', 'fixtures', 'ftp_server', 'DESY', test_suite='functional', ) return { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'source_folder': package_location, } }
def get_ftp_settings(): netrc_location = get_test_suite_path( 'desy', 'fixtures', 'ftp_server', '.netrc', test_suite='functional', ) return { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'ftp_host': 'ftp_server', 'ftp_netrc': netrc_location, } }
def get_local_settings(): package_location = get_test_suite_path( 'wsp', 'fixtures', 'ftp_server', 'WSP', test_suite='functional', ) return { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'local_package_dir': package_location, 'destination_folder': "/code/.tmp/WSP" } }
def set_up_local_environment(): package_location = get_test_suite_path( 'cds', 'fixtures', 'oai_harvested', 'cds_smoke_records.xml', test_suite='functional', ) yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'source_file': 'file://' + package_location, } } clean_dir()
def set_up_local_environment(): package_location = get_test_suite_path( 'wsp', 'fixtures', 'ftp_server', 'WSP', test_suite='functional', ) yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'package_path': package_location, } } remove_generated_files(package_location)
def set_up_local_environment(): package_location = get_test_suite_path( 'arxiv', 'fixtures', 'oai_harvested', 'arxiv_smoke_record.xml', test_suite='functional', ) # The test must wait until the docker environment is up (takes about 5 seconds). sleep(5) yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'source_file': 'file://' + package_location, } }
def get_configuration(): package_location = get_test_suite_path( 'pos', 'fixtures', 'oai_harvested', 'pos_record.xml', test_suite='functional', ) return { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'source_file': 'file://' + package_location, 'base_conference_paper_url': ('https://http-server.local/contribution?id='), 'base_proceedings_url': ('https://http-server.local/cgi-bin/reader/conf.cgi?confid='), } }
def get_expected_parser_responses_for_new_articles_in_s3(): test_file_path = get_test_suite_path( "elsevier", "fixtures", "elsevier", "parsed_records", test_suite="functional", ) files = [ "j.geomphys.2020.103898.yml", "j.geomphys.2020.103921.yml", "j.geomphys.2020.103925.yml", "j.geomphys.2020.103892.yml", ] responses = [] for file in files: responses.append( get_parser_response_from_file(os.path.join(test_file_path, file))) return responses
def get_ftp_settings(): netrc_location = get_test_suite_path( 'wsp', 'fixtures', 'ftp_server', '.netrc', test_suite='functional', ) # The test must wait until the docker environment is up (takes about 10 # seconds). sleep(10) return { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'ftp_host': 'ftp_server', 'ftp_netrc': netrc_location, } }
def set_up_ftp_environment(): netrc_location = get_test_suite_path( 'wsp', 'fixtures', 'ftp_server', '.netrc', test_suite='functional', ) # The test must wait until the docker environment is up (takes about 10 seconds). sleep(10) yield { 'CRAWLER_HOST_URL': 'http://scrapyd:6800', 'CRAWLER_PROJECT': 'hepcrawl', 'CRAWLER_ARGUMENTS': { 'ftp_host': 'ftp_server', 'ftp_netrc': netrc_location, } } clean_dir(path='/tmp/WSP/')
def setup_s3_files(s3_key, s3_secret, s3_server, buckets=[], files_to_upload=[], files_path=None, *args, **kwargs): s3 = s3_connection(s3_key, s3_secret, s3_server) buckets_map = {} for bucket_name in buckets: bucket = s3.Bucket(bucket_name) bucket.create() buckets_map[bucket_name] = bucket test_files_path = get_test_suite_path(*files_path, test_suite='functional') transfer_config = TransferConfig(use_threads=False) for bucket_name, file_name in files_to_upload: buckets_map[bucket_name].upload_file(Filename=os.path.join( test_files_path, file_name), Key=file_name, Config=transfer_config)
def test_not_published_record(): """Not-published paper should result in nothing.""" spider = iop_spider.IOPSpider() body = """ <ArticleSet> <Article> <Journal> <PubDate PubStatus="aheadofprint"> <Year>2015</Year> <Month>03</Month> </PubDate> </Journal> </Article> </ArticleSet> """ response = fake_response_from_string(body) node = get_node(spider, "Article", response) spider.pdf_files = get_test_suite_path( 'responses', 'iop', 'pdf', ) records = spider.parse_node(response, node) assert records is None
import six import pytest from hepcrawl.spiders import iop_spider from hepcrawl.testlib.fixtures import ( fake_response_from_file, fake_response_from_string, get_node, get_test_suite_path, ) TEST_PDF_DIR = get_test_suite_path( 'responses', 'iop', 'pdf', ) @pytest.fixture def record(): """Return results generator from the WSP spider.""" spider = iop_spider.IOPSpider() response = fake_response_from_file('iop/xml/test_standard.xml') node = get_node(spider, "Article", response) spider.pdf_files = TEST_PDF_DIR parsed_item = spider.parse_node(response, node) assert parsed_item assert parsed_item.record