def download_from_page(url, url_trace, meta): if not is_http(url): print('(download_from_page) Invalid URL scheme [{}]'.format(url)) return download_links = identify_download_links(url) if len(download_links) == 0: print('No download links found on {}'.format(url)) return if len(download_links) > 1: print('Multiple links found') download_url = next(iter(download_links)) if run_in_test_mode: print("Identified download URL {}, stopping in test mode".format(download_url)) return data_model = DataModel() data_model.download_page_url = url data_model.download_url = download_url new_trace = url_trace.copy() new_trace.append(download_url) data_model.url_trace_csv = ','.join(new_trace) data_model.meta_csv = ','.join(["{},{}".format(x[0], x[1]) for x in meta]) if not is_http(download_url): print('(download_from_page) Invalid download URL scheme [{}]'.format(download_url)) return file = requests.get(download_url) open("{}.pdf".format(data_model.id), 'wb').write(file.content) access = DataAccess() access.store(data_model) exit(1)