예제 #1
0
def download_from_page(url, url_trace, meta):
    if not is_http(url):
        print('(download_from_page) Invalid URL scheme [{}]'.format(url))
        return

    download_links = identify_download_links(url)

    if len(download_links) == 0:
        print('No download links found on {}'.format(url))
        return

    if len(download_links) > 1:
        print('Multiple links found')

    download_url = next(iter(download_links))
    if run_in_test_mode:
        print("Identified download URL {}, stopping in test mode".format(download_url))
        return

    data_model = DataModel()
    data_model.download_page_url = url
    data_model.download_url = download_url

    new_trace = url_trace.copy()
    new_trace.append(download_url)
    data_model.url_trace_csv = ','.join(new_trace)

    data_model.meta_csv = ','.join(["{},{}".format(x[0], x[1]) for x in meta])

    if not is_http(download_url):
        print('(download_from_page) Invalid download URL scheme [{}]'.format(download_url))
        return

    file = requests.get(download_url)
    open("{}.pdf".format(data_model.id), 'wb').write(file.content)

    access = DataAccess()
    access.store(data_model)

    exit(1)