Пример #1
0
def main(args):
    if len(args) != 1:
        print("usage: python bibfilter_oaipos2inspire.py input_filename")
        raise Exception("Wrong usage!!")
    input_filename = args[0]

    out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY)

    insert_records = []
    append_records = []
    error_records = []
    files_uploaded = []

    pos = PosPackage()
    xml_doc = parse(input_filename)
    for record in xml_doc.getElementsByTagName('record'):
        rec = pos.get_record(record)
        identifier = pos.get_identifier()
        conference = identifier.split(':')[2]
        conference = conference.split('/')[0]
        contribution = identifier.split(':')[2]
        contribution = contribution.split('/')[1]
        identifier = "PoS(%s)%s" % (conference, contribution)
        query = "773__p:pos 773__v:%s 773__c:%s" % \
                (conference.replace(' ', ''), contribution)
        print("Querying with: %s" % (query, ))
        results = perform_request_search(p=query, of="id")

        #harvest fulltext
        url = base_url + identifier
        session = requests.session()
        r = session.get(url)
        parsed_html = BeautifulSoup(r.text)
        links = parsed_html.body.findAll('a')
        found = False

        for link in links:
            url = urllib.quote(link['href'], safe=":/")
            if url.endswith('.pdf'):
                found = True
                if results:
                    rec = create_record()
                filename = join(out_folder, identifier + ".pdf")
                record_add_field(rec,
                                 '856',
                                 ind1='4',
                                 subfields=[('u', url), ('y', 'PoS server')])
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', filename), ('t', 'PoS'),
                                            ('d', 'Fulltext')])
                try:
                    print('Downloading ' + url)
                    download_url(url, "pdf", filename, 5, 60.0)
                    if results:
                        recid = results[0]
                        record_add_field(rec, '001', controlfield_value=recid)
                        append_records.append(rec)
                    else:
                        insert_records.append(rec)
                except InvenioFileDownloadError:
                    print("Download of %s failed" % (url, ))
                break
        if not found:
            error_records.append(rec)

        #upload to FTP
        tempfile_path = '/tmp/%s.xml' % (contribution, )
        with open(tempfile_path, 'w') as tempfile:
            tempfile.write(record_xml_output(rec))
        try:
            submit_records_via_ftp(tempfile_path, conference)
            files_uploaded.append('%s/%s.xml' % (conference, contribution))
            write_message("%s successfully uploaded to FTP server" %
                          tempfile_path)
        except:
            write_message("Failed to upload %s to FTP server" % tempfile_path)
        remove(tempfile_path)

    insert_filename = "%s.insert.xml" % (input_filename, )
    append_filename = "%s.append.xml" % (input_filename, )
    errors_filename = "%s.errors.xml" % (input_filename, )

    created_files = []

    if write_record_to_file(insert_filename, insert_records):
        copy(insert_filename, out_folder)
        created_files.append(join(out_folder, basename(insert_filename)))
    if write_record_to_file(append_filename, append_records):
        copy(append_filename, out_folder)
        created_files.append(join(out_folder, basename(append_filename)))
    if write_record_to_file(errors_filename, error_records):
        copy(errors_filename, errors_filename)
        created_files.append(join(out_folder, basename(errors_filename)))

    total_records = len(append_records) + len(insert_records) + len(
        error_records)
    subject = "PoS Harvest results: " + datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S")
    body = """
    Total of %d records processed:

    %d new records,
    %d records already existing in the system,
    %d records that failed to retrieve the fulltext

    Location of new records:
    %s
    """ % \
           (total_records,
            len(insert_records),
            len(append_records),
            len(error_records),
            "\n".join(created_files))
    if files_uploaded:
        body += "\nFiles uploaded:"
        for fl in files_uploaded:
            body += "\n\t%s file uploaded on the FTP Server\n" % (fl, )
    write_message(subject)
    write_message(body)
    if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject,
                      body):
        print("ERROR: Mail not sent")
    else:
        print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL, ))