Пример #1
0
 def setUp(self):
     """Setup test."""
     self.pos = PosPackage()
     sample_filepath = pkg_resources.resource_filename(
         'harvestingkit.tests',
         os.path.join('data', 'sample_pos_record.xml')
     )
     self.pos.document = parse(sample_filepath)
Пример #2
0
class POSPackageTests(unittest.TestCase):

    """Tests for the PosPackage."""

    def setUp(self):
        """Setup test."""
        self.pos = PosPackage()
        sample_filepath = pkg_resources.resource_filename(
            'harvestingkit.tests',
            os.path.join('data', 'sample_pos_record.xml')
        )
        self.pos.document = parse(sample_filepath)

    def test_authors(self):
        """Test the field authors."""
        self.assertEqual(self.pos._get_authors(),
                         [('El-Khadra, Aida', ['INFN and Università di Firenze'])])

    def test_language(self):
        """Test the field language."""
        self.assertEqual(self.pos._get_language(), 'en')

    def test_publisher(self):
        """Test the field publisher."""
        self.assertEqual(self.pos._get_publisher(), 'SISSA')

    def test_date(self):
        """Test the field date."""
        self.assertEqual(self.pos._get_date(), '2014-03-19')

    def test_title(self):
        """Test the field title."""
        self.assertEqual(self.pos._get_title(), 'Heavy Flavour Physics Review')

    def test_copyright(self):
        """Test the field copyright."""
        self.assertEqual(self.pos._get_copyright(), 'CC-BY-NC-SA')

    def test_subject(self):
        """Test the field subject."""
        self.assertEqual(self.pos._get_subject(), 'Lattice Field Theory')

    def test_identifier(self):
        """Test the field identifier."""
        self.assertEqual(self.pos.get_identifier(), 'oai:pos.sissa.it:LATTICE 2013/001')

    def test_record(self):
        """Test the field identifier."""
        record = self.pos.get_record(self.pos.document)
        self.assertTrue(record["100"])
        self.assertTrue(record["980"])
Пример #3
0
class POSPackageTests(unittest.TestCase):
    def setUp(self):
        self.pos = PosPackage()
        self.pos.document = parse(join(dirname(folder), pos_test_record))

    def test_authors(self):
        self.assertEqual(self.pos._get_authors(), ['El-Khadra, Aida', 'Johnson, A.T.'])

    def test_language(self):
        self.assertEqual(self.pos._get_language(), 'en')

    def test_publisher(self):
        self.assertEqual(self.pos._get_publisher(), 'SISSA')

    def test_date(self):
        self.assertEqual(self.pos._get_date(), '2014-03-19')

    def test_copyright(self):
        self.assertEqual(self.pos._get_copyright(), 'CC-BY-NC-SA')

    def test_subject(self):
        self.assertEqual(self.pos._get_subject(), 'Lattice Field Theory')

    def test_identifier(self):
        self.assertEqual(self.pos.get_identifier(), 'oai:pos.sissa.it:LATTICE 2013/001')
Пример #4
0
def main(args):
    if len(args) != 1:
        print("usage: python bibfilter_oaipos2inspire.py input_filename")
        raise Exception("Wrong usage!!")
    input_filename = args[0]

    out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY)

    insert_records = []
    append_records = []
    error_records = []
    files_uploaded = []

    pos = PosPackage()
    xml_doc = parse(input_filename)
    for record in xml_doc.getElementsByTagName('record'):
        rec = pos.get_record(record)
        identifier = pos.get_identifier()
        conference = identifier.split(':')[2]
        conference = conference.split('/')[0]
        contribution = identifier.split(':')[2]
        contribution = contribution.split('/')[1]
        identifier = "PoS(%s)%s" % (conference, contribution)
        query = "773__p:pos 773__v:%s 773__c:%s" % \
                (conference.replace(' ', ''), contribution)
        print("Querying with: %s" % (query, ))
        results = perform_request_search(p=query, of="id")

        #harvest fulltext
        url = base_url + identifier
        session = requests.session()
        r = session.get(url)
        parsed_html = BeautifulSoup(r.text)
        links = parsed_html.body.findAll('a')
        found = False

        for link in links:
            url = urllib.quote(link['href'], safe=":/")
            if url.endswith('.pdf'):
                found = True
                if results:
                    rec = create_record()
                filename = join(out_folder, identifier + ".pdf")
                record_add_field(rec,
                                 '856',
                                 ind1='4',
                                 subfields=[('u', url), ('y', 'PoS server')])
                record_add_field(rec,
                                 'FFT',
                                 subfields=[('a', filename), ('t', 'PoS'),
                                            ('d', 'Fulltext')])
                try:
                    print('Downloading ' + url)
                    download_url(url, "pdf", filename, 5, 60.0)
                    if results:
                        recid = results[0]
                        record_add_field(rec, '001', controlfield_value=recid)
                        append_records.append(rec)
                    else:
                        insert_records.append(rec)
                except InvenioFileDownloadError:
                    print("Download of %s failed" % (url, ))
                break
        if not found:
            error_records.append(rec)

        #upload to FTP
        tempfile_path = '/tmp/%s.xml' % (contribution, )
        with open(tempfile_path, 'w') as tempfile:
            tempfile.write(record_xml_output(rec))
        try:
            submit_records_via_ftp(tempfile_path, conference)
            files_uploaded.append('%s/%s.xml' % (conference, contribution))
            write_message("%s successfully uploaded to FTP server" %
                          tempfile_path)
        except:
            write_message("Failed to upload %s to FTP server" % tempfile_path)
        remove(tempfile_path)

    insert_filename = "%s.insert.xml" % (input_filename, )
    append_filename = "%s.append.xml" % (input_filename, )
    errors_filename = "%s.errors.xml" % (input_filename, )

    created_files = []

    if write_record_to_file(insert_filename, insert_records):
        copy(insert_filename, out_folder)
        created_files.append(join(out_folder, basename(insert_filename)))
    if write_record_to_file(append_filename, append_records):
        copy(append_filename, out_folder)
        created_files.append(join(out_folder, basename(append_filename)))
    if write_record_to_file(errors_filename, error_records):
        copy(errors_filename, errors_filename)
        created_files.append(join(out_folder, basename(errors_filename)))

    total_records = len(append_records) + len(insert_records) + len(
        error_records)
    subject = "PoS Harvest results: " + datetime.now().strftime(
        "%Y-%m-%d %H:%M:%S")
    body = """
    Total of %d records processed:

    %d new records,
    %d records already existing in the system,
    %d records that failed to retrieve the fulltext

    Location of new records:
    %s
    """ % \
           (total_records,
            len(insert_records),
            len(append_records),
            len(error_records),
            "\n".join(created_files))
    if files_uploaded:
        body += "\nFiles uploaded:"
        for fl in files_uploaded:
            body += "\n\t%s file uploaded on the FTP Server\n" % (fl, )
    write_message(subject)
    write_message(body)
    if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject,
                      body):
        print("ERROR: Mail not sent")
    else:
        print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL, ))
Пример #5
0
def main(args):
    if len(args) != 1:
        print("usage: python bibfilter_oaipos2inspire.py input_filename")
        raise Exception("Wrong usage!!")
    input_filename = args[0]

    out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY)

    insert_records = []
    append_records = []
    error_records = []
    files_uploaded = []

    pos = PosPackage()
    xml_doc = parse(input_filename)
    for record in xml_doc.getElementsByTagName('record'):
        rec = pos.get_record(record)
        identifier = pos.get_identifier()
        conference = identifier.split(':')[2]
        conference = conference.split('/')[0]
        contribution = identifier.split(':')[2]
        contribution = contribution.split('/')[1]
        identifier = "PoS(%s)%s" % (conference, contribution)
        query = "773__p:pos 773__v:%s 773__c:%s" % \
                (conference.replace(' ', ''), contribution)
        print("Querying with: %s" % (query,))
        results = perform_request_search(p=query, of="id")

        #harvest fulltext
        url = base_url + identifier
        session = requests.session()
        r = session.get(url)
        parsed_html = BeautifulSoup(r.text)
        links = parsed_html.body.findAll('a')
        found = False

        for link in links:
            url = urllib.quote(link['href'], safe=":/")
            if url.endswith('.pdf'):
                found = True
                if results:
                    rec = create_record()
                filename = join(out_folder, identifier + ".pdf")
                record_add_field(rec, '856', ind1='4', subfields=[
                    ('u', url),
                    ('y', 'PoS server')
                ])
                record_add_field(rec, 'FFT', subfields=[('a', filename),
                                                        ('t', 'PoS'),
                                                        ('d', 'Fulltext')])
                try:
                    print('Downloading ' + url)
                    download_url(url, "pdf", filename, 5, 60.0)
                    if results:
                        recid = results[0]
                        record_add_field(rec, '001', controlfield_value=recid)
                        append_records.append(rec)
                    else:
                        insert_records.append(rec)
                except InvenioFileDownloadError:
                    print("Download of %s failed" % (url,))
                break
        if not found:
            error_records.append(rec)

        #upload to FTP
        tempfile_path = '/tmp/%s.xml' % (contribution,)
        with open(tempfile_path, 'w') as tempfile:
            tempfile.write(record_xml_output(rec))
        try:
            submit_records_via_ftp(tempfile_path, conference)
            files_uploaded.append('%s/%s.xml' % (conference, contribution))
            write_message("%s successfully uploaded to FTP server" % tempfile_path)
        except:
            write_message("Failed to upload %s to FTP server" % tempfile_path)
        remove(tempfile_path)

    insert_filename = "%s.insert.xml" % (input_filename,)
    append_filename = "%s.append.xml" % (input_filename,)
    errors_filename = "%s.errors.xml" % (input_filename,)

    created_files = []

    if write_record_to_file(insert_filename, insert_records):
        copy(insert_filename, out_folder)
        created_files.append(join(out_folder, basename(insert_filename)))
    if write_record_to_file(append_filename, append_records):
        copy(append_filename, out_folder)
        created_files.append(join(out_folder, basename(append_filename)))
    if write_record_to_file(errors_filename, error_records):
        copy(errors_filename, errors_filename)
        created_files.append(join(out_folder, basename(errors_filename)))

    total_records = len(append_records) + len(insert_records) + len(error_records)
    subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    body = """
    Total of %d records processed:

    %d new records,
    %d records already existing in the system,
    %d records that failed to retrieve the fulltext

    Location of new records:
    %s
    """ % \
           (total_records,
            len(insert_records),
            len(append_records),
            len(error_records),
            "\n".join(created_files))
    if files_uploaded:
        body += "\nFiles uploaded:"
        for fl in files_uploaded:
            body += "\n\t%s file uploaded on the FTP Server\n" % (fl,)
    write_message(subject)
    write_message(body)
    if not send_email(CFG_SITE_SUPPORT_EMAIL,
                      CFG_POSHARVEST_EMAIL,
                      subject,
                      body):
        print("ERROR: Mail not sent")
    else:
        print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
def main(args):
    if len(args) != 1:
        print("usage: python bibfilter_oaipos2inspire.py input_filename")
        raise Exception("Wrong usage!!")
    input_filename = args[0]

    out_folder = create_work_folder(CFG_POS_OUT_DIRECTORY)

    insert_records = []
    append_records = []
    error_records = []

    pos = PosPackage()
    xml_doc = parse(input_filename)
    for record in xml_doc.getElementsByTagName("record"):
        rec = pos.get_record(record)
        identifier = pos.get_identifier()
        conference = identifier.split(":")[2]
        conference = conference.split("/")[0]
        contribution = identifier.split(":")[2]
        contribution = contribution.split("/")[1]
        identifier = "PoS(%s)%s" % (conference, contribution)
        query = "773__p:pos 773__v:%s 773__c:%s" % (conference.replace(" ", ""), contribution)
        print("Querying with: %s" % (query,))
        results = perform_request_search(p=query, of="id")

        # harvest fulltext
        url = base_url + identifier
        session = requests.session()
        r = session.get(url)
        parsed_html = BeautifulSoup(r.text)
        links = parsed_html.body.findAll("a")
        found = False

        for link in links:
            url = urllib.quote(link["href"], safe=":/")
            if url.endswith(".pdf"):
                found = True
                if results:
                    rec = {}
                filename = join(out_folder, identifier + ".pdf")
                record_add_field(rec, "856", ind1="4", subfields=[("u", url), ("y", "Fulltext")])
                record_add_field(rec, "FFT", subfields=[("a", filename), ("t", "PoS"), ("d", "Fulltext")])
                try:
                    print("Downloading " + url)
                    download_url(url, "pdf", filename, 5, 60.0)
                    if results:
                        recid = results[0]
                        record_add_field(rec, "001", controlfield_value=recid)
                        append_records.append(rec)
                    else:
                        insert_records.append(rec)
                except InvenioFileDownloadError:
                    print("Download of %s failed" % (url,))
                break
        if not found:
            error_records.append(rec)

    insert_filename = "%s.insert.xml" % (input_filename,)
    append_filename = "%s.append.xml" % (input_filename,)
    errors_filename = "%s.errors.xml" % (input_filename,)

    created_files = []

    if write_record_to_file(insert_filename, insert_records):
        copy(insert_filename, out_folder)
        created_files.append(join(out_folder, basename(insert_filename)))
    if write_record_to_file(append_filename, append_records):
        copy(append_filename, out_folder)
        created_files.append(join(out_folder, basename(append_filename)))
    if write_record_to_file(errors_filename, error_records):
        copy(errors_filename, errors_filename)
        created_files.append(join(out_folder, basename(errors_filename)))

    total_records = len(append_records) + len(insert_records) + len(error_records)
    subject = "PoS Harvest results: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    body = """
    Total of %d records processed:

    %d new records,
    %d records already existing in the system,
    %d records that failed to retrieve the fulltext

    Location of new records:
    %s
    """ % (
        total_records,
        len(insert_records),
        len(append_records),
        len(error_records),
        "\n".join(created_files),
    )
    print(subject)
    print(body)
    if not send_email(CFG_SITE_SUPPORT_EMAIL, CFG_POSHARVEST_EMAIL, subject, body):
        print("ERROR: Mail not sent")
    else:
        print("Mail sent to %s" % (CFG_POSHARVEST_EMAIL,))
Пример #7
0
 def setUp(self):
     self.pos = PosPackage()
     self.pos.document = parse(join(dirname(folder), pos_test_record))