Exemplo n.º 1
0
    def test_unzip(self):
        """
        Test uncompressing function.
        """
        test_zipped_file = "./test/test.zip"

        desired_dir = os.path.join(CFG_TMPSHAREDDIR, "apsharvest")
        unzipped_folder = unzip(test_zipped_file, desired_dir)
        self.assertTrue(
            unzipped_folder.startswith(desired_dir),
            "Unzipped folder is located in the wrong place %s!" % unzipped_folder,
        )

        unzipped_folder = unzip(test_zipped_file)
        self.assertTrue(os.path.exists(unzipped_folder), "Unzipped folder does not exist!")

        z = zipfile.ZipFile(test_zipped_file)
        list_of_zipfiles = z.namelist()
        found_list_of_files = get_files_and_folders(unzipped_folder)
        self.assertTrue(len(list_of_zipfiles) == len(found_list_of_files), "Looks like all files were not extracted!")
Exemplo n.º 2
0
            except StandardError, e:
                if 'urlopen' in str(e) or 'URL could not be opened' in str(e):
                    msg = "URL could not be opened: %s" % (url,)
                    write_message("Error: %s" % (msg,),
                                  stream=sys.stderr)
                    write_message("No fulltext found for %s" %
                                 (record.recid or record.doi,))
                    yield record, msg
                    continue
                raise
            finally:
                request_end = time.time()

            # Unzip the compressed file
            unzipped_folder = unzip(result_file, base_directory=self.out_folder)

            # Validate the checksum of the compressed fulltext file.
            try:
                checksum_validated_files = find_and_validate_md5_checksums(
                    in_folder=unzipped_folder,
                    md5key_filename=CFG_APSHARVEST_MD5_FILE)
            except APSFileChecksumError, e:
                info_msg = "Skipping %s in %s" % \
                           (record.recid or record.doi, unzipped_folder)
                msg = "Error while validating checksum: %s\n%s\n%s" % \
                      (info_msg, str(e), traceback.format_exc()[:-1])
                write_message(msg)
                yield record, msg
                continue
            if not checksum_validated_files:
Exemplo n.º 3
0
def apply_filter(rec):
    """ Filters the record to be compatible within Inspire
    Parameters:
     * rec - dictionary: BibRecord structure
    Returns: dictionary, BibRecord structure
    """
    # Move recid from 001 to 035 if not hidden
    cds_id = rec['001'][0][3]
    if not 'hidden' in [x.lower() for x in record_get_field_values(rec, "980",
                                                                   code="a")]:
        record_add_field(rec, '035', subfields=[('9', 'CDS'), ('a', cds_id)])
    # Clear control fields
    record_strip_controlfields(rec)

    # Clear other uninteresting fields
    interesting_fields = ["024", "041", "035", "037", "088", "100",
                          "110", "111", "242", "245", "246", "260",
                          "269", "300", "502", "650", "653", "693",
                          "700", "710", "773", "856", "520", "500",
                          "980"]
    for tag in rec.keys():
        if tag not in interesting_fields:
            record_delete_fields(rec, tag)

    # 980 Determine Collections
    collections = set([])
    for value in record_get_field_values(rec, '980', code='a'):
        if 'NOTE' in value.upper():
            collections.add('NOTE')
        if 'THESIS' in value.upper():
            collections.add('THESIS')
        if 'CONFERENCEPAPER' in value.upper():
            collections.add('ConferencePaper')


    if is_published(rec):
        collections.add("PUBLISHED")
        collections.add("CITEABLE")

    if not 'NOTE' in collections:
        # TODO: Move this to a KB
        kb = ['ATLAS-CONF-', 'CMS-PAS-', 'ATL-', 'CMS-DP-',
              'ALICE-INT-', 'LHCb-PUB-']
        values = record_get_field_values(rec, "088", code='a')
        for val, rep in product(values, kb):
            if val.startswith(rep):
                collections.add('NOTE')
                break

    # 980 Arxiv tag
    if record_get_field_values(rec, '035', filter_subfield_code="a",
                               filter_subfield_value="arXiv"):
        collections.add("arXiv")

    # 980 HEP && CORE
    collections.add('HEP')
    collections.add('CORE')

    # 980 Conference Note
    if not 'ConferencePaper' in collections:
        for value in record_get_field_values(rec, '962', code='n'):
            if value[-2:].isdigit():
                collections.add('ConferencePaper')
                break

    record_delete_fields(rec, "980")

    intnote = record_get_field_values(rec, '690', filter_subfield_code="a",
                                      filter_subfield_value='INTNOTE')
    if intnote:
        val_088 = record_get_field_values(rec, '088', filter_subfield_code="a")
        for val in val_088:
            if 'CMS' in val:
                url = ('http://weblib.cern.ch/abstract?CERN-CMS' +
                       val.split('CMS', 1)[-1])
                record_add_field(rec, '856', ind1='4', subfields=[('u', url)])

    # 041 Language
    languages = get_languages()
    language_fields = record_get_field_instances(rec, '041')
    record_delete_fields(rec, "041")
    for field in language_fields:
        subs = field_get_subfields(field)
        if 'a' in subs:
            if "eng" in subs['a']:
                continue
            new_value = translate_config(subs['a'][0], languages)
            new_subs = [('a', new_value)]
            record_add_field(rec, "041", subfields=new_subs)

    # 035 Externals
    scn_035_fields = record_get_field_instances(rec, '035')
    forbidden_values = ["cercer",
                        "inspire",
                        "xx",
                        "cern annual report",
                        "cmscms",
                        "wai01"]
    for field in scn_035_fields:
        subs = field_get_subfields(field)
        if '9' in subs:
            if not 'a' in subs:
                continue
            for sub in subs['9']:
                if sub.lower() in forbidden_values:
                    break
            else:
                # No forbidden values (We did not "break")
                suffixes = [s.lower() for s in subs['9']]
                if 'spires' in suffixes:
                    new_subs = [('a', 'SPIRES-%s' % subs['a'][0])]
                    record_add_field(rec, '970', subfields=new_subs)
                    continue
        if 'a' in subs:
            for sub in subs['a']:
                if sub.lower() in forbidden_values:
                    record_delete_field(rec, tag="035",
                                        field_position_global=field[4])

    rep_088_fields = record_get_field_instances(rec, '088')
    for field in rep_088_fields:
        subs = field_get_subfields(field)
        if '9' in subs:
            for val in subs['9']:
                if val.startswith('P0') or val.startswith('CM-P0'):
                    sf = [('9', 'CERN'), ('b', val)]
                    record_add_field(rec, '595', subfields=sf)
        for key, val in field[0]:
            if key in ['a', '9'] and not val.startswith('SIS-'):
                record_add_field(rec, '037', subfields=[('a', val)])
    record_delete_fields(rec, "088")

    # 037 Externals also...
    rep_037_fields = record_get_field_instances(rec, '037')
    for field in rep_037_fields:
        subs = field_get_subfields(field)
        if 'a' in subs:
            for value in subs['a']:
                if 'arXiv' in value:
                    new_subs = [('a', value), ('9', 'arXiv')]
                    for fld in record_get_field_instances(rec,  '695'):
                        for key, val in field_get_subfield_instances(fld):
                            if key == 'a':
                                new_subs.append(('c', val))
                                break
                    nf = create_field(subfields=new_subs)
                    record_replace_field(rec, '037', nf, field[4])
        for key, val in field[0]:
            if key in ['a', '9'] and val.startswith('SIS-'):
                record_delete_field(rec, '037', field_position_global=field[4])

    for field in record_get_field_instances(rec, '242'):
        record_add_field(rec, '246', subfields=field[0])
    record_delete_fields(rec, '242')

    # 269 Date normalization
    for field in record_get_field_instances(rec, '269'):
        for idx, (key, value) in enumerate(field[0]):
            if key == "c":
                field[0][idx] = ("c", convert_date_to_iso(value))
                record_delete_fields(rec, "260")

    if not 'THESIS' in collections:
        for field in record_get_field_instances(rec, '260'):
            record_add_field(rec, '269', subfields=field[0])
        record_delete_fields(rec, '260')

    # 300 page number
    for field in record_get_field_instances(rec, '300'):
        for idx, (key, value) in enumerate(field[0]):
            if key == 'a':
                if "mult." not in value and value != " p":
                    field[0][idx] = ('a', re.sub(r'[^\d-]+', '', value))
                else:
                    record_delete_field(rec, '300',
                                        field_position_global=field[4])
                    break

    # 100 & 700 punctuate author names
    author_names = record_get_field_instances(rec, '100')
    author_names.extend(record_get_field_instances(rec, '700'))
    for field in author_names:
        subs = field_get_subfields(field)
        if not 'i' in subs or 'XX' in subs['i']:
            if not 'j' in subs or 'YY' in subs['j']:
                for idx, (key, value) in enumerate(field[0]):
                    if key == 'a':
                        field[0][idx] = ('a', punctuate_authorname(value))

    # 700 -> 701 Thesis supervisors
    if 'THESIS' in collections:
        for field in record_get_field_instances(rec, '700'):
            record_add_field(rec, '701', subfields=field[0])
        record_delete_fields(rec, '700')

    # 501 move subfields
    fields_501 = record_get_field_instances(rec, '502')
    for idx, field in enumerate(fields_501):
        new_subs = []
        for key, value in field[0]:
            if key == 'a':
                new_subs.append(('b', value))
            elif key == 'b':
                new_subs.append(('c', value))
            elif key == 'c':
                new_subs.append(('d', value))
            else:
                new_subs.append((key, value))
        fields_501[idx] = field_swap_subfields(field, new_subs)

    # 650 Translate Categories
    categories = get_categories()
    category_fields = record_get_field_instances(rec, '650', ind1='1', ind2='7')
    record_delete_fields(rec, "650")
    for field in category_fields:
        for idx, (key, value) in enumerate(field[0]):
            if key == 'a':
                new_value = translate_config(value, categories)
                if new_value != value:
                    new_subs = [('2', 'INSPIRE'), ('a', new_value)]
                else:
                    new_subs = [('2', 'SzGeCERN'), ('a', value)]
                record_add_field(rec, "650", ind1="1", ind2="7",
                                 subfields=new_subs)
                break

    # 653 Free Keywords
    for field in record_get_field_instances(rec, '653', ind1='1'):
        subs = field_get_subfields(field)
        new_subs = []
        if 'a' in subs:
            for val in subs['a']:
                new_subs.extend([('9', 'author'), ('a', val)])
        new_field = create_field(subfields=new_subs, ind1='1')
        record_replace_field(rec, '653', new_field, field_position_global=field[4])

    experiments = get_experiments()
    # 693 Remove if 'not applicable'
    for field in record_get_field_instances(rec, '693'):
        subs = field_get_subfields(field)
        all_subs = subs.get('a', []) + subs.get('e', [])
        if 'not applicable' in [x.lower() for x in all_subs]:
            record_delete_field(rec, '693',
                                field_position_global=field[4])
        new_subs = []
        experiment_a = ""
        experiment_e = ""
        for (key, value) in subs.iteritems():
            if key == 'a':
                experiment_a = value[0]
                new_subs.append((key, value[0]))
            elif key == 'e':
                experiment_e = value[0]
        experiment = "%s---%s" % (experiment_a.replace(" ", "-"),
                                  experiment_e)
        translated_experiments = translate_config(experiment,
                                                  experiments)
        new_subs.append(("e", translated_experiments))
        record_delete_field(rec, tag="693",
                            field_position_global=field[4])
        record_add_field(rec, "693", subfields=new_subs)

    # 710 Collaboration
    for field in record_get_field_instances(rec, '710'):
        subs = field_get_subfield_instances(field)
        for idx, (key, value) in enumerate(subs[:]):
            if key == '5':
                subs.pop(idx)
            elif value.startswith('CERN. Geneva'):
                subs.pop(idx)
        if len(subs) == 0:
            record_delete_field(rec, '710', field_position_global=field[4])

    # 773 journal translations
    journals = get_journals()
    for field in record_get_field_instances(rec, '773'):
        subs = field_get_subfield_instances(field)
        new_subs = []
        for idx, (key, value) in enumerate(subs):
            if key == 'p':
                new_subs.append((key, translate_config(value, journals)))
            else:
                new_subs.append((key, value))
        record_delete_field(rec, tag="773",
                            field_position_global=field[4])
        record_add_field(rec, "773", subfields=new_subs)

    # FFT (856) Dealing with graphs
    figure_counter = 0
    for field in record_get_field_instances(rec, '856', ind1='4'):
        subs = field_get_subfields(field)

        newsubs = []
        remove = False

        if 'z' in subs:
            is_figure = [s for s in subs['z'] if "figure" in s.lower()]
            if is_figure and 'u' in subs:
                is_subformat = [s for s in subs['u'] if "subformat" in s.lower()]
                if not is_subformat:
                    url = subs['u'][0]
                    if url.endswith(".pdf"):
                        # We try to convert
                        fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR)
                        os.close(fd)
                        _print("Downloading %s into %s" % (url, local_url), verbose=5)
                        plotfile = ""
                        try:
                            plotfile = download_url(url=url,
                                                    download_to_file=local_url,
                                                    timeout=30.0)
                        except InvenioFileDownloadError:
                            _print("Download failed while attempting to reach %s. Skipping.." % (url,))
                            remove = True
                        if plotfile:
                            converted = convert_images([plotfile])
                            if converted:
                                url = converted.pop()
                                _print("Successfully converted %s to %s" % (local_url, url), verbose=5)
                            else:
                                _print("Conversion failed on %s" % (local_url,))
                                url = None
                                remove = True
                    if url:
                        newsubs.append(('a', url))
                        newsubs.append(('t', 'Plot'))
                        figure_counter += 1
                        if 'y' in subs:
                            newsubs.append(('d', "%05d %s" % (figure_counter, subs['y'][0])))
                            newsubs.append(('n', subs['y'][0]))
                        else:
                            # Get basename without extension.
                            name = os.path.basename(os.path.splitext(subs['u'][0])[0])
                            newsubs.append(('d', "%05d %s" % (figure_counter, name)))
                            newsubs.append(('n', name))

        if not newsubs and 'u' in subs:
            is_fulltext = [s for s in subs['u'] if ".pdf" in s and not "subformat=pdfa" in s]
            if is_fulltext:
                newsubs = [('t', 'INSPIRE-PUBLIC'), ('a', subs['u'][0])]

        if not newsubs and 'u' in subs:
            remove = True
            is_zipfile = [s for s in subs['u'] if ".zip" in s]
            if is_zipfile:
                url = is_zipfile[0]
                local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url))
                _print("Downloading %s into %s" % (url, local_url), verbose=5)
                zipped_archive = ""
                try:
                    zipped_archive = download_url(url=is_zipfile[0],
                                                  download_to_file=local_url,
                                                  timeout=30.0)
                except InvenioFileDownloadError:
                    _print("Download failed while attempting to reach %s. Skipping.."
                           % (is_zipfile[0],))
                    remove = True
                if zipped_archive:
                    unzipped_archive = unzip(zipped_archive)
                    list_of_pngs = locate("*.png", unzipped_archive)
                    for png in list_of_pngs:
                        if "_vti_" in png or "__MACOSX" in png:
                            continue
                        figure_counter += 1
                        plotsubs = []
                        plotsubs.append(('a', png))
                        caption = '%05d %s' % (figure_counter, os.path.basename(png))
                        plotsubs.append(('d', caption))
                        plotsubs.append(('t', 'Plot'))
                        record_add_field(rec, 'FFT', subfields=plotsubs)

        if not remove and not newsubs and 'u' in subs:
            urls = ('http://cdsweb.cern.ch', 'http://cms.cern.ch',
                    'http://cmsdoc.cern.ch', 'http://documents.cern.ch',
                    'http://preprints.cern.ch', 'http://cds.cern.ch')
            for val in subs['u']:
                if any(url in val for url in urls):
                    remove = True
                    break
                if val.endswith('ps.gz'):
                    remove = True

        if newsubs:
            record_add_field(rec, 'FFT', subfields=newsubs)
            remove = True

        if remove:
            record_delete_field(rec, '856', ind1='4',
                                field_position_global=field[4])

    # 500 - Preliminary results
    if "THESIS" not in collections:
        subs = [('a', "Preliminary results")]
        record_add_field(rec, "500", subfields=subs)

    for collection in collections:
        record_add_field(rec, '980', subfields=[('a', collection)])

    return rec
def apply_filter(rec):
    """ Filters the record to be compatible within Inspire
    Parameters:
     * rec - dictionary: BibRecord structure
    Returns: dictionary, BibRecord structure
    """
    # Move recid from 001 to 035 if not hidden
    cds_id = rec["001"][0][3]
    if not "hidden" in [x.lower() for x in record_get_field_values(rec, "980", code="a")]:
        record_add_field(rec, "035", subfields=[("9", "CDS"), ("a", cds_id)])
    # Clear control fields
    record_strip_controlfields(rec)

    # Clear other uninteresting fields
    interesting_fields = [
        "024",
        "041",
        "035",
        "037",
        "088",
        "100",
        "110",
        "111",
        "242",
        "245",
        "246",
        "260",
        "269",
        "300",
        "502",
        "650",
        "653",
        "693",
        "700",
        "710",
        "773",
        "856",
        "520",
        "500",
        "980",
    ]
    for tag in rec.keys():
        if tag not in interesting_fields:
            record_delete_fields(rec, tag)

    # 980 Determine Collections
    collections = set([])
    for value in record_get_field_values(rec, "980", code="a"):
        if "NOTE" in value.upper():
            collections.add("NOTE")
        if "THESIS" in value.upper():
            collections.add("THESIS")
        if "CONFERENCEPAPER" in value.upper():
            collections.add("ConferencePaper")

    if is_published(rec):
        collections.add("PUBLISHED")
        collections.add("CITEABLE")

    if not "NOTE" in collections:
        # TODO: Move this to a KB
        kb = ["ATLAS-CONF-", "CMS-PAS-", "ATL-", "CMS-DP-", "ALICE-INT-", "LHCb-PUB-"]
        values = record_get_field_values(rec, "088", code="a")
        for val, rep in product(values, kb):
            if val.startswith(rep):
                collections.add("NOTE")
                break

    # 980 Arxiv tag
    if record_get_field_values(rec, "035", filter_subfield_code="a", filter_subfield_value="arXiv"):
        collections.add("arXiv")

    # 980 HEP && CORE
    collections.add("HEP")
    collections.add("CORE")

    # 980 Conference Note
    if not "ConferencePaper" in collections:
        for value in record_get_field_values(rec, "962", code="n"):
            if value[-2:].isdigit():
                collections.add("ConferencePaper")
                break

    record_delete_fields(rec, "980")

    intnote = record_get_field_values(rec, "690", filter_subfield_code="a", filter_subfield_value="INTNOTE")
    if intnote:
        val_088 = record_get_field_values(rec, "088", filter_subfield_code="a")
        for val in val_088:
            if "CMS" in val:
                url = "http://weblib.cern.ch/abstract?CERN-CMS" + val.split("CMS", 1)[-1]
                record_add_field(rec, "856", ind1="4", subfields=[("u", url)])

    # 041 Language
    languages = get_languages()
    language_fields = record_get_field_instances(rec, "041")
    record_delete_fields(rec, "041")
    for field in language_fields:
        subs = field_get_subfields(field)
        if "a" in subs:
            if "eng" in subs["a"]:
                continue
            new_value = translate_config(subs["a"][0], languages)
            new_subs = [("a", new_value)]
            record_add_field(rec, "041", subfields=new_subs)

    # 035 Externals
    scn_035_fields = record_get_field_instances(rec, "035")
    forbidden_values = ["cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01"]
    for field in scn_035_fields:
        subs = field_get_subfields(field)
        if "9" in subs:
            if not "a" in subs:
                continue
            for sub in subs["9"]:
                if sub.lower() in forbidden_values:
                    break
            else:
                # No forbidden values (We did not "break")
                suffixes = [s.lower() for s in subs["9"]]
                if "spires" in suffixes:
                    new_subs = [("a", "SPIRES-%s" % subs["a"][0])]
                    record_add_field(rec, "970", subfields=new_subs)
                    continue
        if "a" in subs:
            for sub in subs["a"]:
                if sub.lower() in forbidden_values:
                    record_delete_field(rec, tag="035", field_position_global=field[4])

    rep_088_fields = record_get_field_instances(rec, "088")
    for field in rep_088_fields:
        subs = field_get_subfields(field)
        if "9" in subs:
            for val in subs["9"]:
                if val.startswith("P0") or val.startswith("CM-P0"):
                    sf = [("9", "CERN"), ("b", val)]
                    record_add_field(rec, "595", subfields=sf)
        for key, val in field[0]:
            if key in ["a", "9"] and not val.startswith("SIS-"):
                record_add_field(rec, "037", subfields=[("a", val)])
    record_delete_fields(rec, "088")

    # 037 Externals also...
    rep_037_fields = record_get_field_instances(rec, "037")
    for field in rep_037_fields:
        subs = field_get_subfields(field)
        if "a" in subs:
            for value in subs["a"]:
                if "arXiv" in value:
                    new_subs = [("a", value), ("9", "arXiv")]
                    for fld in record_get_field_instances(rec, "695"):
                        for key, val in field_get_subfield_instances(fld):
                            if key == "a":
                                new_subs.append(("c", val))
                                break
                    nf = create_field(subfields=new_subs)
                    record_replace_field(rec, "037", nf, field[4])
        for key, val in field[0]:
            if key in ["a", "9"] and val.startswith("SIS-"):
                record_delete_field(rec, "037", field_position_global=field[4])

    for field in record_get_field_instances(rec, "242"):
        record_add_field(rec, "246", subfields=field[0])
    record_delete_fields(rec, "242")

    # 269 Date normalization
    for field in record_get_field_instances(rec, "269"):
        for idx, (key, value) in enumerate(field[0]):
            if key == "c":
                field[0][idx] = ("c", convert_date_to_iso(value))
                record_delete_fields(rec, "260")

    if not "THESIS" in collections:
        for field in record_get_field_instances(rec, "260"):
            record_add_field(rec, "269", subfields=field[0])
        record_delete_fields(rec, "260")

    # 300 page number
    for field in record_get_field_instances(rec, "300"):
        for idx, (key, value) in enumerate(field[0]):
            if key == "a":
                if "mult." not in value and value != " p":
                    field[0][idx] = ("a", re.sub(r"[^\d-]+", "", value))
                else:
                    record_delete_field(rec, "300", field_position_global=field[4])
                    break

    # 100 & 700 punctuate author names
    author_names = record_get_field_instances(rec, "100")
    author_names.extend(record_get_field_instances(rec, "700"))
    for field in author_names:
        subs = field_get_subfields(field)
        if not "i" in subs or "XX" in subs["i"]:
            if not "j" in subs or "YY" in subs["j"]:
                for idx, (key, value) in enumerate(field[0]):
                    if key == "a":
                        field[0][idx] = ("a", punctuate_authorname(value))

    # 700 -> 701 Thesis supervisors
    if "THESIS" in collections:
        for field in record_get_field_instances(rec, "700"):
            record_add_field(rec, "701", subfields=field[0])
        record_delete_fields(rec, "700")

    # 501 move subfields
    fields_501 = record_get_field_instances(rec, "502")
    for idx, field in enumerate(fields_501):
        new_subs = []
        for key, value in field[0]:
            if key == "a":
                new_subs.append(("b", value))
            elif key == "b":
                new_subs.append(("c", value))
            elif key == "c":
                new_subs.append(("d", value))
            else:
                new_subs.append((key, value))
        fields_501[idx] = field_swap_subfields(field, new_subs)

    # 650 Translate Categories
    categories = get_categories()
    category_fields = record_get_field_instances(rec, "650", ind1="1", ind2="7")
    record_delete_fields(rec, "650")
    for field in category_fields:
        for idx, (key, value) in enumerate(field[0]):
            if key == "a":
                new_value = translate_config(value, categories)
                if new_value != value:
                    new_subs = [("2", "INSPIRE"), ("a", new_value)]
                else:
                    new_subs = [("2", "SzGeCERN"), ("a", value)]
                record_add_field(rec, "650", ind1="1", ind2="7", subfields=new_subs)
                break

    # 653 Free Keywords
    for field in record_get_field_instances(rec, "653", ind1="1"):
        subs = field_get_subfields(field)
        new_subs = []
        if "a" in subs:
            for val in subs["a"]:
                new_subs.extend([("9", "author"), ("a", val)])
        new_field = create_field(subfields=new_subs, ind1="1")
        record_replace_field(rec, "653", new_field, field_position_global=field[4])

    experiments = get_experiments()
    # 693 Remove if 'not applicable'
    for field in record_get_field_instances(rec, "693"):
        subs = field_get_subfields(field)
        all_subs = subs.get("a", []) + subs.get("e", [])
        if "not applicable" in [x.lower() for x in all_subs]:
            record_delete_field(rec, "693", field_position_global=field[4])
        new_subs = []
        experiment_a = ""
        experiment_e = ""
        for (key, value) in subs.iteritems():
            if key == "a":
                experiment_a = value[0]
                new_subs.append((key, value[0]))
            elif key == "e":
                experiment_e = value[0]
        experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e)
        translated_experiments = translate_config(experiment, experiments)
        new_subs.append(("e", translated_experiments))
        record_delete_field(rec, tag="693", field_position_global=field[4])
        record_add_field(rec, "693", subfields=new_subs)

    # 710 Collaboration
    for field in record_get_field_instances(rec, "710"):
        subs = field_get_subfield_instances(field)
        for idx, (key, value) in enumerate(subs[:]):
            if key == "5":
                subs.pop(idx)
            elif value.startswith("CERN. Geneva"):
                subs.pop(idx)
        if len(subs) == 0:
            record_delete_field(rec, "710", field_position_global=field[4])

    # 773 journal translations
    journals = get_journals()
    for field in record_get_field_instances(rec, "773"):
        subs = field_get_subfield_instances(field)
        new_subs = []
        for idx, (key, value) in enumerate(subs):
            if key == "p":
                new_subs.append((key, translate_config(value, journals)))
            else:
                new_subs.append((key, value))
        record_delete_field(rec, tag="773", field_position_global=field[4])
        record_add_field(rec, "773", subfields=new_subs)

    # FFT (856) Dealing with graphs
    figure_counter = 0
    for field in record_get_field_instances(rec, "856", ind1="4"):
        subs = field_get_subfields(field)

        newsubs = []
        remove = False

        if "z" in subs:
            is_figure = [s for s in subs["z"] if "figure" in s.lower()]
            if is_figure and "u" in subs:
                is_subformat = [s for s in subs["u"] if "subformat" in s.lower()]
                if not is_subformat:
                    url = subs["u"][0]
                    if url.endswith(".pdf"):
                        # We try to convert
                        fd, local_url = mkstemp(suffix=os.path.basename(url), dir=CFG_TMPSHAREDDIR)
                        os.close(fd)
                        _print("Downloading %s into %s" % (url, local_url), verbose=5)
                        plotfile = ""
                        try:
                            plotfile = download_url(url=url, download_to_file=local_url, timeout=30.0)
                        except InvenioFileDownloadError:
                            _print("Download failed while attempting to reach %s. Skipping.." % (url,))
                            remove = True
                        if plotfile:
                            converted = convert_images([plotfile])
                            if converted:
                                url = converted.pop()
                                _print("Successfully converted %s to %s" % (local_url, url), verbose=5)
                            else:
                                _print("Conversion failed on %s" % (local_url,))
                                url = None
                                remove = True
                    if url:
                        newsubs.append(("a", url))
                        newsubs.append(("t", "Plot"))
                        figure_counter += 1
                        if "y" in subs:
                            newsubs.append(("d", "%05d %s" % (figure_counter, subs["y"][0])))
                            newsubs.append(("n", subs["y"][0]))
                        else:
                            # Get basename without extension.
                            name = os.path.basename(os.path.splitext(subs["u"][0])[0])
                            newsubs.append(("d", "%05d %s" % (figure_counter, name)))
                            newsubs.append(("n", name))

        if not newsubs and "u" in subs:
            is_fulltext = [s for s in subs["u"] if ".pdf" in s]
            if is_fulltext:
                newsubs = [("t", "INSPIRE-PUBLIC"), ("a", subs["u"][0])]

        if not newsubs and "u" in subs:
            remove = True
            is_zipfile = [s for s in subs["u"] if ".zip" in s]
            if is_zipfile:
                url = is_zipfile[0]
                local_url = os.path.join(CFG_TMPSHAREDDIR, os.path.basename(url))
                _print("Downloading %s into %s" % (url, local_url), verbose=5)
                zipped_archive = ""
                try:
                    zipped_archive = download_url(url=is_zipfile[0], download_to_file=local_url, timeout=30.0)
                except InvenioFileDownloadError:
                    _print("Download failed while attempting to reach %s. Skipping.." % (is_zipfile[0],))
                    remove = True
                if zipped_archive:
                    unzipped_archive = unzip(zipped_archive)
                    list_of_pngs = locate("*.png", unzipped_archive)
                    for png in list_of_pngs:
                        if "_vti_" in png or "__MACOSX" in png:
                            continue
                        figure_counter += 1
                        plotsubs = []
                        plotsubs.append(("a", png))
                        caption = "%05d %s" % (figure_counter, os.path.basename(png))
                        plotsubs.append(("d", caption))
                        plotsubs.append(("t", "Plot"))
                        record_add_field(rec, "FFT", subfields=plotsubs)

        if not remove and not newsubs and "u" in subs:
            urls = (
                "http://cdsweb.cern.ch",
                "http://cms.cern.ch",
                "http://cmsdoc.cern.ch",
                "http://documents.cern.ch",
                "http://preprints.cern.ch",
                "http://cds.cern.ch",
            )
            for val in subs["u"]:
                if any(url in val for url in urls):
                    remove = True
                    break
                if val.endswith("ps.gz"):
                    remove = True

        if newsubs:
            record_add_field(rec, "FFT", subfields=newsubs)
            remove = True

        if remove:
            record_delete_field(rec, "856", ind1="4", field_position_global=field[4])

    # 500 - Preliminary results
    if "THESIS" not in collections:
        subs = [("a", "Preliminary results")]
        record_add_field(rec, "500", subfields=subs)

    for collection in collections:
        record_add_field(rec, "980", subfields=[("a", collection)])

    return rec
Exemplo n.º 5
0
            continue

        except APSHarvesterFileExits:
            write_message("File exists at %s" % (result_file,), verbose=2)

        except StandardError, e:
            if 'urlopen' in str(e) or 'URL could not be opened' in str(e):
                write_message("Error: URL could not be opened: %s" % (url,),
                              stream=sys.stderr)
                write_message("No fulltext found for %s" %
                             (record.recid or record.doi,))
                yield record, "%s cannot be opened." % (url,)
            raise

        # Unzip the compressed file
        unzipped_folder = unzip(result_file)

        # Validate the checksum of the compressed fulltext file.
        try:
            checksum_validated_files = find_and_validate_md5_checksums(
                in_folder=unzipped_folder,
                md5key_filename=CFG_APSHARVEST_MD5_FILE)
        except InvenioFileChecksumError, e:
            write_message("Error while validating checksum: %s" % (str(e),))
            write_message("Skipping %s in %s" %
                         (record.recid or record.doi, unzipped_folder))
            continue
        if not checksum_validated_files:
            write_message("Warning: No files found to perform checksum"
                          " validation on inside %s" % (unzipped_folder,))
        elif len(checksum_validated_files) != 1 or \
Exemplo n.º 6
0
            write_message("File exists at %s" % (result_file, ), verbose=2)

        except StandardError, e:
            if 'urlopen' in str(e) or 'URL could not be opened' in str(e):
                msg = "URL could not be opened: %s" % (url, )
                write_message("Error: %s" % (msg, ), stream=sys.stderr)
                write_message("No fulltext found for %s" %
                              (record.recid or record.doi, ))
                yield record, msg
                continue
            raise
        finally:
            request_end = time.time()

        # Unzip the compressed file
        unzipped_folder = unzip(result_file, base_directory=out_folder)

        # Validate the checksum of the compressed fulltext file.
        try:
            checksum_validated_files = find_and_validate_md5_checksums(
                in_folder=unzipped_folder,
                md5key_filename=CFG_APSHARVEST_MD5_FILE)
        except InvenioFileChecksumError, e:
            info_msg = "Skipping %s in %s" % \
                       (record.recid or record.doi, unzipped_folder)
            msg = "Error while validating checksum: %s\n%s\n%s" % \
                  (info_msg, str(e), traceback.format_exc()[:-1])
            write_message(msg)
            yield record, msg
            continue
        if not checksum_validated_files:
Exemplo n.º 7
0
            write_message("File exists at %s" % (result_file, ), verbose=2)

        except StandardError, e:
            if 'urlopen' in str(e) or 'URL could not be opened' in str(e):
                msg = "URL could not be opened: %s" % (url, )
                write_message("Error: %s" % (msg, ), stream=sys.stderr)
                write_message("No fulltext found for %s" %
                              (record.recid or record.doi, ))
                yield record, msg
                continue
            raise
        finally:
            request_end = time.time()

        # Unzip the compressed file
        unzipped_folder = unzip(result_file)

        # Validate the checksum of the compressed fulltext file.
        try:
            checksum_validated_files = find_and_validate_md5_checksums(
                in_folder=unzipped_folder,
                md5key_filename=CFG_APSHARVEST_MD5_FILE)
        except InvenioFileChecksumError, e:
            info_msg = "Skipping %s in %s" % \
                        (record.recid or record.doi, unzipped_folder)
            msg = "Error while validating checksum: %s\n%s\n%s" % \
                  (info_msg, str(e), traceback.format_exc()[:-1])
            write_message(msg)
            yield record, msg
            continue
        if not checksum_validated_files:
Exemplo n.º 8
0
def apply_filter(rec):
    """ Filters the record to be compatible within Inspire
    Parameters:
     * rec - dictionary: BibRecord structure
    Returns: dictionary, BibRecord structure
    """
    # Move recid from 001 to 035
    cds_id = rec['001'][0][3]
    record_add_field(rec, '035', subfields=[('9', 'CDS'), ('a', cds_id)])
    # Clear control fields
    record_strip_controlfields(rec)

    # Clear other uninteresting fields
    interesting_fields = [
        "024", "041", "035", "037", "088", "100", "110", "111", "242", "245",
        "246", "260", "269", "300", "502", "650", "653", "693", "700", "710",
        "773", "856", "520", "500", "980"
    ]
    for tag in rec.keys():
        if tag not in interesting_fields:
            record_delete_fields(rec, tag)

    # 980 Determine Collections
    collections = set([])
    for value in record_get_field_values(rec, '980', code='a'):
        if 'NOTE' in value.upper():
            collections.add('NOTE')
        if 'THESIS' in value.upper():
            collections.add('THESIS')

    if is_published(rec):
        collections.add("PUBLISHED")
        collections.add("CITEABLE")

    # 980 Arxiv tag
    if record_get_field_values(rec,
                               '035',
                               filter_subfield_code="a",
                               filter_subfield_value="arXiv"):
        collections.add("arXiv")

    # 980 HEP && CORE
    collections.add('HEP')
    collections.add('CORE')

    record_delete_fields(rec, "980")

    intnote = record_get_field_values(rec,
                                      '690',
                                      filter_subfield_code="a",
                                      filter_subfield_value='INTNOTE')
    if intnote:
        val_088 = record_get_field_values(rec, '088', filter_subfield_code="a")
        for val in val_088:
            if 'CMS' in val:
                url = ('http://weblib.cern.ch/abstract?CERN-CMS' +
                       val.split('CMS', 1)[-1])
                record_add_field(rec, '856', ind1='4', subfields=[('u', url)])

    # 041 Language
    languages = get_languages()
    language_fields = record_get_field_instances(rec, '041')
    record_delete_fields(rec, "041")
    for field in language_fields:
        subs = field_get_subfields(field)
        if 'a' in subs:
            if "eng" in subs['a']:
                continue
            new_value = translate_config(subs['a'][0], languages)
            new_subs = [('a', new_value)]
            record_add_field(rec, "041", subfields=new_subs)

    # 035 Externals
    scn_035_fields = record_get_field_instances(rec, '035')
    forbidden_values = [
        "cercer", "inspire", "xx", "cern annual report", "cmscms", "wai01"
    ]
    for field in scn_035_fields:
        subs = field_get_subfields(field)
        if '9' in subs:
            if not 'a' in subs:
                continue
            for sub in subs['9']:
                if sub.lower() in forbidden_values:
                    break
            else:
                # No forbidden values (We did not "break")
                suffixes = [s.lower() for s in subs['9']]
                if 'spires' in suffixes:
                    new_sub = ('a', 'SPIRES-%s' % subs['a'])
                    record_add_field(rec, '970', subfields=new_sub)
                    continue
        if 'a' in subs:
            for sub in subs['a']:
                if sub.lower() in forbidden_values:
                    record_delete_field(rec,
                                        tag="035",
                                        field_position_global=field[4])

    rep_088_fields = record_get_field_instances(rec, '088')
    for field in rep_088_fields:
        subs = field_get_subfields(field)
        if '9' in subs:
            for val in subs['9']:
                if val.startswith('P0') or val.startswith('CM-P0'):
                    sf = [('9', 'CERN'), ('b', val)]
                    record_add_field(rec, '595', subfields=sf)
        for key, val in field[0]:
            if key in ['a', '9'] and not val.startswith('SIS-'):
                record_add_field(rec, '037', subfields=[('a', val)])
    record_delete_fields(rec, "088")

    rep_037_fields = record_get_field_instances(rec, '037')
    for field in rep_037_fields:
        subs = field_get_subfields(field)
        if 'a' in subs:
            for value in subs['a']:
                if 'arXiv' in value:
                    new_subs = [('a', value), ('9', 'arXiv')]
                    for fld in record_get_field_instances(rec, '695'):
                        for key, val in field_get_subfield_instances(fld):
                            if key == 'a':
                                new_subs.append(('c', val))
                                break
                    nf = create_field(subfields=new_subs)
                    record_replace_field(rec, '037', nf, field[4])
        for key, val in field[0]:
            if key in ['a', '9'] and val.startswith('SIS-'):
                record_delete_field(rec, '037', field_position_global=field[4])

    for field in record_get_field_instances(rec, '242'):
        record_add_field(rec, '246', subfields=field[0])
    record_delete_fields(rec, '242')

    # 269 Date normalization
    for field in record_get_field_instances(rec, '269'):
        for idx, (key, value) in enumerate(field[0]):
            if key == "c":
                field[0][idx] = ("c", convert_date_to_iso(value))
                record_delete_fields(rec, "260")

    if not 'THESIS' in collections:
        for field in record_get_field_instances(rec, '260'):
            record_add_field(rec, '269', subfields=field[0])
        record_delete_fields(rec, '260')

    # 300 page number
    for field in record_get_field_instances(rec, '300'):
        for idx, (key, value) in enumerate(field[0]):
            if key == 'a':
                if "mult." not in value and value != " p":
                    field[0][idx] = ('a', re.sub(r'[^\d-]+', '', value))
                else:
                    record_delete_field(rec,
                                        '300',
                                        field_position_global=field[4])
                    break

    # 100 & 700 punctuate author names
    author_names = record_get_field_instances(rec, '100')
    author_names.extend(record_get_field_instances(rec, '700'))
    for field in author_names:
        subs = field_get_subfields(field)
        if not 'i' in subs or 'XX' in subs['i']:
            if not 'j' in subs or 'YY' in subs['j']:
                for idx, (key, value) in enumerate(field[0]):
                    if key == 'a':
                        field[0][idx] = ('a', punctuate_authorname(value))

    # 700 -> 701 Thesis supervisors
    if 'THESIS' in collections:
        for field in record_get_field_instances(rec, '700'):
            record_add_field(rec, '701', subfields=field[0])
        record_delete_fields(rec, '700')

    # 501 move subfields
    fields_501 = record_get_field_instances(rec, '502')
    for idx, field in enumerate(fields_501):
        new_subs = []
        for key, value in field[0]:
            if key == 'a':
                new_subs.append(('b', value))
            elif key == 'b':
                new_subs.append(('c', value))
            elif key == 'c':
                new_subs.append(('d', value))
            else:
                new_subs.append((key, value))
        fields_501[idx] = field_swap_subfields(field, new_subs)

    # 650 Translate Categories
    categories = get_categories()
    category_fields = record_get_field_instances(rec,
                                                 '650',
                                                 ind1='1',
                                                 ind2='7')
    record_delete_fields(rec, "650")
    for field in category_fields:
        for idx, (key, value) in enumerate(field[0]):
            if key == 'a':
                new_value = translate_config(value, categories)
                if new_value != value:
                    new_subs = [('2', 'INSPIRE'), ('a', new_value)]
                else:
                    new_subs = [('2', 'SzGeCERN'), ('a', value)]
                record_add_field(rec,
                                 "650",
                                 ind1="1",
                                 ind2="7",
                                 subfields=new_subs)
                break

    # 653 Free Keywords
    for field in record_get_field_instances(rec, '653', ind1='1'):
        subs = field_get_subfields(field)
        new_subs = []
        if 'a' in subs:
            for val in subs['a']:
                new_subs.extend([('9', 'author'), ('a', val)])
        new_field = create_field(subfields=new_subs)
        record_replace_field(rec,
                             '653',
                             new_field,
                             field_position_global=field[4])

    experiments = get_experiments()
    # 693 Remove if 'not applicable'
    for field in record_get_field_instances(rec, '693'):
        subs = field_get_subfields(field)
        if 'not applicable' in [x.lower() for x in subs['a']]:
            if 'not applicable' in [x.lower() for x in subs['e']]:
                record_delete_field(rec, '693', field_position_global=field[4])
        new_subs = []
        experiment_a = ""
        experiment_e = ""
        for (key, value) in subs.iteritems():
            if key == 'a':
                experiment_a = value[0]
                new_subs.append((key, value[0]))
            elif key == 'e':
                experiment_e = value[0]
        experiment = "%s---%s" % (experiment_a.replace(" ", "-"), experiment_e)
        translated_experiments = translate_config(experiment, experiments)
        new_subs.append(("e", translated_experiments))
        record_delete_field(rec, tag="693", field_position_global=field[4])
        record_add_field(rec, "693", subfields=new_subs)

    # 710 Collaboration
    for field in record_get_field_instances(rec, '710'):
        subs = field_get_subfield_instances(field)
        for idx, (key, value) in enumerate(subs[:]):
            if key == '5':
                subs.pop(idx)
            elif value.startswith('CERN. Geneva'):
                subs.pop(idx)
        if len(subs) == 0:
            record_delete_field(rec, '710', field_position_global=field[4])

    # 773 journal translations
    journals = get_journals()
    for field in record_get_field_instances(rec, '773'):
        subs = field_get_subfield_instances(field)
        new_subs = []
        for idx, (key, value) in enumerate(subs):
            if key == 'p':
                new_subs.append((key, translate_config(value, journals)))
            else:
                new_subs.append((key, value))
        record_delete_field(rec, tag="773", field_position_global=field[4])
        record_add_field(rec, "773", subfields=new_subs)

    # FFT (856) Dealing with graphs
    figure_counter = 0
    for field in record_get_field_instances(rec, '856', ind1='4'):
        subs = field_get_subfields(field)

        newsubs = []
        remove = False

        if 'z' in subs:
            is_figure = [s for s in subs['z'] if "figure" in s.lower()]
            if is_figure and 'u' in subs:
                is_subformat = [
                    s for s in subs['u'] if "subformat" in s.lower()
                ]
                if not is_subformat:
                    url = subs['u'][0]
                    if url.endswith(".pdf"):
                        # We try to convert
                        fd, local_url = mkstemp(suffix=os.path.basename(url),
                                                dir=CFG_TMPSHAREDDIR)
                        os.close(fd)
                        _print("Downloading %s into %s" % (url, local_url),
                               verbose=5)
                        plotfile = ""
                        try:
                            plotfile = download_file(url_for_file=url,
                                                     downloaded_file=local_url,
                                                     timeout=30.0)
                        except InvenioDownloadError:
                            _print(
                                "Download failed while attempting to reach %s. Skipping.."
                                % (url, ))
                            remove = True
                        if plotfile:
                            converted = convert_images([plotfile])
                            if converted:
                                url = converted.pop()
                                _print("Successfully converted %s to %s" %
                                       (local_url, url),
                                       verbose=5)
                            else:
                                _print("Conversion failed on %s" %
                                       (local_url, ))
                                url = None
                                remove = True
                    if url:
                        newsubs.append(('a', url))
                        newsubs.append(('t', 'Plot'))
                        figure_counter += 1
                        if 'y' in subs:
                            newsubs.append(
                                ('d',
                                 "%05d %s" % (figure_counter, subs['y'][0])))
                        else:
                            newsubs.append(
                                ('d', "%05d %s" %
                                 (figure_counter, os.path.basename(url))))

        if not newsubs and 'u' in subs:
            is_fulltext = [s for s in subs['u'] if ".pdf" in s]
            if is_fulltext:
                newsubs = [('t', 'INSPIRE-PUBLIC'), ('a', subs['u'][0])]

        if not newsubs and 'u' in subs:
            remove = True
            is_zipfile = [s for s in subs['u'] if ".zip" in s]
            if is_zipfile:
                url = is_zipfile[0]
                local_url = os.path.join(CFG_TMPSHAREDDIR,
                                         os.path.basename(url))
                _print("Downloading %s into %s" % (url, local_url), verbose=5)
                zipped_archive = ""
                try:
                    zipped_archive = download_file(url_for_file=is_zipfile[0],
                                                   downloaded_file=local_url,
                                                   timeout=30.0)
                except InvenioDownloadError:
                    _print(
                        "Download failed while attempting to reach %s. Skipping.."
                        % (is_zipfile[0], ))
                    remove = True
                if zipped_archive:
                    unzipped_archive = unzip(zipped_archive)
                    list_of_pngs = locate("*.png", unzipped_archive)
                    for png in list_of_pngs:
                        if "_vti_" in png or "__MACOSX" in png:
                            continue
                        figure_counter += 1
                        plotsubs = []
                        plotsubs.append(('a', png))
                        caption = '%05d %s' % (figure_counter,
                                               os.path.basename(png))
                        plotsubs.append(('d', caption))
                        plotsubs.append(('t', 'Plot'))
                        record_add_field(rec, 'FFT', subfields=plotsubs)

        if not remove and not newsubs and 'u' in subs:
            urls = ('http://cdsweb.cern.ch', 'http://cms.cern.ch',
                    'http://cmsdoc.cern.ch', 'http://documents.cern.ch',
                    'http://preprints.cern.ch', 'http://cds.cern.ch')
            for val in subs['u']:
                if any(url in val for url in urls):
                    remove = True
                    break
                if val.endswith('ps.gz'):
                    remove = True

        if newsubs:
            record_add_field(rec, 'FFT', subfields=newsubs)
            remove = True

        if remove:
            record_delete_field(rec,
                                '856',
                                ind1='4',
                                field_position_global=field[4])

    # 500 - Preliminary results
    subs = [('a', "Preliminary results")]
    record_add_field(rec, "500", subfields=subs)

    for collection in collections:
        record_add_field(rec, '980', subfields=[('a', collection)])

    return rec