예제 #1
0
def get_country_for_aff(x_aff):
    # In XML could have other representations for certain organizations?
    ORGS = (
        'CERN',
        'JINR',
    )

    organizations = [
        c.childNodes[0].nodeValue
        for c in x_aff.getElementsByTagName('sa:organization')
    ]
    common = set(organizations).intersection(ORGS)
    if common:
        return common.pop()

    country = x_aff.getElementsByTagName('sa:country')
    if country:
        return country[0].childNodes[0].nodeValue

    info('No country in XML. Falling back to google maps.')
    country = get_country(
        x_aff.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue)
    if country:
        return country

    error('Google didn\'t help.')
    return 'HUMAN CHECK'
예제 #2
0
def springer():
    DIR = 'JHEP/'
    EXT = ('.xml.Meta', '.xml.scoap')
    BASE_DIR = '/eos/project/s/scoap3repo/BETA/harvesting/Springer/download/' + DIR
    zip_list = listdir(BASE_DIR)

    needed_dois = json.loads(open('/tmp/repo_diff_result2', 'r').read())['only_in_old']

    extracted_dois = {}
    for file in zip_list:
        full_path = BASE_DIR + file
        if isfile(full_path) and full_path.endswith('.zip'):
            try:
                zip = ZipFile(full_path)
                for zip_element in zip.infolist():
                    fn = zip_element.filename
                    if fn.endswith(EXT):
                        xml = parseString(zip.read(zip_element))
                        doi = xml.getElementsByTagName('ArticleDOI')[0].firstChild.nodeValue
                        if doi in needed_dois:
                            if full_path not in extracted_dois:
                                extracted_dois[full_path] = []
                            extracted_dois[full_path].append(doi)
            except BadZipfile as e:
                error('file %s: %s' % (file, e))

    info('%s' % json.dumps(extracted_dois, indent=2))
예제 #3
0
def repos_diff():
    OLD_REPO_FILE = '/tmp/old_repo_dump4'
    OLD_REPO_URL = 'https://repo.scoap3.org/search?p=&of=recjson&ot=recid,doi,creation_date&rg=100000000'
    COOKIES = {
        'INVENIOSESSION': 'd3c673cf6be468dc6c6fd25703ff90c3',
        'INVENIOSESSIONstub': 'HTTPS',
        '_pk_id.10.1cdf': 'ff8bdd9962372712.1536586766.49.1546956598.1546955767.'
    }
    RESULT_FILE = '/tmp/repo_diff_result9'

    if not isfile(OLD_REPO_FILE):
        info('No old repo file (%s), downloding...' % OLD_REPO_FILE)
        data = requests_retry_session().get(OLD_REPO_URL, cookies=COOKIES).json()
        info('download complete (%d records), mapping...' % len(data))

        if len(data) < 1000:
            error('Aborting, not all record queried.')
            return

        mapped_data = {}
        for r in data:
            doi = r.pop('doi')
            if doi in mapped_data:
                error('Multiple records with doi. %s' % r)
            mapped_data[doi] = r

        info('mapping complete, saving file...')
        with open(OLD_REPO_FILE, 'wt') as f:
            f.write(json.dumps(mapped_data))

        info('File saved.')

    info('reading old repo data from: %s' % OLD_REPO_FILE)
    with open(OLD_REPO_FILE, 'rt') as f:
        old_data = json.loads(f.read())

    result = dict(only_in_old=[],
                  only_in_new=[],
                  in_both=[])

    def proc(record):
        if not record.json:
            return

        doi = get_first_doi(record.json)
        if doi in old_data:
            result['in_both'].append(doi)
            old_data.pop(doi)
        else:
            result['only_in_new'].append(doi)

    process_all_records(proc)

    result['only_in_old'] = map(lambda x: x[0], old_data.iteritems())
    with open(RESULT_FILE, 'wt') as f:
        f.write(json.dumps(result, indent=2))

    info('only_in_old: %s\nonly_in_new: %s\nin_both:%s\nALL DONE.' % (
        len(result['only_in_old']), len(result['only_in_new']), len(result['in_both'])))
예제 #4
0
    def get_aff_by_id(x_author_group, aff_id):
        for x_affiliation in x_author_group.getElementsByTagName('ce:affiliation'):
            id = x_affiliation.attributes.get('id').value
            if id == aff_id:
                return x_affiliation.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue

        error('No affiliation for id: %s' % aff_id)
        return None
예제 #5
0
    def get_arxiv(data):
        if 'report_numbers' in data:
            for r in data['report_numbers']:
                if r['source'] == 'arXiv':
                    return r['value'].split(':')[1]
            error('no arxiv? %s' % data['control_number'])
        if 'arxiv_eprints' in data:
            return data['arxiv_eprints'][0]['value'].split(':')[1]

        return ''
예제 #6
0
def attach_file_object(api_record, filename, file_type, file_data):
    try:
        api_record.files[filename] = file_data
        api_record.files[filename]['filetype'] = file_type
        return True
    except FSError as e:
        error(
            'Error occurred while attaching file. Is storage accessible for the current user? '
            'Details: %s' % e.message)
        return False
예제 #7
0
    def proc(ai):
        try:
            PersistentIdentifier.get('recid', ai.control_number)
        except PIDDoesNotExistError:
            api_response = requests_retry_session().get(crossref_url % ai.doi)
            if api_response.status_code != 200:
                error('Failed to query crossref for doi: %s. Error code: %s' % (ai.doi, api_response.status_code))
                result['not200'].append(ai.control_number)
                return None

            title = api_response.json()['message']['title'][0].lower()

            if 'addendum' in title or 'corrigendum' in title or 'erratum' in title:
                result['hit'].append((ai.control_number, title))
예제 #8
0
def init_default_location():
    """
    Add default Location, if not already present.
    Used by Travis as well.
    """

    if not Location.query.filter(Location.name == 'default').count():
        loc = Location()
        loc.name = 'default'
        loc.default = True
        loc.uri = '/virtualenv/files/'
        db.session.add(loc)
        db.session.commit()
    else:
        error("Default location already exists.")
예제 #9
0
def init_default_location():
    """
    Add default Location, if not already present.
    Used by Travis as well.
    """

    if not Location.query.filter(Location.name == 'default').count():
        loc = Location()
        loc.name = 'default'
        loc.default = True
        loc.uri = '/virtualenv/files/'
        db.session.add(loc)
        db.session.commit()
    else:
        error("Default location already exists.")
예제 #10
0
def utf8rec(data):
    if isinstance(data, basestring):
        try:
            return ''.join(chr(ord(c)) for c in data).decode('utf8')
        except:  # noqa todo: implement proper exception handling (E722 do not use bare except)
            return data

    if isinstance(data, tuple) or isinstance(data, list):
        return [utf8rec(element) for element in data]

    if isinstance(data, dict):
        return {k: utf8rec(v) for k, v in data.items()}

    if isinstance(data, numbers.Number) or data is None:
        return data

    error('Couldn\'t determine the data type of %s. Returning the same.' % data)
    return data
예제 #11
0
def utf8rec(data):
    if isinstance(data, basestring):
        try:
            return ''.join(chr(ord(c)) for c in data).decode('utf8')
        except:  # noqa todo: implement proper exception handling (E722 do not use bare except)
            return data

    if isinstance(data, tuple) or isinstance(data, list):
        return [utf8rec(element) for element in data]

    if isinstance(data, dict):
        return {k: utf8rec(v) for k, v in data.items()}

    if isinstance(data, numbers.Number) or data is None:
        return data

    error('Couldn\'t determine the data type of %s. Returning the same.' %
          data)
    return data
예제 #12
0
def update_countries(dry_run, ids):
    """
    Updates countries for articles, that are marked as given parameter. Countries are determined with the google maps api.
    """

    counts = {'changed': 0, 'all': 0}

    if ids:
        ids = ids.split(',')

    def proc(record):
        try:
            if 'authors' not in record.json:
                error('no authors for record %s' % record.json['control_number'])
                return

            for author_index, author_data in enumerate(record.json['authors']):
                if 'affiliations' not in author_data:
                    error('no affiliations for record %s' % record.json['control_number'])
                    continue

                for aff_index, aff_data in enumerate(author_data['affiliations']):
                    counts['all'] += 1

                    new_country = find_country(aff_data['value'])
                    if aff_data['country'] != new_country:
                        counts['changed'] += 1

                        info('Changed country for record with id %s from %s to %s' % (record.json['control_number'],
                                                                                      aff_data['country'], new_country))
                        record.json['authors'][author_index]['affiliations'][aff_index]['country'] = new_country

            if not dry_run:
                flag_modified(record, 'json')
        except Exception as e:
            error(str(e))

    process_all_records(proc, control_ids=ids)

    if dry_run:
        error('NO CHANGES were committed to the database, because --dry-run flag was present.')

    info("%s\nDONE." % counts)
예제 #13
0
def fix_record_mapping(ids, dry_run):
    """
    Maps the given records if needed to comply with the new schema.

    If dry-run option is set, no changes will be committed to the database.
    """

    if ids:
        ids = ids.split(',')

    failed_records = []

    process_all_records(map_old_record_outer, 50, ids, dry_run, failed_records)

    if failed_records:
        failed_control_numbers = [r.json.get('control_number', r.id) for r in failed_records if r.json]
        error('Mapping process failed for the following records: %s' % ', '.join(failed_control_numbers))

    if dry_run:
        error('NO CHANGES were committed to the database, because --dry-run flag was present.')
예제 #14
0
def get_country_for_aff(x_aff):
    # In XML could have other representations for certain organizations?
    ORGS = ('CERN', 'JINR',)

    organizations = [c.childNodes[0].nodeValue for c in x_aff.getElementsByTagName('sa:organization')]
    common = set(organizations).intersection(ORGS)
    if common:
        return common.pop()

    country = x_aff.getElementsByTagName('sa:country')
    if country:
        return country[0].childNodes[0].nodeValue

    info('No country in XML. Falling back to google maps.')
    country = get_country(x_aff.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue)
    if country:
        return country

    error('Google didn\'t help.')
    return 'HUMAN CHECK'
예제 #15
0
def delete_file(control_number, key):
    """
    Deletes a file attached to a record.
    """

    # get existing record
    try:
        api_record = APIRecord.get_record(
            PersistentIdentifier.get('recid', control_number).object_uuid)
    except (PIDDoesNotExistError, NoResultFound):
        error('No record found for given control number!')
        return

    if key not in api_record.files:
        error('Defined key is not present.')
        return

    del api_record.files[key]
    api_record.commit()
    db.session.commit()
    info('File successfully deleted.')
예제 #16
0
    def proc(record):
        try:
            if 'authors' not in record.json:
                error('no authors for record %s' % record.json['control_number'])
                return

            for author_index, author_data in enumerate(record.json['authors']):
                if 'affiliations' not in author_data:
                    error('no affiliations for record %s' % record.json['control_number'])
                    continue

                for aff_index, aff_data in enumerate(author_data['affiliations']):
                    counts['all'] += 1

                    new_country = find_country(aff_data['value'])
                    if aff_data['country'] != new_country:
                        counts['changed'] += 1

                        info('Changed country for record with id %s from %s to %s' % (record.json['control_number'],
                                                                                      aff_data['country'], new_country))
                        record.json['authors'][author_index]['affiliations'][aff_index]['country'] = new_country

            if not dry_run:
                flag_modified(record, 'json')
        except Exception as e:
            error(str(e))
예제 #17
0
def fix_doi_dates(json_file, dry_run):
    """
    Fixes the imprint/publication/copyright dates on a list of DOIs.
    """
    with open(json_file) as _file:
        dois_with_dates = json.load(_file)

    for doi in dois_with_dates.keys():
        search_result = current_search_client.search(
            'scoap3-records-record',
            q='dois.value:"{}"'.format(doi))['hits']['hits']

        if search_result:
            uuid = search_result[0]['_id']
            rec = Record.get_record(uuid)

            date = dois_with_dates[doi]
            year = int(date.split('-')[0])
            old_date = rec['imprints'][0]['date']

            rec['imprints'][0]['date'] = date
            rec['publication_info'][0]['year'] = year
            rec['copyright'][0]['year'] = year

            info('DOI {} with UUID {}: changed {} -> {}'.format(
                doi, uuid, old_date, date))

            if not dry_run:
                rec.commit()
                db.session.commit()
                info('{} successfully updated.'.format(doi))

        else:
            error('DOI {} not found in ES.'.format(doi))

    if dry_run:
        error(
            'NO CHANGES were committed to the database, because --dry-run flag was present.'
        )
예제 #18
0
def elsevier():
    EXT = 'main.xml'
    BASE_DIR = '/eos/project/s/scoap3repo/BETA/harvesting/Elsevier/download/'
    RESULT_FILE = '/tmp/elsevier'
    tar_list = listdir(BASE_DIR)
    needed_dois = json.loads(open('/tmp/repo_diff_result5', 'r').read())['only_in_old']

    from_date = datetime.now() - timedelta(days=365)
    to_date = datetime.now() - timedelta(days=60)
    info('found %d files in base dir.' % len(tar_list))

    extracted_dois = {}
    for file in tar_list:
        full_path = BASE_DIR + file
        creation_date = datetime.utcfromtimestamp(getctime(full_path))
        if isfile(full_path) and full_path.endswith('.tar') and from_date <= creation_date <= to_date:
            try:
                tar = tarfile.open(full_path, 'r')
                for element in tar.getmembers():
                    if element.name.endswith(EXT):
                        xml = parseString(tar.extractfile(element).read())
                        doi = xml.getElementsByTagName('item-info')[0].getElementsByTagName('ce:doi')[0].firstChild.nodeValue
                        if doi in needed_dois:
                            if full_path not in extracted_dois:
                                extracted_dois[full_path] = []
                            extracted_dois[full_path].append(doi)
                            info('found %s in %s' % (doi, file))
                    else:
                        pass
                        # info('ignoring file: %s' % fn)
            except (tarfile.TarError, ExpatError) as e:
                error('file %s: %s' % (file, e))

    info('%s' % json.dumps(extracted_dois, indent=2))

    with open(RESULT_FILE, 'wt') as f:
        f.write(json.dumps(extracted_dois, indent=2))
예제 #19
0
    def proc(article_impact):
        try:
            if 'arxiv_primary_category' in article_impact.details:
                return

            pid = PersistentIdentifier.get('recid', article_impact.control_number)
            record = Record.get_record(pid.object_uuid)

            if not record:
                return

            if 'arxiv_eprints' in record:
                info('%d: eprints found' % article_impact.control_number)
                arxiv = (record['arxiv_eprints'][0]['value'].split(':')[1]).split('v')[0]
                cat = get_arxiv_categories(arxiv)[0]
                info('category: %s' % cat)
                if cat:
                    article_impact.details['arxiv_primary_category'] = cat
                    flag_modified(article_impact, 'details')

            elif 'report_numbers' in record:
                info('%d: report_numbers found' % article_impact.control_number)
                cat = get_arxiv_primary_category(record)
                info('category: %s' % cat)
                if cat:
                    article_impact.details['arxiv_primary_category'] = cat
                    flag_modified(article_impact, 'details')

            else:
                error('%d: no arxiv' % article_impact.control_number)

        except PIDDoesNotExistError:
            # records imported from Inspire won't be found
            pass
        except AttributeError as e:
            error('%d: %s' % (article_impact.control_number, e))
예제 #20
0
def attach_file(control_number, file_path, file_type, filename):
    """
    Attach a file to an already existing record.

    The file-path can point to a local file, but also http and https protocols are supported. For these protocols
    sending specific headers are not supported, so make sure the website doesn't require any.

    In case the record already has a file with the given filename, it will be overwritten.
    """

    # get existing record
    try:
        api_record = APIRecord.get_record(
            PersistentIdentifier.get('recid', control_number).object_uuid)
    except (PIDDoesNotExistError, NoResultFound):
        error('No record found for given control number!')
        return

    # read and attach file
    if file_path.startswith('http://') or file_path.startswith('https://'):
        data = requests_retry_session().get(file_path)
        if data.status_code != 200:
            error('Could not download file. Status code: %d' %
                  data.status_code)
            return

        file_data = StringIO(data.content)
        if not attach_file_object(api_record, filename, file_type, file_data):
            return
    else:
        try:
            with open(file_path) as f:
                if not attach_file_object(api_record, filename, file_type, f):
                    return
        except IOError:
            error('local file was not found or not readable: %s' % file_path)
            return

    api_record.commit()
    db.session.commit()
    info('File successfully attached.')
예제 #21
0
def update_countries(dry_run, ids, country="HUMAN CHECK"):
    """
    Updates countries for articles, that are marked as given parameter. Countries are determined with the google maps api.
    """

    country_cache = {}
    cache_fails = 0
    total_hits = 0

    # Use parameter ids or, if not given, search for all records with the specified country.
    if ids:
        ids = ids.split(',')
    else:
        search_result = current_search_client.search(
            'records-record', 'record-v1.0.0', {
                'size': 10000,
                'query': {
                    'term': {
                        'country': country
                    }
                }
            })
        ids = [
            hit['_source']['control_number']
            for hit in search_result['hits']['hits']
        ]
        info('Found %d records having %s as a country of one of the authors.' %
             (len(ids), country))

    uuids = [
        PersistentIdentifier.get('recid', recid).object_uuid for recid in ids
    ]
    records = Record.get_records(uuids)

    try:
        for record in records:
            for author_index, author_data in enumerate(record['authors']):
                for aff_index, aff_data in enumerate(
                        author_data['affiliations']):
                    if aff_data['country'] == country:
                        total_hits += 1

                        # cache countries based on old affiliation value to decrease api requests
                        old_value = aff_data['value']
                        if old_value not in country_cache:
                            country_cache[old_value] = get_country(old_value)
                            cache_fails += 1

                        new_country = country_cache[old_value]

                        if new_country:
                            record['authors'][author_index]['affiliations'][
                                aff_index]['country'] = new_country
                            info(
                                'Changed country for record with id %s to %s' %
                                (record['control_number'], new_country))
                        else:
                            error(
                                'Could not find country for record with id %s (affiliation value: %s)'
                                % (record['control_number'], old_value))
            if not dry_run:
                record.commit()
                db.session.commit()
    except Exception as e:
        print(e)

    info(
        'In total %d countries needed to be updated and %d queries were made to determine the countries.'
        % (total_hits, cache_fails))

    if dry_run:
        error(
            'NO CHANGES were committed to the database, because --dry-run flag was present.'
        )
예제 #22
0
def japanise():
    size = 100

    def get_query(start_index, size):
        return {
            '_source': ['authors', 'control_number', 'dois', 'publication_info', 'report_numbers', 'arxiv_eprints'],
            'from': start_index,
            'size': size,
            'query': {
                'term': {
                    'country': 'Japan'
                }
            }
        }

    def get_arxiv(data):
        if 'report_numbers' in data:
            for r in data['report_numbers']:
                if r['source'] == 'arXiv':
                    return r['value'].split(':')[1]
            error('no arxiv? %s' % data['control_number'])
        if 'arxiv_eprints' in data:
            return data['arxiv_eprints'][0]['value'].split(':')[1]

        return ''

    index = 0
    total = None

    header = ['year', 'journal', 'doi', 'arxiv number', 'primary arxiv category', 'affiliaton',
              'authors with affiliation', 'total number of authors']
    si = StringIO()
    cw = csv.writer(si, delimiter=";")
    cw.writerow(header)

    while total is None or index < total:
        search_results = es.search(index='records-record',
                                   doc_type='record-v1.0.0',
                                   body=get_query(index, size))
        total = search_results['hits']['total']
        info("%s/%s" % (index, total))
        index += size

        for hit in search_results['hits']['hits']:
            data = hit['_source']

            year = data['publication_info'][0]['year']
            journal = data['publication_info'][0]['journal_title']
            doi = data['dois'][0]['value']
            arxiv = get_arxiv(data)
            arxiv_category = get_arxiv_categories(arxiv)[0] if arxiv else ''

            total_authors = len(data['authors'])

            extracted_affiliations = {}
            for author in data['authors']:
                if 'affiliations' not in author:
                    error('no affiliations for author. %s' % doi)
                    continue

                for aff in author['affiliations']:
                    if aff['country'] == 'Japan':
                        value = aff['value']
                        if value not in extracted_affiliations:
                            extracted_affiliations[value] = 0
                        extracted_affiliations[value] += 1

            if not extracted_affiliations:
                error('no extracted affs')

            for aff, count in extracted_affiliations.items():
                cw.writerow([year, journal, doi, arxiv, arxiv_category, aff.encode('utf8'), count, total_authors])

    with open('/tmp/japanise.csv', 'wt') as f:
        f.write(si.getvalue())