def get_country_for_aff(x_aff): # In XML could have other representations for certain organizations? ORGS = ( 'CERN', 'JINR', ) organizations = [ c.childNodes[0].nodeValue for c in x_aff.getElementsByTagName('sa:organization') ] common = set(organizations).intersection(ORGS) if common: return common.pop() country = x_aff.getElementsByTagName('sa:country') if country: return country[0].childNodes[0].nodeValue info('No country in XML. Falling back to google maps.') country = get_country( x_aff.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue) if country: return country error('Google didn\'t help.') return 'HUMAN CHECK'
def springer(): DIR = 'JHEP/' EXT = ('.xml.Meta', '.xml.scoap') BASE_DIR = '/eos/project/s/scoap3repo/BETA/harvesting/Springer/download/' + DIR zip_list = listdir(BASE_DIR) needed_dois = json.loads(open('/tmp/repo_diff_result2', 'r').read())['only_in_old'] extracted_dois = {} for file in zip_list: full_path = BASE_DIR + file if isfile(full_path) and full_path.endswith('.zip'): try: zip = ZipFile(full_path) for zip_element in zip.infolist(): fn = zip_element.filename if fn.endswith(EXT): xml = parseString(zip.read(zip_element)) doi = xml.getElementsByTagName('ArticleDOI')[0].firstChild.nodeValue if doi in needed_dois: if full_path not in extracted_dois: extracted_dois[full_path] = [] extracted_dois[full_path].append(doi) except BadZipfile as e: error('file %s: %s' % (file, e)) info('%s' % json.dumps(extracted_dois, indent=2))
def repos_diff(): OLD_REPO_FILE = '/tmp/old_repo_dump4' OLD_REPO_URL = 'https://repo.scoap3.org/search?p=&of=recjson&ot=recid,doi,creation_date&rg=100000000' COOKIES = { 'INVENIOSESSION': 'd3c673cf6be468dc6c6fd25703ff90c3', 'INVENIOSESSIONstub': 'HTTPS', '_pk_id.10.1cdf': 'ff8bdd9962372712.1536586766.49.1546956598.1546955767.' } RESULT_FILE = '/tmp/repo_diff_result9' if not isfile(OLD_REPO_FILE): info('No old repo file (%s), downloding...' % OLD_REPO_FILE) data = requests_retry_session().get(OLD_REPO_URL, cookies=COOKIES).json() info('download complete (%d records), mapping...' % len(data)) if len(data) < 1000: error('Aborting, not all record queried.') return mapped_data = {} for r in data: doi = r.pop('doi') if doi in mapped_data: error('Multiple records with doi. %s' % r) mapped_data[doi] = r info('mapping complete, saving file...') with open(OLD_REPO_FILE, 'wt') as f: f.write(json.dumps(mapped_data)) info('File saved.') info('reading old repo data from: %s' % OLD_REPO_FILE) with open(OLD_REPO_FILE, 'rt') as f: old_data = json.loads(f.read()) result = dict(only_in_old=[], only_in_new=[], in_both=[]) def proc(record): if not record.json: return doi = get_first_doi(record.json) if doi in old_data: result['in_both'].append(doi) old_data.pop(doi) else: result['only_in_new'].append(doi) process_all_records(proc) result['only_in_old'] = map(lambda x: x[0], old_data.iteritems()) with open(RESULT_FILE, 'wt') as f: f.write(json.dumps(result, indent=2)) info('only_in_old: %s\nonly_in_new: %s\nin_both:%s\nALL DONE.' % ( len(result['only_in_old']), len(result['only_in_new']), len(result['in_both'])))
def get_aff_by_id(x_author_group, aff_id): for x_affiliation in x_author_group.getElementsByTagName('ce:affiliation'): id = x_affiliation.attributes.get('id').value if id == aff_id: return x_affiliation.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue error('No affiliation for id: %s' % aff_id) return None
def get_arxiv(data): if 'report_numbers' in data: for r in data['report_numbers']: if r['source'] == 'arXiv': return r['value'].split(':')[1] error('no arxiv? %s' % data['control_number']) if 'arxiv_eprints' in data: return data['arxiv_eprints'][0]['value'].split(':')[1] return ''
def attach_file_object(api_record, filename, file_type, file_data): try: api_record.files[filename] = file_data api_record.files[filename]['filetype'] = file_type return True except FSError as e: error( 'Error occurred while attaching file. Is storage accessible for the current user? ' 'Details: %s' % e.message) return False
def proc(ai): try: PersistentIdentifier.get('recid', ai.control_number) except PIDDoesNotExistError: api_response = requests_retry_session().get(crossref_url % ai.doi) if api_response.status_code != 200: error('Failed to query crossref for doi: %s. Error code: %s' % (ai.doi, api_response.status_code)) result['not200'].append(ai.control_number) return None title = api_response.json()['message']['title'][0].lower() if 'addendum' in title or 'corrigendum' in title or 'erratum' in title: result['hit'].append((ai.control_number, title))
def init_default_location(): """ Add default Location, if not already present. Used by Travis as well. """ if not Location.query.filter(Location.name == 'default').count(): loc = Location() loc.name = 'default' loc.default = True loc.uri = '/virtualenv/files/' db.session.add(loc) db.session.commit() else: error("Default location already exists.")
def utf8rec(data): if isinstance(data, basestring): try: return ''.join(chr(ord(c)) for c in data).decode('utf8') except: # noqa todo: implement proper exception handling (E722 do not use bare except) return data if isinstance(data, tuple) or isinstance(data, list): return [utf8rec(element) for element in data] if isinstance(data, dict): return {k: utf8rec(v) for k, v in data.items()} if isinstance(data, numbers.Number) or data is None: return data error('Couldn\'t determine the data type of %s. Returning the same.' % data) return data
def update_countries(dry_run, ids): """ Updates countries for articles, that are marked as given parameter. Countries are determined with the google maps api. """ counts = {'changed': 0, 'all': 0} if ids: ids = ids.split(',') def proc(record): try: if 'authors' not in record.json: error('no authors for record %s' % record.json['control_number']) return for author_index, author_data in enumerate(record.json['authors']): if 'affiliations' not in author_data: error('no affiliations for record %s' % record.json['control_number']) continue for aff_index, aff_data in enumerate(author_data['affiliations']): counts['all'] += 1 new_country = find_country(aff_data['value']) if aff_data['country'] != new_country: counts['changed'] += 1 info('Changed country for record with id %s from %s to %s' % (record.json['control_number'], aff_data['country'], new_country)) record.json['authors'][author_index]['affiliations'][aff_index]['country'] = new_country if not dry_run: flag_modified(record, 'json') except Exception as e: error(str(e)) process_all_records(proc, control_ids=ids) if dry_run: error('NO CHANGES were committed to the database, because --dry-run flag was present.') info("%s\nDONE." % counts)
def fix_record_mapping(ids, dry_run): """ Maps the given records if needed to comply with the new schema. If dry-run option is set, no changes will be committed to the database. """ if ids: ids = ids.split(',') failed_records = [] process_all_records(map_old_record_outer, 50, ids, dry_run, failed_records) if failed_records: failed_control_numbers = [r.json.get('control_number', r.id) for r in failed_records if r.json] error('Mapping process failed for the following records: %s' % ', '.join(failed_control_numbers)) if dry_run: error('NO CHANGES were committed to the database, because --dry-run flag was present.')
def get_country_for_aff(x_aff): # In XML could have other representations for certain organizations? ORGS = ('CERN', 'JINR',) organizations = [c.childNodes[0].nodeValue for c in x_aff.getElementsByTagName('sa:organization')] common = set(organizations).intersection(ORGS) if common: return common.pop() country = x_aff.getElementsByTagName('sa:country') if country: return country[0].childNodes[0].nodeValue info('No country in XML. Falling back to google maps.') country = get_country(x_aff.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue) if country: return country error('Google didn\'t help.') return 'HUMAN CHECK'
def delete_file(control_number, key): """ Deletes a file attached to a record. """ # get existing record try: api_record = APIRecord.get_record( PersistentIdentifier.get('recid', control_number).object_uuid) except (PIDDoesNotExistError, NoResultFound): error('No record found for given control number!') return if key not in api_record.files: error('Defined key is not present.') return del api_record.files[key] api_record.commit() db.session.commit() info('File successfully deleted.')
def proc(record): try: if 'authors' not in record.json: error('no authors for record %s' % record.json['control_number']) return for author_index, author_data in enumerate(record.json['authors']): if 'affiliations' not in author_data: error('no affiliations for record %s' % record.json['control_number']) continue for aff_index, aff_data in enumerate(author_data['affiliations']): counts['all'] += 1 new_country = find_country(aff_data['value']) if aff_data['country'] != new_country: counts['changed'] += 1 info('Changed country for record with id %s from %s to %s' % (record.json['control_number'], aff_data['country'], new_country)) record.json['authors'][author_index]['affiliations'][aff_index]['country'] = new_country if not dry_run: flag_modified(record, 'json') except Exception as e: error(str(e))
def fix_doi_dates(json_file, dry_run): """ Fixes the imprint/publication/copyright dates on a list of DOIs. """ with open(json_file) as _file: dois_with_dates = json.load(_file) for doi in dois_with_dates.keys(): search_result = current_search_client.search( 'scoap3-records-record', q='dois.value:"{}"'.format(doi))['hits']['hits'] if search_result: uuid = search_result[0]['_id'] rec = Record.get_record(uuid) date = dois_with_dates[doi] year = int(date.split('-')[0]) old_date = rec['imprints'][0]['date'] rec['imprints'][0]['date'] = date rec['publication_info'][0]['year'] = year rec['copyright'][0]['year'] = year info('DOI {} with UUID {}: changed {} -> {}'.format( doi, uuid, old_date, date)) if not dry_run: rec.commit() db.session.commit() info('{} successfully updated.'.format(doi)) else: error('DOI {} not found in ES.'.format(doi)) if dry_run: error( 'NO CHANGES were committed to the database, because --dry-run flag was present.' )
def elsevier(): EXT = 'main.xml' BASE_DIR = '/eos/project/s/scoap3repo/BETA/harvesting/Elsevier/download/' RESULT_FILE = '/tmp/elsevier' tar_list = listdir(BASE_DIR) needed_dois = json.loads(open('/tmp/repo_diff_result5', 'r').read())['only_in_old'] from_date = datetime.now() - timedelta(days=365) to_date = datetime.now() - timedelta(days=60) info('found %d files in base dir.' % len(tar_list)) extracted_dois = {} for file in tar_list: full_path = BASE_DIR + file creation_date = datetime.utcfromtimestamp(getctime(full_path)) if isfile(full_path) and full_path.endswith('.tar') and from_date <= creation_date <= to_date: try: tar = tarfile.open(full_path, 'r') for element in tar.getmembers(): if element.name.endswith(EXT): xml = parseString(tar.extractfile(element).read()) doi = xml.getElementsByTagName('item-info')[0].getElementsByTagName('ce:doi')[0].firstChild.nodeValue if doi in needed_dois: if full_path not in extracted_dois: extracted_dois[full_path] = [] extracted_dois[full_path].append(doi) info('found %s in %s' % (doi, file)) else: pass # info('ignoring file: %s' % fn) except (tarfile.TarError, ExpatError) as e: error('file %s: %s' % (file, e)) info('%s' % json.dumps(extracted_dois, indent=2)) with open(RESULT_FILE, 'wt') as f: f.write(json.dumps(extracted_dois, indent=2))
def proc(article_impact): try: if 'arxiv_primary_category' in article_impact.details: return pid = PersistentIdentifier.get('recid', article_impact.control_number) record = Record.get_record(pid.object_uuid) if not record: return if 'arxiv_eprints' in record: info('%d: eprints found' % article_impact.control_number) arxiv = (record['arxiv_eprints'][0]['value'].split(':')[1]).split('v')[0] cat = get_arxiv_categories(arxiv)[0] info('category: %s' % cat) if cat: article_impact.details['arxiv_primary_category'] = cat flag_modified(article_impact, 'details') elif 'report_numbers' in record: info('%d: report_numbers found' % article_impact.control_number) cat = get_arxiv_primary_category(record) info('category: %s' % cat) if cat: article_impact.details['arxiv_primary_category'] = cat flag_modified(article_impact, 'details') else: error('%d: no arxiv' % article_impact.control_number) except PIDDoesNotExistError: # records imported from Inspire won't be found pass except AttributeError as e: error('%d: %s' % (article_impact.control_number, e))
def attach_file(control_number, file_path, file_type, filename): """ Attach a file to an already existing record. The file-path can point to a local file, but also http and https protocols are supported. For these protocols sending specific headers are not supported, so make sure the website doesn't require any. In case the record already has a file with the given filename, it will be overwritten. """ # get existing record try: api_record = APIRecord.get_record( PersistentIdentifier.get('recid', control_number).object_uuid) except (PIDDoesNotExistError, NoResultFound): error('No record found for given control number!') return # read and attach file if file_path.startswith('http://') or file_path.startswith('https://'): data = requests_retry_session().get(file_path) if data.status_code != 200: error('Could not download file. Status code: %d' % data.status_code) return file_data = StringIO(data.content) if not attach_file_object(api_record, filename, file_type, file_data): return else: try: with open(file_path) as f: if not attach_file_object(api_record, filename, file_type, f): return except IOError: error('local file was not found or not readable: %s' % file_path) return api_record.commit() db.session.commit() info('File successfully attached.')
def update_countries(dry_run, ids, country="HUMAN CHECK"): """ Updates countries for articles, that are marked as given parameter. Countries are determined with the google maps api. """ country_cache = {} cache_fails = 0 total_hits = 0 # Use parameter ids or, if not given, search for all records with the specified country. if ids: ids = ids.split(',') else: search_result = current_search_client.search( 'records-record', 'record-v1.0.0', { 'size': 10000, 'query': { 'term': { 'country': country } } }) ids = [ hit['_source']['control_number'] for hit in search_result['hits']['hits'] ] info('Found %d records having %s as a country of one of the authors.' % (len(ids), country)) uuids = [ PersistentIdentifier.get('recid', recid).object_uuid for recid in ids ] records = Record.get_records(uuids) try: for record in records: for author_index, author_data in enumerate(record['authors']): for aff_index, aff_data in enumerate( author_data['affiliations']): if aff_data['country'] == country: total_hits += 1 # cache countries based on old affiliation value to decrease api requests old_value = aff_data['value'] if old_value not in country_cache: country_cache[old_value] = get_country(old_value) cache_fails += 1 new_country = country_cache[old_value] if new_country: record['authors'][author_index]['affiliations'][ aff_index]['country'] = new_country info( 'Changed country for record with id %s to %s' % (record['control_number'], new_country)) else: error( 'Could not find country for record with id %s (affiliation value: %s)' % (record['control_number'], old_value)) if not dry_run: record.commit() db.session.commit() except Exception as e: print(e) info( 'In total %d countries needed to be updated and %d queries were made to determine the countries.' % (total_hits, cache_fails)) if dry_run: error( 'NO CHANGES were committed to the database, because --dry-run flag was present.' )
def japanise(): size = 100 def get_query(start_index, size): return { '_source': ['authors', 'control_number', 'dois', 'publication_info', 'report_numbers', 'arxiv_eprints'], 'from': start_index, 'size': size, 'query': { 'term': { 'country': 'Japan' } } } def get_arxiv(data): if 'report_numbers' in data: for r in data['report_numbers']: if r['source'] == 'arXiv': return r['value'].split(':')[1] error('no arxiv? %s' % data['control_number']) if 'arxiv_eprints' in data: return data['arxiv_eprints'][0]['value'].split(':')[1] return '' index = 0 total = None header = ['year', 'journal', 'doi', 'arxiv number', 'primary arxiv category', 'affiliaton', 'authors with affiliation', 'total number of authors'] si = StringIO() cw = csv.writer(si, delimiter=";") cw.writerow(header) while total is None or index < total: search_results = es.search(index='records-record', doc_type='record-v1.0.0', body=get_query(index, size)) total = search_results['hits']['total'] info("%s/%s" % (index, total)) index += size for hit in search_results['hits']['hits']: data = hit['_source'] year = data['publication_info'][0]['year'] journal = data['publication_info'][0]['journal_title'] doi = data['dois'][0]['value'] arxiv = get_arxiv(data) arxiv_category = get_arxiv_categories(arxiv)[0] if arxiv else '' total_authors = len(data['authors']) extracted_affiliations = {} for author in data['authors']: if 'affiliations' not in author: error('no affiliations for author. %s' % doi) continue for aff in author['affiliations']: if aff['country'] == 'Japan': value = aff['value'] if value not in extracted_affiliations: extracted_affiliations[value] = 0 extracted_affiliations[value] += 1 if not extracted_affiliations: error('no extracted affs') for aff, count in extracted_affiliations.items(): cw.writerow([year, journal, doi, arxiv, arxiv_category, aff.encode('utf8'), count, total_authors]) with open('/tmp/japanise.csv', 'wt') as f: f.write(si.getvalue())