def get_country_for_aff(x_aff): # In XML could have other representations for certain organizations? ORGS = ( 'CERN', 'JINR', ) organizations = [ c.childNodes[0].nodeValue for c in x_aff.getElementsByTagName('sa:organization') ] common = set(organizations).intersection(ORGS) if common: return common.pop() country = x_aff.getElementsByTagName('sa:country') if country: return country[0].childNodes[0].nodeValue info('No country in XML. Falling back to google maps.') country = get_country( x_aff.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue) if country: return country error('Google didn\'t help.') return 'HUMAN CHECK'
def springer(): DIR = 'JHEP/' EXT = ('.xml.Meta', '.xml.scoap') BASE_DIR = '/eos/project/s/scoap3repo/BETA/harvesting/Springer/download/' + DIR zip_list = listdir(BASE_DIR) needed_dois = json.loads(open('/tmp/repo_diff_result2', 'r').read())['only_in_old'] extracted_dois = {} for file in zip_list: full_path = BASE_DIR + file if isfile(full_path) and full_path.endswith('.zip'): try: zip = ZipFile(full_path) for zip_element in zip.infolist(): fn = zip_element.filename if fn.endswith(EXT): xml = parseString(zip.read(zip_element)) doi = xml.getElementsByTagName('ArticleDOI')[0].firstChild.nodeValue if doi in needed_dois: if full_path not in extracted_dois: extracted_dois[full_path] = [] extracted_dois[full_path].append(doi) except BadZipfile as e: error('file %s: %s' % (file, e)) info('%s' % json.dumps(extracted_dois, indent=2))
def proc(record): try: if 'authors' not in record.json: error('no authors for record %s' % record.json['control_number']) return for author_index, author_data in enumerate(record.json['authors']): if 'affiliations' not in author_data: error('no affiliations for record %s' % record.json['control_number']) continue for aff_index, aff_data in enumerate(author_data['affiliations']): counts['all'] += 1 new_country = find_country(aff_data['value']) if aff_data['country'] != new_country: counts['changed'] += 1 info('Changed country for record with id %s from %s to %s' % (record.json['control_number'], aff_data['country'], new_country)) record.json['authors'][author_index]['affiliations'][aff_index]['country'] = new_country if not dry_run: flag_modified(record, 'json') except Exception as e: error(str(e))
def proc(r): for k, v in dict(r.results).iteritems(): new_k = COUNTRIES_DEFAULT_MAPPING.get(k, k) if k != new_k: info('%d: %s => %s' % (r.control_number, k, new_k)) r.results[new_k] = v r.results.pop(k) flag_modified(r, 'results')
def hotfix_country_mapping_in_article_impacts(): def proc(r): for k, v in dict(r.results).iteritems(): new_k = COUNTRIES_DEFAULT_MAPPING.get(k, k) if k != new_k: info('%d: %s => %s' % (r.control_number, k, new_k)) r.results[new_k] = v r.results.pop(k) flag_modified(r, 'results') process_all_articles_impact(proc) info('ALL DONE')
def proc_delete(record): to_delete = [] for i, a in enumerate(record.json['authors']): s = sum(map(bool, a.values())) if s == 0: to_delete.append(i) if to_delete: for d in to_delete: del record.json['authors'][d] flag_modified(record, 'json') info('DELETE %d authors' % len(to_delete))
def process_all_records(function, chuck_size=50, control_ids=(), *args): """ Calls the 'function' for all records. If 'control_ids' is set to a non empty list, then only those records will be processed. :param function: Function to be called for all record. First parameter will be a RecordMetadata object. :param chuck_size: How many records should be queried at once from db. :param control_ids: Control ids of records. If set to a non empty list, this will be used to filter records :param args: Args to be passed to 'function' """ info('gathering records...') # query ids from all records record_ids = RecordMetadata.query.with_entities(RecordMetadata.id) # filter records if control_ids: info('applying filter for records...') uuids = [ PersistentIdentifier.get('recid', recid).object_uuid for recid in control_ids ] record_ids = record_ids.filter(RecordMetadata.id.in_(uuids)) # get record ids record_ids = [r[0] for r in record_ids.all()] records_count = len(record_ids) processed = 0 info('start processing %d records...' % records_count) # process record chunks for i in range((records_count / chuck_size) + 1): # calculate chunk start and end position ixn = i * chuck_size current_ids = record_ids[ixn:ixn + chuck_size] # process current chunk for record in RecordMetadata.query.filter( RecordMetadata.id.in_(current_ids)): try: function(record, *args) except Exception: raise # TODO Should we handle anything here, or just stop the whole process? processed += 1 # commiting processed precords info('partial commit...') db.session.commit() info('%s records processed.' % processed) # have we processed everything? assert (processed == records_count)
def utf8(ids): """Unescape records and store data as unicode.""" def proc(record): if record.json is None: rerror('record.json is None', record) return record.json = utf8rec(record.json) flag_modified(record, 'json') if ids: ids = ids.split(',') process_all_records(proc, control_ids=ids) info('all done!')
def process_all_records(function, chuck_size=50, control_ids=(), *args): """ Calls the 'function' for all records. If 'control_ids' is set to a non empty list, then only those records will be processed. :param function: Function to be called for all record. First parameter will be a RecordMetadata object. :param chuck_size: How many records should be queried at once from db. :param control_ids: Control ids of records. If set to a non empty list, this will be used to filter records :param args: Args to be passed to 'function' """ info('gathering records...') # query ids from all records record_ids = RecordMetadata.query.with_entities(RecordMetadata.id) # filter records if control_ids: info('applying filter for records...') uuids = [PersistentIdentifier.get('recid', recid).object_uuid for recid in control_ids] record_ids = record_ids.filter(RecordMetadata.id.in_(uuids)) # get record ids record_ids = [r[0] for r in record_ids.all()] records_count = len(record_ids) processed = 0 info('start processing %d records...' % records_count) # process record chunks for i in range((records_count / chuck_size) + 1): # calculate chunk start and end position ixn = i * chuck_size current_ids = record_ids[ixn:ixn + chuck_size] # process current chunk for record in RecordMetadata.query.filter(RecordMetadata.id.in_(current_ids)): try: function(record, *args) except Exception: raise # TODO Should we handle anything here, or just stop the whole process? processed += 1 # commiting processed precords info('partial commit...') db.session.commit() info('%s records processed.' % processed) # have we processed everything? assert (processed == records_count)
def extract_year_from_record_creation(): def proc(record): if not record.json: rerror('no json.', record) return if 'record_creation_year' not in record.json: date = parse_date(record.json['record_creation_date']) if not date: rerror("Date couldn't be parsed: %s" % record.json['record_creation_date'], record) record.json['record_creation_year'] = date.year flag_modified(record, 'json') process_all_records(proc) info('ALL DONE')
def check_authors(): RESULT_FILE = '/tmp/check_authors' result = { 'null': set(), 'noauth': set(), 'noaff': set(), 'nocountry': set(), 'empty_aff': set() } def proc(record): key = '' if not record.json: key = 'null' elif 'authors' not in record.json: key = 'noauth' else: for a in record.json['authors']: if 'affiliations' not in a: key = 'noaff' break elif not a['affiliations']: key = 'empty_aff' break else: for aff in a['affiliations']: if 'country' not in aff: key = 'nocountry' break if key: result[key].add(record.id) process_all_records(proc) for k, v in result.items(): pids = PersistentIdentifier.query\ .filter(PersistentIdentifier.pid_type == 'recid')\ .filter(PersistentIdentifier.object_uuid.in_(v)).all() result[k+'_c'] = map(lambda x: x.pid_value, pids) result[k] = map(six.text_type, v) result_str = json.dumps(result, indent=2) with open(RESULT_FILE, 'wt') as f: f.write(result_str) info(result_str) info('DONE')
def extract_year_from_record_creation(): def proc(record): if not record.json: rerror('no json.', record) return if 'record_creation_year' not in record.json: date = parse_date(record.json['record_creation_date']) if not date: rerror( "Date couldn't be parsed: %s" % record.json['record_creation_date'], record) record.json['record_creation_year'] = date.year flag_modified(record, 'json') process_all_records(proc) info('ALL DONE')
def update_countries(dry_run, ids): """ Updates countries for articles, that are marked as given parameter. Countries are determined with the google maps api. """ counts = {'changed': 0, 'all': 0} if ids: ids = ids.split(',') def proc(record): try: if 'authors' not in record.json: error('no authors for record %s' % record.json['control_number']) return for author_index, author_data in enumerate(record.json['authors']): if 'affiliations' not in author_data: error('no affiliations for record %s' % record.json['control_number']) continue for aff_index, aff_data in enumerate(author_data['affiliations']): counts['all'] += 1 new_country = find_country(aff_data['value']) if aff_data['country'] != new_country: counts['changed'] += 1 info('Changed country for record with id %s from %s to %s' % (record.json['control_number'], aff_data['country'], new_country)) record.json['authors'][author_index]['affiliations'][aff_index]['country'] = new_country if not dry_run: flag_modified(record, 'json') except Exception as e: error(str(e)) process_all_records(proc, control_ids=ids) if dry_run: error('NO CHANGES were committed to the database, because --dry-run flag was present.') info("%s\nDONE." % counts)
def unescaperecords(ids): """HTML unescape abstract and title for all records.""" parser = HTMLParser() def proc(record, parser): if record.json is None: rerror('record.json is None', record) return unescape_abstract(record, parser) unescape_titles(record, parser) if ids: ids = ids.split(',') process_all_records(proc, 50, ids, parser) info('all done!')
def hotfix_country_mapping(): ids = (29476, 44219, 44220) def proc(record): """Fix country mappings...""" if record.json and 'authors' in record.json: for i, a in enumerate(record.json['authors']): for i2, aff in enumerate(a.get('affiliations', ())): c = aff.get('country') new_c = find_country(aff['value']) if c != new_c: rinfo('%s -> %s (%s)' % (c, new_c, aff['value']), record) record.json['authors'][i]['affiliations'][i2]['country'] = new_c flag_modified(record, 'json') process_all_records(proc, control_ids=ids) info('ALL DONE')
def get_country_for_aff(x_aff): # In XML could have other representations for certain organizations? ORGS = ('CERN', 'JINR',) organizations = [c.childNodes[0].nodeValue for c in x_aff.getElementsByTagName('sa:organization')] common = set(organizations).intersection(ORGS) if common: return common.pop() country = x_aff.getElementsByTagName('sa:country') if country: return country[0].childNodes[0].nodeValue info('No country in XML. Falling back to google maps.') country = get_country(x_aff.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue) if country: return country error('Google didn\'t help.') return 'HUMAN CHECK'
def add_primary_arxiv_categories(): def proc(article_impact): try: if 'arxiv_primary_category' in article_impact.details: return pid = PersistentIdentifier.get('recid', article_impact.control_number) record = Record.get_record(pid.object_uuid) if not record: return if 'arxiv_eprints' in record: info('%d: eprints found' % article_impact.control_number) arxiv = (record['arxiv_eprints'][0]['value'].split(':')[1]).split('v')[0] cat = get_arxiv_categories(arxiv)[0] info('category: %s' % cat) if cat: article_impact.details['arxiv_primary_category'] = cat flag_modified(article_impact, 'details') elif 'report_numbers' in record: info('%d: report_numbers found' % article_impact.control_number) cat = get_arxiv_primary_category(record) info('category: %s' % cat) if cat: article_impact.details['arxiv_primary_category'] = cat flag_modified(article_impact, 'details') else: error('%d: no arxiv' % article_impact.control_number) except PIDDoesNotExistError: # records imported from Inspire won't be found pass except AttributeError as e: error('%d: %s' % (article_impact.control_number, e)) process_all_articles_impact(proc) info('DONE.')
def attach_file(control_number, file_path, file_type, filename): """ Attach a file to an already existing record. The file-path can point to a local file, but also http and https protocols are supported. For these protocols sending specific headers are not supported, so make sure the website doesn't require any. In case the record already has a file with the given filename, it will be overwritten. """ # get existing record try: api_record = APIRecord.get_record( PersistentIdentifier.get('recid', control_number).object_uuid) except (PIDDoesNotExistError, NoResultFound): error('No record found for given control number!') return # read and attach file if file_path.startswith('http://') or file_path.startswith('https://'): data = requests_retry_session().get(file_path) if data.status_code != 200: error('Could not download file. Status code: %d' % data.status_code) return file_data = StringIO(data.content) if not attach_file_object(api_record, filename, file_type, file_data): return else: try: with open(file_path) as f: if not attach_file_object(api_record, filename, file_type, f): return except IOError: error('local file was not found or not readable: %s' % file_path) return api_record.commit() db.session.commit() info('File successfully attached.')
def delete_file(control_number, key): """ Deletes a file attached to a record. """ # get existing record try: api_record = APIRecord.get_record( PersistentIdentifier.get('recid', control_number).object_uuid) except (PIDDoesNotExistError, NoResultFound): error('No record found for given control number!') return if key not in api_record.files: error('Defined key is not present.') return del api_record.files[key] api_record.commit() db.session.commit() info('File successfully deleted.')
def hotfix_country_mapping(): ids = () def proc(record): """Fix country mappings...""" if record.json and 'authors' in record.json: for i, a in enumerate(record.json['authors']): for i2, aff in enumerate(a.get('affiliations', ())): c = aff['country'] new_c = find_country(aff['value']) if c != new_c: rinfo('%s -> %s (%s)' % (c, new_c, aff['value']), record) record.json['authors'][i]['affiliations'][i2][ 'country'] = new_c flag_modified(record, 'json') process_all_records(proc, control_ids=ids) info('ALL DONE')
def check_country_share(): RESULT_FILE = '/tmp/cs_test' data = {'countries': {}, 'not_one': set()} def proc(article_impact): for country, val in article_impact.results.items(): if country not in data['countries']: data['countries'][country] = 0 data['countries'][country] += val try: record = Record.get_record(PersistentIdentifier.get('recid', article_impact.control_number).object_uuid) author_count = len(record['authors']) except PIDDoesNotExistError: author_count = len(article_impact.details['authors']) sum_values = sum(article_impact.results.values()) if sum_values != author_count: data['not_one'].add((article_impact.control_number, sum_values, author_count)) process_all_articles_impact(proc) data['not_one'] = list(data['not_one']) data['missing_gdp'] = [] all_country = [g.name for g in Gdp.query.all()] for c in data['countries'].keys(): if c not in all_country: data['missing_gdp'].append(c) data['countries'] = sorted(data['countries'].items(), key=lambda x: x[0]) result_str = json.dumps(data, indent=2) with open(RESULT_FILE, 'wt') as f: f.write(result_str) info('DONE')
def fix_doi_dates(json_file, dry_run): """ Fixes the imprint/publication/copyright dates on a list of DOIs. """ with open(json_file) as _file: dois_with_dates = json.load(_file) for doi in dois_with_dates.keys(): search_result = current_search_client.search( 'scoap3-records-record', q='dois.value:"{}"'.format(doi))['hits']['hits'] if search_result: uuid = search_result[0]['_id'] rec = Record.get_record(uuid) date = dois_with_dates[doi] year = int(date.split('-')[0]) old_date = rec['imprints'][0]['date'] rec['imprints'][0]['date'] = date rec['publication_info'][0]['year'] = year rec['copyright'][0]['year'] = year info('DOI {} with UUID {}: changed {} -> {}'.format( doi, uuid, old_date, date)) if not dry_run: rec.commit() db.session.commit() info('{} successfully updated.'.format(doi)) else: error('DOI {} not found in ES.'.format(doi)) if dry_run: error( 'NO CHANGES were committed to the database, because --dry-run flag was present.' )
def empty_author(): missing_authors = [] def proc_find(record): if record.json and 'authors' in record.json: for a in record.json['authors']: s = sum(map(bool, a.values())) if s == 0: rerror('error', record) missing_authors.append(record.id) return # process_all_records(proc_find) # missing_authors2 = list(map(lambda recid:PersistentIdentifier.query\ # .filter(PersistentIdentifier.pid_type == 'recid')\ # .filter(PersistentIdentifier.object_uuid == recid).one().pid_value, missing_authors)) # info(json.dumps(missing_authors2, indent=2)) def proc_delete(record): to_delete = [] for i, a in enumerate(record.json['authors']): s = sum(map(bool, a.values())) if s == 0: to_delete.append(i) if to_delete: for d in to_delete: del record.json['authors'][d] flag_modified(record, 'json') info('DELETE %d authors' % len(to_delete)) control_ids = [22647, 21193, 14535, 10195, 16281, 16197, 9110, 4336, 21274, 22399, 1156, 14391, 22126, 22633, 22433, 22217, 10402, 22208, 20511, 3059, 2926, 4780, 1232, 2513, 22388, 10523, 22606, 12874, 22853, 22789, 4021, 13026, 3073, 1899, 20297, 4185, 1311, 23074] process_all_records(proc_delete, control_ids=control_ids) info('done')
def elsevier(): EXT = 'main.xml' BASE_DIR = '/eos/project/s/scoap3repo/BETA/harvesting/Elsevier/download/' RESULT_FILE = '/tmp/elsevier' tar_list = listdir(BASE_DIR) needed_dois = json.loads(open('/tmp/repo_diff_result5', 'r').read())['only_in_old'] from_date = datetime.now() - timedelta(days=365) to_date = datetime.now() - timedelta(days=60) info('found %d files in base dir.' % len(tar_list)) extracted_dois = {} for file in tar_list: full_path = BASE_DIR + file creation_date = datetime.utcfromtimestamp(getctime(full_path)) if isfile(full_path) and full_path.endswith('.tar') and from_date <= creation_date <= to_date: try: tar = tarfile.open(full_path, 'r') for element in tar.getmembers(): if element.name.endswith(EXT): xml = parseString(tar.extractfile(element).read()) doi = xml.getElementsByTagName('item-info')[0].getElementsByTagName('ce:doi')[0].firstChild.nodeValue if doi in needed_dois: if full_path not in extracted_dois: extracted_dois[full_path] = [] extracted_dois[full_path].append(doi) info('found %s in %s' % (doi, file)) else: pass # info('ignoring file: %s' % fn) except (tarfile.TarError, ExpatError) as e: error('file %s: %s' % (file, e)) info('%s' % json.dumps(extracted_dois, indent=2)) with open(RESULT_FILE, 'wt') as f: f.write(json.dumps(extracted_dois, indent=2))
def hotfix_els_countries(): """Hotfix for updating countries from xml""" ids = ( 18758, 19841, 21407, 21896, 22903, 24301, 40311, 23504, 23866, 23613, 23661, 23861, 23725, 24005, 23867, 15590, 16071, 15938, 15943, 15867, 15931, 16014, 15940, 15942, 16196, 15851, 15817, 15789, 15790, 15745, 25282, 25288, 24955, 25442, 25376, 25346, 25277, 40576, 40629, 40677, 40680, 40813, 23974, 24958, 24932, 40833, 25272, 25265, 24434, 25301, 25303, 25299, 25261, 24811, 24810, 24809, 24860, 24848, 24815, 24825, 24571, 40834, 40766, 40838, 40900, 40906, 23424, 23411, 23237, 23040, 23195, 23060, 23221, 23414, 23081, 23419, 23130, 23134, 23211, 23017, 23451, 23235, 40240, 40279, 40288, 40487, 40435, 25292, 25426, 25400, 25399, 25522, 40392, 40583, 40575, 40665, 40245, 40242, 25309, 40633, 25467, 25468, 25471, 40678, 40291, 40285, 40343, 25328, 25445, 40910, 40911, 40679, 40540, 40812, 40839, 40438, 40728, 40681, 40884, 40885, 40858, 40932, 40901, 40904, 40928, 40962, 40963, 41570, 41572, 41573, 41585, 41588, 41594, 41595, 41598, 41599, 41601, 41602, 41605, 41612, 41613, 41617, 41618, 41627, 41628, 41631, 41637, 41640, 41641, 41678, 41692, 41702, 41740, 41810, 41837, 41857, 41944, 41977, 41979, 42005, 42049, 42050, 42099, 42116, 42155, 42156, 42174, 42215, 42221, 42225, 42259, 42286, 42300, 42307, 42308, 42341, 42344, 42351, 42385, 42422, 42424, 42456, 42458, 42485, 42505, 43068, 43070, 43071, 43072, 43080, 43082, 43084, 43089, 43092, 43093, 43096, 43098, 43109, 43110, 43113, 43114, 43116, 43118, 43120, 43121, 43127, 43129, 43150, 43154, 43170, 43171, 43173, 43174, 43176, 43200, 43213, 43224, 43226, 43227, 43230, 43237, 43269, 43288, 43290, 43303, 43305, 43314, ) def proc(record): rinfo('start...', record) if '_files' not in record.json: rerror('Skipping. No _files', record) return xml = filter(lambda x: x['filetype'] == 'xml', record.json['_files']) if not xml: rerror('Skipping. No xml in _files', record) return object = ObjectVersion.get(xml[0]['bucket'], xml[0]['key']) uri = object.file.uri xml = parse(open(uri, 'rt')) x_author_groups = xml.getElementsByTagName('ce:author-group') if not x_author_groups: rerror('Skipping. No author groups.', record) return if len(x_author_groups) > 1: rerror('Skipping. MORE THEN ONE author group. Not supported.', record) return for x_author_group in x_author_groups: x_collaborations = x_author_group.getElementsByTagName( 'ce:collaboration') x_affiliations = x_author_group.getElementsByTagName( 'ce:affiliation') # needed for supporting multiple author groups with author matching, but author matching is not rly possible. # authors_in_group = [ # (c.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue.replace('-', '').title(), # c.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue.replace('-', '').title()) # for c in x_author_group.getElementsByTagName('ce:author') # ] if 'authors' not in record.json: # Type 1 and 3: has no authors at all. Fix: add collaborations if there are affiliations in xml. rerror('No authors... SKIPPING', record) return # extract collaborations, find countries later # FIXME we should always extract collaborations, but that would cause a lot more problems now. authors = [{ 'full_name': c.getElementsByTagName('ce:text') [0].childNodes[0].nodeValue } for c in x_collaborations] if authors: rinfo('Collaborations found: %s' % authors, record) record.json['authors'] = authors else: rerror('No collaborations. Not fixable.', record) # possibly we added authors in the previous step. if 'authors' in record.json: # Type 2 and 4: has authors, but no affiliations. authors = record.json['authors'] aff_count = sum(map(lambda x: 'affiliations' in x, authors)) if aff_count == 0: # Type 4: No affiliations in data. new_affs = [{ u'country': get_country_for_aff(a), u'value': a.getElementsByTagName('ce:textfn') [0].childNodes[0].nodeValue } for a in x_affiliations] if new_affs: rinfo('New affiliations: %s' % new_affs, record) # FIXME modify this, if multiple author groups should be supported # FIXME (not all authors should be updated)!!! # update_authors(record, authors_in_group, new_affs) for i, a in enumerate(record.json.get('authors')): record.json['authors'][i][ 'affiliations'] = new_affs flag_modified(record, 'json') else: rerror('No affiliations at all. Not fixable.', record) elif aff_count == len(authors): empty_aff_count = sum( map(lambda x: len(x['affiliations']) == 0, authors)) if empty_aff_count == len(authors): # Type 2: Only empty affiliations. rinfo('Type 2. Not fixable.', record) else: rerror( 'Only SOME authors have EMPTY affiliations. What now?', record) else: rerror('Only SOME authors have affiliations. What now?', record) rinfo('OK', record) process_all_records(proc, control_ids=ids) info('ALL DONE')
def update_countries(dry_run, ids, country="HUMAN CHECK"): """ Updates countries for articles, that are marked as given parameter. Countries are determined with the google maps api. """ country_cache = {} cache_fails = 0 total_hits = 0 # Use parameter ids or, if not given, search for all records with the specified country. if ids: ids = ids.split(',') else: search_result = current_search_client.search( 'records-record', 'record-v1.0.0', { 'size': 10000, 'query': { 'term': { 'country': country } } }) ids = [ hit['_source']['control_number'] for hit in search_result['hits']['hits'] ] info('Found %d records having %s as a country of one of the authors.' % (len(ids), country)) uuids = [ PersistentIdentifier.get('recid', recid).object_uuid for recid in ids ] records = Record.get_records(uuids) try: for record in records: for author_index, author_data in enumerate(record['authors']): for aff_index, aff_data in enumerate( author_data['affiliations']): if aff_data['country'] == country: total_hits += 1 # cache countries based on old affiliation value to decrease api requests old_value = aff_data['value'] if old_value not in country_cache: country_cache[old_value] = get_country(old_value) cache_fails += 1 new_country = country_cache[old_value] if new_country: record['authors'][author_index]['affiliations'][ aff_index]['country'] = new_country info( 'Changed country for record with id %s to %s' % (record['control_number'], new_country)) else: error( 'Could not find country for record with id %s (affiliation value: %s)' % (record['control_number'], old_value)) if not dry_run: record.commit() db.session.commit() except Exception as e: print(e) info( 'In total %d countries needed to be updated and %d queries were made to determine the countries.' % (total_hits, cache_fails)) if dry_run: error( 'NO CHANGES were committed to the database, because --dry-run flag was present.' )
def repos_diff(): OLD_REPO_FILE = '/tmp/old_repo_dump4' OLD_REPO_URL = 'https://repo.scoap3.org/search?p=&of=recjson&ot=recid,doi,creation_date&rg=100000000' COOKIES = { 'INVENIOSESSION': 'd3c673cf6be468dc6c6fd25703ff90c3', 'INVENIOSESSIONstub': 'HTTPS', '_pk_id.10.1cdf': 'ff8bdd9962372712.1536586766.49.1546956598.1546955767.' } RESULT_FILE = '/tmp/repo_diff_result9' if not isfile(OLD_REPO_FILE): info('No old repo file (%s), downloding...' % OLD_REPO_FILE) data = requests_retry_session().get(OLD_REPO_URL, cookies=COOKIES).json() info('download complete (%d records), mapping...' % len(data)) if len(data) < 1000: error('Aborting, not all record queried.') return mapped_data = {} for r in data: doi = r.pop('doi') if doi in mapped_data: error('Multiple records with doi. %s' % r) mapped_data[doi] = r info('mapping complete, saving file...') with open(OLD_REPO_FILE, 'wt') as f: f.write(json.dumps(mapped_data)) info('File saved.') info('reading old repo data from: %s' % OLD_REPO_FILE) with open(OLD_REPO_FILE, 'rt') as f: old_data = json.loads(f.read()) result = dict(only_in_old=[], only_in_new=[], in_both=[]) def proc(record): if not record.json: return doi = get_first_doi(record.json) if doi in old_data: result['in_both'].append(doi) old_data.pop(doi) else: result['only_in_new'].append(doi) process_all_records(proc) result['only_in_old'] = map(lambda x: x[0], old_data.iteritems()) with open(RESULT_FILE, 'wt') as f: f.write(json.dumps(result, indent=2)) info('only_in_old: %s\nonly_in_new: %s\nin_both:%s\nALL DONE.' % ( len(result['only_in_old']), len(result['only_in_new']), len(result['in_both'])))
def hotfix_els_countries(): """Hotfix for updating countries from xml""" ids = (44264, 24944, 24850, 16040, 23414, 15632, 15820, 24786, 15937, 25306, 15819, 40393, 15681, 23089, 23019) def get_aff_by_id(x_author_group, aff_id): for x_affiliation in x_author_group.getElementsByTagName('ce:affiliation'): id = x_affiliation.attributes.get('id').value if id == aff_id: return x_affiliation.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue error('No affiliation for id: %s' % aff_id) return None def proc(record): rinfo('start...', record) if '_files' not in record.json: rerror('Skipping. No _files', record) return xml = filter(lambda x: x['filetype'] == 'xml', record.json['_files']) if not xml: rerror('Skipping. No xml in _files', record) return object = ObjectVersion.get(xml[0]['bucket'], xml[0]['key']) uri = object.file.uri xml = parse(open(uri, 'rt')) x_author_groups = xml.getElementsByTagName('ce:author-group') if not x_author_groups: rerror('Skipping. No author groups.', record) return if len(x_author_groups) > 1: rinfo('Reparse all authors.', record) authors = [] for x_author_group in x_author_groups: # skip if not deepest author-group if x_author_group.getElementsByTagName('ce:author-group'): continue # extract affiliations x_affiliations = x_author_group.getElementsByTagName('ce:affiliation') affs = [] for a in x_affiliations: value = a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue affs.append({ u'country': find_country(value), u'value': value }) # extract authors, add affiliations x_authors = x_author_group.getElementsByTagName('ce:author') for x_author in x_authors: given_name = x_author.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue surname = x_author.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue full_name = '%s, %s' % (surname, given_name) author_affs = [] for ref in x_author.getElementsByTagName('ce:cross-ref'): affid = ref.attributes.get('refid').value if 'aff' in affid: aff_value = get_aff_by_id(x_author_group, affid) aff_country = find_country(aff_value) author_affs.append({ u'country': aff_country, u'value': aff_value }) if not (author_affs or affs): rerror('no affs for author: %s. Skip this record.' % surname, record) return authors.append({ 'full_name': full_name, 'given_name': given_name, 'surname': surname, 'affiliations': author_affs or affs }) if authors: record.json['authors'] = authors flag_modified(record, 'json') rinfo('updated', record) else: rerror('No authors found', record) else: for x_author_group in x_author_groups: x_collaborations = x_author_group.getElementsByTagName('ce:collaboration') x_affiliations = x_author_group.getElementsByTagName('ce:affiliation') # needed for supporting multiple author groups with author matching, but author matching is not rly possible. # authors_in_group = [ # (c.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue.replace('-', '').title(), # c.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue.replace('-', '').title()) # for c in x_author_group.getElementsByTagName('ce:author') # ] if 'authors' not in record.json: # Type 1 and 3: has no authors at all. Fix: add collaborations if there are affiliations in xml. rerror('No authors... SKIPPING', record) return # extract collaborations, find countries later # FIXME we should always extract collaborations, but that would cause a lot more problems now. authors = [{'full_name': c.getElementsByTagName('ce:text')[0].childNodes[0].nodeValue} for c in x_collaborations] if authors: rinfo('Collaborations found: %s' % authors, record) record.json['authors'] = authors else: rerror('No collaborations. Not fixable.', record) # possibly we added authors in the previous step. if 'authors' in record.json: # Type 2 and 4: has authors, but no affiliations. authors = record.json['authors'] aff_count = sum(map(lambda x: 'affiliations' in x, authors)) if aff_count == 0: # Type 4: No affiliations in data. new_affs = [ {u'country': find_country(a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue), u'value': a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue } for a in x_affiliations] if new_affs: rinfo('New affiliations: %s' % new_affs, record) # FIXME modify this, if multiple author groups should be supported # FIXME (not all authors should be updated)!!! # update_authors(record, authors_in_group, new_affs) for i, a in enumerate(record.json.get('authors')): record.json['authors'][i]['affiliations'] = new_affs flag_modified(record, 'json') else: rerror('No affiliations at all. Not fixable.', record) elif aff_count == len(authors): empty_aff_count = sum(map(lambda x: len(x['affiliations']) == 0, authors)) if empty_aff_count == len(authors): # Type 2: Only empty affiliations. rinfo('Type 2. Not fixable.', record) else: rerror('Only SOME authors have EMPTY affiliations. What now?', record) else: rerror('Only SOME authors have affiliations. What now?', record) rinfo('OK', record) process_all_records(proc, control_ids=ids) info('ALL DONE')
def japanise(): size = 100 def get_query(start_index, size): return { '_source': ['authors', 'control_number', 'dois', 'publication_info', 'report_numbers', 'arxiv_eprints'], 'from': start_index, 'size': size, 'query': { 'term': { 'country': 'Japan' } } } def get_arxiv(data): if 'report_numbers' in data: for r in data['report_numbers']: if r['source'] == 'arXiv': return r['value'].split(':')[1] error('no arxiv? %s' % data['control_number']) if 'arxiv_eprints' in data: return data['arxiv_eprints'][0]['value'].split(':')[1] return '' index = 0 total = None header = ['year', 'journal', 'doi', 'arxiv number', 'primary arxiv category', 'affiliaton', 'authors with affiliation', 'total number of authors'] si = StringIO() cw = csv.writer(si, delimiter=";") cw.writerow(header) while total is None or index < total: search_results = es.search(index='records-record', doc_type='record-v1.0.0', body=get_query(index, size)) total = search_results['hits']['total'] info("%s/%s" % (index, total)) index += size for hit in search_results['hits']['hits']: data = hit['_source'] year = data['publication_info'][0]['year'] journal = data['publication_info'][0]['journal_title'] doi = data['dois'][0]['value'] arxiv = get_arxiv(data) arxiv_category = get_arxiv_categories(arxiv)[0] if arxiv else '' total_authors = len(data['authors']) extracted_affiliations = {} for author in data['authors']: if 'affiliations' not in author: error('no affiliations for author. %s' % doi) continue for aff in author['affiliations']: if aff['country'] == 'Japan': value = aff['value'] if value not in extracted_affiliations: extracted_affiliations[value] = 0 extracted_affiliations[value] += 1 if not extracted_affiliations: error('no extracted affs') for aff, count in extracted_affiliations.items(): cw.writerow([year, journal, doi, arxiv, arxiv_category, aff.encode('utf8'), count, total_authors]) with open('/tmp/japanise.csv', 'wt') as f: f.write(si.getvalue())