def test_categories(): """Test extraction arXiv categories from arXiv api.""" file_data = read_response('arxiv', '1811.00370.xml') with requests_mock.Mocker() as m: m.get('http://export.arxiv.org/api/query?search_query=id:1811.00370', text=file_data) categories = get_arxiv_categories('1811.00370') assert categories == ['hep-th', 'gr-qc', 'math-ph', 'math.MP']
def test_empty_response(): """Test extraction arXiv categories from arXiv api.""" file_data = read_response('arxiv', 'empty.xml') with requests_mock.Mocker() as m: m.get('http://export.arxiv.org/api/query?search_query=id:not_found', text=file_data) categories = get_arxiv_categories('not_found') assert categories == []
def test_ambiguous_title(): """Test for receiving more then one result for partial title.""" title = 'hep' arxiv_search_title_hep = read_response('arxiv', 'search_title_hep.xml') with requests_mock.Mocker() as m: m.get('http://export.arxiv.org/api/query?search_query=ti:"%s"' % title, text=arxiv_search_title_hep) categories = get_arxiv_categories(title=title) assert categories == []
def test_categories(): """Test extraction arXiv categories from arXiv api.""" file_path = path.join(get_response_dir(), 'arxiv', '1811.00370.xml') with open(file_path, 'rb') as f: file_data = f.read() with requests_mock.Mocker() as m: m.get('http://export.arxiv.org/api/query?search_query=id:1811.00370', text=file_data) categories = get_arxiv_categories('1811.00370') assert categories == ['hep-th', 'gr-qc', 'math-ph', 'math.MP']
def test_empty_response(): """Test extraction arXiv categories from arXiv api.""" file_path = path.join(get_response_dir(), 'arxiv', 'empty.xml') with open(file_path, 'rb') as f: file_data = f.read() with requests_mock.Mocker() as m: m.get('http://export.arxiv.org/api/query?search_query=id:not_found', text=file_data) categories = get_arxiv_categories('not_found') assert categories == []
def add_arxiv_category(obj, eng): """Add arXiv categories fetched from arXiv.org""" if "arxiv_eprints" in obj.data: for element in obj.data.get("arxiv_eprints", ()): if 'value' not in element: logger.warning('arxiv_eprints value missing for article with doi: %s' % get_first_doi(obj)) continue arxiv_id = element['value'] if 'categories' not in element: categories = get_arxiv_categories(arxiv_id) element['categories'] = categories
def _get_arxiv_category_from_arxiv(item): """ Try querying arXiv for the category. Hence the arXiv id is not present at this point, try filter for doi or title. """ field_list = (('doi', 'DOI'), ('title', 'title[0]')) for param, item_key in field_list: categories = get_arxiv_categories(**{param: get_value(item, item_key)}) if categories: return categories[0] return None
def add_arxiv_category(obj, eng): """Add arXiv categories fetched from arXiv.org""" if "arxiv_eprints" in obj.data: for element in obj.data.get("arxiv_eprints", ()): if 'value' not in element: logger.warning( 'arxiv_eprints value missing for article with doi: %s' % __get_first_doi(obj)) continue arxiv_id = element['value'] if 'categories' not in element: categories = get_arxiv_categories(arxiv_id) element['categories'] = categories
def add_arxiv_category(obj, eng): """Add arXiv categories fetched from arXiv.org""" if "report_numbers" in obj.data: for i, element in enumerate(obj.data["report_numbers"]): arxiv_id = element['value'] if arxiv_id.lower().startswith("arxiv:"): arxiv_id = element['value'][6:] arxiv_id = arxiv_id.split('v')[0] if 'categories' not in element: categories = get_arxiv_categories(arxiv_id) obj.data["report_numbers"][i]['categories'] = categories if 'primary_category' not in element: primary_category = get_arxiv_primary_category(arxiv_id) obj.data["report_numbers"][i][ 'primary_category'] = primary_category
def add_arxiv_category(obj, eng): """Add arXiv categories fetched from arXiv.org""" if "arxiv_eprints" in obj.data: for element in obj.data.get("arxiv_eprints", ()): if 'value' not in element: logger.warning( 'arxiv_eprints value missing for article with doi: %s' % get_first_doi(obj)) continue arxiv_id = element['value'] if 'categories' not in element: categories = get_arxiv_categories(arxiv_id) if not categories: __halt_and_notify( 'Could not determine arXiv category based on id.', eng) element['categories'] = categories
def proc(article_impact): try: if 'arxiv_primary_category' in article_impact.details: return pid = PersistentIdentifier.get('recid', article_impact.control_number) record = Record.get_record(pid.object_uuid) if not record: return if 'arxiv_eprints' in record: info('%d: eprints found' % article_impact.control_number) arxiv = (record['arxiv_eprints'][0]['value'].split(':')[1]).split('v')[0] cat = get_arxiv_categories(arxiv)[0] info('category: %s' % cat) if cat: article_impact.details['arxiv_primary_category'] = cat flag_modified(article_impact, 'details') elif 'report_numbers' in record: info('%d: report_numbers found' % article_impact.control_number) cat = get_arxiv_primary_category(record) info('category: %s' % cat) if cat: article_impact.details['arxiv_primary_category'] = cat flag_modified(article_impact, 'details') else: error('%d: no arxiv' % article_impact.control_number) except PIDDoesNotExistError: # records imported from Inspire won't be found pass except AttributeError as e: error('%d: %s' % (article_impact.control_number, e))
def map_old_record(record, dry_run): """ Maps the given record if needed to comply with the new schema. Following fields will be mapped: - page_nr will be a list of integers instead of list of strings - arxiv id will be put to the arxiv_eprints field - arxiv categories will be added if not yet present - "arxiv:" prefix will be removed from arxiv id - record_creation_date will be converted to iso format Following fields will be deleted at the end of the process: - _collections - report_numbers - files - local_files - free_keywords - additional_files - file_urls - earliest_date The result won't be saved and None will be returned in the following cases: - the record doesn't contain a json - a record fails the validation after mapping - both report_numbers and arxiv_eprints fields are present (shouldn't happen in the existing records) - there is more then one value in report_numbers field (shouldn't happen in the existing records) - report_numbers field is present, but there is no source subfield - no record_creation_date is present """ # if there is no json, the record is considered deleted if not record.json: rerror('no json', record) return # page_nr to list of integers if 'page_nr' in record.json: record.json['page_nr'] = [int(x) for x in record.json['page_nr']] # extract arxiv from report_numbers if present if "report_numbers" in record.json and "arxiv_eprints" in record.json: rerror('both report_numbers and arxiv_eprints are present. Skip record.', record) return if "report_numbers" in record.json: if len(record.json["report_numbers"]) > 1: rerror('report_numbers has more then one element. Skip record.', record) return arxiv_id = None for element in record.json.get("report_numbers", ()): source = element.get('source') if not source: rerror('report_numbers present, but no source. Skip record.', record) return if source.lower() == 'arxiv': arxiv_id = element.get('value') break if arxiv_id: arxiv_id = arxiv_id.lower().replace('arxiv:', '') record.json['arxiv_eprints'] = [{'value': arxiv_id}] rinfo('report_numbers -> arxiv_eprints', record) else: rerror('report_numbers present, but no arxiv id? Skip record.', record) return # add arxiv category if not yet present if "arxiv_eprints" in record.json: for element in record.json.get("arxiv_eprints", ()): if 'value' not in element: rerror('arxiv_eprints value missing', record) continue arxiv_id = element['value'] # remove arxiv prefix if present if arxiv_id.lower().startswith('arxiv:'): rinfo('removing "arxiv:" prefix', record) arxiv_id = arxiv_id[len('arxiv:'):] if 'categories' not in element: categories = get_arxiv_categories(arxiv_id) element['categories'] = categories # record_creation_date to isoformat record_creation_date = record.json.get('record_creation_date') if record_creation_date is None: rerror('no record creation date. Skip record.', record) return new_date = parse_date(record_creation_date).isoformat() if new_date != record_creation_date: rinfo('update record_creation_date: %s -> %s' % (record_creation_date, new_date), record) record.json['record_creation_date'] = new_date # delete unwanted fields unwanted_fields = ( '_collections', 'report_numbers', 'files', 'local_files', 'free_keywords', 'additional_files', 'file_urls', 'earliest_date', ) for key in unwanted_fields: if record.json.pop(key, None) is not None: rinfo('deleted %s field' % key, record) # validate record valid = False schema = record.json.get('$schema') if schema is not None: schema_data = requests_retry_session().get(schema).content schema_data = json.loads(schema_data) try: validate(record.json, schema_data) valid = True except ValidationError as err: rerror('Invalid record: %s' % err, record) except SchemaError as err: rerror('SchemaError during record validation! %s' % err, record) else: rerror('No schema found!', record) if not valid: return # mark changes if not dry_run if not dry_run: flag_modified(record, 'json') return record
def japanise(): size = 100 def get_query(start_index, size): return { '_source': ['authors', 'control_number', 'dois', 'publication_info', 'report_numbers', 'arxiv_eprints'], 'from': start_index, 'size': size, 'query': { 'term': { 'country': 'Japan' } } } def get_arxiv(data): if 'report_numbers' in data: for r in data['report_numbers']: if r['source'] == 'arXiv': return r['value'].split(':')[1] error('no arxiv? %s' % data['control_number']) if 'arxiv_eprints' in data: return data['arxiv_eprints'][0]['value'].split(':')[1] return '' index = 0 total = None header = ['year', 'journal', 'doi', 'arxiv number', 'primary arxiv category', 'affiliaton', 'authors with affiliation', 'total number of authors'] si = StringIO() cw = csv.writer(si, delimiter=";") cw.writerow(header) while total is None or index < total: search_results = es.search(index='records-record', doc_type='record-v1.0.0', body=get_query(index, size)) total = search_results['hits']['total'] info("%s/%s" % (index, total)) index += size for hit in search_results['hits']['hits']: data = hit['_source'] year = data['publication_info'][0]['year'] journal = data['publication_info'][0]['journal_title'] doi = data['dois'][0]['value'] arxiv = get_arxiv(data) arxiv_category = get_arxiv_categories(arxiv)[0] if arxiv else '' total_authors = len(data['authors']) extracted_affiliations = {} for author in data['authors']: if 'affiliations' not in author: error('no affiliations for author. %s' % doi) continue for aff in author['affiliations']: if aff['country'] == 'Japan': value = aff['value'] if value not in extracted_affiliations: extracted_affiliations[value] = 0 extracted_affiliations[value] += 1 if not extracted_affiliations: error('no extracted affs') for aff, count in extracted_affiliations.items(): cw.writerow([year, journal, doi, arxiv, arxiv_category, aff.encode('utf8'), count, total_authors]) with open('/tmp/japanise.csv', 'wt') as f: f.write(si.getvalue())