def attach_files(obj, eng): if 'files' in obj.extra_data: recid = obj.data['control_number'] pid = PersistentIdentifier.get('recid', recid) existing_record = Record.get_record(pid.object_uuid) if '_files' not in existing_record or not existing_record['_files']: bucket = Bucket.create() RecordsBuckets.create(record=existing_record.model, bucket=bucket) for file_ in obj.extra_data['files']: if file_['url'].startswith('http'): data = requests_retry_session().get(file_['url'], headers=file_.get('headers', {})) f = StringIO(data.content) else: f = open(file_['url']) existing_record.files[file_['name']] = f existing_record.files[file_['name']]['filetype'] = file_['filetype'] obj.save() existing_record.commit() db.session.commit() else: __halt_and_notify('No files found.', eng)
def attach_files(obj, eng): if 'files' in obj.extra_data: recid = obj.data['control_number'] pid = PersistentIdentifier.get('recid', recid) existing_record = Record.get_record(pid.object_uuid) if '_files' not in existing_record or not existing_record['_files']: bucket = Bucket.create() RecordsBuckets.create(record=existing_record.model, bucket=bucket) for file_ in obj.extra_data['files']: if file_['url'].startswith('http'): headers = file_.get('headers', {}) data = requests_retry_session().get(file_['url'], headers=headers) if data.status_code != 200: __halt_and_notify( "Error during acquiring files.\nHTTP status: %d\nUrl: %s\nHeaders:%s" % (data.status_code, file_['url'], headers), eng) f = StringIO(data.content) else: f = open(file_['url']) existing_record.files[file_['name']] = f existing_record.files[ file_['name']]['filetype'] = file_['filetype'] obj.save() existing_record.commit() db.session.commit() else: __halt_and_notify('No files found.', eng)
def repos_diff(): OLD_REPO_FILE = '/tmp/old_repo_dump4' OLD_REPO_URL = 'https://repo.scoap3.org/search?p=&of=recjson&ot=recid,doi,creation_date&rg=100000000' COOKIES = { 'INVENIOSESSION': 'd3c673cf6be468dc6c6fd25703ff90c3', 'INVENIOSESSIONstub': 'HTTPS', '_pk_id.10.1cdf': 'ff8bdd9962372712.1536586766.49.1546956598.1546955767.' } RESULT_FILE = '/tmp/repo_diff_result9' if not isfile(OLD_REPO_FILE): info('No old repo file (%s), downloding...' % OLD_REPO_FILE) data = requests_retry_session().get(OLD_REPO_URL, cookies=COOKIES).json() info('download complete (%d records), mapping...' % len(data)) if len(data) < 1000: error('Aborting, not all record queried.') return mapped_data = {} for r in data: doi = r.pop('doi') if doi in mapped_data: error('Multiple records with doi. %s' % r) mapped_data[doi] = r info('mapping complete, saving file...') with open(OLD_REPO_FILE, 'wt') as f: f.write(json.dumps(mapped_data)) info('File saved.') info('reading old repo data from: %s' % OLD_REPO_FILE) with open(OLD_REPO_FILE, 'rt') as f: old_data = json.loads(f.read()) result = dict(only_in_old=[], only_in_new=[], in_both=[]) def proc(record): if not record.json: return doi = get_first_doi(record.json) if doi in old_data: result['in_both'].append(doi) old_data.pop(doi) else: result['only_in_new'].append(doi) process_all_records(proc) result['only_in_old'] = map(lambda x: x[0], old_data.iteritems()) with open(RESULT_FILE, 'wt') as f: f.write(json.dumps(result, indent=2)) info('only_in_old: %s\nonly_in_new: %s\nin_both:%s\nALL DONE.' % ( len(result['only_in_old']), len(result['only_in_new']), len(result['in_both'])))
def proc(ai): try: PersistentIdentifier.get('recid', ai.control_number) except PIDDoesNotExistError: api_response = requests_retry_session().get(crossref_url % ai.doi) if api_response.status_code != 200: error('Failed to query crossref for doi: %s. Error code: %s' % (ai.doi, api_response.status_code)) result['not200'].append(ai.control_number) return None title = api_response.json()['message']['title'][0].lower() if 'addendum' in title or 'corrigendum' in title or 'erratum' in title: result['hit'].append((ai.control_number, title))
def get_record_date(doi): crossref_url = current_app.config.get('CROSSREF_API_URL') api_response = requests_retry_session().get(crossref_url + doi) if api_response.status_code != 200: current_app.logger.error('Failed to query crossref for doi: %s. Error code: %s' % (doi, api_response.status_code)) return None message = api_response.json()['message'] if 'published-online' in message: parts = message['published-online']['date-parts'][0] # if we don't have month or day substitute it with 1 if len(parts) < 3: parts.extend([1] * (3 - len(parts))) return datetime(*parts) return datetime.fromtimestamp(message['created']['timestamp'] // 1000)
def get_record_date(doi): crossref_url = current_app.config.get('CROSSREF_API_URL') api_response = requests_retry_session().get(crossref_url % doi) if api_response.status_code != 200: current_app.logger.error('Failed to query crossref for doi: %s. Error code: %s' % (doi, api_response.status_code)) return None message = api_response.json()['message'] if 'published-online' in message: parts = message['published-online']['date-parts'][0] # if we don't have month or day substitute it with 1 if len(parts) < 3: parts.extend([1] * (3 - len(parts))) return datetime(*parts) return datetime.fromtimestamp(message['created']['timestamp'] // 1000)
def __get_country(search_text): """Return the country of the search text based on Google Maps.""" GOOGLE_MAPS_API_URL = 'https://maps.googleapis.com/maps/api/geocode/json' params = { 'address': search_text, 'language': 'en', 'key': current_app.config.get('GOOGLE_API_KEY', '') } req = requests_retry_session().get(GOOGLE_MAPS_API_URL, params=params, timeout=1).json() if 'status' in req: if req['status'].lower() == 'ok': country = __get_country_from_results(req) return COUNTRIES_DEFAULT_MAPPING.get(country, country) return None
def validate_record(obj, eng): """ Validate record based on its schema. If there is no schema or the record is invalid, the workflow will be halted. """ if '$schema' not in obj.data: __halt_and_notify('No schema found!', eng) return schema_data = requests_retry_session().get(obj.data['$schema']).content schema_data = json.loads(schema_data) try: validate(obj.data, schema_data) except ValidationError as err: __halt_and_notify('Invalid record: %s' % err, eng) except SchemaError as err: __halt_and_notify('SchemaError during record validation! %s' % err, eng)
def attach_file(control_number, file_path, file_type, filename): """ Attach a file to an already existing record. The file-path can point to a local file, but also http and https protocols are supported. For these protocols sending specific headers are not supported, so make sure the website doesn't require any. In case the record already has a file with the given filename, it will be overwritten. """ # get existing record try: api_record = APIRecord.get_record( PersistentIdentifier.get('recid', control_number).object_uuid) except (PIDDoesNotExistError, NoResultFound): error('No record found for given control number!') return # read and attach file if file_path.startswith('http://') or file_path.startswith('https://'): data = requests_retry_session().get(file_path) if data.status_code != 200: error('Could not download file. Status code: %d' % data.status_code) return file_data = StringIO(data.content) if not attach_file_object(api_record, filename, file_type, file_data): return else: try: with open(file_path) as f: if not attach_file_object(api_record, filename, file_type, f): return except IOError: error('local file was not found or not readable: %s' % file_path) return api_record.commit() db.session.commit() info('File successfully attached.')
def get_arxiv_categories(arxiv_id=None, title=None, doi=None): """ Return a list of arxiv categories based on the specified arXiv identifier and/or title and/or doi. The identifier and title (if given) are both forwarded to the arXiv api. First element of the returned list is the primary category. In case categories cannot be found, empty list is returned. """ if arxiv_id is None and title is None and doi is None: raise ValueError('One of the arxiv_id, title and doi parameters has to be different then None.') # make sure we have a clean arxiv number arxiv_id = clean_arxiv(arxiv_id) query = [] if arxiv_id: query.append('id:%s' % arxiv_id) if title: title = title.replace('-', '?').encode('ascii', 'replace') query.append('ti:"%s"' % title) if doi: query.append('doi:"%s"' % doi) request_url = url.format(' '.join(query)) data = requests_retry_session().get(request_url) categories = [] if data.status_code == 200: xml = etree.fromstring(data.content) categories = get_arxiv_categories_from_response_xml(xml) if not categories: logger.warning('Could not get arxiv categories for id="%s" title="%s" doi="%s"' % (arxiv_id, title, doi)) else: logger.error('Got status_code %s from arXiv when looking for categires for id="%s" title="%s" doi="%s"' % ( data.status_code, arxiv_id, title, doi)) return categories
def get_crossref_items(filter_param=None): crossref_url = current_app.config.get('CROSSREF_API_URL') params = {'filter': filter_param, 'cursor': '*'} while True: api_response = requests_retry_session().get(crossref_url, params=params) if api_response.status_code != 200: logger.error('Failed to query crossref. params' % params) break message = api_response.json()['message'] items = message.get('items') if not items: break for item in items: yield item params['cursor'] = message.get('next-cursor')
def get_arxiv_categories(arxiv_id): """ Return a list of arxiv categories for specified arXiv identifier. First element of the list is the primary category. In case categories cannot be found, empty list is returned. """ # make sure we have a clean arxiv number arxiv_id = clean_arxiv(arxiv_id) data = requests_retry_session().get(url.format(arxiv_id)) categories = [] if data.status_code == 200: xml = etree.fromstring(data.content) primary_category = xml.xpath('//arxiv:primary_category/@term', namespaces=xml_namespaces) if not primary_category: logger.error('Arxiv did not return primary category for id: %s' % arxiv_id) return categories if len(primary_category) > 1: logger.error('Arxiv returned %d primary category for id: %s' % (len(primary_category), arxiv_id)) secondary_categories = xml.xpath('//w3:category/@term', namespaces=xml_namespaces) # remove primary category from secondary category list, if exists try: secondary_categories.remove(primary_category[0]) except ValueError: logger.warning('Primary arxiv category not present in secondary categories for arxiv: %s' % arxiv_id) categories = primary_category + secondary_categories else: logger.error('Got status_code %s from arXiv when looking for categires for %s' % (data.status_code, arxiv_id)) return categories
def get_inspire_records(query): url = current_app.config.get('INSPIRE_LITERATURE_API_URL') data = requests_retry_session().get(url, params={'q': query}) return data.json()['hits']['hits']
def map_old_record(record, dry_run): """ Maps the given record if needed to comply with the new schema. Following fields will be mapped: - page_nr will be a list of integers instead of list of strings - arxiv id will be put to the arxiv_eprints field - arxiv categories will be added if not yet present - "arxiv:" prefix will be removed from arxiv id - record_creation_date will be converted to iso format Following fields will be deleted at the end of the process: - _collections - report_numbers - files - local_files - free_keywords - additional_files - file_urls - earliest_date The result won't be saved and None will be returned in the following cases: - the record doesn't contain a json - a record fails the validation after mapping - both report_numbers and arxiv_eprints fields are present (shouldn't happen in the existing records) - there is more then one value in report_numbers field (shouldn't happen in the existing records) - report_numbers field is present, but there is no source subfield - no record_creation_date is present """ # if there is no json, the record is considered deleted if not record.json: rerror('no json', record) return # page_nr to list of integers if 'page_nr' in record.json: record.json['page_nr'] = [int(x) for x in record.json['page_nr']] # extract arxiv from report_numbers if present if "report_numbers" in record.json and "arxiv_eprints" in record.json: rerror('both report_numbers and arxiv_eprints are present. Skip record.', record) return if "report_numbers" in record.json: if len(record.json["report_numbers"]) > 1: rerror('report_numbers has more then one element. Skip record.', record) return arxiv_id = None for element in record.json.get("report_numbers", ()): source = element.get('source') if not source: rerror('report_numbers present, but no source. Skip record.', record) return if source.lower() == 'arxiv': arxiv_id = element.get('value') break if arxiv_id: arxiv_id = arxiv_id.lower().replace('arxiv:', '') record.json['arxiv_eprints'] = [{'value': arxiv_id}] rinfo('report_numbers -> arxiv_eprints', record) else: rerror('report_numbers present, but no arxiv id? Skip record.', record) return # add arxiv category if not yet present if "arxiv_eprints" in record.json: for element in record.json.get("arxiv_eprints", ()): if 'value' not in element: rerror('arxiv_eprints value missing', record) continue arxiv_id = element['value'] # remove arxiv prefix if present if arxiv_id.lower().startswith('arxiv:'): rinfo('removing "arxiv:" prefix', record) arxiv_id = arxiv_id[len('arxiv:'):] if 'categories' not in element: categories = get_arxiv_categories(arxiv_id) element['categories'] = categories # record_creation_date to isoformat record_creation_date = record.json.get('record_creation_date') if record_creation_date is None: rerror('no record creation date. Skip record.', record) return new_date = parse_date(record_creation_date).isoformat() if new_date != record_creation_date: rinfo('update record_creation_date: %s -> %s' % (record_creation_date, new_date), record) record.json['record_creation_date'] = new_date # delete unwanted fields unwanted_fields = ( '_collections', 'report_numbers', 'files', 'local_files', 'free_keywords', 'additional_files', 'file_urls', 'earliest_date', ) for key in unwanted_fields: if record.json.pop(key, None) is not None: rinfo('deleted %s field' % key, record) # validate record valid = False schema = record.json.get('$schema') if schema is not None: schema_data = requests_retry_session().get(schema).content schema_data = json.loads(schema_data) try: validate(record.json, schema_data) valid = True except ValidationError as err: rerror('Invalid record: %s' % err, record) except SchemaError as err: rerror('SchemaError during record validation! %s' % err, record) else: rerror('No schema found!', record) if not valid: return # mark changes if not dry_run if not dry_run: flag_modified(record, 'json') return record