def add_pubs_from_dois(dois): new_pubs = [] for doi in dois: crossref_api = get_api_for_one_doi(doi) new_pub = build_new_pub(doi, crossref_api) # hack so it gets updated soon new_pub.updated = datetime.datetime(1042, 1, 1) new_pubs.append(new_pub) added_pubs = add_new_pubs(new_pubs) return added_pubs
def get_new_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, offset_days=0, chunk_size=1000): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers={"Accept": "application/json", "User-Agent": "mailto:[email protected]"} root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}" root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}" root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}" # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates # root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first},until-indexed-date:{last}&rows={chunk}&cursor={next_cursor}" # root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first}&rows={chunk}&cursor={next_cursor}" next_cursor = "*" has_more_responses = True num_pubs_added_so_far = 0 pubs_this_chunk = [] if week: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=7)) elif today: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=2)) if not first: first = datetime.date(2016, 4, 1) last = last and last - datetime.timedelta(days=offset_days) first = first and first - datetime.timedelta(days=offset_days) start_time = time() while has_more_responses: if query_doi: url = root_url_doi.format(doi=query_doi) else: if last: url = root_url_with_last.format(first=first.isoformat(), last=last.isoformat(), next_cursor=next_cursor, chunk=chunk_size) else: # query is much faster if don't have a last specified, even if it is far in the future url = root_url_no_last.format(first=first.isoformat(), next_cursor=next_cursor, chunk=chunk_size) logger.info(u"calling url: {}".format(url)) crossref_time = time() resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds".format(elapsed(crossref_time, 2))) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format(resp.status_code)) resp = None if resp: resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if not resp_data["items"] or not next_cursor: has_more_responses = False for api_raw in resp_data["items"]: loop_time = time() doi = clean_doi(api_raw["DOI"]) my_pub = build_new_pub(doi, api_raw) # hack so it gets updated soon my_pub.updated = datetime.datetime(1042, 1, 1) pubs_this_chunk.append(my_pub) if len(pubs_this_chunk) >= 100: added_pubs = add_new_pubs(pubs_this_chunk) logger.info(u"added {} pubs, loop done in {} seconds".format(len(added_pubs), elapsed(loop_time, 2))) num_pubs_added_so_far += len(added_pubs) # if new_pubs: # id_links = ["http://api.oadoi.org/v2/{}".format(my_pub.id) for my_pub in new_pubs[0:5]] # logger.info(u"last few ids were {}".format(id_links)) pubs_this_chunk = [] logger.info(u"at bottom of loop") # make sure to get the last ones logger.info(u"saving last ones") added_pubs = add_new_pubs(pubs_this_chunk) num_pubs_added_so_far += len(added_pubs) logger.info(u"Added >>{}<< new crossref dois on {}, took {} seconds".format( num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
def get_new_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, offset_days=0, chunk_size=1000): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers = { "Accept": "application/json", "User-Agent": "mailto:[email protected]" } root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}" root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}" root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}" # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates # root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first},until-indexed-date:{last}&rows={chunk}&cursor={next_cursor}" # root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first}&rows={chunk}&cursor={next_cursor}" next_cursor = "*" has_more_responses = True num_pubs_added_so_far = 0 pubs_this_chunk = [] if week: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=7)) elif today: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=2)) if not first: first = datetime.date(2016, 4, 1) last = last and last - datetime.timedelta(days=offset_days) first = first and first - datetime.timedelta(days=offset_days) start_time = time() while has_more_responses: if query_doi: url = root_url_doi.format(doi=query_doi) else: if last: url = root_url_with_last.format(first=first.isoformat(), last=last.isoformat(), next_cursor=next_cursor, chunk=chunk_size) else: # query is much faster if don't have a last specified, even if it is far in the future url = root_url_no_last.format(first=first.isoformat(), next_cursor=next_cursor, chunk=chunk_size) logger.info(u"calling url: {}".format(url)) crossref_time = time() resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds".format( elapsed(crossref_time, 2))) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format( resp.status_code)) resp = None if resp: resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if not resp_data["items"] or not next_cursor: has_more_responses = False for api_raw in resp_data["items"]: loop_time = time() doi = clean_doi(api_raw["DOI"]) my_pub = build_new_pub(doi, api_raw) # hack so it gets updated soon my_pub.updated = datetime.datetime(1042, 1, 1) pubs_this_chunk.append(my_pub) if len(pubs_this_chunk) >= 100: added_pubs = add_new_pubs(pubs_this_chunk) logger.info( u"added {} pubs, loop done in {} seconds".format( len(added_pubs), elapsed(loop_time, 2))) num_pubs_added_so_far += len(added_pubs) # if new_pubs: # id_links = ["http://api.oadoi.org/v2/{}".format(my_pub.id) for my_pub in new_pubs[0:5]] # logger.info(u"last few ids were {}".format(id_links)) pubs_this_chunk = [] logger.info(u"at bottom of loop") # make sure to get the last ones logger.info(u"saving last ones") added_pubs = add_new_pubs(pubs_this_chunk) num_pubs_added_so_far += len(added_pubs) logger.info( u"Added >>{}<< new crossref dois on {}, took {} seconds".format( num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))