def reported_noncompliant_url_fragments(dirty_doi): if not dirty_doi: return [] lookup_normalized = {} for (doi_key, fragment_list) in lookup_raw.iteritems(): lookup_normalized[normalize_doi(doi_key)] = [noncompliant_url_fragment.lower() for noncompliant_url_fragment in fragment_list] return lookup_normalized.get(normalize_doi(dirty_doi), [])
def is_bronze(self): if self.best_url and not (self.is_gold or self.is_green) and not self.has_open_license: return True if is_doi_url(self.best_url): url_doi = normalize_doi(self.best_url, return_none_if_error=True) unquoted_doi = normalize_doi(unquote(self.best_url), return_none_if_error=True) return (self.doi in (url_doi, unquoted_doi) and not (self.is_gold or self.is_hybrid or self.is_green)) return False
def scroll_through_all_dois(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000): # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service headers = {"Accept": "application/json", "User-Agent": "mailto:[email protected]"} if first: base_url = "https://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}" else: base_url = "https://api.crossref.org/works?filter=until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}" next_cursor = "*" has_more_responses = True number_added = 0 while has_more_responses: has_more_responses = False start_time = time() url = base_url.format( first=first, last=last, rows=chunk_size, next_cursor=next_cursor) logger.info(u"calling url: {}".format(url)) resp = requests.get(url, headers=headers) logger.info(u"getting crossref response took {} seconds. url: {}".format(elapsed(start_time, 2), url)) if resp.status_code != 200: logger.info(u"error in crossref call, status_code = {}".format(resp.status_code)) return number_added resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if resp_data["items"] and len(resp_data["items"]) == chunk_size: has_more_responses = True dois_from_api = [normalize_doi(api_raw["DOI"]) for api_raw in resp_data["items"]] added_pubs = add_new_pubs_from_dois(dois_from_api) if dois_from_api: logger.info(u"got {} dois from api".format(len(dois_from_api))) if added_pubs: logger.info(u"{}: saved {} new pubs, including {}".format( first, len(added_pubs), added_pubs[-2:])) number_added += len(added_pubs) logger.info(u"loop done in {} seconds".format(elapsed(start_time, 2))) return number_added
def run_update(parsed_args): update = update_registry.get(parsed_args.fn) start = time() #convenience method for handling an doi if parsed_args.doi: from pub import Pub from util import normalize_doi my_pub = db.session.query(Pub).filter( Pub.id == normalize_doi(parsed_args.doi)).first() parsed_args.id = my_pub.id logger.info(u"Got database hit for this doi: {}".format(my_pub.id)) update.run(**vars(parsed_args)) db.session.remove() logger.info(u"finished update in {} secconds".format(elapsed(start)))
def run(parsed_args, job_type): start = time() if job_type in ("normal", "hybrid"): update = update_registry.get("Pub."+process_name(job_type)) if parsed_args.doi: parsed_args.id = normalize_doi(parsed_args.doi) parsed_args.doi = None else: update = update_registry.get("DateRange.get_unpaywall_events") # update = update_registry.get("DateRange.get_pmh_events") update.run(**vars(parsed_args)) logger.info(u"finished update in {} seconds".format(elapsed(start))) resp = None if job_type in ("normal", "hybrid"): my_pub = Pub.query.get(parsed_args.id) resp = my_pub.response_jsonb pprint(resp) return resp
def crawl_crossref(page_delay=None, page_length=None): # see if there's an unfinished crawl active_crawl = None unfinished_crawl = CrossrefCrawl.query.filter( CrossrefCrawl.done.is_(None)).scalar() if unfinished_crawl: logger.info(u'found an unfinished crawl starting at {}'.format( unfinished_crawl.started)) # see if it's still running last_request = unfinished_crawl.last_request if last_request is None or last_request > datetime.datetime.utcnow( ) - datetime.timedelta(hours=2): logger.info( u'aborting, unfinished crawl still looks active. started: {}, last request {}' .format(unfinished_crawl.started, unfinished_crawl.last_request)) return # see if we should resume it if unfinished_crawl.cursor_tries < 5: # resume it active_crawl = unfinished_crawl else: # kill it unfinished_crawl.done = False db.session.commit() if not active_crawl: logger.info(u'beginning a new crawl') active_crawl = CrossrefCrawl(started=datetime.datetime.utcnow(), cursor='*', cursor_tries=0) db.session.add(active_crawl) db.session.commit() root_url = u'https://api.crossref.org/works?cursor={next_cursor}' if page_length: root_url = root_url + u'&rows={}'.format(page_length) has_more_responses = True while has_more_responses: url = root_url.format(next_cursor=active_crawl.cursor) logger.info(u"calling url: {}".format(url)) active_crawl.last_request = datetime.datetime.utcnow() db.session.commit() crossref_time = time() resp = get_response_page(url) logger.info(u"getting crossref response took {} seconds".format( elapsed(crossref_time, 2))) if not resp or resp.status_code != 200: # abort, try agan later logger.info(u"error in crossref call, status_code = {}".format( resp and resp.status_code)) active_crawl.cursor_tries += 1 active_crawl.last_request = None db.session.commit() return else: # save DOIs resp_data = resp.json()["message"] page_dois = [] for api_raw in resp_data["items"]: doi = normalize_doi(api_raw["DOI"]) if doi: page_dois.append({ 'crawl_time': active_crawl.started, 'doi': doi }) # update cursor next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if not resp_data["items"] or not next_cursor: has_more_responses = False active_crawl.done = True else: active_crawl.cursor = next_cursor active_crawl.cursor_tries = 0 if page_dois: db.session.bulk_insert_mappings(CrossrefCrawlDoi, page_dois) db.session.commit() logger.info(u'added {} dois'.format(len(page_dois))) if page_delay: logger.info('sleeping {} seconds'.format(page_delay)) sleep(page_delay)
def get_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, offset_days=0, chunk_size=1000, get_updates=False): root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}" if get_updates: root_url_with_last = "https://api.crossref.org/works?order=desc&sort=indexed&filter=from-index-date:{first},until-index-date:{last}&rows={chunk}&cursor={next_cursor}" root_url_no_last = "https://api.crossref.org/works?order=desc&sort=indexed&filter=from-index-date:{first}&rows={chunk}&cursor={next_cursor}" else: root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}" root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}" next_cursor = "*" has_more_responses = True num_pubs_added_so_far = 0 pubs_this_chunk = [] if week: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=7)) elif today: last = (datetime.date.today() + datetime.timedelta(days=1)) first = (datetime.date.today() - datetime.timedelta(days=2)) if not first: first = datetime.date(2016, 4, 1) last = last and last - datetime.timedelta(days=offset_days) first = first and first - datetime.timedelta(days=offset_days) start_time = time() insert_pub_fn = add_pubs_or_update_crossref if get_updates else add_new_pubs while has_more_responses: if query_doi: url = root_url_doi.format(doi=query_doi) else: if last: url = root_url_with_last.format(first=first.isoformat(), last=last.isoformat(), next_cursor=next_cursor, chunk=chunk_size) else: # query is much faster if don't have a last specified, even if it is far in the future url = root_url_no_last.format(first=first.isoformat(), next_cursor=next_cursor, chunk=chunk_size) logger.info("calling url: {}".format(url)) crossref_time = time() resp = get_response_page(url) logger.info("getting crossref response took {} seconds".format( elapsed(crossref_time, 2))) if resp.status_code != 200: logger.info("error in crossref call, status_code = {}".format( resp.status_code)) resp = None if resp: resp_data = resp.json()["message"] next_cursor = resp_data.get("next-cursor", None) if next_cursor: next_cursor = quote(next_cursor) if not resp_data["items"] or not next_cursor: has_more_responses = False for api_raw in resp_data["items"]: loop_time = time() doi = normalize_doi(api_raw["DOI"]) my_pub = build_new_pub(doi, api_raw) # hack so it gets updated soon my_pub.updated = datetime.datetime(1042, 1, 1) pubs_this_chunk.append(my_pub) if len(pubs_this_chunk) >= 100: added_pubs = insert_pub_fn(pubs_this_chunk) logger.info( "added {} pubs, loop done in {} seconds".format( len(added_pubs), elapsed(loop_time, 2))) num_pubs_added_so_far += len(added_pubs) pubs_this_chunk = [] logger.info("at bottom of loop") # make sure to get the last ones logger.info("saving last ones") added_pubs = insert_pub_fn(pubs_this_chunk) num_pubs_added_so_far += len(added_pubs) logger.info("Added >>{}<< new crossref dois on {}, took {} seconds".format( num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
def worker_run(self, **kwargs): single_obj_id = kwargs.get("id", None) chunk = kwargs.get("chunk", 100) limit = kwargs.get("limit", 10) run_class = Pub run_method = kwargs.get("method") if single_obj_id: limit = 1 queue_table = None elif run_method == "refresh": queue_table = "pub_refresh_queue" if not limit: limit = 1000 text_query_pattern = """ with refresh_queue as ( select id from {queue_table} where started is null order by priority desc, finished nulls first, started, rand limit {chunk} for update skip locked ) update {queue_table} queue_rows_to_update set started = now() from refresh_queue where refresh_queue.id = queue_rows_to_update.id returning refresh_queue.id;""" text_query = text_query_pattern.format(chunk=chunk, queue_table=queue_table) logger.info("the queue query is:\n{}".format(text_query)) else: queue_table = "pub_queue" if not limit: limit = 1000 text_query_pattern = """WITH update_pub_queue AS ( SELECT id FROM {queue_table} WHERE started is null order by finished asc nulls first LIMIT {chunk} FOR UPDATE SKIP LOCKED ) UPDATE {queue_table} queue_rows_to_update SET started=now() FROM update_pub_queue WHERE update_pub_queue.id = queue_rows_to_update.id RETURNING update_pub_queue.id;""" text_query = text_query_pattern.format(limit=limit, chunk=chunk, queue_table=queue_table) logger.info("the queue query is:\n{}".format(text_query)) index = 0 start_time = time() while True: new_loop_start_time = time() if single_obj_id: single_obj_id = normalize_doi(single_obj_id) objects = [ run_class.query.filter( run_class.id == single_obj_id).first() ] else: logger.info("looking for new jobs") job_time = time() row_list = db.engine.execute( text(text_query).execution_options( autocommit=True)).fetchall() object_ids = [row[0] for row in row_list] logger.info("got ids, took {} seconds".format( elapsed(job_time))) job_time = time() q = db.session.query(Pub).options(orm.undefer('*')).filter( Pub.id.in_(object_ids)) objects = q.all() logger.info("got pub objects in {} seconds".format( elapsed(job_time))) # shuffle them or they sort by doi order random.shuffle(objects) # objects = Pub.query.from_statement(text(text_query)).execution_options(autocommit=True).all() # objects = run_class.query.from_statement(text(text_query)).execution_options(autocommit=True).all() # id_rows = db.engine.execute(text(text_query)).fetchall() # ids = [row[0] for row in id_rows] # # job_time = time() # objects = run_class.query.filter(run_class.id.in_(ids)).all() # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(job_time))) if not objects: # logger.info(u"sleeping for 5 seconds, then going again") sleep(5) continue object_ids = [obj.id for obj in objects] self.update_fn(run_class, run_method, objects, index=index) # logger.info(u"finished update_fn") if queue_table: object_ids_str = ",".join([ "'{}'".format(id.replace("'", "''")) for id in object_ids ]) object_ids_str = object_ids_str.replace("%", "%%") #sql escaping sql_command = "update {queue_table} set finished=now(), started=null where id in ({ids})".format( queue_table=queue_table, ids=object_ids_str) # logger.info(u"sql command to update finished is: {}".format(sql_command)) run_sql(db, sql_command) # logger.info(u"finished run_sql") # finished is set in update_fn index += 1 if single_obj_id: return else: self.print_update(new_loop_start_time, chunk, limit, start_time, index)
def get_chorus_data(starting_offset=0, agency_id=None): requests_session = requests.Session() retries = Retry(total=10, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504]) requests_session.mount('http://', DelayedAdapter(max_retries=retries)) requests_session.mount('https://', DelayedAdapter(max_retries=retries)) agencies = get_chorus_agencies() for agency in agencies: if agency_id: if int(agency["Agency_Id"]) != int(agency_id): print "skipping {}, you are not the agency id we are looking for".format(agency["Agency_Id"]) continue if starting_offset: offset = starting_offset else: offset = 0 logger.info(u"*** on agency {}:{}".format(agency["Agency_Name"], agency["Agency_Id"])) url_template = "https://api.chorusaccess.org/v1.1/agencies/{agency_id}/histories/current?category=publicly_accessible&limit={limit}&offset={offset}" limit = 50 total_results = None while total_results==None or offset < total_results: loop_start = time() url = url_template.format(agency_id=agency["Agency_Id"], offset=offset, limit=limit) print url try: r = requests_session.get(url, timeout=360) # wait for 3 minutes except Exception, e: logger.exception(u"Exception: {}, skipping".format(unicode(e.message).encode("utf-8"))) r = None print u"api call elapsed: {} seconds".format(elapsed(loop_start, 1)) offset += limit if r: data = r.json() total_results = data["total_results"] logger.info(u"Has {} total results, {} remaining".format( total_results, total_results - offset)) items = data["items"] new_objects = [] for item in items: if item["DOI"]: doi = normalize_doi(item["DOI"]) new_objects.append(Chorus(id=doi, raw=item)) ids_already_in_db = [id_tuple[0] for id_tuple in db.session.query(Chorus.id).filter(Chorus.id.in_([obj.id for obj in new_objects])).all()] objects_to_add_to_db = [obj for obj in new_objects if obj.id not in ids_already_in_db] if objects_to_add_to_db: logger.info(u"adding {} items".format(len(objects_to_add_to_db))) db.session.add_all(objects_to_add_to_db) safe_commit(db) else: logger.info(u"all of these items already in db") logger.info(u"sleeping for 2 seconds") sleep(2)
def __init__(self, **kwargs): self.updated = datetime.datetime.utcnow() if "doi" in kwargs: kwargs["doi"] = normalize_doi(kwargs["doi"]) super(Chorus, self).__init__(**kwargs)
def simple_query_tool(): body = request.json dirty_dois_list = {d for d in body["dois"] if d} # look up normalized dois normalized_dois = [ c for c in [normalize_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c ] q = db.session.query(pub.Pub.response_jsonb).filter( pub.Pub.id.in_(normalized_dois)) rows = q.all() normalized_doi_responses = [row[0] for row in rows if row[0]] found_normalized_dois = [r['doi'] for r in normalized_doi_responses] missing_dois = [ d for d in dirty_dois_list if normalize_doi( d, return_none_if_error=True) not in found_normalized_dois ] # look up cleaned dois where normalization wasn't enough clean_dois = [ c for c in [clean_doi(d, return_none_if_error=True) for d in missing_dois] if c and c not in found_normalized_dois ] q = db.session.query(pub.Pub.response_jsonb).filter( pub.Pub.id.in_(clean_dois)) rows = q.all() clean_doi_responses = [row[0] for row in rows if row[0]] found_clean_dois = [r['doi'] for r in clean_doi_responses] missing_dois = [ d for d in missing_dois if clean_doi(d, return_none_if_error=True) not in found_normalized_dois + found_clean_dois ] placeholder_responses = [ pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois ] responses = normalized_doi_responses + clean_doi_responses + placeholder_responses formats = body.get("formats", []) or ["jsonl", "csv"] files = [] if "jsonl" in formats: # save jsonl with open("output.jsonl", 'wb') as f: for response_jsonb in responses: f.write(json.dumps(response_jsonb, sort_keys=True)) f.write("\n") files.append("output.jsonl") csv_dicts = [ pub.csv_dict_from_response_dict(my_dict) for my_dict in responses ] csv_dicts = [my_dict for my_dict in csv_dicts if my_dict] fieldnames = sorted(csv_dicts[0].keys()) fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"] if "csv" in formats: # save csv with open("output.csv", 'wb') as f: writer = unicodecsv.DictWriter(f, fieldnames=fieldnames, dialect='excel') writer.writeheader() for my_dict in csv_dicts: writer.writerow(my_dict) files.append("output.csv") if "xlsx" in formats: book = Workbook() sheet = book.worksheets[0] sheet.title = "results" for col_idx, field_name in enumerate(fieldnames): sheet.cell(column=col_idx + 1, row=1, value=field_name) for row_idx, row in enumerate(csv_dicts): for col_idx, field_name in enumerate(fieldnames): sheet.cell(column=col_idx + 1, row=row_idx + 2, value=row[field_name]) book.save(filename="output.xlsx") files.append("output.xlsx") # prep email email_address = body["email"] email = create_email(email_address, "Your Unpaywall results", "simple_query_tool", {"profile": {}}, files) send(email, for_real=True) return jsonify({ "got it": email_address, "dois": found_normalized_dois + found_clean_dois + missing_dois })
def get_overrides_dict(): override_dict = defaultdict(dict) # cindy wu example override_dict["10.1038/nature21360"] = { "pdf_url": "https://arxiv.org/pdf/1703.01424.pdf", "version": "submittedVersion", "host_type_set": "repository", "evidence": "oa repository (manual)" } # example from twitter override_dict["10.1021/acs.jproteome.5b00852"] = { "pdf_url": "http://pubs.acs.org/doi/pdfplus/10.1021/acs.jproteome.5b00852", "host_type_set": "publisher", "version": "publishedVersion" } # have the unpaywall example go straight to the PDF, not the metadata page override_dict["10.1098/rspa.1998.0160"] = { "pdf_url": "https://arxiv.org/pdf/quant-ph/9706064.pdf", "version": "submittedVersion" } # missed, not in BASE, from Maha Bali in email override_dict["10.1080/13562517.2014.867620"] = { "pdf_url": "http://dar.aucegypt.edu/bitstream/handle/10526/4363/Final%20Maha%20Bali%20TiHE-PoD-Empowering_Sept30-13.pdf", "version": "submittedVersion" } # otherwise links to figshare match that only has data, not the article override_dict["110.1126/science.aaf3777"] = {} #otherwise links to a metadata page that doesn't have the PDF because have to request a copy: https://openresearch-repository.anu.edu.au/handle/1885/103608 override_dict["10.1126/science.aad2622"] = { "pdf_url": "https://lra.le.ac.uk/bitstream/2381/38048/6/Waters%20et%20al%20draft_post%20review_v2_clean%20copy.pdf", "version": "submittedVersion" } # otherwise led to http://www.researchonline.mq.edu.au/vital/access/services/Download/mq:39727/DS01 and authorization error override_dict["10.1126/science.aad2622"] = {} # else goes here: http://www.it-c.dk/people/schmidt/papers/complexity.pdf override_dict["10.1007/978-1-84800-068-1_9"] = {} # otherwise led to https://dea.lib.unideb.hu/dea/bitstream/handle/2437/200488/file_up_KMBT36220140226131332.pdf;jsessionid=FDA9F1A60ACA567330A8B945208E3CA4?sequence=1 override_dict["10.1007/978-3-211-77280-5"] = {} # otherwise led to publisher page but isn't open override_dict["10.1016/j.renene.2015.04.017"] = {} # override old-style webpage override_dict["10.1210/jc.2016-2141"] = { "pdf_url": "https://academic.oup.com/jcem/article-lookup/doi/10.1210/jc.2016-2141", "host_type_set": "publisher", "version": "publishedVersion", } # not indexing this location yet, from @rickypo override_dict["10.1207/s15327957pspr0203_4"] = { "pdf_url": "http://www2.psych.ubc.ca/~schaller/528Readings/Kerr1998.pdf", "version": "submittedVersion" } # mentioned in world bank as good unpaywall example override_dict["10.3386/w23298"] = { "pdf_url": "https://economics.mit.edu/files/12774", "version": "submittedVersion" } # from email, has bad citesserx cached version override_dict["10.1007/bf02693740"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.536.6939&rep=rep1&type=pdf", "version": "publishedVersion" } # from email, has bad citesserx cached version override_dict["10.1126/science.1150952"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.168.3796&rep=rep1&type=pdf", "version": "submittedVersion", "host_type_set": "repository" } # from email, has bad citesserx cached version override_dict["10.1515/eqc.2007.295"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.543.7752&rep=rep1&type=pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.1038/nature21377"] = { "pdf_url": "http://eprints.whiterose.ac.uk/112179/1/ppnature21377_Dodd_for%20Symplectic.pdf", "version": "submittedVersion" } # from email override_dict["10.1016/j.gtc.2016.09.007"] = { "pdf_url": "https://cora.ucc.ie/bitstream/handle/10468/3544/Quigley_Chapter.pdf?sequence=1&isAllowed=y", "version": "acceptedVersion" } # stephen hawking's thesis override_dict["10.17863/cam.11283"] = { "pdf_url": "https://www.repository.cam.ac.uk/bitstream/handle/1810/251038/PR-PHD-05437_CUDL2017-reduced.pdf?sequence=15&isAllowed=y", "version": "publishedVersion" } # from email override_dict["10.1152/advan.00040.2005"] = { "pdf_url": "https://www.physiology.org/doi/pdf/10.1152/advan.00040.2005", "version": "publishedVersion" } # from email override_dict["10.1016/j.chemosphere.2014.07.047"] = { "pdf_url": "https://manuscript.elsevier.com/S0045653514009102/pdf/S0045653514009102.pdf", "version": "submittedVersion" } # from email override_dict["10.4324/9780203900956"] = {} # from email override_dict["10.3810/psm.2010.04.1767"] = { "pdf_url": "http://cupola.gettysburg.edu/cgi/viewcontent.cgi?article=1014&context=healthfac", "version": "publishedVersion" } # from email override_dict["10.1016/S0140-6736(17)33308-1"] = { "pdf_url": "https://www.rug.nl/research/portal/files/64097453/Author_s_version_Gonadotrophins_versus_clomiphene_citrate_with_or_without_intrauterine_insemination_in_women.pdf", "version": "acceptedVersion", "host_type_set": "repository" } # from email override_dict["10.1093/joclec/nhy009"] = { "pdf_url": "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3126848", "host_type_set": "repository" } # from email override_dict["10.1038/s41477-017-0019-3"] = { "pdf_url": "https://www.repository.cam.ac.uk/bitstream/handle/1810/270235/3383_1_merged_1502805167.pdf?sequence=1&isAllowed=y", "version": "acceptedVersion", "host_type_set": "repository" } # from email override_dict["10.1029/wr015i006p01633"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.475.497&rep=rep1&type=pdf", "version": "publishedVersion" } # from email, zenodo override_dict["10.1080/01650521.2018.1460931"] = { "metadata_url": "https://zenodo.org/record/1236622", "host_type_set": "repository", "version": "acceptedVersion" } # from email override_dict["10.3928/01477447-20150804-53"] = {} # from twitter override_dict["10.1103/physreva.97.013421"] = { "pdf_url": "https://arxiv.org/pdf/1711.10074.pdf", "version": "submittedVersion" } # from email override_dict["10.1016/j.amjmed.2005.09.031"] = { "pdf_url": "https://www.amjmed.com/article/S0002-9343(05)00885-5/pdf", "version": "publishedVersion" } # from email override_dict["10.1080/15348458.2017.1327816"] = {} # from chorus override_dict["10.1103/physrevd.94.052011"] = { "pdf_url": "https://link.aps.org/accepted/10.1103/PhysRevD.94.052011", "version": "acceptedVersion", } override_dict["10.1063/1.4962501"] = { "pdf_url": "https://aip.scitation.org/doi/am-pdf/10.1063/1.4962501", "version": "acceptedVersion", "host_type_set": "repository" } # from email, broken citeseer link override_dict["10.2202/1949-6605.1908"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.535.9289&rep=rep1&type=pdf", "version": "publishedVersion" } # from email override_dict["10.1561/1500000012"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.174.8814&rep=rep1&type=pdf", "version": "publishedVersion" } # from email override_dict["10.1137/s0036142902418680"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.144.7627&rep=rep1&type=pdf", "version": "publishedVersion" } # from email override_dict["10.1088/1741-2552/aab4e4"] = { "pdf_url": "http://iopscience.iop.org/article/10.1088/1741-2552/aab4e4/pdf", "version": "publishedVersion" } # from email override_dict["10.1145/1031607.1031615"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.540.8125&rep=rep1&type=pdf", "version": "publishedVersion" } # from email override_dict["10.1007/s11227-016-1779-7"] = { "pdf_url": "https://hcl.ucd.ie/system/files/TJS-Hasanov-2016.pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.1016/s0020-0190(03)00351-x"] = { "pdf_url": "https://kam.mff.cuni.cz/~kolman/papers/noteb.ps", "version": "submittedVersion", "host_type_set": "repository" } # from email override_dict["10.1002/14651858.cd001704.pub4"] = { "pdf_url": "https://core.ac.uk/download/pdf/9440822.pdf", "version": "submittedVersion", "host_type_set": "repository" } # from email override_dict["10.1016/j.tetlet.2015.04.131"] = { "pdf_url": "https://www.sciencedirect.com/sdfe/pdf/download/read/aam/noindex/pii/S0040403915007881", "version": "acceptedVersion", "host_type_set": "publisher" } # from email override_dict["10.1016/j.nima.2016.04.104"] = { "pdf_url": "http://cds.cern.ch/record/2239750/files/1-s2.0-S0168900216303400-main.pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.1016/s1470-2045(15)00444-1"] = { "pdf_url": "https://www.statsarecool.com/data/uploads/journal-articles/who_declares_reds_meat_carcinogeniclancet_oct_2015.pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.1056/NEJM199406233302502"] = { "pdf_url": "https://www.nejm.org/doi/full/10.1056/NEJM199406233302502", "version": "publishedVersion", "host_type_set": "publisher" } # from email override_dict["10.1056/NEJMra1201534"] = { "pdf_url": "https://www.nejm.org/doi/pdf/10.1056/NEJMra1201534", "version": "publishedVersion", "host_type_set": "publisher" } # from email override_dict["10.1016/j.cmet.2018.03.012"] = { "pdf_url": "https://www.biorxiv.org/content/biorxiv/early/2018/01/15/245332.full.pdf", "version": "submittedVersion", "host_type_set": "repository" } # from email override_dict["10.1093/sf/65.1.1"] = { "pdf_url": "https://faculty.washington.edu/charles/new%20PUBS/A52.pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.1088/1751-8121/aabd9c"] = {} # from email override_dict["10.1017/CBO9781139173728.002"] = {} # from email override_dict["10.2174/97816810846711170101"] = {} # from email override_dict["10.1177/1354066196002003001"] = {} # from email override_dict["10.1093/bioinformatics/bty721"] = {} # from email override_dict["10.1088/1361-6528/aac7a4"] = {} # from email override_dict["10.1088/1361-6528/aac645"] = {} # from email override_dict["10.1111/1748-8583.12159"] = {} # from email override_dict["10.1042/BJ20080963"] = {} # from email override_dict["10.1136/bmj.j5007"] = {} # from email override_dict["10.1016/j.phrs.2017.12.007"] = {} # from email override_dict["10.4324/9781315770185"] = {} # from email override_dict["10.1108/PIJPSM-02-2016-0019"] = {} # from email override_dict["10.1016/j.ejca.2017.07.015"] = {} # from email override_dict["10.1080/14655187.2017.1469322"] = {} # from email override_dict["10.1080/02684527.2017.1407549"] = {} # from email override_dict["10.1093/jat/bky025"] = {} # from email override_dict["10.1016/j.midw.2009.07.004"] = {} # from email override_dict["10.1177/247553031521a00105"] = {} # from email override_dict["10.1002/0471445428"] = {} # from email override_dict["10.1007/978-3-642-31232-8"] = {} # ticket 267 override_dict["10.1016/j.anucene.2014.08.021"] = {} # ticket 199 # pdf has embedded password protection override_dict["10.22381/rcp1720184"] = {} # ticket 574 # pdf has embedded password protection override_dict["10.22381/EMFM14220195"] = {} # ticket 256 # journal in doaj but article not available override_dict["10.1016/j.mattod.2018.03.001"] = {} # ticket 277 # pmh record with spurious title: oai:works.swarthmore.edu:fac-psychology-1039 override_dict["10.1016/j.actpsy.2010.01.009"] = {} # ticket 280 # green scrape gets overexcited about a .doc link override_dict["10.1108/09596111211217932"] = {} # ticket 279 # match to wrong pdf, currently suppressed incorrectly by bad pdf check override_dict["10.1238/physica.topical.102a00059"] = {} # ticket 275 override_dict["10.1039/c7nj03253f"] = {} # email override_dict['10.1007/978-3-642-30350-0'] = {} # ticket 135 # bad title / last author match override_dict["10.1016/s0140-6736(17)31287-4"] = {} # ticket 98 # two similar articles with this title override_dict["10.1002/14651858.CD012414.pub2"] = {} # ticket 322 # pmh match on a cover sheet override_dict["10.1116/1.5046531"] = {} # ticket 631 # withdrawn article override_dict["10.5812/jjm.3664"] = {} # ticket 832 override_dict["10.5935/scd1984-8773.20168409"] = {} # ticket 1047 # book chapter has a bronze tag override_dict["10.1002/9781119473992"] = {} # from email override_dict["10.1016/S0022-1996(00)00093-3"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.475.3874&rep=rep1&type=pdf", "version": "submittedVersion", "host_type_set": "repository" } # from email override_dict["10.1177/088840649401700203"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.1014.8577&rep=rep1&type=pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.7326/L18-0139"] = { "pdf_url": "http://annals.org/data/journals/aim/936928/aime201804170-l180139.pdf", "version": "publishedVersion", "host_type_set": "publisher" } # from email override_dict["10.1007/978-3-319-48881-3_55"] = { "pdf_url": "http://liu.diva-portal.org/smash/get/diva2:1063949/FULLTEXT01.pdf", "version": "acceptedVersion", "host_type_set": "repository" } # from email override_dict["10.1109/ICCVW.2015.86"] = { "pdf_url": "http://liu.diva-portal.org/smash/get/diva2:917646/FULLTEXT01", "version": "acceptedVersion", "host_type_set": "repository" } # from email override_dict["10.1126/science.aap9559"] = { "pdf_url": "http://vermontcomplexsystems.org/share/papershredder/vosoughi2018a.pdf", "version": "publishedVersion", "host_type_set": "repository" } # from email override_dict["10.1109/tpds.2012.97"] = { "pdf_url": "https://www.cnsr.ictas.vt.edu/publication/06171175.pdf", "version": "publishedVersion", "host_type_set": "repository" } # ticket 261 # crossref metadata points to wrong article override_dict["10.4149/BLL_2013_058"] = { "pdf_url": "http://www.elis.sk/download_file.php?product_id=3759&session_id=lnkeo437s8hv5t0r28g6ku93b0", "version": "publishedVersion", "host_type_set": "publisher" } # ticket 317 # broken link on citeseer override_dict["10.1016/b978-1-55860-307-3.50012-5"] = { "pdf_url": "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.57.3196&rep=rep1&type=pdf", "version": "submittedVersion", "host_type_set": "repository" } # ticket 195 # wrong registered landing page override_dict["10.21285/2227-2925-2018-8-2-9-18"] = { "metadata_url": "http://journals.istu.edu/izvestia_biochemi/journals/2018/02/articles/01", "version": "publishedVersion", "host_type_set": "publisher", "evidence": oa_evidence.oa_journal_doaj } # ticket 213 # journal issue is open override_dict["10.14195/2182-7982_32"] = { "metadata_url": "https://doi.org/10.14195/2182-7982_32", "version": "publishedVersion", "host_type_set": "publisher" } override_dict["10.1016/S2213-8587(16)30320-5"] = { "pdf_url": "http://www.spdm.org.pt/media/1373/pku-guidelines_2017.pdf", "version": "publishedVersion", "host_type_set": "repository" } # ticket 433 override_dict["10.1144/GSL.JGS.1846.002.01-02.54"] = { "metadata_url": "https://www.biodiversitylibrary.org/item/109652#page/473/mode/1up", "version": "publishedVersion", "host_type_set": "repository" } # ticket 223 # pme record has wrong page url override_dict["10.1002/abc.207"] = { "pdf_url": "https://repository.library.northeastern.edu/files/neu:344561/fulltext.pdf", "metadata_url": "https://repository.library.northeastern.edu/files/neu:344561", "version": "submittedVersion", "host_type_set": "repository" } # ticket 304 # inline citation pdf links override_dict["10.7766/alluvium.v3.1.05"] = { "metadata_url": "https://doi.org/10.7766/alluvium.v3.1.05", "version": "publishedVersion", "host_type_set": "publisher" } # ticket 376 override_dict["10.1080/01639374.2017.1358232"] = { "pdf_url": "https://groups.niso.org/apps/group_public/download.php/17446/Understanding%20Metadata.pdf", "version": "publishedVersion", "host_type_set": "repository" } # ticket 539 # malformed url in pmh record override_dict["10.1642/0004-8038(2007)124[1121:EOWNVT]2.0.CO;2"] = { "pdf_url": "https://repository.si.edu/bitstream/handle/10088/35181/NZP_Marra_2007-ECOLOGY_OF_WEST_NILE_VIRUS_TRANSMISSION_AND_ITS_IMPACT_ON_BIRDS_IN_THE_WESTERN_HEMISPHERE.pdf", "version": "publishedVersion", "host_type_set": "repository" } # https://github.com/Impactstory/unpaywall/issues/41 # link to preprint with different DOI override_dict["10.1038/s41592-018-0235-4"] = { "metadata_url": "https://www.biorxiv.org/content/10.1101/306951v3", "pdf_url": "https://www.biorxiv.org/content/biorxiv/early/2018/07/24/306951.full.pdf", "version": "submittedVersion", "host_type_set": "repository" } # issue 530 # unrelated pmh record has wrong DOI override_dict["10.1056/nejmoa063842"] = { "metadata_url": "https://www.nejm.org/doi/10.1056/NEJMoa063842", "version": "publishedVersion", "host_type_set": "publisher" } # issue 571 # scrape finds supplementary file override_dict["10.21203/rs.2.11958/v1"] = { "metadata_url": "https://doi.org/10.21203/rs.2.11958/v1", "version": "submittedVersion", "host_type_set": "repository", "license": "cc-by" } # twitter override_dict['10.1002/jclp.22680'] = { 'pdf_url': 'https://dl.uswr.ac.ir/bitstream/Hannan/62873/1/2018%20JCpsychology%20Volume%2074%20Issue%2011%20November%20%2811%29.pdf', 'version': 'publishedVersion', 'host_type_set': 'repository', } # ticket 680 override_dict['10.17059/2015-4-27'] = { 'metadata_url': 'http://economyofregion.com/archive/2015/57/2731/', 'pdf_url': 'http://economyofregion.com/archive/2015/57/2731/pdf/', 'version': 'publishedVersion', 'host_type_set': 'publisher', } # ticket 681 override_dict['10.17059/2016-1-19'] = { 'metadata_url': 'http://economyofregion.com/archive/2016/58/2778/', 'pdf_url': 'http://economyofregion.com/archive/2016/58/2778/pdf/', 'version': 'publishedVersion', 'host_type_set': 'publisher', } # ticket 743 override_dict['10.1016/S0140-6736(07)61162-3'] = { 'metadata_url': 'https://www.semanticscholar.org/paper/Cannabis-use-and-risk-of-psychotic-or-aff-ective-a-Moore-Zammit/6e5bc8bf7814c62db319632ca939ad68a6770d1b', 'pdf_url': 'https://pdfs.semanticscholar.org/641e/6aba769421d4308d1ad107684eeca7f687d1.pdf', 'version': 'publishedVersion', 'host_type_set': 'repository', } # ticket 835 override_dict['10.23912/9781911396512-3454'] = { 'metadata_url': 'https://doi.org/10.23912/9781911396512-3454', 'pdf_url': 'https://www.goodfellowpublishers.com/academic-publishing.php?promoCode=&partnerID=&housekeeping=getfile&productID=3657', 'version': 'publishedVersion', 'host_type_set': 'publisher', } # ticket 899, missing from IR override_dict['10.1080/0361526x.2019.1551004'] = { 'metadata_url': 'https://inspire.redlands.edu/oh_articles/249/', 'pdf_url': 'https://inspire.redlands.edu/cgi/viewcontent.cgi?article=1190&context=oh_articles', 'version': 'publishedVersion', 'host_type_set': 'repository', 'license': 'cc-by-nc', } # ticket 1029, can't detect PDF override_dict['10.1891/2156-5287.8.4.252'] = { 'metadata_url': 'https://doi.org/10.1891/2156-5287.8.4.252', 'pdf_url': 'https://connect.springerpub.com/content/sgrijc/8/4/252.full.pdf', 'version': 'publishedVersion', 'host_type_set': 'publisher', } # ticket 1057, full issue pdf found first but has errors override_dict['10.5152/turkjnephrol.2020.3579'] = { 'metadata_url': 'https://doi.org/10.5152/turkjnephrol.2020.3579', 'pdf_url': 'https://turkjnephrol.org/Content/files/sayilar/420/84-88(2).pdf', 'version': 'publishedVersion', 'host_type_set': 'publisher', } # ticket 1064, doi.org/10.1016/j.jcmg.2012.07.005 redirects to 10.1016/j.jcmg.2012.08.001 override_dict['10.1016/j.jcmg.2012.07.005'] = { 'metadata_url': 'https://www.sciencedirect.com/science/article/pii/S1936878X12005748', 'version': 'publishedVersion', 'host_type_set': 'publisher', } # ticket 1084 faculty page override_dict['10.1016/j.jebo.2012.09.021'] = { 'pdf_url': 'https://cpb-us-w2.wpmucdn.com/sites.wustl.edu/dist/c/2014/files/2019/06/tennis.pdf', 'version': 'submittedVersion', 'host_type_set': 'repository', } # ticket 1118, can't read landing page override_dict['10.3917/zil.006.0009'] = { 'metadata_url': 'https://doi.org/10.3917/zil.006.0009', 'version': 'publishedVersion', 'host_type_set': 'publisher', } # ticket 1151, doi.org url 404 override_dict['10.1001/jamafacial.2013.406'] = { 'metadata_url': 'https://www.liebertpub.com/doi/10.1001/archfaci.2013.406', 'version': 'publishedVersion', 'host_type_set': 'publisher', } #ticket 1152, doi.org url leads to wrong article override_dict['10.1016/j.aott.2018.06.004'] = { 'metadata_url': 'https://www.aott.org.tr/en/comparison-of-ultrasound-and-extracorporeal-shock-wave-therapy-in-lateral-epicondylosis-133459', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', 'license': 'cc-by-nc-nd', } #ticket 1162, can't download PDF override_dict['10.3406/ahess.1976.293748'] = { 'metadata_url': 'https://www.persee.fr/doc/ahess_0395-2649_1976_num_31_4_293748', 'version': 'publishedVersion', 'host_type_set': 'repository', 'license': 'cc-by-nc-sa', } #ticket 1184, missing from philarchive override_dict['10.1007/s10670-020-00241-4'] = { 'metadata_url': 'https://philarchive.org/rec/LOGIAI', 'pdf_url': 'https://philarchive.org/archive/LOGIAI', 'version': 'acceptedVersion', 'host_type_set': 'repository', } override_dict['10.1007/s11098-019-01378-x'] = { 'metadata_url': 'https://philarchive.org/rec/LOGTST', 'pdf_url': 'https://philarchive.org/archive/LOGTST', 'version': 'acceptedVersion', 'host_type_set': 'repository', } override_dict['10.1002/tht3.395'] = { 'metadata_url': 'https://philarchive.org/rec/LOGSUR', 'pdf_url': 'https://philarchive.org/archive/LOGSUR', 'version': 'publishedVersion', 'host_type_set': 'repository', } override_dict['10.3917/lig.764.0006'] = { 'metadata_url': 'https://doi.org/10.3917/lig.764.0006', 'version': 'publishedVersion', 'host_type_set': 'publisher', } # ticket 1260, wrong doi url override_dict['10.5603/ait.a2017.0053'] = { 'metadata_url': 'https://www.termedia.pl/Pharmacokinetic-drug-drug-interactions-in-the-intensive-care-unit-single-centre-experience-and-literature-review,118,38092,1,1.html', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } # ticket 22157, wrong doi url override_dict['10.5603/ait.a2015.0073'] = { 'metadata_url': 'https://www.termedia.pl/Hemodynamic-monitoring-To-calibrate-or-not-to-calibrate-r-nPart-1-Calibrated-techniques,118,38312,0,1.html', 'pdf_url': 'https://www.termedia.pl/Journal/-118/pdf-38312-10?filename=pages_487-500_article_43713.pdf', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } # ticket 22157, wrong doi url override_dict['10.5603/ait.a2015.0076'] = { 'metadata_url': 'https://www.termedia.pl/Hemodynamic-monitoring-To-calibrate-or-not-to-calibrate-r-nPart-2-Non-calibrated-techniques,118,38313,0,1.html', 'pdf_url': 'https://www.termedia.pl/Journal/-118/pdf-38313-10?filename=pages_501-516_article_43754.pdf', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } # ticket 3417, wrong doi url override_dict['10.15414/jmbfs.2016.5.special1.64-68'] = { 'metadata_url': 'https://www.jmbfs.org/issue/february-2016-vol-5-special-1/jmbfs-2016_020-ivanuska/?issue_id=4120&article_id=18', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } # ticket 4625, wrong doi url override_dict['10.4103/1011-4564.204985'] = { 'metadata_url': 'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=37;epage=43;aulast=Huang;type=0', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } override_dict['10.4103/jmedsci.jmedsci_99_16'] = { 'metadata_url': 'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=44;epage=49;aulast=Doka;type=0', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } override_dict['10.4103/jmedsci.jmedsci_95_16'] = { 'metadata_url': 'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=50;epage=55;aulast=Hsu;type=0', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } override_dict['10.4103/jmedsci.jmedsci_104_16'] = { 'metadata_url': 'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=56;epage=60;aulast=Lin;type=0', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } override_dict['10.4103/jmedsci.jmedsci_100_16'] = { 'metadata_url': 'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=61;epage=68;aulast=Shen;type=0', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } override_dict['10.4103/jmedsci.jmedsci_12_16'] = { 'metadata_url': 'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=69;epage=71;aulast=Chaitra;type=0', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } override_dict['10.4103/jmedsci.jmedsci_92_15'] = { 'metadata_url': 'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=72;epage=75;aulast=Huang;type=0', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } override_dict['10.4103/jmedsci.jmedsci_27_16'] = { 'metadata_url': 'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=76;epage=79;aulast=Saha;type=0', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } # end ticket 4625 # ticket 22137, doi url redirects to http, then https redirect fails override_dict['10.18845/te.v1i2.868'] = { 'metadata_url': 'https://revistas.tec.ac.cr/index.php/tec_empresarial/article/view/868', 'pdf_url': 'https://revistas.tec.ac.cr/index.php/tec_empresarial/article/view/868', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } # ticket 22208. 404 at doi url override_dict['10.26442/terarkh201890417-20'] = { "host_type_set": "publisher", "version": "publishedVersion", "evidence": "oa journal (via doaj)", "metadata_url": "https://ter-arkhiv.ru/0040-3660/article/view/32440", "license": "cc-by", } # ticket 22274. gold journal but DOI doesn't resolve override_dict['10.25251/skin.3.6.4'] = { 'metadata_url': 'https://jofskin.org/index.php/skin/article/view/625', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via observed oa rate)', } # ticket 22287. journal in DOAJ, but article missing from https://sophia.ups.edu.ec/index.php/sophia/issue/view/151 override_dict['10.17163/soph.n25.2018.03'] = { 'metadata_url': 'https://www.redalyc.org/jatsRepo/4418/441855948003/html/index.html', 'license': 'cc-by-nc-sa', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } # ticket 22562. journal in DOAJ, doi.org url 404s override_dict['10.1162/itid.2003.1.1.75'] = { 'metadata_url': 'https://itidjournal.org/index.php/itid/article/view/136.html', 'pdf_url': 'https://itidjournal.org/index.php/itid/article/download/136/136-472-1-PB.pdf', 'license': 'cc-by-nc-sa', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } # ticket 22936. journal in DOAJ, doi.org url 404s, same article as above override_dict['10.1162/154475203771799720'] = { 'metadata_url': 'https://itidjournal.org/index.php/itid/article/view/136.html', 'pdf_url': 'https://itidjournal.org/index.php/itid/article/download/136/136-472-1-PB.pdf', 'license': 'cc-by-nc-sa', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } # ticket 22674. doesn't want to link to PDF for some reason. override_dict['10.1101/2020.02.24.962878'] = { 'metadata_url': 'https://doi.org/10.1101/2020.02.24.962878', 'version': 'submittedVersion', 'host_type_set': 'repository', 'evidence': 'oa repository (via free pdf)', } # ticket 22562. journal in DOAJ, broken doi.org url override_dict['10.5505/tbdhd.2018.50251'] = { 'metadata_url': 'http://dergi.bdhd.org.tr/eng/jvi.aspx?un=TBDHD-50251&volume=24&issue=2', 'pdf_url': 'https://jag.journalagent.com/tbdhd/pdfs/TBDHD-50251-CASE_REPORT-YEKTAS.pdf', 'license': 'cc-by-nc-nd', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'oa journal (via doaj)', } # ticket 22877. journal was detected as all-OA but DOIs don't work now override_dict['10.14800/scti.232'] = { 'metadata_url': 'https://www.smartscitech.com/index.php/SCTI/article/view/828', 'version': 'publishedVersion', 'host_type_set': 'publisher', } # ticket 22945. first citeseerx link broken override_dict['10.1111/1467-8306.9302004'] = { 'metadata_url': 'https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.109.1825', 'pdf_url': 'https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.109.1825&rep=rep1&type=pdf', 'version': 'submittedVersion', 'host_type_set': 'repository', 'evidence': 'oa repository (via free pdf)', } # ticket 22967. first citeseerx link is slide deck override_dict['10.1109/msp.2010.936019'] = { 'metadata_url': 'http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.470.8283', 'pdf_url': 'http://www1.se.cuhk.edu.hk/~manchoso/papers/sdrapp-SPM.pdf', 'version': 'acceptedVersion', 'host_type_set': 'repository', 'evidence': 'oa repository (via free pdf)', } # ticket 23017. can't scrape cairn override_dict['10.3917/sr.035.0007'] = { 'metadata_url': 'https://doi.org/10.3917/sr.035.0007', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'license': 'cc-by-nc-nd', } # ticket 23017. can't scrape cairn override_dict['10.3917/sr.039.0119'] = { 'metadata_url': 'https://doi.org/10.3917/sr.039.0119', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'license': 'cc-by-nc-nd', } # ticket 23020 override_dict['10.3847/2041-8213/abe4de'] = { 'metadata_url': 'https://doi.org/10.3847/2041-8213/abe4de', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'license': 'cc-by', } # ticket 23020 override_dict['10.3847/2041-8213/abe71d'] = { 'metadata_url': 'https://doi.org/10.3847/2041-8213/abe71d', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'license': 'cc-by', } # ticket 23020 override_dict['10.3847/2041-8213/abed53'] = { 'metadata_url': 'https://doi.org/10.3847/2041-8213/abed53', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'license': 'cc-by', } # ticket 23020 override_dict['10.3847/2041-8213/abee6a'] = { 'metadata_url': 'https://doi.org/10.3847/2041-8213/abee6a', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'license': 'cc-by', } # ticket 1025 # WOS user says full article isn't available override_dict['10.1016/j.fuel.2019.116234'] = {} # ticket 215 # doi.org links point to wrong article override_dict["10.1515/res-2016-0002"] = {} # ticket 584 # repo match to dissertation with same title and author override_dict["10.3726/978-3-0343-2544-8"] = {} # book front matter override_dict["10.1007/978-3-319-78349-9"] = {} # ticket 594 override_dict["10.1016/j.chemgeo.2016.02.020"] = {} # ticket 240 part 2. mislabeled in repository. override_dict["10.1111/eip.12323"] = {} # ticket 928. CC license in references. override_dict['10.1007/s11012-016-0472-5'] = {} # ticket 968. CC license for dataset. override_dict['10.1007/s12275-020-9536-2'] = {} # ticket 966. PDF link only works once. override_dict['10.1093/ee/nvz159'] = {} # ticket 1371. someone doesn't like green OA override_dict['10.1007/s10798-019-09554-0'] = {} # ticket 6937. bad license info on page override_dict['10.1016/j.breast.2015.07.036'] = {} # ticket 22163. doing a favor. override_dict['10.1016/j.energy.2015.06.127'] = {} # ticket 22794. page and metadata have license override_dict['10.1515/pac-2020-0702'] = {} # ticket 22791 override_dict['10.1038/s41574-020-00451-4'] = {} # ticket 22636 override_dict['10.1007/978-981-15-4814-7'] = { 'metadata_url': 'https://doi.org/10.1007/978-981-15-4814-7', 'version': 'publishedVersion', 'host_type_set': 'publisher', 'evidence': 'open (via free pdf)', } override_dict['10.1080/1097198x.2020.1752084'] = {} # ticket 22892, doi resolution is wrong # is https://www.journalofhospitalmedicine.com/jhospmed/article/189543/hospital-medicine/you-cant-have-it-all-experience-academic-hospitalists # should be https://www.journalofhospitalmedicine.com/jhospmed/article/189545/hospital-medicine/barriers-earlier-hospital-discharge-what-matters-most override_dict['10.12788/jhm.3094'] = {} # ticket 535 # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.6.09070802050003050502050201 for doi in ['10.1484/M.RELMIN-EB.6.09070802050003050502050201'] + [ '10.1484/M.RELMIN-EB.5.1038' + str(n) for n in range(59, 76) ]: override_dict[doi] = { "pdf_url": "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=23027", "version": "publishedVersion", "host_type_set": "repository" } # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.5.109256 for doi in ['10.1484/M.RELMIN-EB.5.109256'] + list( ['10.1484/M.RELMIN-EB.5.1091' + str(n) for n in range(58, 70)]): override_dict[doi] = { "pdf_url": "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=26957", "version": "publishedVersion", "host_type_set": "repository" } # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.5.108025 for doi in ['10.1484/M.RELMIN-EB.5.108025'] + list( ['10.1484/M.RELMIN-EB.5.1084' + str(n) for n in range(35, 51)]): override_dict[doi] = { "pdf_url": "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=26953", "version": "publishedVersion", "host_type_set": "repository" } # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.6.09070802050003050500050207 for doi in ['10.1484/M.RELMIN-EB.6.09070802050003050500050207'] + list( ['10.1484/M.RELMIN-EB.1.1018' + str(n) for n in range(74, 92)]): override_dict[doi] = { "pdf_url": "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=23029", "version": "publishedVersion", "host_type_set": "repository" } # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.5.108940 for doi in ['10.1484/M.RELMIN-EB.5.108940'] + list( ['10.1484/M.RELMIN-EB.5.1093' + str(n) for n in range(46, 60)]): override_dict[doi] = { "pdf_url": "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=26960", "version": "publishedVersion", "host_type_set": "repository" } # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.6.09070802050003050408050408 for doi in ['10.1484/M.RELMIN-EB.6.09070802050003050408050408'] + list( ['10.1484/M.RELMIN-EB.1.1018' + str(n) for n in range(10, 27)]): override_dict[doi] = { "pdf_url": "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=25736", "version": "publishedVersion", "host_type_set": "repository" } # book & chapters listed at https://www.brepolsonline.net/action/showBook?doi=10.1484%2FM.RELMIN-EB.5.106169 for doi in ['10.1484/M.RELMIN-EB.5.106169'] + list( ['10.1484/M.RELMIN-EB.4.000' + str(n).zfill(2) for n in range(2, 15)]): override_dict[doi] = { "pdf_url": "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=23028", "version": "publishedVersion", "host_type_set": "repository" } # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.5.109274 for doi in ['10.1484/M.RELMIN-EB.5.109274'] + list( ['10.1484/M.RELMIN-EB.5.111' + str(n) for n in range(590, 615)]): override_dict[doi] = { "pdf_url": "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=26954", "version": "publishedVersion", "host_type_set": "repository" } # book & chapters listed at https://www.brepolsonline.net/action/showBook?doi=10.1484/M.RELMIN-EB.5.112302 for doi in ['10.1484/M.RELMIN-EB.5.112302'] + list( ['10.1484/M.RELMIN-EB.5.1115' + str(n) for n in range(13, 29)]): override_dict[doi] = { "pdf_url": "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=26961", "version": "publishedVersion", "host_type_set": "repository" } override_dict["10.1016/s1474-4422(19)30285-6"] = { "metadata_url": "http://hdl.handle.net/2066/207798", "version": "publishedVersion", "host_type_set": "repository", "evidence": "oa repository (manual)" } # the use of this is counting on the doi keys being lowercase/cannonical response = {} for k, v in override_dict.items(): response[normalize_doi(k)] = v return response
def run_through_dois(filename=None, reverse=None, loggly=False): total_start = time() i = 0 output_dicts = [] fh = open(filename, "r") lines = fh.readlines() if reverse: logger.info(u"reverse!") lines.reverse() i = -1 * len(lines) dois = [] for line in lines: dois.append(line.strip()) # line = line.replace('"', '') # if u"," in line: # split_line = line.split(",") # if loggly: # dois.append(split_line[1]) # else: # dois.append(split_line[0]) # else: # dois.append(line.strip()) # deduplicate, preserving order duplicated_dois = dois dois = [] for doi in duplicated_dois: if doi not in dois: dois.append(doi) logger.info(u"length of deduped doi list: {}".format(len(dois))) for doi in dois: try: my_doi = normalize_doi(doi) except NoDoiException: logger.info(u"bad doi: {}".format(doi)) continue if not my_doi: logger.info(u"bad doi: {}".format(doi)) continue my_pub = Oab.query.get(my_doi) if not my_pub: my_pub = Oab() db.session.add(my_pub) my_pub.id = my_doi my_doi_url = "http://doi.org/{}".format(my_doi) my_doi_url_encoded = urllib.quote_plus(my_doi_url) api_url = "https://api.openaccessbutton.org/availability?url={}".format( my_doi_url_encoded) headers = {"content-type": "application/json"} r = requests.get(api_url, headers=headers) if r.status_code == 200: logger.info(u"success with oab! with {}".format(my_doi)) # logger.info(r.json()) my_pub.api = r.json() flag_modified(my_pub, "api") else: logger.info(u"problem with oab, status_code {}".format( r.status_code)) dissemin_url = "http://dissem.in/api/{}".format(my_doi) r = requests.get(dissemin_url, headers=headers) if r.status_code == 200: logger.info(u"success! with dissemin! with {}".format(my_doi)) # logger.info(r.json()) my_pub.dissemin = r.json() flag_modified(my_pub, "dissemin") else: logger.info(u"problem with dissemin, status_code {}".format( r.status_code)) safe_commit(db) i += 1 logger.info(u"finished {} in {} seconds".format(i, elapsed(total_start, 2))) fh.close()