def reported_noncompliant_url_fragments(dirty_doi):
    if not dirty_doi:
        return []

    lookup_normalized = {}
    for (doi_key, fragment_list) in lookup_raw.iteritems():
        lookup_normalized[normalize_doi(doi_key)] = [noncompliant_url_fragment.lower() for noncompliant_url_fragment in fragment_list]

    return lookup_normalized.get(normalize_doi(dirty_doi), [])
示例#2
0
    def is_bronze(self):
        if self.best_url and not (self.is_gold or
                                  self.is_green) and not self.has_open_license:
            return True

        if is_doi_url(self.best_url):
            url_doi = normalize_doi(self.best_url, return_none_if_error=True)
            unquoted_doi = normalize_doi(unquote(self.best_url),
                                         return_none_if_error=True)

            return (self.doi in (url_doi, unquoted_doi)
                    and not (self.is_gold or self.is_hybrid or self.is_green))

        return False
示例#3
0
def scroll_through_all_dois(query_doi=None, first=None, last=None, today=False, week=False, chunk_size=1000):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers = {"Accept": "application/json", "User-Agent": "mailto:[email protected]"}

    if first:
        base_url = "https://api.crossref.org/works?filter=from-created-date:{first},until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}"
    else:
        base_url = "https://api.crossref.org/works?filter=until-created-date:{last}&rows={rows}&select=DOI&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    number_added = 0

    while has_more_responses:
        has_more_responses = False

        start_time = time()
        url = base_url.format(
            first=first,
            last=last,
            rows=chunk_size,
            next_cursor=next_cursor)
        logger.info(u"calling url: {}".format(url))

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds.  url: {}".format(elapsed(start_time, 2), url))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(resp.status_code))
            return number_added

        resp_data = resp.json()["message"]
        next_cursor = resp_data.get("next-cursor", None)
        if next_cursor:
            next_cursor = quote(next_cursor)
            if resp_data["items"] and len(resp_data["items"]) == chunk_size:
                has_more_responses = True

        dois_from_api = [normalize_doi(api_raw["DOI"]) for api_raw in resp_data["items"]]
        added_pubs = add_new_pubs_from_dois(dois_from_api)
        if dois_from_api:
            logger.info(u"got {} dois from api".format(len(dois_from_api)))
        if added_pubs:
            logger.info(u"{}: saved {} new pubs, including {}".format(
                first, len(added_pubs), added_pubs[-2:]))

        number_added += len(added_pubs)

        logger.info(u"loop done in {} seconds".format(elapsed(start_time, 2)))

    return number_added
示例#4
0
def run_update(parsed_args):
    update = update_registry.get(parsed_args.fn)

    start = time()

    #convenience method for handling an doi
    if parsed_args.doi:
        from pub import Pub
        from util import normalize_doi

        my_pub = db.session.query(Pub).filter(
            Pub.id == normalize_doi(parsed_args.doi)).first()
        parsed_args.id = my_pub.id
        logger.info(u"Got database hit for this doi: {}".format(my_pub.id))

    update.run(**vars(parsed_args))

    db.session.remove()
    logger.info(u"finished update in {} secconds".format(elapsed(start)))
示例#5
0
def run(parsed_args, job_type):
    start = time()
    if job_type in ("normal", "hybrid"):
        update = update_registry.get("Pub."+process_name(job_type))
        if parsed_args.doi:
            parsed_args.id = normalize_doi(parsed_args.doi)
            parsed_args.doi = None
    else:
        update = update_registry.get("DateRange.get_unpaywall_events")
        # update = update_registry.get("DateRange.get_pmh_events")

    update.run(**vars(parsed_args))

    logger.info(u"finished update in {} seconds".format(elapsed(start)))

    resp = None
    if job_type in ("normal", "hybrid"):
        my_pub = Pub.query.get(parsed_args.id)
        resp = my_pub.response_jsonb
        pprint(resp)

    return resp
示例#6
0
def crawl_crossref(page_delay=None, page_length=None):
    # see if there's an unfinished crawl
    active_crawl = None
    unfinished_crawl = CrossrefCrawl.query.filter(
        CrossrefCrawl.done.is_(None)).scalar()

    if unfinished_crawl:
        logger.info(u'found an unfinished crawl starting at {}'.format(
            unfinished_crawl.started))

        # see if it's still running
        last_request = unfinished_crawl.last_request
        if last_request is None or last_request > datetime.datetime.utcnow(
        ) - datetime.timedelta(hours=2):
            logger.info(
                u'aborting, unfinished crawl still looks active. started: {}, last request {}'
                .format(unfinished_crawl.started,
                        unfinished_crawl.last_request))
            return

        # see if we should resume it
        if unfinished_crawl.cursor_tries < 5:
            # resume it
            active_crawl = unfinished_crawl
        else:
            # kill it
            unfinished_crawl.done = False
            db.session.commit()

    if not active_crawl:
        logger.info(u'beginning a new crawl')
        active_crawl = CrossrefCrawl(started=datetime.datetime.utcnow(),
                                     cursor='*',
                                     cursor_tries=0)
        db.session.add(active_crawl)
        db.session.commit()

    root_url = u'https://api.crossref.org/works?cursor={next_cursor}'
    if page_length:
        root_url = root_url + u'&rows={}'.format(page_length)

    has_more_responses = True

    while has_more_responses:
        url = root_url.format(next_cursor=active_crawl.cursor)
        logger.info(u"calling url: {}".format(url))

        active_crawl.last_request = datetime.datetime.utcnow()
        db.session.commit()

        crossref_time = time()
        resp = get_response_page(url)
        logger.info(u"getting crossref response took {} seconds".format(
            elapsed(crossref_time, 2)))

        if not resp or resp.status_code != 200:
            # abort, try agan later
            logger.info(u"error in crossref call, status_code = {}".format(
                resp and resp.status_code))
            active_crawl.cursor_tries += 1
            active_crawl.last_request = None
            db.session.commit()
            return
        else:
            # save DOIs
            resp_data = resp.json()["message"]

            page_dois = []
            for api_raw in resp_data["items"]:
                doi = normalize_doi(api_raw["DOI"])
                if doi:
                    page_dois.append({
                        'crawl_time': active_crawl.started,
                        'doi': doi
                    })

            # update cursor
            next_cursor = resp_data.get("next-cursor", None)
            if next_cursor:
                next_cursor = quote(next_cursor)

            if not resp_data["items"] or not next_cursor:
                has_more_responses = False
                active_crawl.done = True
            else:
                active_crawl.cursor = next_cursor
                active_crawl.cursor_tries = 0

            if page_dois:
                db.session.bulk_insert_mappings(CrossrefCrawlDoi, page_dois)

            db.session.commit()

            logger.info(u'added {} dois'.format(len(page_dois)))

            if page_delay:
                logger.info('sleeping {} seconds'.format(page_delay))
                sleep(page_delay)
示例#7
0
def get_dois_and_data_from_crossref(query_doi=None,
                                    first=None,
                                    last=None,
                                    today=False,
                                    week=False,
                                    offset_days=0,
                                    chunk_size=1000,
                                    get_updates=False):

    root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}"

    if get_updates:
        root_url_with_last = "https://api.crossref.org/works?order=desc&sort=indexed&filter=from-index-date:{first},until-index-date:{last}&rows={chunk}&cursor={next_cursor}"
        root_url_no_last = "https://api.crossref.org/works?order=desc&sort=indexed&filter=from-index-date:{first}&rows={chunk}&cursor={next_cursor}"
    else:
        root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}"
        root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    num_pubs_added_so_far = 0
    pubs_this_chunk = []

    if week:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=7))
    elif today:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=2))

    if not first:
        first = datetime.date(2016, 4, 1)

    last = last and last - datetime.timedelta(days=offset_days)
    first = first and first - datetime.timedelta(days=offset_days)

    start_time = time()

    insert_pub_fn = add_pubs_or_update_crossref if get_updates else add_new_pubs

    while has_more_responses:
        if query_doi:
            url = root_url_doi.format(doi=query_doi)
        else:
            if last:
                url = root_url_with_last.format(first=first.isoformat(),
                                                last=last.isoformat(),
                                                next_cursor=next_cursor,
                                                chunk=chunk_size)
            else:
                # query is much faster if don't have a last specified, even if it is far in the future
                url = root_url_no_last.format(first=first.isoformat(),
                                              next_cursor=next_cursor,
                                              chunk=chunk_size)

        logger.info("calling url: {}".format(url))
        crossref_time = time()

        resp = get_response_page(url)
        logger.info("getting crossref response took {} seconds".format(
            elapsed(crossref_time, 2)))
        if resp.status_code != 200:
            logger.info("error in crossref call, status_code = {}".format(
                resp.status_code))
            resp = None

        if resp:
            resp_data = resp.json()["message"]
            next_cursor = resp_data.get("next-cursor", None)
            if next_cursor:
                next_cursor = quote(next_cursor)

            if not resp_data["items"] or not next_cursor:
                has_more_responses = False

            for api_raw in resp_data["items"]:
                loop_time = time()

                doi = normalize_doi(api_raw["DOI"])
                my_pub = build_new_pub(doi, api_raw)

                # hack so it gets updated soon
                my_pub.updated = datetime.datetime(1042, 1, 1)

                pubs_this_chunk.append(my_pub)

                if len(pubs_this_chunk) >= 100:
                    added_pubs = insert_pub_fn(pubs_this_chunk)
                    logger.info(
                        "added {} pubs, loop done in {} seconds".format(
                            len(added_pubs), elapsed(loop_time, 2)))
                    num_pubs_added_so_far += len(added_pubs)

                    pubs_this_chunk = []

        logger.info("at bottom of loop")

    # make sure to get the last ones
    logger.info("saving last ones")
    added_pubs = insert_pub_fn(pubs_this_chunk)
    num_pubs_added_so_far += len(added_pubs)
    logger.info("Added >>{}<< new crossref dois on {}, took {} seconds".format(
        num_pubs_added_so_far,
        datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
示例#8
0
    def worker_run(self, **kwargs):
        single_obj_id = kwargs.get("id", None)
        chunk = kwargs.get("chunk", 100)
        limit = kwargs.get("limit", 10)
        run_class = Pub
        run_method = kwargs.get("method")

        if single_obj_id:
            limit = 1
            queue_table = None
        elif run_method == "refresh":
            queue_table = "pub_refresh_queue"
            if not limit:
                limit = 1000
            text_query_pattern = """
                with refresh_queue as (
                    select id
                    from {queue_table}
                    where started is null
                    order by
                        priority desc,
                        finished nulls first,
                        started,
                        rand
                    limit {chunk}
                    for update skip locked
                )
                update {queue_table} queue_rows_to_update
                set started = now()
                from refresh_queue
                where refresh_queue.id = queue_rows_to_update.id
                returning refresh_queue.id;"""
            text_query = text_query_pattern.format(chunk=chunk,
                                                   queue_table=queue_table)
            logger.info("the queue query is:\n{}".format(text_query))
        else:
            queue_table = "pub_queue"
            if not limit:
                limit = 1000
            text_query_pattern = """WITH update_pub_queue AS (
                       SELECT id
                       FROM   {queue_table}
                       WHERE  started is null
                       order by finished asc
                       nulls first
                   LIMIT  {chunk}
                   FOR UPDATE SKIP LOCKED
                   )
                UPDATE {queue_table} queue_rows_to_update
                SET    started=now()
                FROM   update_pub_queue
                WHERE update_pub_queue.id = queue_rows_to_update.id
                RETURNING update_pub_queue.id;"""
            text_query = text_query_pattern.format(limit=limit,
                                                   chunk=chunk,
                                                   queue_table=queue_table)
            logger.info("the queue query is:\n{}".format(text_query))
        index = 0
        start_time = time()
        while True:
            new_loop_start_time = time()
            if single_obj_id:
                single_obj_id = normalize_doi(single_obj_id)
                objects = [
                    run_class.query.filter(
                        run_class.id == single_obj_id).first()
                ]
            else:
                logger.info("looking for new jobs")

                job_time = time()
                row_list = db.engine.execute(
                    text(text_query).execution_options(
                        autocommit=True)).fetchall()
                object_ids = [row[0] for row in row_list]
                logger.info("got ids, took {} seconds".format(
                    elapsed(job_time)))

                job_time = time()
                q = db.session.query(Pub).options(orm.undefer('*')).filter(
                    Pub.id.in_(object_ids))
                objects = q.all()
                logger.info("got pub objects in {} seconds".format(
                    elapsed(job_time)))

                # shuffle them or they sort by doi order
                random.shuffle(objects)

                # objects = Pub.query.from_statement(text(text_query)).execution_options(autocommit=True).all()

                # objects = run_class.query.from_statement(text(text_query)).execution_options(autocommit=True).all()
                # id_rows =  db.engine.execute(text(text_query)).fetchall()
                # ids = [row[0] for row in id_rows]
                #
                # job_time = time()
                # objects = run_class.query.filter(run_class.id.in_(ids)).all()

                # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(job_time)))

            if not objects:
                # logger.info(u"sleeping for 5 seconds, then going again")
                sleep(5)
                continue

            object_ids = [obj.id for obj in objects]
            self.update_fn(run_class, run_method, objects, index=index)

            # logger.info(u"finished update_fn")
            if queue_table:
                object_ids_str = ",".join([
                    "'{}'".format(id.replace("'", "''")) for id in object_ids
                ])
                object_ids_str = object_ids_str.replace("%",
                                                        "%%")  #sql escaping
                sql_command = "update {queue_table} set finished=now(), started=null where id in ({ids})".format(
                    queue_table=queue_table, ids=object_ids_str)
                # logger.info(u"sql command to update finished is: {}".format(sql_command))
                run_sql(db, sql_command)
                # logger.info(u"finished run_sql")

            # finished is set in update_fn
            index += 1
            if single_obj_id:
                return
            else:
                self.print_update(new_loop_start_time, chunk, limit,
                                  start_time, index)
示例#9
0
def get_chorus_data(starting_offset=0, agency_id=None):
    requests_session = requests.Session()
    retries = Retry(total=10,
                backoff_factor=0.5,
                status_forcelist=[500, 502, 503, 504])
    requests_session.mount('http://', DelayedAdapter(max_retries=retries))
    requests_session.mount('https://', DelayedAdapter(max_retries=retries))

    agencies = get_chorus_agencies()
    for agency in agencies:
        if agency_id:
            if int(agency["Agency_Id"]) != int(agency_id):
                print "skipping {}, you are not the agency id we are looking for".format(agency["Agency_Id"])
                continue
        if starting_offset:
            offset = starting_offset
        else:
            offset = 0

        logger.info(u"*** on agency {}:{}".format(agency["Agency_Name"], agency["Agency_Id"]))
        url_template = "https://api.chorusaccess.org/v1.1/agencies/{agency_id}/histories/current?category=publicly_accessible&limit={limit}&offset={offset}"
        limit = 50
        total_results = None
        while total_results==None or offset < total_results:
            loop_start = time()
            url = url_template.format(agency_id=agency["Agency_Id"], offset=offset, limit=limit)
            print url
            try:
                r = requests_session.get(url, timeout=360)  # wait for 3 minutes
            except Exception, e:
                logger.exception(u"Exception: {}, skipping".format(unicode(e.message).encode("utf-8")))
                r = None

            print u"api call elapsed: {} seconds".format(elapsed(loop_start, 1))
            offset += limit

            if r:
                data = r.json()
                total_results = data["total_results"]
                logger.info(u"Has {} total results, {} remaining".format(
                    total_results, total_results - offset))


                items = data["items"]
                new_objects = []
                for item in items:
                    if item["DOI"]:
                        doi = normalize_doi(item["DOI"])
                        new_objects.append(Chorus(id=doi, raw=item))

                ids_already_in_db = [id_tuple[0] for id_tuple in db.session.query(Chorus.id).filter(Chorus.id.in_([obj.id for obj in new_objects])).all()]
                objects_to_add_to_db = [obj for obj in new_objects if obj.id not in ids_already_in_db]
                if objects_to_add_to_db:
                    logger.info(u"adding {} items".format(len(objects_to_add_to_db)))
                    db.session.add_all(objects_to_add_to_db)
                    safe_commit(db)
                else:
                    logger.info(u"all of these items already in db")

            logger.info(u"sleeping for 2 seconds")
            sleep(2)
示例#10
0
 def __init__(self, **kwargs):
     self.updated = datetime.datetime.utcnow()
     if "doi" in kwargs:
         kwargs["doi"] = normalize_doi(kwargs["doi"])
     super(Chorus, self).__init__(**kwargs)
示例#11
0
文件: views.py 项目: meonBot/oadoi
def simple_query_tool():
    body = request.json
    dirty_dois_list = {d for d in body["dois"] if d}

    # look up normalized dois
    normalized_dois = [
        c for c in
        [normalize_doi(d, return_none_if_error=True) for d in dirty_dois_list]
        if c
    ]

    q = db.session.query(pub.Pub.response_jsonb).filter(
        pub.Pub.id.in_(normalized_dois))
    rows = q.all()

    normalized_doi_responses = [row[0] for row in rows if row[0]]
    found_normalized_dois = [r['doi'] for r in normalized_doi_responses]
    missing_dois = [
        d for d in dirty_dois_list if normalize_doi(
            d, return_none_if_error=True) not in found_normalized_dois
    ]

    # look up cleaned dois where normalization wasn't enough
    clean_dois = [
        c for c in
        [clean_doi(d, return_none_if_error=True) for d in missing_dois]
        if c and c not in found_normalized_dois
    ]

    q = db.session.query(pub.Pub.response_jsonb).filter(
        pub.Pub.id.in_(clean_dois))
    rows = q.all()

    clean_doi_responses = [row[0] for row in rows if row[0]]
    found_clean_dois = [r['doi'] for r in clean_doi_responses]
    missing_dois = [
        d for d in missing_dois if clean_doi(d, return_none_if_error=True)
        not in found_normalized_dois + found_clean_dois
    ]

    placeholder_responses = [
        pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois
    ]

    responses = normalized_doi_responses + clean_doi_responses + placeholder_responses

    formats = body.get("formats", []) or ["jsonl", "csv"]
    files = []

    if "jsonl" in formats:
        # save jsonl
        with open("output.jsonl", 'wb') as f:
            for response_jsonb in responses:
                f.write(json.dumps(response_jsonb, sort_keys=True))
                f.write("\n")
        files.append("output.jsonl")

    csv_dicts = [
        pub.csv_dict_from_response_dict(my_dict) for my_dict in responses
    ]
    csv_dicts = [my_dict for my_dict in csv_dicts if my_dict]
    fieldnames = sorted(csv_dicts[0].keys())
    fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"]

    if "csv" in formats:
        # save csv
        with open("output.csv", 'wb') as f:
            writer = unicodecsv.DictWriter(f,
                                           fieldnames=fieldnames,
                                           dialect='excel')
            writer.writeheader()
            for my_dict in csv_dicts:
                writer.writerow(my_dict)
        files.append("output.csv")

    if "xlsx" in formats:
        book = Workbook()
        sheet = book.worksheets[0]
        sheet.title = "results"

        for col_idx, field_name in enumerate(fieldnames):
            sheet.cell(column=col_idx + 1, row=1, value=field_name)

        for row_idx, row in enumerate(csv_dicts):
            for col_idx, field_name in enumerate(fieldnames):
                sheet.cell(column=col_idx + 1,
                           row=row_idx + 2,
                           value=row[field_name])

        book.save(filename="output.xlsx")
        files.append("output.xlsx")

    # prep email
    email_address = body["email"]
    email = create_email(email_address, "Your Unpaywall results",
                         "simple_query_tool", {"profile": {}}, files)
    send(email, for_real=True)

    return jsonify({
        "got it":
        email_address,
        "dois":
        found_normalized_dois + found_clean_dois + missing_dois
    })
示例#12
0
def get_overrides_dict():
    override_dict = defaultdict(dict)

    # cindy wu example
    override_dict["10.1038/nature21360"] = {
        "pdf_url": "https://arxiv.org/pdf/1703.01424.pdf",
        "version": "submittedVersion",
        "host_type_set": "repository",
        "evidence": "oa repository (manual)"
    }

    # example from twitter
    override_dict["10.1021/acs.jproteome.5b00852"] = {
        "pdf_url":
        "http://pubs.acs.org/doi/pdfplus/10.1021/acs.jproteome.5b00852",
        "host_type_set": "publisher",
        "version": "publishedVersion"
    }

    # have the unpaywall example go straight to the PDF, not the metadata page
    override_dict["10.1098/rspa.1998.0160"] = {
        "pdf_url": "https://arxiv.org/pdf/quant-ph/9706064.pdf",
        "version": "submittedVersion"
    }

    # missed, not in BASE, from Maha Bali in email
    override_dict["10.1080/13562517.2014.867620"] = {
        "pdf_url":
        "http://dar.aucegypt.edu/bitstream/handle/10526/4363/Final%20Maha%20Bali%20TiHE-PoD-Empowering_Sept30-13.pdf",
        "version": "submittedVersion"
    }

    # otherwise links to figshare match that only has data, not the article
    override_dict["110.1126/science.aaf3777"] = {}

    #otherwise links to a metadata page that doesn't have the PDF because have to request a copy: https://openresearch-repository.anu.edu.au/handle/1885/103608
    override_dict["10.1126/science.aad2622"] = {
        "pdf_url":
        "https://lra.le.ac.uk/bitstream/2381/38048/6/Waters%20et%20al%20draft_post%20review_v2_clean%20copy.pdf",
        "version": "submittedVersion"
    }

    # otherwise led to http://www.researchonline.mq.edu.au/vital/access/services/Download/mq:39727/DS01 and authorization error
    override_dict["10.1126/science.aad2622"] = {}

    # else goes here: http://www.it-c.dk/people/schmidt/papers/complexity.pdf
    override_dict["10.1007/978-1-84800-068-1_9"] = {}

    # otherwise led to https://dea.lib.unideb.hu/dea/bitstream/handle/2437/200488/file_up_KMBT36220140226131332.pdf;jsessionid=FDA9F1A60ACA567330A8B945208E3CA4?sequence=1
    override_dict["10.1007/978-3-211-77280-5"] = {}

    # otherwise led to publisher page but isn't open
    override_dict["10.1016/j.renene.2015.04.017"] = {}

    # override old-style webpage
    override_dict["10.1210/jc.2016-2141"] = {
        "pdf_url":
        "https://academic.oup.com/jcem/article-lookup/doi/10.1210/jc.2016-2141",
        "host_type_set": "publisher",
        "version": "publishedVersion",
    }

    # not indexing this location yet, from @rickypo
    override_dict["10.1207/s15327957pspr0203_4"] = {
        "pdf_url":
        "http://www2.psych.ubc.ca/~schaller/528Readings/Kerr1998.pdf",
        "version": "submittedVersion"
    }

    # mentioned in world bank as good unpaywall example
    override_dict["10.3386/w23298"] = {
        "pdf_url": "https://economics.mit.edu/files/12774",
        "version": "submittedVersion"
    }

    # from email, has bad citesserx cached version
    override_dict["10.1007/bf02693740"] = {
        "pdf_url":
        "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.536.6939&rep=rep1&type=pdf",
        "version": "publishedVersion"
    }

    # from email, has bad citesserx cached version
    override_dict["10.1126/science.1150952"] = {
        "pdf_url":
        "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.168.3796&rep=rep1&type=pdf",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # from email, has bad citesserx cached version
    override_dict["10.1515/eqc.2007.295"] = {
        "pdf_url":
        "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.543.7752&rep=rep1&type=pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1038/nature21377"] = {
        "pdf_url":
        "http://eprints.whiterose.ac.uk/112179/1/ppnature21377_Dodd_for%20Symplectic.pdf",
        "version": "submittedVersion"
    }

    # from email
    override_dict["10.1016/j.gtc.2016.09.007"] = {
        "pdf_url":
        "https://cora.ucc.ie/bitstream/handle/10468/3544/Quigley_Chapter.pdf?sequence=1&isAllowed=y",
        "version": "acceptedVersion"
    }

    # stephen hawking's thesis
    override_dict["10.17863/cam.11283"] = {
        "pdf_url":
        "https://www.repository.cam.ac.uk/bitstream/handle/1810/251038/PR-PHD-05437_CUDL2017-reduced.pdf?sequence=15&isAllowed=y",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1152/advan.00040.2005"] = {
        "pdf_url":
        "https://www.physiology.org/doi/pdf/10.1152/advan.00040.2005",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1016/j.chemosphere.2014.07.047"] = {
        "pdf_url":
        "https://manuscript.elsevier.com/S0045653514009102/pdf/S0045653514009102.pdf",
        "version": "submittedVersion"
    }

    # from email
    override_dict["10.4324/9780203900956"] = {}

    # from email
    override_dict["10.3810/psm.2010.04.1767"] = {
        "pdf_url":
        "http://cupola.gettysburg.edu/cgi/viewcontent.cgi?article=1014&context=healthfac",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1016/S0140-6736(17)33308-1"] = {
        "pdf_url":
        "https://www.rug.nl/research/portal/files/64097453/Author_s_version_Gonadotrophins_versus_clomiphene_citrate_with_or_without_intrauterine_insemination_in_women.pdf",
        "version": "acceptedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1093/joclec/nhy009"] = {
        "pdf_url":
        "https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3126848",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1038/s41477-017-0019-3"] = {
        "pdf_url":
        "https://www.repository.cam.ac.uk/bitstream/handle/1810/270235/3383_1_merged_1502805167.pdf?sequence=1&isAllowed=y",
        "version": "acceptedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1029/wr015i006p01633"] = {
        "pdf_url":
        "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.475.497&rep=rep1&type=pdf",
        "version": "publishedVersion"
    }

    # from email, zenodo
    override_dict["10.1080/01650521.2018.1460931"] = {
        "metadata_url": "https://zenodo.org/record/1236622",
        "host_type_set": "repository",
        "version": "acceptedVersion"
    }

    # from email
    override_dict["10.3928/01477447-20150804-53"] = {}

    # from twitter
    override_dict["10.1103/physreva.97.013421"] = {
        "pdf_url": "https://arxiv.org/pdf/1711.10074.pdf",
        "version": "submittedVersion"
    }

    # from email
    override_dict["10.1016/j.amjmed.2005.09.031"] = {
        "pdf_url": "https://www.amjmed.com/article/S0002-9343(05)00885-5/pdf",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1080/15348458.2017.1327816"] = {}

    # from chorus
    override_dict["10.1103/physrevd.94.052011"] = {
        "pdf_url": "https://link.aps.org/accepted/10.1103/PhysRevD.94.052011",
        "version": "acceptedVersion",
    }
    override_dict["10.1063/1.4962501"] = {
        "pdf_url": "https://aip.scitation.org/doi/am-pdf/10.1063/1.4962501",
        "version": "acceptedVersion",
        "host_type_set": "repository"
    }

    # from email, broken citeseer link
    override_dict["10.2202/1949-6605.1908"] = {
        "pdf_url":
        "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.535.9289&rep=rep1&type=pdf",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1561/1500000012"] = {
        "pdf_url":
        "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.174.8814&rep=rep1&type=pdf",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1137/s0036142902418680"] = {
        "pdf_url":
        "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.144.7627&rep=rep1&type=pdf",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1088/1741-2552/aab4e4"] = {
        "pdf_url":
        "http://iopscience.iop.org/article/10.1088/1741-2552/aab4e4/pdf",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1145/1031607.1031615"] = {
        "pdf_url":
        "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.540.8125&rep=rep1&type=pdf",
        "version": "publishedVersion"
    }

    # from email
    override_dict["10.1007/s11227-016-1779-7"] = {
        "pdf_url": "https://hcl.ucd.ie/system/files/TJS-Hasanov-2016.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1016/s0020-0190(03)00351-x"] = {
        "pdf_url": "https://kam.mff.cuni.cz/~kolman/papers/noteb.ps",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1002/14651858.cd001704.pub4"] = {
        "pdf_url": "https://core.ac.uk/download/pdf/9440822.pdf",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1016/j.tetlet.2015.04.131"] = {
        "pdf_url":
        "https://www.sciencedirect.com/sdfe/pdf/download/read/aam/noindex/pii/S0040403915007881",
        "version": "acceptedVersion",
        "host_type_set": "publisher"
    }

    # from email
    override_dict["10.1016/j.nima.2016.04.104"] = {
        "pdf_url":
        "http://cds.cern.ch/record/2239750/files/1-s2.0-S0168900216303400-main.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1016/s1470-2045(15)00444-1"] = {
        "pdf_url":
        "https://www.statsarecool.com/data/uploads/journal-articles/who_declares_reds_meat_carcinogeniclancet_oct_2015.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1056/NEJM199406233302502"] = {
        "pdf_url": "https://www.nejm.org/doi/full/10.1056/NEJM199406233302502",
        "version": "publishedVersion",
        "host_type_set": "publisher"
    }

    # from email
    override_dict["10.1056/NEJMra1201534"] = {
        "pdf_url": "https://www.nejm.org/doi/pdf/10.1056/NEJMra1201534",
        "version": "publishedVersion",
        "host_type_set": "publisher"
    }

    # from email
    override_dict["10.1016/j.cmet.2018.03.012"] = {
        "pdf_url":
        "https://www.biorxiv.org/content/biorxiv/early/2018/01/15/245332.full.pdf",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1093/sf/65.1.1"] = {
        "pdf_url": "https://faculty.washington.edu/charles/new%20PUBS/A52.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1088/1751-8121/aabd9c"] = {}

    # from email
    override_dict["10.1017/CBO9781139173728.002"] = {}

    # from email
    override_dict["10.2174/97816810846711170101"] = {}

    # from email
    override_dict["10.1177/1354066196002003001"] = {}

    # from email
    override_dict["10.1093/bioinformatics/bty721"] = {}

    # from email
    override_dict["10.1088/1361-6528/aac7a4"] = {}

    # from email
    override_dict["10.1088/1361-6528/aac645"] = {}

    # from email
    override_dict["10.1111/1748-8583.12159"] = {}

    # from email
    override_dict["10.1042/BJ20080963"] = {}

    # from email
    override_dict["10.1136/bmj.j5007"] = {}

    # from email
    override_dict["10.1016/j.phrs.2017.12.007"] = {}

    # from email
    override_dict["10.4324/9781315770185"] = {}

    # from email
    override_dict["10.1108/PIJPSM-02-2016-0019"] = {}

    # from email
    override_dict["10.1016/j.ejca.2017.07.015"] = {}

    # from email
    override_dict["10.1080/14655187.2017.1469322"] = {}

    # from email
    override_dict["10.1080/02684527.2017.1407549"] = {}

    # from email
    override_dict["10.1093/jat/bky025"] = {}

    # from email
    override_dict["10.1016/j.midw.2009.07.004"] = {}

    # from email
    override_dict["10.1177/247553031521a00105"] = {}

    # from email
    override_dict["10.1002/0471445428"] = {}

    # from email
    override_dict["10.1007/978-3-642-31232-8"] = {}

    # ticket 267
    override_dict["10.1016/j.anucene.2014.08.021"] = {}

    # ticket 199
    # pdf has embedded password protection
    override_dict["10.22381/rcp1720184"] = {}

    # ticket 574
    # pdf has embedded password protection
    override_dict["10.22381/EMFM14220195"] = {}

    # ticket 256
    # journal in doaj but article not available
    override_dict["10.1016/j.mattod.2018.03.001"] = {}

    # ticket 277
    # pmh record with spurious title: oai:works.swarthmore.edu:fac-psychology-1039
    override_dict["10.1016/j.actpsy.2010.01.009"] = {}

    # ticket 280
    # green scrape gets overexcited about a .doc link
    override_dict["10.1108/09596111211217932"] = {}

    # ticket 279
    # match to wrong pdf, currently suppressed incorrectly by bad pdf check
    override_dict["10.1238/physica.topical.102a00059"] = {}

    # ticket 275
    override_dict["10.1039/c7nj03253f"] = {}

    # email
    override_dict['10.1007/978-3-642-30350-0'] = {}

    # ticket 135
    # bad title / last author match
    override_dict["10.1016/s0140-6736(17)31287-4"] = {}

    # ticket 98
    # two similar articles with this title
    override_dict["10.1002/14651858.CD012414.pub2"] = {}

    # ticket 322
    # pmh match on a cover sheet
    override_dict["10.1116/1.5046531"] = {}

    # ticket 631
    # withdrawn article
    override_dict["10.5812/jjm.3664"] = {}

    # ticket 832
    override_dict["10.5935/scd1984-8773.20168409"] = {}

    # ticket 1047
    # book chapter has a bronze tag
    override_dict["10.1002/9781119473992"] = {}

    # from email
    override_dict["10.1016/S0022-1996(00)00093-3"] = {
        "pdf_url":
        "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.475.3874&rep=rep1&type=pdf",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1177/088840649401700203"] = {
        "pdf_url":
        "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.1014.8577&rep=rep1&type=pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.7326/L18-0139"] = {
        "pdf_url":
        "http://annals.org/data/journals/aim/936928/aime201804170-l180139.pdf",
        "version": "publishedVersion",
        "host_type_set": "publisher"
    }

    # from email
    override_dict["10.1007/978-3-319-48881-3_55"] = {
        "pdf_url":
        "http://liu.diva-portal.org/smash/get/diva2:1063949/FULLTEXT01.pdf",
        "version": "acceptedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1109/ICCVW.2015.86"] = {
        "pdf_url":
        "http://liu.diva-portal.org/smash/get/diva2:917646/FULLTEXT01",
        "version": "acceptedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1126/science.aap9559"] = {
        "pdf_url":
        "http://vermontcomplexsystems.org/share/papershredder/vosoughi2018a.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # from email
    override_dict["10.1109/tpds.2012.97"] = {
        "pdf_url": "https://www.cnsr.ictas.vt.edu/publication/06171175.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # ticket 261
    # crossref metadata points to wrong article
    override_dict["10.4149/BLL_2013_058"] = {
        "pdf_url":
        "http://www.elis.sk/download_file.php?product_id=3759&session_id=lnkeo437s8hv5t0r28g6ku93b0",
        "version": "publishedVersion",
        "host_type_set": "publisher"
    }

    # ticket 317
    # broken link on citeseer
    override_dict["10.1016/b978-1-55860-307-3.50012-5"] = {
        "pdf_url":
        "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.57.3196&rep=rep1&type=pdf",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # ticket 195
    # wrong registered landing page
    override_dict["10.21285/2227-2925-2018-8-2-9-18"] = {
        "metadata_url":
        "http://journals.istu.edu/izvestia_biochemi/journals/2018/02/articles/01",
        "version": "publishedVersion",
        "host_type_set": "publisher",
        "evidence": oa_evidence.oa_journal_doaj
    }

    # ticket 213
    # journal issue is open
    override_dict["10.14195/2182-7982_32"] = {
        "metadata_url": "https://doi.org/10.14195/2182-7982_32",
        "version": "publishedVersion",
        "host_type_set": "publisher"
    }

    override_dict["10.1016/S2213-8587(16)30320-5"] = {
        "pdf_url": "http://www.spdm.org.pt/media/1373/pku-guidelines_2017.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # ticket 433
    override_dict["10.1144/GSL.JGS.1846.002.01-02.54"] = {
        "metadata_url":
        "https://www.biodiversitylibrary.org/item/109652#page/473/mode/1up",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # ticket 223
    # pme record has wrong page url
    override_dict["10.1002/abc.207"] = {
        "pdf_url":
        "https://repository.library.northeastern.edu/files/neu:344561/fulltext.pdf",
        "metadata_url":
        "https://repository.library.northeastern.edu/files/neu:344561",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # ticket 304
    # inline citation pdf links
    override_dict["10.7766/alluvium.v3.1.05"] = {
        "metadata_url": "https://doi.org/10.7766/alluvium.v3.1.05",
        "version": "publishedVersion",
        "host_type_set": "publisher"
    }

    # ticket 376
    override_dict["10.1080/01639374.2017.1358232"] = {
        "pdf_url":
        "https://groups.niso.org/apps/group_public/download.php/17446/Understanding%20Metadata.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # ticket 539
    # malformed url in pmh record
    override_dict["10.1642/0004-8038(2007)124[1121:EOWNVT]2.0.CO;2"] = {
        "pdf_url":
        "https://repository.si.edu/bitstream/handle/10088/35181/NZP_Marra_2007-ECOLOGY_OF_WEST_NILE_VIRUS_TRANSMISSION_AND_ITS_IMPACT_ON_BIRDS_IN_THE_WESTERN_HEMISPHERE.pdf",
        "version": "publishedVersion",
        "host_type_set": "repository"
    }

    # https://github.com/Impactstory/unpaywall/issues/41
    # link to preprint with different DOI
    override_dict["10.1038/s41592-018-0235-4"] = {
        "metadata_url": "https://www.biorxiv.org/content/10.1101/306951v3",
        "pdf_url":
        "https://www.biorxiv.org/content/biorxiv/early/2018/07/24/306951.full.pdf",
        "version": "submittedVersion",
        "host_type_set": "repository"
    }

    # issue 530
    # unrelated pmh record has wrong DOI
    override_dict["10.1056/nejmoa063842"] = {
        "metadata_url": "https://www.nejm.org/doi/10.1056/NEJMoa063842",
        "version": "publishedVersion",
        "host_type_set": "publisher"
    }

    # issue 571
    # scrape finds supplementary file
    override_dict["10.21203/rs.2.11958/v1"] = {
        "metadata_url": "https://doi.org/10.21203/rs.2.11958/v1",
        "version": "submittedVersion",
        "host_type_set": "repository",
        "license": "cc-by"
    }

    # twitter
    override_dict['10.1002/jclp.22680'] = {
        'pdf_url':
        'https://dl.uswr.ac.ir/bitstream/Hannan/62873/1/2018%20JCpsychology%20Volume%2074%20Issue%2011%20November%20%2811%29.pdf',
        'version': 'publishedVersion',
        'host_type_set': 'repository',
    }

    # ticket 680
    override_dict['10.17059/2015-4-27'] = {
        'metadata_url': 'http://economyofregion.com/archive/2015/57/2731/',
        'pdf_url': 'http://economyofregion.com/archive/2015/57/2731/pdf/',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
    }

    # ticket 681
    override_dict['10.17059/2016-1-19'] = {
        'metadata_url': 'http://economyofregion.com/archive/2016/58/2778/',
        'pdf_url': 'http://economyofregion.com/archive/2016/58/2778/pdf/',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
    }

    # ticket 743
    override_dict['10.1016/S0140-6736(07)61162-3'] = {
        'metadata_url':
        'https://www.semanticscholar.org/paper/Cannabis-use-and-risk-of-psychotic-or-aff-ective-a-Moore-Zammit/6e5bc8bf7814c62db319632ca939ad68a6770d1b',
        'pdf_url':
        'https://pdfs.semanticscholar.org/641e/6aba769421d4308d1ad107684eeca7f687d1.pdf',
        'version': 'publishedVersion',
        'host_type_set': 'repository',
    }

    # ticket 835
    override_dict['10.23912/9781911396512-3454'] = {
        'metadata_url': 'https://doi.org/10.23912/9781911396512-3454',
        'pdf_url':
        'https://www.goodfellowpublishers.com/academic-publishing.php?promoCode=&partnerID=&housekeeping=getfile&productID=3657',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
    }

    # ticket 899, missing from IR
    override_dict['10.1080/0361526x.2019.1551004'] = {
        'metadata_url': 'https://inspire.redlands.edu/oh_articles/249/',
        'pdf_url':
        'https://inspire.redlands.edu/cgi/viewcontent.cgi?article=1190&context=oh_articles',
        'version': 'publishedVersion',
        'host_type_set': 'repository',
        'license': 'cc-by-nc',
    }

    # ticket 1029, can't detect PDF
    override_dict['10.1891/2156-5287.8.4.252'] = {
        'metadata_url': 'https://doi.org/10.1891/2156-5287.8.4.252',
        'pdf_url':
        'https://connect.springerpub.com/content/sgrijc/8/4/252.full.pdf',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
    }

    # ticket 1057, full issue pdf found first but has errors
    override_dict['10.5152/turkjnephrol.2020.3579'] = {
        'metadata_url': 'https://doi.org/10.5152/turkjnephrol.2020.3579',
        'pdf_url':
        'https://turkjnephrol.org/Content/files/sayilar/420/84-88(2).pdf',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
    }

    # ticket 1064, doi.org/10.1016/j.jcmg.2012.07.005 redirects to 10.1016/j.jcmg.2012.08.001
    override_dict['10.1016/j.jcmg.2012.07.005'] = {
        'metadata_url':
        'https://www.sciencedirect.com/science/article/pii/S1936878X12005748',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
    }

    # ticket 1084 faculty page
    override_dict['10.1016/j.jebo.2012.09.021'] = {
        'pdf_url':
        'https://cpb-us-w2.wpmucdn.com/sites.wustl.edu/dist/c/2014/files/2019/06/tennis.pdf',
        'version': 'submittedVersion',
        'host_type_set': 'repository',
    }

    # ticket 1118, can't read landing page
    override_dict['10.3917/zil.006.0009'] = {
        'metadata_url': 'https://doi.org/10.3917/zil.006.0009',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
    }

    # ticket 1151, doi.org url 404
    override_dict['10.1001/jamafacial.2013.406'] = {
        'metadata_url':
        'https://www.liebertpub.com/doi/10.1001/archfaci.2013.406',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
    }

    #ticket 1152, doi.org url leads to wrong article
    override_dict['10.1016/j.aott.2018.06.004'] = {
        'metadata_url':
        'https://www.aott.org.tr/en/comparison-of-ultrasound-and-extracorporeal-shock-wave-therapy-in-lateral-epicondylosis-133459',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
        'license': 'cc-by-nc-nd',
    }

    #ticket 1162, can't download PDF
    override_dict['10.3406/ahess.1976.293748'] = {
        'metadata_url':
        'https://www.persee.fr/doc/ahess_0395-2649_1976_num_31_4_293748',
        'version': 'publishedVersion',
        'host_type_set': 'repository',
        'license': 'cc-by-nc-sa',
    }

    #ticket 1184, missing from philarchive
    override_dict['10.1007/s10670-020-00241-4'] = {
        'metadata_url': 'https://philarchive.org/rec/LOGIAI',
        'pdf_url': 'https://philarchive.org/archive/LOGIAI',
        'version': 'acceptedVersion',
        'host_type_set': 'repository',
    }

    override_dict['10.1007/s11098-019-01378-x'] = {
        'metadata_url': 'https://philarchive.org/rec/LOGTST',
        'pdf_url': 'https://philarchive.org/archive/LOGTST',
        'version': 'acceptedVersion',
        'host_type_set': 'repository',
    }

    override_dict['10.1002/tht3.395'] = {
        'metadata_url': 'https://philarchive.org/rec/LOGSUR',
        'pdf_url': 'https://philarchive.org/archive/LOGSUR',
        'version': 'publishedVersion',
        'host_type_set': 'repository',
    }

    override_dict['10.3917/lig.764.0006'] = {
        'metadata_url': 'https://doi.org/10.3917/lig.764.0006',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
    }

    # ticket 1260, wrong doi url
    override_dict['10.5603/ait.a2017.0053'] = {
        'metadata_url':
        'https://www.termedia.pl/Pharmacokinetic-drug-drug-interactions-in-the-intensive-care-unit-single-centre-experience-and-literature-review,118,38092,1,1.html',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    # ticket 22157, wrong doi url
    override_dict['10.5603/ait.a2015.0073'] = {
        'metadata_url':
        'https://www.termedia.pl/Hemodynamic-monitoring-To-calibrate-or-not-to-calibrate-r-nPart-1-Calibrated-techniques,118,38312,0,1.html',
        'pdf_url':
        'https://www.termedia.pl/Journal/-118/pdf-38312-10?filename=pages_487-500_article_43713.pdf',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    # ticket 22157, wrong doi url
    override_dict['10.5603/ait.a2015.0076'] = {
        'metadata_url':
        'https://www.termedia.pl/Hemodynamic-monitoring-To-calibrate-or-not-to-calibrate-r-nPart-2-Non-calibrated-techniques,118,38313,0,1.html',
        'pdf_url':
        'https://www.termedia.pl/Journal/-118/pdf-38313-10?filename=pages_501-516_article_43754.pdf',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    # ticket 3417, wrong doi url
    override_dict['10.15414/jmbfs.2016.5.special1.64-68'] = {
        'metadata_url':
        'https://www.jmbfs.org/issue/february-2016-vol-5-special-1/jmbfs-2016_020-ivanuska/?issue_id=4120&article_id=18',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    # ticket 4625, wrong doi url
    override_dict['10.4103/1011-4564.204985'] = {
        'metadata_url':
        'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=37;epage=43;aulast=Huang;type=0',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    override_dict['10.4103/jmedsci.jmedsci_99_16'] = {
        'metadata_url':
        'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=44;epage=49;aulast=Doka;type=0',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    override_dict['10.4103/jmedsci.jmedsci_95_16'] = {
        'metadata_url':
        'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=50;epage=55;aulast=Hsu;type=0',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    override_dict['10.4103/jmedsci.jmedsci_104_16'] = {
        'metadata_url':
        'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=56;epage=60;aulast=Lin;type=0',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    override_dict['10.4103/jmedsci.jmedsci_100_16'] = {
        'metadata_url':
        'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=61;epage=68;aulast=Shen;type=0',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    override_dict['10.4103/jmedsci.jmedsci_12_16'] = {
        'metadata_url':
        'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=69;epage=71;aulast=Chaitra;type=0',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    override_dict['10.4103/jmedsci.jmedsci_92_15'] = {
        'metadata_url':
        'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=72;epage=75;aulast=Huang;type=0',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    override_dict['10.4103/jmedsci.jmedsci_27_16'] = {
        'metadata_url':
        'http://www.jmedscindmc.com/article.asp?issn=1011-4564;year=2017;volume=37;issue=2;spage=76;epage=79;aulast=Saha;type=0',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    # end ticket 4625

    # ticket 22137, doi url redirects to http, then https redirect fails
    override_dict['10.18845/te.v1i2.868'] = {
        'metadata_url':
        'https://revistas.tec.ac.cr/index.php/tec_empresarial/article/view/868',
        'pdf_url':
        'https://revistas.tec.ac.cr/index.php/tec_empresarial/article/view/868',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    # ticket 22208. 404 at doi url
    override_dict['10.26442/terarkh201890417-20'] = {
        "host_type_set": "publisher",
        "version": "publishedVersion",
        "evidence": "oa journal (via doaj)",
        "metadata_url": "https://ter-arkhiv.ru/0040-3660/article/view/32440",
        "license": "cc-by",
    }

    # ticket 22274. gold journal but DOI doesn't resolve
    override_dict['10.25251/skin.3.6.4'] = {
        'metadata_url': 'https://jofskin.org/index.php/skin/article/view/625',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via observed oa rate)',
    }

    # ticket 22287. journal in DOAJ, but article missing from https://sophia.ups.edu.ec/index.php/sophia/issue/view/151
    override_dict['10.17163/soph.n25.2018.03'] = {
        'metadata_url':
        'https://www.redalyc.org/jatsRepo/4418/441855948003/html/index.html',
        'license': 'cc-by-nc-sa',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    # ticket 22562. journal in DOAJ, doi.org url 404s
    override_dict['10.1162/itid.2003.1.1.75'] = {
        'metadata_url':
        'https://itidjournal.org/index.php/itid/article/view/136.html',
        'pdf_url':
        'https://itidjournal.org/index.php/itid/article/download/136/136-472-1-PB.pdf',
        'license': 'cc-by-nc-sa',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    # ticket 22936. journal in DOAJ, doi.org url 404s, same article as above
    override_dict['10.1162/154475203771799720'] = {
        'metadata_url':
        'https://itidjournal.org/index.php/itid/article/view/136.html',
        'pdf_url':
        'https://itidjournal.org/index.php/itid/article/download/136/136-472-1-PB.pdf',
        'license': 'cc-by-nc-sa',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    # ticket 22674. doesn't want to link to PDF for some reason.
    override_dict['10.1101/2020.02.24.962878'] = {
        'metadata_url': 'https://doi.org/10.1101/2020.02.24.962878',
        'version': 'submittedVersion',
        'host_type_set': 'repository',
        'evidence': 'oa repository (via free pdf)',
    }

    # ticket 22562. journal in DOAJ, broken doi.org url
    override_dict['10.5505/tbdhd.2018.50251'] = {
        'metadata_url':
        'http://dergi.bdhd.org.tr/eng/jvi.aspx?un=TBDHD-50251&volume=24&issue=2',
        'pdf_url':
        'https://jag.journalagent.com/tbdhd/pdfs/TBDHD-50251-CASE_REPORT-YEKTAS.pdf',
        'license': 'cc-by-nc-nd',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'oa journal (via doaj)',
    }

    # ticket 22877. journal was detected as all-OA but DOIs don't work now
    override_dict['10.14800/scti.232'] = {
        'metadata_url':
        'https://www.smartscitech.com/index.php/SCTI/article/view/828',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
    }

    # ticket 22945. first citeseerx link broken
    override_dict['10.1111/1467-8306.9302004'] = {
        'metadata_url':
        'https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.109.1825',
        'pdf_url':
        'https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.109.1825&rep=rep1&type=pdf',
        'version': 'submittedVersion',
        'host_type_set': 'repository',
        'evidence': 'oa repository (via free pdf)',
    }

    # ticket 22967. first citeseerx link is slide deck
    override_dict['10.1109/msp.2010.936019'] = {
        'metadata_url':
        'http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.470.8283',
        'pdf_url':
        'http://www1.se.cuhk.edu.hk/~manchoso/papers/sdrapp-SPM.pdf',
        'version': 'acceptedVersion',
        'host_type_set': 'repository',
        'evidence': 'oa repository (via free pdf)',
    }

    # ticket 23017. can't scrape cairn
    override_dict['10.3917/sr.035.0007'] = {
        'metadata_url': 'https://doi.org/10.3917/sr.035.0007',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'license': 'cc-by-nc-nd',
    }

    # ticket 23017. can't scrape cairn
    override_dict['10.3917/sr.039.0119'] = {
        'metadata_url': 'https://doi.org/10.3917/sr.039.0119',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'license': 'cc-by-nc-nd',
    }

    # ticket 23020
    override_dict['10.3847/2041-8213/abe4de'] = {
        'metadata_url': 'https://doi.org/10.3847/2041-8213/abe4de',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'license': 'cc-by',
    }

    # ticket 23020
    override_dict['10.3847/2041-8213/abe71d'] = {
        'metadata_url': 'https://doi.org/10.3847/2041-8213/abe71d',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'license': 'cc-by',
    }

    # ticket 23020
    override_dict['10.3847/2041-8213/abed53'] = {
        'metadata_url': 'https://doi.org/10.3847/2041-8213/abed53',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'license': 'cc-by',
    }

    # ticket 23020
    override_dict['10.3847/2041-8213/abee6a'] = {
        'metadata_url': 'https://doi.org/10.3847/2041-8213/abee6a',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'license': 'cc-by',
    }

    # ticket 1025
    # WOS user says full article isn't available
    override_dict['10.1016/j.fuel.2019.116234'] = {}

    # ticket 215
    # doi.org links point to wrong article
    override_dict["10.1515/res-2016-0002"] = {}

    # ticket 584
    # repo match to dissertation with same title and author
    override_dict["10.3726/978-3-0343-2544-8"] = {}
    # book front matter
    override_dict["10.1007/978-3-319-78349-9"] = {}

    # ticket 594
    override_dict["10.1016/j.chemgeo.2016.02.020"] = {}

    # ticket 240 part 2. mislabeled in repository.
    override_dict["10.1111/eip.12323"] = {}

    # ticket 928. CC license in references.
    override_dict['10.1007/s11012-016-0472-5'] = {}

    # ticket 968. CC license for dataset.
    override_dict['10.1007/s12275-020-9536-2'] = {}

    # ticket 966. PDF link only works once.
    override_dict['10.1093/ee/nvz159'] = {}

    # ticket 1371. someone doesn't like green OA
    override_dict['10.1007/s10798-019-09554-0'] = {}

    # ticket 6937. bad license info on page
    override_dict['10.1016/j.breast.2015.07.036'] = {}

    # ticket 22163. doing a favor.
    override_dict['10.1016/j.energy.2015.06.127'] = {}

    # ticket 22794. page and metadata have license
    override_dict['10.1515/pac-2020-0702'] = {}

    # ticket 22791
    override_dict['10.1038/s41574-020-00451-4'] = {}

    # ticket 22636
    override_dict['10.1007/978-981-15-4814-7'] = {
        'metadata_url': 'https://doi.org/10.1007/978-981-15-4814-7',
        'version': 'publishedVersion',
        'host_type_set': 'publisher',
        'evidence': 'open (via free pdf)',
    }

    override_dict['10.1080/1097198x.2020.1752084'] = {}

    # ticket 22892, doi resolution is wrong
    # is https://www.journalofhospitalmedicine.com/jhospmed/article/189543/hospital-medicine/you-cant-have-it-all-experience-academic-hospitalists
    # should be https://www.journalofhospitalmedicine.com/jhospmed/article/189545/hospital-medicine/barriers-earlier-hospital-discharge-what-matters-most
    override_dict['10.12788/jhm.3094'] = {}

    # ticket 535
    # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.6.09070802050003050502050201
    for doi in ['10.1484/M.RELMIN-EB.6.09070802050003050502050201'] + [
            '10.1484/M.RELMIN-EB.5.1038' + str(n) for n in range(59, 76)
    ]:
        override_dict[doi] = {
            "pdf_url":
            "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=23027",
            "version": "publishedVersion",
            "host_type_set": "repository"
        }

    # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.5.109256
    for doi in ['10.1484/M.RELMIN-EB.5.109256'] + list(
        ['10.1484/M.RELMIN-EB.5.1091' + str(n) for n in range(58, 70)]):
        override_dict[doi] = {
            "pdf_url":
            "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=26957",
            "version": "publishedVersion",
            "host_type_set": "repository"
        }

    # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.5.108025
    for doi in ['10.1484/M.RELMIN-EB.5.108025'] + list(
        ['10.1484/M.RELMIN-EB.5.1084' + str(n) for n in range(35, 51)]):
        override_dict[doi] = {
            "pdf_url":
            "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=26953",
            "version": "publishedVersion",
            "host_type_set": "repository"
        }

    # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.6.09070802050003050500050207
    for doi in ['10.1484/M.RELMIN-EB.6.09070802050003050500050207'] + list(
        ['10.1484/M.RELMIN-EB.1.1018' + str(n) for n in range(74, 92)]):
        override_dict[doi] = {
            "pdf_url":
            "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=23029",
            "version": "publishedVersion",
            "host_type_set": "repository"
        }

    # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.5.108940
    for doi in ['10.1484/M.RELMIN-EB.5.108940'] + list(
        ['10.1484/M.RELMIN-EB.5.1093' + str(n) for n in range(46, 60)]):
        override_dict[doi] = {
            "pdf_url":
            "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=26960",
            "version": "publishedVersion",
            "host_type_set": "repository"
        }

    # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.6.09070802050003050408050408
    for doi in ['10.1484/M.RELMIN-EB.6.09070802050003050408050408'] + list(
        ['10.1484/M.RELMIN-EB.1.1018' + str(n) for n in range(10, 27)]):
        override_dict[doi] = {
            "pdf_url":
            "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=25736",
            "version": "publishedVersion",
            "host_type_set": "repository"
        }

    # book & chapters listed at https://www.brepolsonline.net/action/showBook?doi=10.1484%2FM.RELMIN-EB.5.106169
    for doi in ['10.1484/M.RELMIN-EB.5.106169'] + list(
        ['10.1484/M.RELMIN-EB.4.000' + str(n).zfill(2) for n in range(2, 15)]):
        override_dict[doi] = {
            "pdf_url":
            "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=23028",
            "version": "publishedVersion",
            "host_type_set": "repository"
        }

    # book & chapters listed at https://www.brepolsonline.net/doi/book/10.1484/M.RELMIN-EB.5.109274
    for doi in ['10.1484/M.RELMIN-EB.5.109274'] + list(
        ['10.1484/M.RELMIN-EB.5.111' + str(n) for n in range(590, 615)]):
        override_dict[doi] = {
            "pdf_url":
            "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=26954",
            "version": "publishedVersion",
            "host_type_set": "repository"
        }

    # book & chapters listed at https://www.brepolsonline.net/action/showBook?doi=10.1484/M.RELMIN-EB.5.112302
    for doi in ['10.1484/M.RELMIN-EB.5.112302'] + list(
        ['10.1484/M.RELMIN-EB.5.1115' + str(n) for n in range(13, 29)]):
        override_dict[doi] = {
            "pdf_url":
            "https://www.doabooks.org/doab?func=fulltext&uiLanguage=en&rid=26961",
            "version": "publishedVersion",
            "host_type_set": "repository"
        }

    override_dict["10.1016/s1474-4422(19)30285-6"] = {
        "metadata_url": "http://hdl.handle.net/2066/207798",
        "version": "publishedVersion",
        "host_type_set": "repository",
        "evidence": "oa repository (manual)"
    }

    # the use of this is counting on the doi keys being lowercase/cannonical
    response = {}
    for k, v in override_dict.items():
        response[normalize_doi(k)] = v

    return response
示例#13
0
def run_through_dois(filename=None, reverse=None, loggly=False):
    total_start = time()
    i = 0
    output_dicts = []
    fh = open(filename, "r")

    lines = fh.readlines()

    if reverse:
        logger.info(u"reverse!")
        lines.reverse()
        i = -1 * len(lines)

    dois = []
    for line in lines:
        dois.append(line.strip())

        # line = line.replace('"', '')
        # if u"," in line:
        #     split_line = line.split(",")
        #     if loggly:
        #         dois.append(split_line[1])
        #     else:
        #         dois.append(split_line[0])
        # else:
        #     dois.append(line.strip())

    # deduplicate, preserving order
    duplicated_dois = dois
    dois = []
    for doi in duplicated_dois:
        if doi not in dois:
            dois.append(doi)

    logger.info(u"length of deduped doi list: {}".format(len(dois)))

    for doi in dois:

        try:
            my_doi = normalize_doi(doi)
        except NoDoiException:
            logger.info(u"bad doi: {}".format(doi))
            continue

        if not my_doi:
            logger.info(u"bad doi: {}".format(doi))
            continue

        my_pub = Oab.query.get(my_doi)
        if not my_pub:
            my_pub = Oab()
            db.session.add(my_pub)
        my_pub.id = my_doi
        my_doi_url = "http://doi.org/{}".format(my_doi)
        my_doi_url_encoded = urllib.quote_plus(my_doi_url)
        api_url = "https://api.openaccessbutton.org/availability?url={}".format(
            my_doi_url_encoded)
        headers = {"content-type": "application/json"}
        r = requests.get(api_url, headers=headers)
        if r.status_code == 200:
            logger.info(u"success with oab! with {}".format(my_doi))
            # logger.info(r.json())
            my_pub.api = r.json()
            flag_modified(my_pub, "api")
        else:
            logger.info(u"problem with oab, status_code {}".format(
                r.status_code))

        dissemin_url = "http://dissem.in/api/{}".format(my_doi)
        r = requests.get(dissemin_url, headers=headers)
        if r.status_code == 200:
            logger.info(u"success! with dissemin! with {}".format(my_doi))
            # logger.info(r.json())
            my_pub.dissemin = r.json()
            flag_modified(my_pub, "dissemin")
        else:
            logger.info(u"problem with dissemin, status_code {}".format(
                r.status_code))

        safe_commit(db)
        i += 1

    logger.info(u"finished {} in {} seconds".format(i, elapsed(total_start,
                                                               2)))

    fh.close()