示例#1
0
def simple_query_tool():
    body = request.json
    dirty_dois_list = {d for d in body["dois"] if d}

    clean_dois = [
        c for c in
        [clean_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c
    ]

    q = db.session.query(pub.Pub.response_jsonb).filter(
        pub.Pub.id.in_(clean_dois))
    rows = q.all()

    pub_responses = [row[0] for row in rows if row[0]]

    pub_dois = [r['doi'] for r in pub_responses]
    missing_dois = [
        d for d in dirty_dois_list
        if clean_doi(d, return_none_if_error=True) not in pub_dois
    ]
    placeholder_responses = [
        pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois
    ]

    responses = pub_responses + placeholder_responses

    # save jsonl
    with open("output.jsonl", 'wb') as f:
        for response_jsonb in responses:
            f.write(json.dumps(response_jsonb, sort_keys=True))
            f.write("\n")

    # save csv
    csv_dicts = [
        pub.csv_dict_from_response_dict(my_dict) for my_dict in responses
    ]
    csv_dicts = [my_dict for my_dict in csv_dicts if my_dict]
    fieldnames = sorted(csv_dicts[0].keys())
    fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"]
    with open("output.csv", 'wb') as f:
        writer = unicodecsv.DictWriter(f,
                                       fieldnames=fieldnames,
                                       dialect='excel')
        writer.writeheader()
        for my_dict in csv_dicts:
            writer.writerow(my_dict)

    # prep email
    email_address = body["email"]
    email = create_email(email_address, "Your Unpaywall results",
                         "simple_query_tool", {"profile": {}},
                         ["output.csv", "output.jsonl"])
    send(email, for_real=True)

    return jsonify({"got it": email_address, "dois": pub_dois + missing_dois})
示例#2
0
def add_pubs_from_dois(dois):
    new_pubs = []
    for doi in dois:
        crossref_api = get_api_for_one_doi(doi)
        new_pub = build_new_pub(doi, crossref_api)

        # hack so it gets updated soon
        new_pub.updated = datetime.datetime(1042, 1, 1)

        new_pubs.append(new_pub)

    added_pubs = add_new_pubs(new_pubs)
    return added_pubs
def add_pubs_from_dois(dois):
    new_pubs = []
    for doi in dois:
        crossref_api = get_api_for_one_doi(doi)
        new_pub = build_new_pub(doi, crossref_api)

        # hack so it gets updated soon
        new_pub.updated = datetime.datetime(1042, 1, 1)

        new_pubs.append(new_pub)

    added_pubs = add_new_pubs(new_pubs)
    return added_pubs
示例#4
0
def simple_query_tool():
    body = request.json
    dirty_dois_list = {d for d in body["dois"] if d}

    clean_dois = [c for c in [clean_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c]

    q = db.session.query(pub.Pub.response_jsonb).filter(pub.Pub.id.in_(clean_dois))
    rows = q.all()

    pub_responses = [row[0] for row in rows]

    pub_dois = [r['doi'] for r in pub_responses]
    missing_dois = [d for d in dirty_dois_list if clean_doi(d, return_none_if_error=True) not in pub_dois]
    placeholder_responses = [pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois]

    responses = pub_responses + placeholder_responses

    # save jsonl
    with open("output.jsonl", 'wb') as f:
        for response_jsonb in responses:
            f.write(json.dumps(response_jsonb, sort_keys=True))
            f.write("\n")


    # save csv
    csv_dicts = [pub.csv_dict_from_response_dict(my_dict) for my_dict in responses]
    csv_dicts = [my_dict for my_dict in csv_dicts if my_dict]
    fieldnames = sorted(csv_dicts[0].keys())
    fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"]
    with open("output.csv", 'wb') as f:
        writer = unicodecsv.DictWriter(f, fieldnames=fieldnames, dialect='excel')
        writer.writeheader()
        for my_dict in csv_dicts:
            writer.writerow(my_dict)

    # prep email
    email_address = body["email"]
    email = create_email(email_address,
                 "Your Unpaywall results",
                 "simple_query_tool",
                 {"profile": {}},
                 ["output.csv", "output.jsonl"])
    send(email, for_real=True)

    return jsonify({"got it": email_address, "dois": pub_dois + missing_dois})
示例#5
0
def simple_query_tool():
    body = request.json
    dirty_dois_list = {d for d in body["dois"] if d}

    clean_dois = [c for c in [clean_doi(d, return_none_if_error=True) for d in dirty_dois_list] if c]

    q = db.session.query(pub.Pub.response_jsonb).filter(pub.Pub.id.in_(clean_dois))
    rows = q.all()

    pub_responses = [row[0] for row in rows if row[0]]

    pub_dois = [r['doi'] for r in pub_responses]
    missing_dois = [d for d in dirty_dois_list if clean_doi(d, return_none_if_error=True) not in pub_dois]
    placeholder_responses = [pub.build_new_pub(d, None).to_dict_v2() for d in missing_dois]

    responses = pub_responses + placeholder_responses

    formats = body.get("formats", []) or ["jsonl", "csv"]
    files = []

    if "jsonl" in formats:
        # save jsonl
        with open("output.jsonl", 'wb') as f:
            for response_jsonb in responses:
                f.write(json.dumps(response_jsonb, sort_keys=True))
                f.write("\n")
        files.append("output.jsonl")

    csv_dicts = [pub.csv_dict_from_response_dict(my_dict) for my_dict in responses]
    csv_dicts = [my_dict for my_dict in csv_dicts if my_dict]
    fieldnames = sorted(csv_dicts[0].keys())
    fieldnames = ["doi"] + [name for name in fieldnames if name != "doi"]

    if "csv" in formats:
        # save csv
        with open("output.csv", 'wb') as f:
            writer = unicodecsv.DictWriter(f, fieldnames=fieldnames, dialect='excel')
            writer.writeheader()
            for my_dict in csv_dicts:
                writer.writerow(my_dict)
        files.append("output.csv")

    if "xlsx" in formats:
        book = Workbook()
        sheet = book.worksheets[0]
        sheet.title = "results"

        for col_idx, field_name in enumerate(fieldnames):
            sheet.cell(column=col_idx+1, row=1, value=field_name)

        for row_idx, row in enumerate(csv_dicts):
            for col_idx, field_name in enumerate(fieldnames):
                sheet.cell(column=col_idx+1, row=row_idx+2, value=row[field_name])

        book.save(filename="output.xlsx")
        files.append("output.xlsx")

    # prep email
    email_address = body["email"]
    email = create_email(email_address,
                 "Your Unpaywall results",
                 "simple_query_tool",
                 {"profile": {}},
                 files)
    send(email, for_real=True)

    return jsonify({"got it": email_address, "dois": pub_dois + missing_dois})
示例#6
0
def get_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, offset_days=0, chunk_size=1000, get_updates=False):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers = {"Accept": "application/json", "User-Agent": "mailto:[email protected]"}

    root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}"

    if get_updates:
        root_url_with_last = "https://api.crossref.org/works?order=desc&sort=indexed&filter=from-index-date:{first},until-index-date:{last}&rows={chunk}&cursor={next_cursor}"
        root_url_no_last = "https://api.crossref.org/works?order=desc&sort=indexed&filter=from-index-date:{first}&rows={chunk}&cursor={next_cursor}"
    else:
        root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}"
        root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    num_pubs_added_so_far = 0
    pubs_this_chunk = []

    if week:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=7))
    elif today:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=2))

    if not first:
        first = datetime.date(2016, 4, 1)

    last = last and last - datetime.timedelta(days=offset_days)
    first = first and first - datetime.timedelta(days=offset_days)

    start_time = time()

    insert_pub_fn = add_pubs_or_update_crossref if get_updates else add_new_pubs

    while has_more_responses:
        if query_doi:
            url = root_url_doi.format(doi=query_doi)
        else:
            if last:
                url = root_url_with_last.format(first=first.isoformat(),
                                                last=last.isoformat(),
                                                next_cursor=next_cursor,
                                                chunk=chunk_size)
            else:
                # query is much faster if don't have a last specified, even if it is far in the future
                url = root_url_no_last.format(first=first.isoformat(),
                                              next_cursor=next_cursor,
                                              chunk=chunk_size)

        logger.info(u"calling url: {}".format(url))
        crossref_time = time()

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds".format(elapsed(crossref_time, 2)))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(resp.status_code))
            resp = None

        if resp:
            resp_data = resp.json()["message"]
            next_cursor = resp_data.get("next-cursor", None)
            if next_cursor:
                next_cursor = quote(next_cursor)

            if not resp_data["items"] or not next_cursor:
                has_more_responses = False

            for api_raw in resp_data["items"]:
                loop_time = time()

                doi = clean_doi(api_raw["DOI"])
                my_pub = build_new_pub(doi, api_raw)

                # hack so it gets updated soon
                my_pub.updated = datetime.datetime(1042, 1, 1)

                pubs_this_chunk.append(my_pub)

                if len(pubs_this_chunk) >= 100:
                    added_pubs = insert_pub_fn(pubs_this_chunk)
                    logger.info(u"added {} pubs, loop done in {} seconds".format(len(added_pubs), elapsed(loop_time, 2)))
                    num_pubs_added_so_far += len(added_pubs)

                    pubs_this_chunk = []

        logger.info(u"at bottom of loop")

    # make sure to get the last ones
    logger.info(u"saving last ones")
    added_pubs = insert_pub_fn(pubs_this_chunk)
    num_pubs_added_so_far += len(added_pubs)
    logger.info(u"Added >>{}<< new crossref dois on {}, took {} seconds".format(
        num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
def get_new_dois_and_data_from_crossref(query_doi=None, first=None, last=None, today=False, week=False, offset_days=0, chunk_size=1000):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers={"Accept": "application/json", "User-Agent": "mailto:[email protected]"}

    root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}"
    root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}"
    root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}"

    # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates
    # root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first},until-indexed-date:{last}&rows={chunk}&cursor={next_cursor}"
    # root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first}&rows={chunk}&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    num_pubs_added_so_far = 0
    pubs_this_chunk = []

    if week:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=7))
    elif today:
        last = (datetime.date.today() + datetime.timedelta(days=1))
        first = (datetime.date.today() - datetime.timedelta(days=2))

    if not first:
        first = datetime.date(2016, 4, 1)

    last = last and last - datetime.timedelta(days=offset_days)
    first = first and first - datetime.timedelta(days=offset_days)

    start_time = time()

    while has_more_responses:

        if query_doi:
            url = root_url_doi.format(doi=query_doi)
        else:
            if last:
                url = root_url_with_last.format(first=first.isoformat(),
                                                last=last.isoformat(),
                                                next_cursor=next_cursor,
                                                chunk=chunk_size)
            else:
                # query is much faster if don't have a last specified, even if it is far in the future
                url = root_url_no_last.format(first=first.isoformat(),
                                              next_cursor=next_cursor,
                                              chunk=chunk_size)

        logger.info(u"calling url: {}".format(url))
        crossref_time = time()

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds".format(elapsed(crossref_time, 2)))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(resp.status_code))
            resp = None

        if resp:
            resp_data = resp.json()["message"]
            next_cursor = resp_data.get("next-cursor", None)
            if next_cursor:
                next_cursor = quote(next_cursor)

            if not resp_data["items"] or not next_cursor:
                has_more_responses = False

            for api_raw in resp_data["items"]:
                loop_time = time()

                doi = clean_doi(api_raw["DOI"])
                my_pub = build_new_pub(doi, api_raw)

                # hack so it gets updated soon
                my_pub.updated = datetime.datetime(1042, 1, 1)

                pubs_this_chunk.append(my_pub)

                if len(pubs_this_chunk) >= 100:
                    added_pubs = add_new_pubs(pubs_this_chunk)
                    logger.info(u"added {} pubs, loop done in {} seconds".format(len(added_pubs), elapsed(loop_time, 2)))
                    num_pubs_added_so_far += len(added_pubs)

                    # if new_pubs:
                    #     id_links = ["http://api.oadoi.org/v2/{}".format(my_pub.id) for my_pub in new_pubs[0:5]]
                    #     logger.info(u"last few ids were {}".format(id_links))

                    pubs_this_chunk = []

        logger.info(u"at bottom of loop")

    # make sure to get the last ones
    logger.info(u"saving last ones")
    added_pubs = add_new_pubs(pubs_this_chunk)
    num_pubs_added_so_far += len(added_pubs)
    logger.info(u"Added >>{}<< new crossref dois on {}, took {} seconds".format(
        num_pubs_added_so_far, datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))
示例#8
0
def get_new_dois_and_data_from_crossref(query_doi=None,
                                        first=None,
                                        last=None,
                                        today=False,
                                        week=False,
                                        chunk_size=1000):
    i = 0
    records_to_save = []

    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers = {
        "Accept": "application/json",
        "User-Agent": "mailto:[email protected]"
    }

    root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first},until-created-date:{last}&rows={chunk}&cursor={next_cursor}"
    root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-created-date:{first}&rows={chunk}&cursor={next_cursor}"
    root_url_doi = "https://api.crossref.org/works?filter=doi:{doi}"

    # but if want all changes, use "indexed" not "created" as per https://github.com/CrossRef/rest-api-doc/blob/master/rest_api.md#notes-on-incremental-metadata-updates
    # root_url_with_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first},until-indexed-date:{last}&rows={chunk}&cursor={next_cursor}"
    # root_url_no_last = "https://api.crossref.org/works?order=desc&sort=updated&filter=from-indexed-date:{first}&rows={chunk}&cursor={next_cursor}"

    next_cursor = "*"
    has_more_responses = True
    num_pubs_added_so_far = 0
    pubs_this_chunk = []

    if week:
        last = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()
        first = (datetime.date.today() -
                 datetime.timedelta(days=7)).isoformat()
    elif today:
        last = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()
        first = (datetime.date.today() -
                 datetime.timedelta(days=2)).isoformat()

    if not first:
        first = "2016-04-01"

    start_time = time()

    while has_more_responses:

        if query_doi:
            url = root_url_doi.format(doi=query_doi)
        else:
            if last:
                url = root_url_with_last.format(first=first,
                                                last=last,
                                                next_cursor=next_cursor,
                                                chunk=chunk_size)
            else:
                # query is much faster if don't have a last specified, even if it is far in the future
                url = root_url_no_last.format(first=first,
                                              next_cursor=next_cursor,
                                              chunk=chunk_size)

        logger.info(u"calling url: {}".format(url))
        crossref_time = time()

        resp = requests.get(url, headers=headers)
        logger.info(u"getting crossref response took {} seconds".format(
            elapsed(crossref_time, 2)))
        if resp.status_code != 200:
            logger.info(u"error in crossref call, status_code = {}".format(
                resp.status_code))
            resp = None

        if resp:
            resp_data = resp.json()["message"]
            next_cursor = resp_data.get("next-cursor", None)
            if next_cursor:
                next_cursor = quote(next_cursor)

            if not resp_data["items"] or not next_cursor:
                has_more_responses = False

            for api_raw in resp_data["items"]:
                loop_time = time()

                doi = clean_doi(api_raw["DOI"])
                my_pub = build_new_pub(doi, api_raw)

                # hack so it gets updated soon
                my_pub.updated = datetime.datetime(1042, 1, 1)

                pubs_this_chunk.append(my_pub)

                if len(pubs_this_chunk) >= 100:
                    added_pubs = add_new_pubs(pubs_this_chunk)
                    logger.info(
                        u"added {} pubs, loop done in {} seconds".format(
                            len(added_pubs), elapsed(loop_time, 2)))
                    num_pubs_added_so_far += len(added_pubs)

                    # if new_pubs:
                    #     id_links = ["http://api.oadoi.org/v2/{}".format(my_pub.id) for my_pub in new_pubs[0:5]]
                    #     logger.info(u"last few ids were {}".format(id_links))

                    pubs_this_chunk = []
                    loop_time = time()

        logger.info(u"at bottom of loop")

    # make sure to get the last ones
    logger.info(u"saving last ones")
    added_pubs = add_new_pubs(pubs_this_chunk)
    num_pubs_added_so_far += len(added_pubs)
    logger.info(
        u"Added >>{}<< new crossref dois on {}, took {} seconds".format(
            num_pubs_added_so_far,
            datetime.datetime.now().isoformat()[0:10], elapsed(start_time, 2)))