Exemplo n.º 1
0
def call_requests_get(url,
                      headers={},
                      read_timeout=60,
                      connect_timeout=60,
                      stream=False,
                      publisher=None,
                      session_id=None,
                      ask_slowly=False):

    following_redirects = True
    num_redirects = 0
    while following_redirects:
        requests_session = requests.Session()

        if ask_slowly:
            retries = Retry(total=1,
                            backoff_factor=0.1,
                            status_forcelist=[500, 502, 503, 504])
        else:
            retries = Retry(total=0,
                            backoff_factor=0.1,
                            status_forcelist=[500, 502, 503, 504])
        requests_session.mount('http://', DelayedAdapter(max_retries=retries))
        requests_session.mount('https://', DelayedAdapter(max_retries=retries))

        if u"citeseerx.ist.psu.edu/" in url:
            url = url.replace("http://", "https://")
            proxy_url = os.getenv("STATIC_IP_PROXY")
            proxies = {"https": proxy_url, "http": proxy_url}
        else:
            proxies = {}

        # logger.info(u"getting url {}".format(url))
        r = requests_session.get(url,
                                 headers=headers,
                                 timeout=(connect_timeout, read_timeout),
                                 stream=stream,
                                 proxies=proxies,
                                 allow_redirects=True,
                                 verify=False)

        # from http://jakeaustwick.me/extending-the-requests-response-class/
        for method_name, method in inspect.getmembers(RequestWithFileDownload,
                                                      inspect.ismethod):
            setattr(requests.models.Response, method_name, method.im_func)

        if r and not r.encoding:
            r.encoding = "utf-8"

        # check to see if we actually want to keep redirecting, using business-logic redirect paths
        following_redirects = False
        num_redirects += 1
        if (r.status_code == 200) and (num_redirects < 5):
            redirect_url = keep_redirecting(r, publisher)
            if redirect_url:
                following_redirects = True
                url = redirect_url

    return r
Exemplo n.º 2
0
def get_response_page(url):
    # needs a mailto, see https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
    headers = {"Accept": "application/json", "User-Agent": "mailto:[email protected]"}

    requests_session = requests.Session()

    retries = Retry(total=10, backoff_factor=1, status_forcelist=[413, 429, 500, 502, 503, 504])
    requests_session.mount('http://', DelayedAdapter(max_retries=retries))
    requests_session.mount('https://', DelayedAdapter(max_retries=retries))

    r = requests_session.get(url, headers=headers, timeout=(180, 180))

    return r
Exemplo n.º 3
0
def get_chorus_data(starting_offset=0, agency_id=None):
    requests_session = requests.Session()
    retries = Retry(total=10,
                backoff_factor=0.5,
                status_forcelist=[500, 502, 503, 504])
    requests_session.mount('http://', DelayedAdapter(max_retries=retries))
    requests_session.mount('https://', DelayedAdapter(max_retries=retries))

    agencies = get_chorus_agencies()
    for agency in agencies:
        if agency_id:
            if int(agency["Agency_Id"]) != int(agency_id):
                print "skipping {}, you are not the agency id we are looking for".format(agency["Agency_Id"])
                continue
        if starting_offset:
            offset = starting_offset
        else:
            offset = 0

        logger.info(u"*** on agency {}:{}".format(agency["Agency_Name"], agency["Agency_Id"]))
        url_template = "https://api.chorusaccess.org/v1.1/agencies/{agency_id}/histories/current?category=publicly_accessible&limit={limit}&offset={offset}"
        limit = 50
        total_results = None
        while total_results==None or offset < total_results:
            loop_start = time()
            url = url_template.format(agency_id=agency["Agency_Id"], offset=offset, limit=limit)
            print url
            try:
                r = requests_session.get(url, timeout=360)  # wait for 3 minutes
            except Exception, e:
                logger.exception(u"Exception: {}, skipping".format(unicode(e.message).encode("utf-8")))
                r = None

            print u"api call elapsed: {} seconds".format(elapsed(loop_start, 1))
            offset += limit

            if r:
                data = r.json()
                total_results = data["total_results"]
                logger.info(u"Has {} total results, {} remaining".format(
                    total_results, total_results - offset))


                items = data["items"]
                new_objects = []
                for item in items:
                    if item["DOI"]:
                        doi = clean_doi(item["DOI"])
                        new_objects.append(Chorus(id=doi, raw=item))

                ids_already_in_db = [id_tuple[0] for id_tuple in db.session.query(Chorus.id).filter(Chorus.id.in_([obj.id for obj in new_objects])).all()]
                objects_to_add_to_db = [obj for obj in new_objects if obj.id not in ids_already_in_db]
                if objects_to_add_to_db:
                    logger.info(u"adding {} items".format(len(objects_to_add_to_db)))
                    db.session.add_all(objects_to_add_to_db)
                    safe_commit(db)
                else:
                    logger.info(u"all of these items already in db")

            logger.info(u"sleeping for 2 seconds")
            sleep(2)