예제 #1
0
    def get_records(self, start_date, end_date):
        ''' helper function to get a response from the DataONE
        API, with the specified number of rows.
        Returns an etree element with results '''

        query = 'dateModified:[{}T00:00:00Z TO {}T00:00:00Z]'.format(start_date.isoformat(), end_date.isoformat())
        doc = requests.get(DATAONE_SOLR_ENDPOINT, params={
            'q': query,
            'start': 0,
            'rows': 1
        })
        doc = etree.XML(doc.content)
        rows = int(doc.xpath("//result/@numFound")[0])

        n = 0
        while n < rows:
            data = requests.get(DATAONE_SOLR_ENDPOINT, params={
                'q': query,
                'start': n,
                'rows': 1000
            })
            docs = etree.XML(data.content).xpath('//doc')
            for doc in docs:
                yield doc
            n += 1000
예제 #2
0
    def fetch_rows(self, days_back):
        query = self.build_query(days_back)

        resp = requests.get(self.BASE_URL, params={
            'q': query,
            'rows': '0',
            'api_key': PLOS_API_KEY,
        })

        total_rows = etree.XML(resp.content).xpath('//result/@numFound')
        total_rows = int(total_rows[0]) if total_rows else 0

        current_row = 0
        while current_row < total_rows:
            response = requests.get(self.BASE_URL, throttle=5, params={
                'q': query,
                'start': current_row,
                'api_key': PLOS_API_KEY,
                'rows': self.MAX_ROWS_PER_REQUEST,
            })

            for doc in etree.XML(response.content).xpath('//doc'):
                yield doc

            current_row += self.MAX_ROWS_PER_REQUEST
예제 #3
0
    def harvest(self, start_date=None, end_date=None):
        ''' First, get a list of all recently updated study urls,
        then get the xml one by one and save it into a list
        of docs including other information '''

        start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()
        start_date += 'T00:00:00Z'
        end_date += 'T00:00:00Z'

        # grab each of those urls for full content
        xml_list = []
        xml_base_url = self.canonical_base_url + '&view=xml'
        for dataset_id in self.query_by_date(start_date, end_date):
            try:
                item_url = str(xml_base_url).format(dataset_id)
                content = requests.get(item_url, throttle=2)
            except exceptions.ConnectionError as e:
                logger.info('Connection error: {}, wait a bit...'.format(e))
                time.sleep(30)
                content = requests.get(item_url)
            doc = etree.XML(content.content)

            record = etree.tostring(doc, encoding=self.DEFAULT_ENCODING)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(dataset_id),
                'filetype': 'xml',
            }))

        return xml_list
예제 #4
0
    def test_get_calls_get(self, monkeypatch):
        mock_request = mock.Mock()
        monkeypatch.setattr(requests, "record_or_load_response", mock_request)

        requests.get("test", tota="dyle")

        mock_request.assert_called_once_with("get", "test", tota="dyle")
예제 #5
0
    def get_records(self, days_back):
        ''' helper function to get a response from the DataONE
        API, with the specified number of rows.
        Returns an etree element with results '''
        to_date = datetime.utcnow()
        from_date = (datetime.utcnow() - timedelta(days=days_back))

        to_date = to_date.replace(hour=0, minute=0, second=0, microsecond=0)
        from_date = from_date.replace(hour=0, minute=0, second=0, microsecond=0)

        query = 'dateModified:[{}Z TO {}Z]'.format(from_date.isoformat(), to_date.isoformat())
        doc = requests.get(DATAONE_SOLR_ENDPOINT, params={
            'q': query,
            'start': 0,
            'rows': 1
        })
        doc = etree.XML(doc.content)
        rows = int(doc.xpath("//result/@numFound")[0])

        n = 0
        while n < rows:
            data = requests.get(DATAONE_SOLR_ENDPOINT, params={
                'q': query,
                'start': n,
                'rows': 1000
            })
            docs = etree.XML(data.content).xpath('//doc')
            for doc in docs:
                yield doc
            n += 1000
예제 #6
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}'
        url = base_url.format('1', format_date_with_slashes(start_date), format_date_with_slashes(end_date))
        initial_data = requests.get(url)
        record_encoding = initial_data.encoding
        initial_doc = etree.XML(initial_data.content)

        num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0])

        url = base_url.format(num_results, start_date, end_date)

        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('records/record')

        xml_list = []
        for record in records:
            doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0]
            record = etree.tostring(record, encoding=record_encoding)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list
예제 #7
0
    def test_get_calls_get(self, monkeypatch):
        mock_request = mock.Mock()
        monkeypatch.setattr(requests, 'record_or_load_response', mock_request)

        requests.get('test', tota='dyle')

        mock_request.assert_called_once_with('get', 'test', tota='dyle')
예제 #8
0
    def get_records(self, start_date, end_date):
        ''' helper function to get a response from the DataONE
        API, with the specified number of rows.
        Returns an etree element with results '''

        query = 'dateModified:[{}T00:00:00Z TO {}T00:00:00Z]'.format(
            start_date.isoformat(), end_date.isoformat())
        doc = requests.get(DATAONE_SOLR_ENDPOINT,
                           params={
                               'q': query,
                               'start': 0,
                               'rows': 1
                           })
        doc = etree.XML(doc.content)
        rows = int(doc.xpath("//result/@numFound")[0])

        n = 0
        while n < rows:
            data = requests.get(DATAONE_SOLR_ENDPOINT,
                                params={
                                    'q': query,
                                    'start': n,
                                    'rows': 1000
                                })
            docs = etree.XML(data.content).xpath('//doc')
            for doc in docs:
                yield doc
            n += 1000
예제 #9
0
    def fetch_rows(self, start_date, end_date):
        query = 'publication_date:[{}T00:00:00Z TO {}T00:00:00Z]'.format(start_date, end_date)

        resp = requests.get(self.BASE_URL, params={
            'q': query,
            'rows': '0',
            'api_key': PLOS_API_KEY,
        })

        total_rows = etree.XML(resp.content).xpath('//result/@numFound')
        total_rows = int(total_rows[0]) if total_rows else 0

        current_row = 0
        while current_row < total_rows:
            response = requests.get(self.BASE_URL, throttle=5, params={
                'q': query,
                'start': current_row,
                'api_key': PLOS_API_KEY,
                'rows': self.MAX_ROWS_PER_REQUEST,
            })

            for doc in etree.XML(response.content).xpath('//doc'):
                yield doc

            current_row += self.MAX_ROWS_PER_REQUEST
예제 #10
0
    def harvest(self, start_date=None, end_date=None):
        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://api.crossref.org/v1/works?filter=from-pub-date:{},until-pub-date:{}&rows={{}}&offset={{}}'.format(
            start_date.isoformat(), end_date.isoformat())
        total = requests.get(base_url.format(
            '0', '0')).json()['message']['total-results']
        logger.info('{} documents to be harvested'.format(total))

        doc_list = []
        for i in xrange(0, total, 1000):
            records = requests.get(base_url.format(
                1000, i)).json()['message']['items']
            logger.info('Harvested {} documents'.format(i + len(records)))

            for record in records:
                doc_id = record['DOI']
                doc_list.append(
                    RawDocument({
                        'doc': json.dumps(record),
                        'source': self.short_name,
                        'docID': doc_id,
                        'filetype': 'json'
                    }))

        return doc_list
예제 #11
0
    def harvest(self, start_date=None, end_date=None):
        ''' First, get a list of all recently updated study urls,
        then get the xml one by one and save it into a list
        of docs including other information '''

        start_date = (start_date or date.today() -
                      timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()
        start_date += 'T00:00:00Z'
        end_date += 'T00:00:00Z'

        # grab each of those urls for full content
        xml_list = []
        xml_base_url = self.canonical_base_url + '&view=xml'
        for dataset_id in self.query_by_date(start_date, end_date):
            try:
                item_url = str(xml_base_url).format(dataset_id)
                content = requests.get(item_url, throttle=2)
            except exceptions.ConnectionError as e:
                logger.info('Connection error: {}, wait a bit...'.format(e))
                time.sleep(30)
                content = requests.get(item_url)
            doc = etree.XML(content.content)

            record = etree.tostring(doc, encoding=self.DEFAULT_ENCODING)
            xml_list.append(
                RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(dataset_id),
                    'filetype': 'xml',
                }))

        return xml_list
예제 #12
0
    def test_get_calls_get(self, monkeypatch):
        mock_request = mock.Mock()
        monkeypatch.setattr(requests, 'record_or_load_response', mock_request)

        requests.get('test', tota='dyle')

        mock_request.assert_called_once_with('get', 'test', tota='dyle')
예제 #13
0
    def test_force_makes_request(self, mock_requests, monkeypatch):
        mock_requests.request.return_value = mock.Mock(
            ok=True, encoding="utf-8", content="rawr", status_code=200, headers={"tota": "dyle"}
        )

        requests.get("dinosaurs.sexy", force=True)
        assert mock_requests.request.called is True
예제 #14
0
    def fetch_rows(self, start_date, end_date):
        query = 'publication_date:[{}T00:00:00Z TO {}T00:00:00Z]'.format(
            start_date, end_date)

        resp = requests.get(self.BASE_URL,
                            params={
                                'q': query,
                                'rows': '0',
                                'api_key': PLOS_API_KEY,
                            })

        total_rows = etree.XML(resp.content).xpath('//result/@numFound')
        total_rows = int(total_rows[0]) if total_rows else 0

        current_row = 0
        while current_row < total_rows:
            response = requests.get(self.BASE_URL,
                                    throttle=5,
                                    params={
                                        'q': query,
                                        'start': current_row,
                                        'api_key': PLOS_API_KEY,
                                        'rows': self.MAX_ROWS_PER_REQUEST,
                                    })

            for doc in etree.XML(response.content).xpath('//doc'):
                yield doc

            current_row += self.MAX_ROWS_PER_REQUEST
예제 #15
0
    def harvest(self, days_back=1):
        today = date.today()
        start_date = today - timedelta(days_back)
        base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}'
        url = base_url.format('1', start_date.strftime('%m/%d/%Y'), today.strftime('%m/%d/%Y'))
        initial_data = requests.get(url)
        record_encoding = initial_data.encoding
        initial_doc = etree.XML(initial_data.content)

        num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0])

        url = base_url.format(num_results, start_date.strftime('%m/%d/%Y'), today.strftime('%m/%d/%Y'))
        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('records/record')

        xml_list = []
        for record in records:
            doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0]
            record = etree.tostring(record, encoding=record_encoding)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list
예제 #16
0
    def test_record_or_load_response_respects_record_true(self, mock_requests, monkeypatch):
        mock_rec_or_load = mock.Mock()
        monkeypatch.setattr(requests, "record_or_load_response", mock_rec_or_load)

        requests.get("foo")

        assert mock_rec_or_load.called_once_with("get", "foo")
        assert mock_requests.request.called_once_with("get", "foo")
예제 #17
0
    def test_record_or_load_response_respects_record_true(self, mock_requests, monkeypatch):
        mock_rec_or_load = mock.Mock()
        monkeypatch.setattr(requests, 'record_or_load_response', mock_rec_or_load)

        requests.get('foo')

        assert mock_rec_or_load.called_once_with('get', 'foo')
        assert mock_requests.request.called_once_with('get', 'foo')
예제 #18
0
    def test_record_or_load_response_respects_record_false(self, mock_requests, monkeypatch):
        mock_rec_or_load = mock.Mock()
        monkeypatch.setattr(requests.settings, "RECORD_HTTP_TRANSACTIONS", False)
        monkeypatch.setattr(requests, "record_or_load_response", mock_rec_or_load)

        requests.get("foo")

        assert not mock_rec_or_load.called
        assert mock_requests.request.called_once_with("get", "foo")
예제 #19
0
    def test_record_or_load_response_respects_record_false(self, mock_requests, monkeypatch):
        mock_rec_or_load = mock.Mock()
        monkeypatch.setattr(requests.settings, 'RECORD_HTTP_TRANSACTIONS', False)
        monkeypatch.setattr(requests, 'record_or_load_response', mock_rec_or_load)

        requests.get('foo')

        assert not mock_rec_or_load.called
        assert mock_requests.request.called_once_with('get', 'foo')
예제 #20
0
 def get_records(self, search_url):
     all_lessons = []
     resp = requests.get(self.URL + '?page=last').json()
     last_lesson_id = resp['lessons'][-1]['id']
     for pk in range(last_lesson_id + 1):
         lesson = requests.get(search_url + "/" + str(pk), expected=[200, 403, 404])
         if lesson.status_code == 200:
             lesson_list = lesson.json()['lessons'][0]
             all_lessons.append(lesson_list)
     return all_lessons
예제 #21
0
    def test_force_makes_request(self, mock_requests, monkeypatch):
        mock_requests.request.return_value = mock.Mock(
            ok=True,
            encoding='utf-8',
            content='rawr',
            status_code=200,
            headers={'tota': 'dyle'})

        requests.get('dinosaurs.sexy', force=True)
        assert mock_requests.request.called is True
예제 #22
0
 def get_records(self, search_url):
     all_lessons = []
     resp = requests.get(self.URL + '?page=last').json()
     last_lesson_id = resp['lessons'][-1]['id']
     for pk in range(last_lesson_id + 1):
         lesson = requests.get(search_url + "/" + str(pk),
                               expected=[200, 403, 404])
         if lesson.status_code == 200:
             lesson_list = lesson.json()['lessons'][0]
             all_lessons.append(lesson_list)
     return all_lessons
예제 #23
0
    def test_force_makes_new_request(self, mock_requests, monkeypatch):
        requests.HarvesterResponse(ok=True, method='get', url='dinosaurs.sexy', content='citychicken').save()
        mock_requests.request.return_value = mock.Mock(encoding='utf-8', content='Snapcity', status_code=200, headers={'tota': 'dyle'})

        resp = requests.get('dinosaurs.sexy')

        assert resp.content == 'citychicken'
        assert mock_requests.request.called is False

        resp = requests.get('dinosaurs.sexy', force=True)

        assert resp.content == 'Snapcity'
        assert mock_requests.request.called is True
예제 #24
0
    def get_records(self, search_url):
        records = requests.get(search_url).json()['response'].get('award')
        offset = 1

        all_records = []
        while len(records) == 25:
            for record in records:
                all_records.append(record)

            offset += 25
            records = requests.get(search_url + '&offset={}'.format(str(offset)), throttle=3).json()['response'].get('award')
        all_records.extend(records)

        return all_records
예제 #25
0
    def test_force_makes_new_request(self, mock_requests, monkeypatch):
        requests.HarvesterResponse(ok=True, method="get", url="dinosaurs.sexy", content=b"citychicken").save()
        mock_requests.request.return_value = mock.Mock(
            encoding="utf-8", content=b"Snapcity", status_code=200, headers={"tota": "dyle"}
        )

        resp = requests.get("dinosaurs.sexy")

        assert resp.content == b"citychicken"
        assert mock_requests.request.called is False

        resp = requests.get("dinosaurs.sexy", force=True)

        assert resp.content == b"Snapcity"
        assert mock_requests.request.called is True
예제 #26
0
    def get_records(self, search_url):
        records = requests.get(search_url)
        total_records = records.json()['data']['total_count']
        start = 0
        all_records = []

        while len(all_records) < total_records:
            records = requests.get(search_url + '&start={}'.format(str(start)))
            record_list = records.json()['data']['items']

            for record in record_list:
                all_records.append(record)

            start += self.MAX_ITEMS_PER_REQUEST

        return all_records
예제 #27
0
    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        start_date += 'T00:00:00Z'
        end_date += 'T00:00:00Z'

        base_url = 'https://www.earthsystemgrid.org/oai/repository?verb=ListRecords&metadataPrefix=dif&from={}&until={}'

        url = base_url.format(start_date, end_date)

        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('//OAI-PMH:record', namespaces=self.namespaces)

        xml_list = []
        for record in records:
            doc_id = record.xpath('//OAI-PMH:header/OAI-PMH:identifier/node()', namespaces=self.namespaces)[0]
            record = etree.tostring(record)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list
예제 #28
0
파일: springer.py 프로젝트: kms6bn/scrapi
    def get_records(self, search_urls):
        all_records_from_all_days = []
        for search_url in search_urls:
            records = requests.get(search_url).json()
            index = 1
            total_records = int(records['result'][0]['total'])

            all_records = []
            while len(all_records) < total_records:
                record_list = records['records']
                all_records += record_list
                index += 100
                records = requests.get(search_url + '&s={}'.format(str(index), throttle=10)).json()
            all_records_from_all_days = all_records_from_all_days + all_records

        return all_records_from_all_days
예제 #29
0
    def _fetch_records(self, start_date, end_date):
        page = 0
        morepages = True

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        while morepages:
            resp = requests.get(self.base_url,
                                params={
                                    'page':
                                    page,
                                    'EntryDateTo':
                                    format_date_with_slashes(end_date),
                                    'EntryDateFrom':
                                    format_date_with_slashes(start_date),
                                })

            xml = etree.XML(resp.content)

            for record in xml.xpath('records/record'):
                yield record

            page += 1
            morepages = xml.xpath('//records/@morepages')[0] == 'true'
예제 #30
0
    def query_by_date(self, start_date, end_date):
        '''Use OAI-PMH interface to get a list of dataset ids for the given date range'''
        search_url_end = '&metadataPrefix=oai_dc&from={}&until={}'.format(
            start_date, end_date)
        search_url = self.search_base_url + search_url_end

        while True:
            record_list = requests.get(search_url)
            record_list_xml = etree.XML(record_list.content)
            if record_list_xml.xpath('./oai_dc:error', namespaces=self.oai_ns):
                break

            for dataset in record_list_xml.xpath(
                    './oai_dc:ListRecords/oai_dc:record',
                    namespaces=self.oai_ns):
                yield dataset.xpath('./oai_dc:header/oai_dc:identifier/node()',
                                    namespaces=self.oai_ns)[0]

            token = record_list_xml.xpath(
                './oai_dc:ListRecords/oai_dc:resumptionToken/node()',
                namespaces=self.oai_ns)
            if not token:
                break

            search_url = self.search_base_url + '&resumptionToken=' + token[0]
예제 #31
0
파일: nih.py 프로젝트: AndrewSallans/scrapi
    def harvest(self, start_date=None, end_date=None):
        """
        Return a list of RawDocuments
        """
        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://exporter.nih.gov/'
        table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/'

        # get ExPORTER page html and rows storing records
        html = requests.get(table_url).content
        soup = BeautifulSoup(html, 'lxml')
        table = soup.find('table', id="ContentPlaceHolder1_ProjectData_dgProjectData")
        rows = table.find_all('tr', class_="row_bg")
        urls = [i for i in construct_urls(base_url, start_date, end_date, rows)]

        return [
            RawDocument({
                'doc': etree.tostring(record, encoding=self.DEFAULT_ENCODING),
                'source': self.short_name,
                'docID': copy_to_unicode(record.xpath('.//APPLICATION_ID/node()', namespaces=self.namespaces)[0]),
                'filetype': 'xml'
            }) for record in xml_records(get_xml_files(urls))
        ]
예제 #32
0
파일: helpers.py 프로젝트: zamattiac/scrapi
def oai_get_records_and_token(url, throttle, force, namespaces, verify):
    """ Helper function to get the records and any resumptionToken
    from an OAI request.

    Takes a url and any request parameters and returns the records
    along with the resumptionToken if there is one.
    """
    data = requests.get(url, throttle=throttle, force=force, verify=verify)

    encoding = data.encoding or 'utf-8'
    if encoding.lower() == 'none':
        encoding = 'utf-8'

    parser = etree.XMLParser(recover=True, encoding=encoding)
    doc = etree.XML(data.content, parser=parser)

    records = doc.xpath(
        '//ns0:record',
        namespaces=namespaces
    )

    token = doc.xpath(
        '//ns0:resumptionToken/node()',
        namespaces=namespaces
    )

    return records, token
예제 #33
0
    def test_record_or_load_params(self, mock_requests, monkeypatch):
        mock_requests.request.return_value = mock.Mock(encoding='utf-8', content='Snapcity', status_code=200, headers={'tota': 'dyle'})

        resp = requests.get('dinosaurs.sexy', params={'test': 'foo'})

        assert resp.status_code == 200
        assert resp.url == 'dinosaurs.sexy?test=foo'
예제 #34
0
    def get_records(self, search_url):
        records = requests.get(search_url)
        total_records = records.json()['data']['total_count']
        start = 0
        all_records = []

        while len(all_records) < total_records:
            records = requests.get(search_url + '&start={}'.format(str(start)))
            record_list = records.json()['data']['items']

            for record in record_list:
                all_records.append(record)

            start += self.MAX_ITEMS_PER_REQUEST

        return all_records
예제 #35
0
    def test_record_or_load_remakes(self, mock_requests, monkeypatch):
        mock_requests.request.return_value = mock.Mock(
            encoding='utf-8',
            content='rawr',
            status_code=200,
            headers={'tota': 'dyle'})
        requests.HarvesterResponse(ok=False,
                                   method='get',
                                   url='dinosaurs.sexy').save()

        model = requests.HarvesterResponse.get(method='get',
                                               url='dinosaurs.sexy')

        assert not model.ok
        assert model.method == 'get'
        assert model.url == 'dinosaurs.sexy'

        resp = requests.get('dinosaurs.sexy')

        model = requests.HarvesterResponse.get(method='get',
                                               url='dinosaurs.sexy')

        assert model.method == 'get'
        assert model.content == b'rawr'
        assert model.encoding == 'utf-8'
        assert model.status_code == 200
        assert model.url == 'dinosaurs.sexy'
        assert model.headers == {'tota': 'dyle'}
        assert model.headers_str == '{"tota": "dyle"}'
        assert isinstance(resp, requests.HarvesterResponse)
예제 #36
0
    def get_records(self, search_url):
        records = requests.get(search_url)
        total_records = records.json()['recordCount']
        logger.info('Harvesting {} records'.format(total_records))
        page_number = 1
        count = 0

        while records.json()['records']:
            record_list = records.json()['records']
            for record in record_list:
                count += 1
                yield record

            page_number += 1
            records = requests.get(search_url + '&page_number={}'.format(page_number), throttle=3)
            logger.info('{} documents harvested'.format(count))
예제 #37
0
def fetch_commits(base_url, start_date, end_date):

    jsonstr = ""
    i = 1
    while True:
        resp = requests.get(base_url, params={
            'since': start_date,
            'until': end_date,
            'page': i,
            'per_page': 100,
        })

        jsonchunk = resp.content.decode('utf-8')
        if len(jsonchunk) <= 2:
            break
        i += 1

        jsonchunk = jsonchunk.replace('},{', '}\\n{')
        jsonchunk = jsonchunk[1:-1]
        jsonstr = jsonstr + "\\n" + jsonchunk

    jsonarr = jsonstr.split('\\n')[1:]

    shas = []
    for jsonstring in jsonarr:
        jsonobj = json.loads(jsonstring)
        shas.append(jsonobj['sha'])
    return shas
예제 #38
0
    def get_records(self, search_url):
        records = requests.get(search_url).json()['response'].get('award')
        offset = 1

        all_records = []
        while len(records) == 25:
            for record in records:
                all_records.append(record)

            offset += 25
            records = requests.get(search_url +
                                   '&offset={}'.format(str(offset)),
                                   throttle=3).json()['response'].get('award')
        all_records.extend(records)

        return all_records
예제 #39
0
파일: nih.py 프로젝트: zamattiac/scrapi
    def harvest(self, start_date=None, end_date=None):
        """
        Return a list of RawDocuments
        """
        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://exporter.nih.gov/'
        table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/'

        # get ExPORTER page html and rows storing records
        html = requests.get(table_url).content
        soup = BeautifulSoup(html, 'lxml')
        table = soup.find('table',
                          id="ContentPlaceHolder1_ProjectData_dgProjectData")
        rows = table.find_all('tr', class_="row_bg")
        urls = [
            i for i in construct_urls(base_url, start_date, end_date, rows)
        ]

        return [
            RawDocument({
                'doc':
                etree.tostring(record, encoding=self.DEFAULT_ENCODING),
                'source':
                self.short_name,
                'docID':
                copy_to_unicode(
                    record.xpath('.//APPLICATION_ID/node()',
                                 namespaces=self.namespaces)[0]),
                'filetype':
                'xml'
            }) for record in xml_records(get_xml_files(urls))
        ]
예제 #40
0
파일: ncar.py 프로젝트: zamattiac/scrapi
    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() -
                      timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        start_date += 'T00:00:00Z'
        end_date += 'T00:00:00Z'

        base_url = 'https://www.earthsystemgrid.org/oai/repository?verb=ListRecords&metadataPrefix=dif&from={}&until={}'

        url = base_url.format(start_date, end_date)

        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('//OAI-PMH:record', namespaces=self.namespaces)

        xml_list = []
        for record in records:
            doc_id = record.xpath('//OAI-PMH:header/OAI-PMH:identifier/node()',
                                  namespaces=self.namespaces)[0]
            record = etree.tostring(record)
            xml_list.append(
                RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return xml_list
예제 #41
0
def fetch_commits(base_url, start_date, end_date):

    jsonstr = ""
    i = 1
    while True:
        resp = requests.get(base_url, params={
            'since': start_date,
            'until': end_date,
            'page': i,
            'per_page': 100,
        })

        jsonchunk = resp.content.decode('utf-8')
        if len(jsonchunk) <= 2:
            break
        i += 1

        jsonchunk = jsonchunk.replace('},{', '}\\n{')
        jsonchunk = jsonchunk[1:-1]
        jsonstr = jsonstr + "\\n" + jsonchunk

    jsonarr = jsonstr.split('\\n')[1:]

    shas = []
    for jsonstring in jsonarr:
        jsonobj = json.loads(jsonstring)
        shas.append(jsonobj['sha'])
    return shas
예제 #42
0
    def get_records(self, search_url):
        records = requests.get(search_url)
        total_records = records.json()['items_found']
        page = 1

        all_records = []
        while len(all_records) < total_records:
            record_list = records.json()['items']

            for record in record_list:
                if len(all_records) < total_records:
                    all_records.append(record)

            page += 1
            records = requests.get(search_url + '&page={}'.format(str(page)), throttle=3)

        return all_records
예제 #43
0
def fetch_file_names(commit_url, sha):
    resp = requests.get(commit_url.format(sha))

    jsonstr = resp.content.decode('utf-8')
    jsonobj = json.loads(jsonstr)

    files = [d['filename'] for d in jsonobj['files']]
    return files
예제 #44
0
파일: osf.py 프로젝트: zamattiac/scrapi
    def get_records(self, search_url):
        records = requests.get(search_url)

        total = int(records.json()['counts']['registration'])

        from_arg = 0
        all_records = []
        while len(all_records) < total:
            record_list = records.json()['results']

            for record in record_list:
                all_records.append(record)

            from_arg += 1000
            records = requests.get(search_url + '&from={}'.format(str(from_arg)), throttle=10)

        return all_records
예제 #45
0
    def get_records(self, search_url):
        records = requests.get(search_url)
        total_records = records.json()['items_found']
        page = 1

        all_records = []
        while len(all_records) < total_records:
            record_list = records.json()['items']

            for record in record_list:
                if len(all_records) < total_records:
                    all_records.append(record)

            page += 1
            records = requests.get(search_url + '&page={}'.format(str(page)), throttle=3)

        return all_records
예제 #46
0
def fetch_file_names(commit_url, sha):
    resp = requests.get(commit_url.format(sha))

    jsonstr = resp.content.decode('utf-8')
    jsonobj = json.loads(jsonstr)

    files = [d['filename'] for d in jsonobj['files']]
    return files
예제 #47
0
    def get_records(self, search_url):
        records = requests.get(search_url + "#{}".format(datetime.date.today()))
        page = 1

        all_records = []
        current_records = len(records.json()['entries'])
        while current_records > 0:
            record_list = records.json()['entries']

            for record in record_list:
                all_records.append(record)

            page += 1
            records = requests.get(search_url + '&page={}#{}'.format(str(page), datetime.date.today()), throttle=10)
            current_records = len(records.json()['entries'])

        return all_records
예제 #48
0
    def test_record_or_load_loads(self, mock_requests, monkeypatch):
        requests.HarvesterResponse(ok=True, method='get', url='dinosaurs.sexy', content='rawr', headers_str="{}").save()

        resp = requests.get('dinosaurs.sexy')

        assert resp.headers == {}
        assert resp.content == 'rawr'
        assert not mock_requests.request.called
        assert isinstance(resp, requests.HarvesterResponse)
예제 #49
0
    def get_records(self, search_urls):
        all_records_from_all_days = []
        for search_url in search_urls:
            records = requests.get(search_url).json()
            index = 1
            total_records = int(records['result'][0]['total'])

            all_records = []
            while len(all_records) < total_records:
                record_list = records['records']
                all_records += record_list
                index += 100
                records = requests.get(
                    search_url +
                    '&s={}'.format(str(index), throttle=10)).json()
            all_records_from_all_days = all_records_from_all_days + all_records

        return all_records_from_all_days
예제 #50
0
    def get_records(self, search_url):
        records = requests.get(search_url)
        total_records = records.json()['recordCount']
        logger.info('Harvesting {} records'.format(total_records))
        page_number = 1
        count = 0

        while records.json()['records']:
            record_list = records.json()['records']
            for record in record_list:
                count += 1
                yield record

            page_number += 1
            records = requests.get(search_url +
                                   '&page_number={}'.format(page_number),
                                   throttle=3)
            logger.info('{} documents harvested'.format(count))
예제 #51
0
    def test_request_doesnt_throttle_on_load(self, mock_requests, monkeypatch):
        mock_sleep = mock.Mock()
        monkeypatch.setattr(requests.time, 'sleep', mock_sleep)
        requests.HarvesterResponse(ok=True, method='get', url='dinosaurs.sexy', content='citychicken').save()

        resp = requests.get('dinosaurs.sexy', throttle=2)

        assert mock_sleep.called is False
        assert mock_requests.request.called is False
        assert isinstance(resp, requests.HarvesterResponse)
예제 #52
0
    def test_record_or_load_throttle_throttles(self, mock_requests, monkeypatch):
        mock_sleep = mock.Mock()
        monkeypatch.setattr(requests.time, 'sleep', mock_sleep)
        mock_requests.request.return_value = mock.Mock(encoding='utf-8', content='Snapcity', status_code=200, headers={'tota': 'dyle'})

        resp = requests.get('dinosaurs.sexy', throttle=2)

        mock_sleep.assert_called_once_with(2)
        assert mock_requests.request.called is True
        assert isinstance(resp, requests.HarvesterResponse)
예제 #53
0
    def get_records(self, search_url):
        records = requests.get(search_url + "#{}".format(date.today()))
        page = 1

        all_records = []
        current_records = len(records.json()['entries'])
        while current_records > 0:
            record_list = records.json()['entries']

            for record in record_list:
                all_records.append(record)

            page += 1
            records = requests.get(
                search_url + '&page={}#{}'.format(str(page), date.today()),
                throttle=10)
            current_records = len(records.json()['entries'])

        return all_records