def get_records(self, start_date, end_date): ''' helper function to get a response from the DataONE API, with the specified number of rows. Returns an etree element with results ''' query = 'dateModified:[{}T00:00:00Z TO {}T00:00:00Z]'.format(start_date.isoformat(), end_date.isoformat()) doc = requests.get(DATAONE_SOLR_ENDPOINT, params={ 'q': query, 'start': 0, 'rows': 1 }) doc = etree.XML(doc.content) rows = int(doc.xpath("//result/@numFound")[0]) n = 0 while n < rows: data = requests.get(DATAONE_SOLR_ENDPOINT, params={ 'q': query, 'start': n, 'rows': 1000 }) docs = etree.XML(data.content).xpath('//doc') for doc in docs: yield doc n += 1000
def fetch_rows(self, days_back): query = self.build_query(days_back) resp = requests.get(self.BASE_URL, params={ 'q': query, 'rows': '0', 'api_key': PLOS_API_KEY, }) total_rows = etree.XML(resp.content).xpath('//result/@numFound') total_rows = int(total_rows[0]) if total_rows else 0 current_row = 0 while current_row < total_rows: response = requests.get(self.BASE_URL, throttle=5, params={ 'q': query, 'start': current_row, 'api_key': PLOS_API_KEY, 'rows': self.MAX_ROWS_PER_REQUEST, }) for doc in etree.XML(response.content).xpath('//doc'): yield doc current_row += self.MAX_ROWS_PER_REQUEST
def harvest(self, start_date=None, end_date=None): ''' First, get a list of all recently updated study urls, then get the xml one by one and save it into a list of docs including other information ''' start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' # grab each of those urls for full content xml_list = [] xml_base_url = self.canonical_base_url + '&view=xml' for dataset_id in self.query_by_date(start_date, end_date): try: item_url = str(xml_base_url).format(dataset_id) content = requests.get(item_url, throttle=2) except exceptions.ConnectionError as e: logger.info('Connection error: {}, wait a bit...'.format(e)) time.sleep(30) content = requests.get(item_url) doc = etree.XML(content.content) record = etree.tostring(doc, encoding=self.DEFAULT_ENCODING) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(dataset_id), 'filetype': 'xml', })) return xml_list
def test_get_calls_get(self, monkeypatch): mock_request = mock.Mock() monkeypatch.setattr(requests, "record_or_load_response", mock_request) requests.get("test", tota="dyle") mock_request.assert_called_once_with("get", "test", tota="dyle")
def get_records(self, days_back): ''' helper function to get a response from the DataONE API, with the specified number of rows. Returns an etree element with results ''' to_date = datetime.utcnow() from_date = (datetime.utcnow() - timedelta(days=days_back)) to_date = to_date.replace(hour=0, minute=0, second=0, microsecond=0) from_date = from_date.replace(hour=0, minute=0, second=0, microsecond=0) query = 'dateModified:[{}Z TO {}Z]'.format(from_date.isoformat(), to_date.isoformat()) doc = requests.get(DATAONE_SOLR_ENDPOINT, params={ 'q': query, 'start': 0, 'rows': 1 }) doc = etree.XML(doc.content) rows = int(doc.xpath("//result/@numFound")[0]) n = 0 while n < rows: data = requests.get(DATAONE_SOLR_ENDPOINT, params={ 'q': query, 'start': n, 'rows': 1000 }) docs = etree.XML(data.content).xpath('//doc') for doc in docs: yield doc n += 1000
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}' url = base_url.format('1', format_date_with_slashes(start_date), format_date_with_slashes(end_date)) initial_data = requests.get(url) record_encoding = initial_data.encoding initial_doc = etree.XML(initial_data.content) num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0]) url = base_url.format(num_results, start_date, end_date) data = requests.get(url) doc = etree.XML(data.content) records = doc.xpath('records/record') xml_list = [] for record in records: doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0] record = etree.tostring(record, encoding=record_encoding) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def test_get_calls_get(self, monkeypatch): mock_request = mock.Mock() monkeypatch.setattr(requests, 'record_or_load_response', mock_request) requests.get('test', tota='dyle') mock_request.assert_called_once_with('get', 'test', tota='dyle')
def get_records(self, start_date, end_date): ''' helper function to get a response from the DataONE API, with the specified number of rows. Returns an etree element with results ''' query = 'dateModified:[{}T00:00:00Z TO {}T00:00:00Z]'.format( start_date.isoformat(), end_date.isoformat()) doc = requests.get(DATAONE_SOLR_ENDPOINT, params={ 'q': query, 'start': 0, 'rows': 1 }) doc = etree.XML(doc.content) rows = int(doc.xpath("//result/@numFound")[0]) n = 0 while n < rows: data = requests.get(DATAONE_SOLR_ENDPOINT, params={ 'q': query, 'start': n, 'rows': 1000 }) docs = etree.XML(data.content).xpath('//doc') for doc in docs: yield doc n += 1000
def fetch_rows(self, start_date, end_date): query = 'publication_date:[{}T00:00:00Z TO {}T00:00:00Z]'.format(start_date, end_date) resp = requests.get(self.BASE_URL, params={ 'q': query, 'rows': '0', 'api_key': PLOS_API_KEY, }) total_rows = etree.XML(resp.content).xpath('//result/@numFound') total_rows = int(total_rows[0]) if total_rows else 0 current_row = 0 while current_row < total_rows: response = requests.get(self.BASE_URL, throttle=5, params={ 'q': query, 'start': current_row, 'api_key': PLOS_API_KEY, 'rows': self.MAX_ROWS_PER_REQUEST, }) for doc in etree.XML(response.content).xpath('//doc'): yield doc current_row += self.MAX_ROWS_PER_REQUEST
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() base_url = 'http://api.crossref.org/v1/works?filter=from-pub-date:{},until-pub-date:{}&rows={{}}&offset={{}}'.format( start_date.isoformat(), end_date.isoformat()) total = requests.get(base_url.format( '0', '0')).json()['message']['total-results'] logger.info('{} documents to be harvested'.format(total)) doc_list = [] for i in xrange(0, total, 1000): records = requests.get(base_url.format( 1000, i)).json()['message']['items'] logger.info('Harvested {} documents'.format(i + len(records))) for record in records: doc_id = record['DOI'] doc_list.append( RawDocument({ 'doc': json.dumps(record), 'source': self.short_name, 'docID': doc_id, 'filetype': 'json' })) return doc_list
def harvest(self, start_date=None, end_date=None): ''' First, get a list of all recently updated study urls, then get the xml one by one and save it into a list of docs including other information ''' start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' # grab each of those urls for full content xml_list = [] xml_base_url = self.canonical_base_url + '&view=xml' for dataset_id in self.query_by_date(start_date, end_date): try: item_url = str(xml_base_url).format(dataset_id) content = requests.get(item_url, throttle=2) except exceptions.ConnectionError as e: logger.info('Connection error: {}, wait a bit...'.format(e)) time.sleep(30) content = requests.get(item_url) doc = etree.XML(content.content) record = etree.tostring(doc, encoding=self.DEFAULT_ENCODING) xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(dataset_id), 'filetype': 'xml', })) return xml_list
def test_get_calls_get(self, monkeypatch): mock_request = mock.Mock() monkeypatch.setattr(requests, 'record_or_load_response', mock_request) requests.get('test', tota='dyle') mock_request.assert_called_once_with('get', 'test', tota='dyle')
def test_force_makes_request(self, mock_requests, monkeypatch): mock_requests.request.return_value = mock.Mock( ok=True, encoding="utf-8", content="rawr", status_code=200, headers={"tota": "dyle"} ) requests.get("dinosaurs.sexy", force=True) assert mock_requests.request.called is True
def fetch_rows(self, start_date, end_date): query = 'publication_date:[{}T00:00:00Z TO {}T00:00:00Z]'.format( start_date, end_date) resp = requests.get(self.BASE_URL, params={ 'q': query, 'rows': '0', 'api_key': PLOS_API_KEY, }) total_rows = etree.XML(resp.content).xpath('//result/@numFound') total_rows = int(total_rows[0]) if total_rows else 0 current_row = 0 while current_row < total_rows: response = requests.get(self.BASE_URL, throttle=5, params={ 'q': query, 'start': current_row, 'api_key': PLOS_API_KEY, 'rows': self.MAX_ROWS_PER_REQUEST, }) for doc in etree.XML(response.content).xpath('//doc'): yield doc current_row += self.MAX_ROWS_PER_REQUEST
def harvest(self, days_back=1): today = date.today() start_date = today - timedelta(days_back) base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}' url = base_url.format('1', start_date.strftime('%m/%d/%Y'), today.strftime('%m/%d/%Y')) initial_data = requests.get(url) record_encoding = initial_data.encoding initial_doc = etree.XML(initial_data.content) num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0]) url = base_url.format(num_results, start_date.strftime('%m/%d/%Y'), today.strftime('%m/%d/%Y')) data = requests.get(url) doc = etree.XML(data.content) records = doc.xpath('records/record') xml_list = [] for record in records: doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0] record = etree.tostring(record, encoding=record_encoding) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def test_record_or_load_response_respects_record_true(self, mock_requests, monkeypatch): mock_rec_or_load = mock.Mock() monkeypatch.setattr(requests, "record_or_load_response", mock_rec_or_load) requests.get("foo") assert mock_rec_or_load.called_once_with("get", "foo") assert mock_requests.request.called_once_with("get", "foo")
def test_record_or_load_response_respects_record_true(self, mock_requests, monkeypatch): mock_rec_or_load = mock.Mock() monkeypatch.setattr(requests, 'record_or_load_response', mock_rec_or_load) requests.get('foo') assert mock_rec_or_load.called_once_with('get', 'foo') assert mock_requests.request.called_once_with('get', 'foo')
def test_record_or_load_response_respects_record_false(self, mock_requests, monkeypatch): mock_rec_or_load = mock.Mock() monkeypatch.setattr(requests.settings, "RECORD_HTTP_TRANSACTIONS", False) monkeypatch.setattr(requests, "record_or_load_response", mock_rec_or_load) requests.get("foo") assert not mock_rec_or_load.called assert mock_requests.request.called_once_with("get", "foo")
def test_record_or_load_response_respects_record_false(self, mock_requests, monkeypatch): mock_rec_or_load = mock.Mock() monkeypatch.setattr(requests.settings, 'RECORD_HTTP_TRANSACTIONS', False) monkeypatch.setattr(requests, 'record_or_load_response', mock_rec_or_load) requests.get('foo') assert not mock_rec_or_load.called assert mock_requests.request.called_once_with('get', 'foo')
def get_records(self, search_url): all_lessons = [] resp = requests.get(self.URL + '?page=last').json() last_lesson_id = resp['lessons'][-1]['id'] for pk in range(last_lesson_id + 1): lesson = requests.get(search_url + "/" + str(pk), expected=[200, 403, 404]) if lesson.status_code == 200: lesson_list = lesson.json()['lessons'][0] all_lessons.append(lesson_list) return all_lessons
def test_force_makes_request(self, mock_requests, monkeypatch): mock_requests.request.return_value = mock.Mock( ok=True, encoding='utf-8', content='rawr', status_code=200, headers={'tota': 'dyle'}) requests.get('dinosaurs.sexy', force=True) assert mock_requests.request.called is True
def get_records(self, search_url): all_lessons = [] resp = requests.get(self.URL + '?page=last').json() last_lesson_id = resp['lessons'][-1]['id'] for pk in range(last_lesson_id + 1): lesson = requests.get(search_url + "/" + str(pk), expected=[200, 403, 404]) if lesson.status_code == 200: lesson_list = lesson.json()['lessons'][0] all_lessons.append(lesson_list) return all_lessons
def test_force_makes_new_request(self, mock_requests, monkeypatch): requests.HarvesterResponse(ok=True, method='get', url='dinosaurs.sexy', content='citychicken').save() mock_requests.request.return_value = mock.Mock(encoding='utf-8', content='Snapcity', status_code=200, headers={'tota': 'dyle'}) resp = requests.get('dinosaurs.sexy') assert resp.content == 'citychicken' assert mock_requests.request.called is False resp = requests.get('dinosaurs.sexy', force=True) assert resp.content == 'Snapcity' assert mock_requests.request.called is True
def get_records(self, search_url): records = requests.get(search_url).json()['response'].get('award') offset = 1 all_records = [] while len(records) == 25: for record in records: all_records.append(record) offset += 25 records = requests.get(search_url + '&offset={}'.format(str(offset)), throttle=3).json()['response'].get('award') all_records.extend(records) return all_records
def test_force_makes_new_request(self, mock_requests, monkeypatch): requests.HarvesterResponse(ok=True, method="get", url="dinosaurs.sexy", content=b"citychicken").save() mock_requests.request.return_value = mock.Mock( encoding="utf-8", content=b"Snapcity", status_code=200, headers={"tota": "dyle"} ) resp = requests.get("dinosaurs.sexy") assert resp.content == b"citychicken" assert mock_requests.request.called is False resp = requests.get("dinosaurs.sexy", force=True) assert resp.content == b"Snapcity" assert mock_requests.request.called is True
def get_records(self, search_url): records = requests.get(search_url) total_records = records.json()['data']['total_count'] start = 0 all_records = [] while len(all_records) < total_records: records = requests.get(search_url + '&start={}'.format(str(start))) record_list = records.json()['data']['items'] for record in record_list: all_records.append(record) start += self.MAX_ITEMS_PER_REQUEST return all_records
def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' base_url = 'https://www.earthsystemgrid.org/oai/repository?verb=ListRecords&metadataPrefix=dif&from={}&until={}' url = base_url.format(start_date, end_date) data = requests.get(url) doc = etree.XML(data.content) records = doc.xpath('//OAI-PMH:record', namespaces=self.namespaces) xml_list = [] for record in records: doc_id = record.xpath('//OAI-PMH:header/OAI-PMH:identifier/node()', namespaces=self.namespaces)[0] record = etree.tostring(record) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def get_records(self, search_urls): all_records_from_all_days = [] for search_url in search_urls: records = requests.get(search_url).json() index = 1 total_records = int(records['result'][0]['total']) all_records = [] while len(all_records) < total_records: record_list = records['records'] all_records += record_list index += 100 records = requests.get(search_url + '&s={}'.format(str(index), throttle=10)).json() all_records_from_all_days = all_records_from_all_days + all_records return all_records_from_all_days
def _fetch_records(self, start_date, end_date): page = 0 morepages = True start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() while morepages: resp = requests.get(self.base_url, params={ 'page': page, 'EntryDateTo': format_date_with_slashes(end_date), 'EntryDateFrom': format_date_with_slashes(start_date), }) xml = etree.XML(resp.content) for record in xml.xpath('records/record'): yield record page += 1 morepages = xml.xpath('//records/@morepages')[0] == 'true'
def query_by_date(self, start_date, end_date): '''Use OAI-PMH interface to get a list of dataset ids for the given date range''' search_url_end = '&metadataPrefix=oai_dc&from={}&until={}'.format( start_date, end_date) search_url = self.search_base_url + search_url_end while True: record_list = requests.get(search_url) record_list_xml = etree.XML(record_list.content) if record_list_xml.xpath('./oai_dc:error', namespaces=self.oai_ns): break for dataset in record_list_xml.xpath( './oai_dc:ListRecords/oai_dc:record', namespaces=self.oai_ns): yield dataset.xpath('./oai_dc:header/oai_dc:identifier/node()', namespaces=self.oai_ns)[0] token = record_list_xml.xpath( './oai_dc:ListRecords/oai_dc:resumptionToken/node()', namespaces=self.oai_ns) if not token: break search_url = self.search_base_url + '&resumptionToken=' + token[0]
def harvest(self, start_date=None, end_date=None): """ Return a list of RawDocuments """ start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() base_url = 'http://exporter.nih.gov/' table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/' # get ExPORTER page html and rows storing records html = requests.get(table_url).content soup = BeautifulSoup(html, 'lxml') table = soup.find('table', id="ContentPlaceHolder1_ProjectData_dgProjectData") rows = table.find_all('tr', class_="row_bg") urls = [i for i in construct_urls(base_url, start_date, end_date, rows)] return [ RawDocument({ 'doc': etree.tostring(record, encoding=self.DEFAULT_ENCODING), 'source': self.short_name, 'docID': copy_to_unicode(record.xpath('.//APPLICATION_ID/node()', namespaces=self.namespaces)[0]), 'filetype': 'xml' }) for record in xml_records(get_xml_files(urls)) ]
def oai_get_records_and_token(url, throttle, force, namespaces, verify): """ Helper function to get the records and any resumptionToken from an OAI request. Takes a url and any request parameters and returns the records along with the resumptionToken if there is one. """ data = requests.get(url, throttle=throttle, force=force, verify=verify) encoding = data.encoding or 'utf-8' if encoding.lower() == 'none': encoding = 'utf-8' parser = etree.XMLParser(recover=True, encoding=encoding) doc = etree.XML(data.content, parser=parser) records = doc.xpath( '//ns0:record', namespaces=namespaces ) token = doc.xpath( '//ns0:resumptionToken/node()', namespaces=namespaces ) return records, token
def test_record_or_load_params(self, mock_requests, monkeypatch): mock_requests.request.return_value = mock.Mock(encoding='utf-8', content='Snapcity', status_code=200, headers={'tota': 'dyle'}) resp = requests.get('dinosaurs.sexy', params={'test': 'foo'}) assert resp.status_code == 200 assert resp.url == 'dinosaurs.sexy?test=foo'
def get_records(self, search_url): records = requests.get(search_url) total_records = records.json()['data']['total_count'] start = 0 all_records = [] while len(all_records) < total_records: records = requests.get(search_url + '&start={}'.format(str(start))) record_list = records.json()['data']['items'] for record in record_list: all_records.append(record) start += self.MAX_ITEMS_PER_REQUEST return all_records
def test_record_or_load_remakes(self, mock_requests, monkeypatch): mock_requests.request.return_value = mock.Mock( encoding='utf-8', content='rawr', status_code=200, headers={'tota': 'dyle'}) requests.HarvesterResponse(ok=False, method='get', url='dinosaurs.sexy').save() model = requests.HarvesterResponse.get(method='get', url='dinosaurs.sexy') assert not model.ok assert model.method == 'get' assert model.url == 'dinosaurs.sexy' resp = requests.get('dinosaurs.sexy') model = requests.HarvesterResponse.get(method='get', url='dinosaurs.sexy') assert model.method == 'get' assert model.content == b'rawr' assert model.encoding == 'utf-8' assert model.status_code == 200 assert model.url == 'dinosaurs.sexy' assert model.headers == {'tota': 'dyle'} assert model.headers_str == '{"tota": "dyle"}' assert isinstance(resp, requests.HarvesterResponse)
def get_records(self, search_url): records = requests.get(search_url) total_records = records.json()['recordCount'] logger.info('Harvesting {} records'.format(total_records)) page_number = 1 count = 0 while records.json()['records']: record_list = records.json()['records'] for record in record_list: count += 1 yield record page_number += 1 records = requests.get(search_url + '&page_number={}'.format(page_number), throttle=3) logger.info('{} documents harvested'.format(count))
def fetch_commits(base_url, start_date, end_date): jsonstr = "" i = 1 while True: resp = requests.get(base_url, params={ 'since': start_date, 'until': end_date, 'page': i, 'per_page': 100, }) jsonchunk = resp.content.decode('utf-8') if len(jsonchunk) <= 2: break i += 1 jsonchunk = jsonchunk.replace('},{', '}\\n{') jsonchunk = jsonchunk[1:-1] jsonstr = jsonstr + "\\n" + jsonchunk jsonarr = jsonstr.split('\\n')[1:] shas = [] for jsonstring in jsonarr: jsonobj = json.loads(jsonstring) shas.append(jsonobj['sha']) return shas
def get_records(self, search_url): records = requests.get(search_url).json()['response'].get('award') offset = 1 all_records = [] while len(records) == 25: for record in records: all_records.append(record) offset += 25 records = requests.get(search_url + '&offset={}'.format(str(offset)), throttle=3).json()['response'].get('award') all_records.extend(records) return all_records
def harvest(self, start_date=None, end_date=None): """ Return a list of RawDocuments """ start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() base_url = 'http://exporter.nih.gov/' table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/' # get ExPORTER page html and rows storing records html = requests.get(table_url).content soup = BeautifulSoup(html, 'lxml') table = soup.find('table', id="ContentPlaceHolder1_ProjectData_dgProjectData") rows = table.find_all('tr', class_="row_bg") urls = [ i for i in construct_urls(base_url, start_date, end_date, rows) ] return [ RawDocument({ 'doc': etree.tostring(record, encoding=self.DEFAULT_ENCODING), 'source': self.short_name, 'docID': copy_to_unicode( record.xpath('.//APPLICATION_ID/node()', namespaces=self.namespaces)[0]), 'filetype': 'xml' }) for record in xml_records(get_xml_files(urls)) ]
def harvest(self, start_date=None, end_date=None): start_date = (start_date or date.today() - timedelta(settings.DAYS_BACK)).isoformat() end_date = (end_date or date.today()).isoformat() start_date += 'T00:00:00Z' end_date += 'T00:00:00Z' base_url = 'https://www.earthsystemgrid.org/oai/repository?verb=ListRecords&metadataPrefix=dif&from={}&until={}' url = base_url.format(start_date, end_date) data = requests.get(url) doc = etree.XML(data.content) records = doc.xpath('//OAI-PMH:record', namespaces=self.namespaces) xml_list = [] for record in records: doc_id = record.xpath('//OAI-PMH:header/OAI-PMH:identifier/node()', namespaces=self.namespaces)[0] record = etree.tostring(record) xml_list.append( RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def fetch_commits(base_url, start_date, end_date): jsonstr = "" i = 1 while True: resp = requests.get(base_url, params={ 'since': start_date, 'until': end_date, 'page': i, 'per_page': 100, }) jsonchunk = resp.content.decode('utf-8') if len(jsonchunk) <= 2: break i += 1 jsonchunk = jsonchunk.replace('},{', '}\\n{') jsonchunk = jsonchunk[1:-1] jsonstr = jsonstr + "\\n" + jsonchunk jsonarr = jsonstr.split('\\n')[1:] shas = [] for jsonstring in jsonarr: jsonobj = json.loads(jsonstring) shas.append(jsonobj['sha']) return shas
def get_records(self, search_url): records = requests.get(search_url) total_records = records.json()['items_found'] page = 1 all_records = [] while len(all_records) < total_records: record_list = records.json()['items'] for record in record_list: if len(all_records) < total_records: all_records.append(record) page += 1 records = requests.get(search_url + '&page={}'.format(str(page)), throttle=3) return all_records
def fetch_file_names(commit_url, sha): resp = requests.get(commit_url.format(sha)) jsonstr = resp.content.decode('utf-8') jsonobj = json.loads(jsonstr) files = [d['filename'] for d in jsonobj['files']] return files
def get_records(self, search_url): records = requests.get(search_url) total = int(records.json()['counts']['registration']) from_arg = 0 all_records = [] while len(all_records) < total: record_list = records.json()['results'] for record in record_list: all_records.append(record) from_arg += 1000 records = requests.get(search_url + '&from={}'.format(str(from_arg)), throttle=10) return all_records
def get_records(self, search_url): records = requests.get(search_url) total_records = records.json()['items_found'] page = 1 all_records = [] while len(all_records) < total_records: record_list = records.json()['items'] for record in record_list: if len(all_records) < total_records: all_records.append(record) page += 1 records = requests.get(search_url + '&page={}'.format(str(page)), throttle=3) return all_records
def fetch_file_names(commit_url, sha): resp = requests.get(commit_url.format(sha)) jsonstr = resp.content.decode('utf-8') jsonobj = json.loads(jsonstr) files = [d['filename'] for d in jsonobj['files']] return files
def get_records(self, search_url): records = requests.get(search_url + "#{}".format(datetime.date.today())) page = 1 all_records = [] current_records = len(records.json()['entries']) while current_records > 0: record_list = records.json()['entries'] for record in record_list: all_records.append(record) page += 1 records = requests.get(search_url + '&page={}#{}'.format(str(page), datetime.date.today()), throttle=10) current_records = len(records.json()['entries']) return all_records
def test_record_or_load_loads(self, mock_requests, monkeypatch): requests.HarvesterResponse(ok=True, method='get', url='dinosaurs.sexy', content='rawr', headers_str="{}").save() resp = requests.get('dinosaurs.sexy') assert resp.headers == {} assert resp.content == 'rawr' assert not mock_requests.request.called assert isinstance(resp, requests.HarvesterResponse)
def get_records(self, search_urls): all_records_from_all_days = [] for search_url in search_urls: records = requests.get(search_url).json() index = 1 total_records = int(records['result'][0]['total']) all_records = [] while len(all_records) < total_records: record_list = records['records'] all_records += record_list index += 100 records = requests.get( search_url + '&s={}'.format(str(index), throttle=10)).json() all_records_from_all_days = all_records_from_all_days + all_records return all_records_from_all_days
def get_records(self, search_url): records = requests.get(search_url) total_records = records.json()['recordCount'] logger.info('Harvesting {} records'.format(total_records)) page_number = 1 count = 0 while records.json()['records']: record_list = records.json()['records'] for record in record_list: count += 1 yield record page_number += 1 records = requests.get(search_url + '&page_number={}'.format(page_number), throttle=3) logger.info('{} documents harvested'.format(count))
def test_request_doesnt_throttle_on_load(self, mock_requests, monkeypatch): mock_sleep = mock.Mock() monkeypatch.setattr(requests.time, 'sleep', mock_sleep) requests.HarvesterResponse(ok=True, method='get', url='dinosaurs.sexy', content='citychicken').save() resp = requests.get('dinosaurs.sexy', throttle=2) assert mock_sleep.called is False assert mock_requests.request.called is False assert isinstance(resp, requests.HarvesterResponse)
def test_record_or_load_throttle_throttles(self, mock_requests, monkeypatch): mock_sleep = mock.Mock() monkeypatch.setattr(requests.time, 'sleep', mock_sleep) mock_requests.request.return_value = mock.Mock(encoding='utf-8', content='Snapcity', status_code=200, headers={'tota': 'dyle'}) resp = requests.get('dinosaurs.sexy', throttle=2) mock_sleep.assert_called_once_with(2) assert mock_requests.request.called is True assert isinstance(resp, requests.HarvesterResponse)
def get_records(self, search_url): records = requests.get(search_url + "#{}".format(date.today())) page = 1 all_records = [] current_records = len(records.json()['entries']) while current_records > 0: record_list = records.json()['entries'] for record in record_list: all_records.append(record) page += 1 records = requests.get( search_url + '&page={}#{}'.format(str(page), date.today()), throttle=10) current_records = len(records.json()['entries']) return all_records