def _fetch_records(self, start_date, end_date): page = 0 morepages = True start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() while morepages: resp = requests.get(self.base_url, params={ 'page': page, 'EntryDateTo': format_date_with_slashes(end_date), 'EntryDateFrom': format_date_with_slashes(start_date), }) xml = etree.XML(resp.content) for record in xml.xpath('records/record'): yield record page += 1 morepages = xml.xpath('//records/@morepages')[0] == 'true'
def harvest(self, start_date=None, end_date=None): start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}' url = base_url.format('1', format_date_with_slashes(start_date), format_date_with_slashes(end_date)) initial_data = requests.get(url) record_encoding = initial_data.encoding initial_doc = etree.XML(initial_data.content) num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0]) url = base_url.format(num_results, start_date, end_date) data = requests.get(url) doc = etree.XML(data.content) records = doc.xpath('records/record') xml_list = [] for record in records: doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0] record = etree.tostring(record, encoding=record_encoding) xml_list.append(RawDocument({ 'doc': record, 'source': self.short_name, 'docID': copy_to_unicode(doc_id), 'filetype': 'xml' })) return xml_list
def _fetch_records(self, start_date, end_date): page = 0 morepages = True start_date = start_date or date.today() - timedelta(settings.DAYS_BACK) end_date = end_date or date.today() while morepages: resp = requests.get(self.base_url, params={ 'page': page, 'EntryDateTo': format_date_with_slashes(start_date), 'EntryDateFrom': format_date_with_slashes(end_date), }) xml = etree.XML(resp.content) for record in xml.xpath('records/record'): yield record page += 1 morepages = xml.xpath('//records/@morepages')[0] == 'true'