예제 #1
0
    def _fetch_records(self, start_date, end_date):
        page = 0
        morepages = True

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        while morepages:
            resp = requests.get(self.base_url,
                                params={
                                    'page':
                                    page,
                                    'EntryDateTo':
                                    format_date_with_slashes(end_date),
                                    'EntryDateFrom':
                                    format_date_with_slashes(start_date),
                                })

            xml = etree.XML(resp.content)

            for record in xml.xpath('records/record'):
                yield record

            page += 1
            morepages = xml.xpath('//records/@morepages')[0] == 'true'
예제 #2
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://www.osti.gov/pages/pagesxml?nrows={0}&EntryDateFrom={1}&EntryDateTo={2}'
        url = base_url.format('1', format_date_with_slashes(start_date), format_date_with_slashes(end_date))
        initial_data = requests.get(url)
        record_encoding = initial_data.encoding
        initial_doc = etree.XML(initial_data.content)

        num_results = int(initial_doc.xpath('//records/@count', namespaces=self.namespaces)[0])

        url = base_url.format(num_results, start_date, end_date)

        data = requests.get(url)
        doc = etree.XML(data.content)

        records = doc.xpath('records/record')

        xml_list = []
        for record in records:
            doc_id = record.xpath('dc:ostiId/node()', namespaces=self.namespaces)[0]
            record = etree.tostring(record, encoding=record_encoding)
            xml_list.append(RawDocument({
                'doc': record,
                'source': self.short_name,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

        return xml_list
예제 #3
0
    def _fetch_records(self, start_date, end_date):
        page = 0
        morepages = True

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        while morepages:
            resp = requests.get(self.base_url, params={
                'page': page,
                'EntryDateTo': format_date_with_slashes(start_date),
                'EntryDateFrom': format_date_with_slashes(end_date),
            })

            xml = etree.XML(resp.content)

            for record in xml.xpath('records/record'):
                yield record

            page += 1
            morepages = xml.xpath('//records/@morepages')[0] == 'true'