Пример #1
0
    def harvest(self, start_date=None, end_date=None):
        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://api.crossref.org/v1/works?filter=from-pub-date:{},until-pub-date:{}&rows={{}}&offset={{}}'.format(
            start_date.isoformat(), end_date.isoformat())
        total = requests.get(base_url.format(
            '0', '0')).json()['message']['total-results']
        logger.info('{} documents to be harvested'.format(total))

        doc_list = []
        for i in xrange(0, total, 1000):
            records = requests.get(base_url.format(
                1000, i)).json()['message']['items']
            logger.info('Harvested {} documents'.format(i + len(records)))

            for record in records:
                doc_id = record['DOI']
                doc_list.append(
                    RawDocument({
                        'doc': json.dumps(record),
                        'source': self.short_name,
                        'docID': doc_id,
                        'filetype': 'json'
                    }))

        return doc_list
Пример #2
0
    def harvest(self, start_date=None, end_date=None):
        """
        Return a list of RawDocuments
        """
        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        base_url = 'http://exporter.nih.gov/'
        table_url = 'http://exporter.nih.gov/ExPORTER_Catalog.aspx/'

        # get ExPORTER page html and rows storing records
        html = requests.get(table_url).content
        soup = BeautifulSoup(html, 'lxml')
        table = soup.find('table',
                          id="ContentPlaceHolder1_ProjectData_dgProjectData")
        rows = table.find_all('tr', class_="row_bg")
        urls = [
            i for i in construct_urls(base_url, start_date, end_date, rows)
        ]

        return [
            RawDocument({
                'doc':
                etree.tostring(record, encoding=self.DEFAULT_ENCODING),
                'source':
                self.short_name,
                'docID':
                copy_to_unicode(
                    record.xpath('.//APPLICATION_ID/node()',
                                 namespaces=self.namespaces)[0]),
                'filetype':
                'xml'
            }) for record in xml_records(get_xml_files(urls))
        ]
Пример #3
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        delta = end_date - start_date

        date_strings = []
        for i in range(delta.days + 1):
            date_strings.append(start_date + timedelta(days=i))

        search_urls = []
        for adate in date_strings:
            self.URL.args['q'] = 'date:{}'.format(adate)

            search_urls.append(self.URL.url)

        records = self.get_records(search_urls)
        records_list = []
        for record in records:
            format_type = record['publisher']
            if format_type.lower() != "biomed central":
                if format_type.lower() != "springer":
                    logger.info(
                        'Found non-springer source in springer api: {}'.format(
                            format_type))
                records_list.append(
                    RawDocument({
                        'doc': json.dumps(record),
                        'source': self.short_name,
                        'docID': record['identifier'],
                        'filetype': 'json'
                    }))
        return records_list
Пример #4
0
    def harvest(self, start_date=None, end_date=None):
        start_date = start_date if start_date else date.today() - timedelta(
            settings.DAYS_BACK)
        end_date = end_date - timedelta(
            1) if end_date else date.today() - timedelta(1)

        search_url = '{0}{1}&dateEnd={2}'.format(
            self.URL, start_date.strftime('%m/%d/%Y'),
            end_date.strftime('%m/%d/%Y'))

        records = self.get_records(search_url)

        record_list = []
        for record in records:
            doc_id = record['id']

            record_list.append(
                RawDocument({
                    'doc': json.dumps(record),
                    'source': self.short_name,
                    'docID': six.text_type(doc_id),
                    'filetype': 'json'
                }))

        return record_list
Пример #5
0
    def harvest(self, start_date=None, end_date=None):
        start_date = (start_date or date.today() -
                      timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        query = furl.furl(self.URL)
        query.args['type'] = self.TYPE
        query.args['per_page'] = self.MAX_ITEMS_PER_REQUEST
        query.args['key'] = HARVARD_DATAVERSE_API_KEY
        query.args['sort'] = 'date'
        query.args['order'] = 'asc'
        query.args['fq'] = 'dateSort:[{}T00:00:00Z TO {}T00:00:00Z]'.format(
            start_date, end_date)

        records = self.get_records(query.url)
        record_list = []
        for record in records:
            doc_id = record['global_id']

            record_list.append(
                RawDocument({
                    'doc': json.dumps(record),
                    'source': self.short_name,
                    'docID': doc_id,
                    'filetype': 'json'
                }))

        return record_list
Пример #6
0
def consume(days_back=5):
    today = date.today()
    start_date = today - timedelta(days_back)
    url = OAI_DC_BASE_URL + '&metadataPrefix=oai_dc&from='
    if 'YYYY-MM-DDThh:mm:ssZ' == 'YYYY-MM-DDThh:mm:ssZ':
        url += str(start_date) + 'T00:00:00Z'
    elif 'YYYY-MM-DDThh:mm:ssZ' == 'YYYY-MM-DD hh:mm:ss':
        url += str(start_date) + ' 00:00:00'
    else:
        url += str(start_date)

    print(url)
    record_encoding = requests.get(url).encoding
    records = get_records(url)

    xml_list = []
    for record in records:
        set_spec = record.xpath('ns0:header/ns0:setSpec/node()',
                                namespaces=NAMESPACES)[0]
        doc_id = record.xpath('ns0:header/ns0:identifier/node()',
                              namespaces=NAMESPACES)[0]
        record_string = etree.tostring(record, encoding=record_encoding)

        xml_list.append(
            RawDocument({
                'doc': record_string,
                'source': NAME,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

    return xml_list
Пример #7
0
    def harvest(self, start_date=None, end_date=None):
        # Always harvest a 2 day period starting 2 days back to honor time given
        # to contributors to cancel a public registration
        start_date = start_date or date.today() - timedelta(4)
        end_date = end_date or date.today() - timedelta(2)

        search_url = self.URL.format(start_date.isoformat(), end_date.isoformat())
        records = self.get_records(search_url)

        record_list = []
        for record in records:
            doc_id = record['url'].replace('/', '')

            record_list.append(
                RawDocument(
                    {
                        'doc': json.dumps(record),
                        'source': self.short_name,
                        'docID': doc_id,
                        'filetype': 'json'
                    }
                )
            )

        return record_list
Пример #8
0
    def harvest(self, start_date=None, end_date=None):
        """ Figshare should always have a 24 hour delay because they
        manually go through and check for test projects. Most of them
        are removed within 24 hours.

        So, we will shift everything back a day with harvesting to ensure
        nothing is harvested on the day of.
        """
        start_date = start_date - timedelta(1) if start_date else date.today() - timedelta(1 + settings.DAYS_BACK)
        end_date = end_date - timedelta(1) if end_date else date.today() - timedelta(1)

        search_url = '{0}{1}&to_date={2}'.format(
            self.URL,
            start_date.isoformat(),
            end_date.isoformat()
        )

        records = self.get_records(search_url)

        record_list = []
        for record in records:
            doc_id = record['article_id']

            record_list.append(
                RawDocument(
                    {
                        'doc': json.dumps(record),
                        'source': self.short_name,
                        'docID': six.text_type(doc_id),
                        'filetype': 'json'
                    }
                )
            )

        return record_list
Пример #9
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        records = self.get_records(start_date, end_date)

        xml_list = []
        for record in records:
            # This ID is unique per data package, but won't unify multiple packages for the same project
            doc_id = record.xpath("str[@name='id']")[0].text
            format_type = record.xpath("str[@name='formatType']")[0].text
            record = ElementTree.tostring(record,
                                          encoding=self.record_encoding)
            if format_type.lower() != 'metadata':
                logger.info(
                    'Not normalizing record with ID {}, type {}'.format(
                        doc_id, format_type))
            else:
                xml_list.append(
                    RawDocument({
                        'doc': record,
                        'source': self.short_name,
                        'docID': copy_to_unicode(doc_id),
                        'filetype': 'xml'
                    }))

        return xml_list
Пример #10
0
    def harvest(self, start_date=None, end_date=None):
        start_date = start_date or datetime.date.today() - datetime.timedelta(settings.DAYS_BACK)
        end_date = end_date or datetime.date.today()

        shas = fetch_commits(self.BASE_URL, start_date.isoformat(), end_date.isoformat())

        files = list(set(chain.from_iterable([
            fetch_file_names(self.BASE_COMMIT_URL, sha)
            for sha in shas])))

        files = filter(lambda filename: filename.endswith('.xml'), files)

        xml_records = [
            fetch_xml(self.BASE_DATA_URL, filename)
            for filename in files
        ]

        return [
            RawDocument({
                'filetype': 'xml',
                'source': self.short_name,
                'doc': etree.tostring(record),
                'docID': record.xpath('//article-id[@*]')[0].text,
            }) for record in xml_records
        ]
Пример #11
0
    def harvest(self, start_date=None, end_date=None):
        ''' First, get a list of all recently updated study urls,
        then get the xml one by one and save it into a list
        of docs including other information '''

        start_date = (start_date or date.today() -
                      timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()
        start_date += 'T00:00:00Z'
        end_date += 'T00:00:00Z'

        # grab each of those urls for full content
        xml_list = []
        xml_base_url = self.canonical_base_url + '&view=xml'
        for dataset_id in self.query_by_date(start_date, end_date):
            try:
                item_url = str(xml_base_url).format(dataset_id)
                content = requests.get(item_url, throttle=2)
            except exceptions.ConnectionError as e:
                logger.info('Connection error: {}, wait a bit...'.format(e))
                time.sleep(30)
                content = requests.get(item_url)
            doc = etree.XML(content.content)

            record = etree.tostring(doc, encoding=self.DEFAULT_ENCODING)
            xml_list.append(
                RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(dataset_id),
                    'filetype': 'xml',
                }))

        return xml_list
Пример #12
0
def consume(days_back=1):
    changes_url = 'http://resync.library.cornell.edu/arxiv-all/changelist.xml'

    changelist = requests.get(changes_url)
    record_encoding = changelist.encoding
    changeXML = etree.XML(changelist.content)

    urls_for_info = changeXML.xpath('//urlset:loc/node()', namespaces=NAMESPACES)
    export_base = 'http://export.arxiv.org/api/query?search_query='

    xml_list = []
    print len(urls_for_info)
    for url in urls_for_info:
        try:
            # matches everything after a slash then 4 numbers, a dot, 4 more numbers
            arxiv_id = re.search('(?<=/)\d{4}(\.)?\d{4}', url).group(0)
        except AttributeError:
            print 'Warning: malformed arxiv ID, skipping entry for {}'.format(url)
            continue    

        export_url = export_base + arxiv_id

        record_request = requests.get(export_url)
        record_encoding = record_request.encoding
        record = etree.XML(record_request.content)

        xml_list.append(RawDocument({
                    'doc': etree.tostring(record),
                    'source': NAME,
                    'docID': copy_to_unicode(arxiv_id),
                    'filetype': 'xml'
                }))
        time.sleep(2)

    return xml_list
Пример #13
0
    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() -
                      timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        if self.timezone_granularity:
            start_date += 'T00:00:00Z'
            end_date += 'T00:00:00Z'

        records_url = self.base_url + self.RECORDS_URL
        request_url = records_url + self.META_PREFIX_DATE.format(
            start_date, end_date)

        records = self.get_records(request_url, start_date, end_date)

        rawdoc_list = []
        for record in records:
            doc_id = record.xpath('ns0:header/ns0:identifier',
                                  namespaces=self.namespaces)[0].text
            record = etree.tostring(record, encoding=self.record_encoding)
            rawdoc_list.append(
                RawDocument({
                    'doc': record,
                    'source': util.copy_to_unicode(self.short_name),
                    'docID': util.copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return rawdoc_list
Пример #14
0
def consume(days_back=1, end_date=None, **kwargs):
    """A function for querying the SciTech Connect database for raw XML. 
    The XML is chunked into smaller pieces, each representing data
    about an article/report. If there are multiple pages of results, 
    this function iterates through all the pages."""
    
    TODAY = datetime.date.today()
    start_date = (TODAY - datetime.timedelta(days_back)).strftime('%m/%d/%Y')
    base_url = 'http://www.osti.gov/scitech/scitechxml'
    parameters = kwargs
    parameters['EntryDateFrom'] = start_date
    parameters['EntryDateTo'] = end_date
    parameters['page'] = 0
    morepages = 'true'
    xml_list = []
    elements_url = 'http://purl.org/dc/elements/1.1/'

    while morepages == 'true':
        xml = requests.get(base_url, params=parameters)  #.text
        record_encoding = xml.encoding
        xml = xml.text
        xml_root = etree.XML(xml.encode('utf-8'))
        for record in xml_root.find('records'):
            doc_id = record.find(str(etree.QName(elements_url, 'ostiId'))).text,
            xml_list.append(RawDocument({
                'doc': etree.tostring(record, encoding=record_encoding),
                'docID' : copy_to_unicode(doc_id),
                'source': NAME,
                'filetype': 'xml'
            }))
        parameters['page'] += 1
        morepages = xml_root.find('records').attrib['morepages']
    return xml_list
Пример #15
0
    def harvest(self, start_date=None, end_date=None):

        start_date = (start_date or date.today() -
                      timedelta(settings.DAYS_BACK)).isoformat()
        end_date = (end_date or date.today()).isoformat()

        if self.timezone_granularity:
            start_date += 'T00:00:00Z'
            end_date += 'T00:00:00Z'

        url = furl(self.base_url)
        url.args['verb'] = 'ListRecords'
        url.args['metadataPrefix'] = 'oai_dc'
        url.args['from'] = start_date
        url.args['until'] = end_date

        records = self.get_records(url.url, start_date, end_date)

        rawdoc_list = []
        for record in records:
            doc_id = record.xpath('ns0:header/ns0:identifier',
                                  namespaces=self.namespaces)[0].text
            record = etree.tostring(record, encoding=self.record_encoding)
            rawdoc_list.append(
                RawDocument({
                    'doc': record,
                    'source': util.copy_to_unicode(self.short_name),
                    'docID': util.copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return rawdoc_list
Пример #16
0
def consume(days_back=0):
    start_date = TODAY - timedelta(days_back)
    oai_dc_request = OAI_DC_BASE_URL + \
        '&metadataPrefix=oai_dc&from={}'.format(str(start_date))
    record_encoding = requests.get(oai_dc_request).encoding

    # just for testing
    print 'oai_dc request: ' + oai_dc_request

    oai_records = get_records(oai_dc_request)
    records = oai_records
    print '{} records collected...'.format(len(records))

    xml_list = []
    for record in records:
        # TODO: make lack of contributors continue the loop
        contributors = record.xpath(
            '//dc:creator/node()', namespaces=NAMESPACES)  # changed
        if not contributors:
            continue
        doc_id = record.xpath(
            'ns0:header/ns0:identifier/node()', namespaces=NAMESPACES)[0]
        record = etree.tostring(record, encoding=record_encoding)
        xml_list.append(RawDocument({
            'doc': record,
            'source': NAME,
            'docID': copy_to_unicode(doc_id),
            'filetype': 'xml'
        }))

    return xml_list
Пример #17
0
    def harvest(self, start_date=None, end_date=None):
        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        total = self.get_total(start_date, end_date)
        logger.info('{} documents to be harvested'.format(total))

        doc_list = []
        for i in xrange(0, total, 1000):
            uris = self.get_uris(start_date, end_date, 1000, i)
            records = self.get_records(uris, mapping.DOCUMENT_MAPPING)
            logger.info('Harvested {} documents'.format(i + len(records)))

            for record in records:
                if 'doi' in record:
                    doc_id = record['doi']
                else:
                    doc_id = record['uri']
                doc_list.append(
                    RawDocument({
                        'doc': json.dumps(record),
                        'source': self.short_name,
                        'docID': doc_id,
                        'filetype': 'json'
                    }))

        return doc_list
Пример #18
0
 def harvest(self, days_back=1):
     return [
         RawDocument({
             'doc': str(TEST_XML_DOC),
             'source': 'test',
             'filetype': 'XML',
             'docID': "1"
         }) for _ in xrange(days_back)
     ]
Пример #19
0
    def harvest(self, start_date=None, end_date=None):
        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        return [
            RawDocument({
                'doc': json.dumps(record),
                'source': record['source'],
                'docID': record['docID'],
                'filetype': 'json'
            }) for record in self.get_records(start_date, end_date)
        ]
Пример #20
0
 def harvest(self, days_back=1):
     return [
         RawDocument({
             'doc': TEST_XML_DOC,
             'source': 'test',
             'filetype': 'XML',
             'docID': "1",
             'timestamps': {
                 'harvestFinished': '2015-03-14T17:05:48+00:00',
                 'harvestStarted': '2015-03-14T17:05:48+00:00',
                 'harvestTaskCreated': '2015-03-16T17:05:48+00:00'
             }
         }) for _ in xrange(days_back)
     ]
Пример #21
0
def consume(days_back=1):
    start_date = date.today() - timedelta(days_back)
    base_url = OAI_DC_BASE + '?verb=ListRecords&metadataPrefix=oai_dc&from='
    url = base_url + str(start_date) + 'T00:00:00Z'

    num_approved_records = 0
    num_rejected_records = 0
    approved_sets = []
    rejected_sets = []

    records = get_records(url)

    xml_list = []
    for record in records:
        set_spec = record.xpath('ns0:header/ns0:setSpec/node()',
                                namespaces=NAMESPACES)[0]
        doc_id = record.xpath('ns0:header/ns0:identifier/node()',
                              namespaces=NAMESPACES)[0]

        set_spec = record.xpath('ns0:header/ns0:setSpec/node()',
                                namespaces=NAMESPACES)[0]

        record_string = etree.tostring(record, encoding=record_encoding)

        if set_spec.replace('publication:', '') in series_name_list:
            approved_sets.append(set_spec)
            num_approved_records += 1
        else:
            rejected_sets.append(set_spec)
            num_rejected_records += 1

        xml_list.append(
            RawDocument({
                'doc': record_string,
                'source': NAME,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

    print "There were {} approved sets".format(num_approved_records)
    print "The records were from these approved sets: {}".format(
        set(approved_sets))
    print "There were {} rejected sets".format(num_rejected_records)
    print "The records were from these rejected sets: {}".format(
        set(rejected_sets))

    return xml_list
Пример #22
0
    def harvest(self, start_date=None, end_date=None):
        # TODO - stepic has no means of querying by date, we should add handling for the
        # start and end date once it does.

        search_url = self.URL
        records = self.get_records(search_url)

        record_list = []
        for record in records:
            doc_id = record['id']
            record_list.append(
                RawDocument({
                    'doc': json.dumps(record),
                    'source': self.short_name,
                    'docID': ('stepic_doc' + str(doc_id)),
                    'filetype': 'json'
                }))
        return record_list
Пример #23
0
def consume(days_back=1):
    doc = get_response(1, days_back)
    rows = doc.xpath("//result/@numFound")[0]
    doc = get_response(rows, days_back)
    records = doc.xpath('//doc')
    xml_list = []
    for record in records:
        doc_id = record.xpath("str[@name='id']")[0].text
        record = ElementTree.tostring(record, encoding=record_encoding)
        xml_list.append(
            RawDocument({
                'doc': record,
                'source': NAME,
                'docID': copy_to_unicode(doc_id),
                'filetype': 'xml'
            }))

    return xml_list
Пример #24
0
    def harvest(self, start_date=None, end_date=None):
        """Returns a list of Rawdocuments (metadata)
        Searching by time is not supported by LWBIN CKAN API. all datasets have to be scanned each time.
        """

        base_url = 'http://130.179.67.140/api/3/action/current_package_list_with_resources'

        records = requests.get(base_url).json()['result']
        total = len(records)  # Total number of documents
        logger.info('{} documents to be harvested'.format(total))

        return [
            RawDocument({
                'doc': json.dumps(record),
                'source': self.short_name,
                'docID': record['id'],
                'filetype': 'json'
            }) for record in records
        ]
Пример #25
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        if not PLOS_API_KEY:
            return []

        return [
            RawDocument({
                'filetype': 'xml',
                'source': self.short_name,
                'doc': etree.tostring(row),
                'docID': row.xpath("str[@name='id']")[0].text,
            }) for row in self.fetch_rows(start_date.isoformat(),
                                          end_date.isoformat())
            if row.xpath("arr[@name='abstract']")
            or row.xpath("str[@name='author_display']")
        ]
Пример #26
0
    def harvest(self, start_date=None, end_date=None):

        api_url = self.url + 'api/collections/?format=json'

        record_list = []

        while api_url:

            records = requests.get(api_url).json()
            for record in records['results']:
                record_list.append(
                    RawDocument({
                        'doc': json.dumps(record),
                        'source': self.short_name,
                        'docID': str(record['id']),
                        'filetype': 'json'
                    }))

            api_url = records['next']

        return record_list
Пример #27
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        records = self.get_records(start_date, end_date)

        xml_list = []
        for record in records:
            doc_id = record.xpath("str[@name='id']")[0].text
            record = ElementTree.tostring(record,
                                          encoding=self.record_encoding)
            xml_list.append(
                RawDocument({
                    'doc': record,
                    'source': self.short_name,
                    'docID': copy_to_unicode(doc_id),
                    'filetype': 'xml'
                }))

        return xml_list
Пример #28
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)
        end_date = end_date or date.today()

        search_url = self.URL.format(start_date.isoformat(),
                                     end_date.isoformat())
        records = self.get_records(search_url)

        record_list = []
        for record in records:
            doc_id = record['url'].replace('/', '')

            record_list.append(
                RawDocument({
                    'doc': json.dumps(record),
                    'source': self.short_name,
                    'docID': doc_id.decode('utf-8'),
                    'filetype': 'json'
                }))

        return record_list
Пример #29
0
    def harvest(self, start_date=None, end_date=None):

        # This API does not support date ranges
        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)

        # days_back = the number of days between start_date and now, defaulting to settings.DAYS_BACK
        days_back = settings.DAYS_BACK
        search_url = '{0}mod_x_days={1}'.format(self.URL, days_back)

        record_list = []
        for record in self.get_records(search_url):
            doc_id = record['id']

            record_list.append(
                RawDocument({
                    'doc': json.dumps(record),
                    'source': self.short_name,
                    'docID': six.text_type(doc_id),
                    'filetype': 'json'
                }))

        return record_list
Пример #30
0
    def harvest(self, start_date=None, end_date=None):

        start_date = start_date or date.today() - timedelta(settings.DAYS_BACK)

        # Biomed central can only have a start date
        end_date = date.today()
        date_number = end_date - start_date

        search_url = self.URL.format(date_number.days)
        records = self.get_records(search_url)

        record_list = []
        for record in records:
            doc_id = record['arxId']

            record_list.append(
                RawDocument({
                    'doc': json.dumps(record),
                    'source': self.short_name,
                    'docID': doc_id,
                    'filetype': 'json'
                }))

        return record_list