def _gather_entry(self, entry, auth=None):
        # Create a harvest object for each entry
        entry_guid = entry['guid']
        log.debug('gathering %s', entry_guid)
        entry_name = entry['identifier'].replace('v101_', '').replace('.hdf5', '')  # noqa: E501
        entry_restart_date = entry['restart_date']

        package_query = Session.query(Package)
        query_filtered = package_query.filter(Package.name == entry_name)
        package = query_filtered.first()

        if package:
            # Meaning we've previously harvested this,
            # but we may want to reharvest it now.
            previous_obj = Session.query(HarvestObject) \
                .filter(HarvestObject.guid == entry_guid) \
                .filter(HarvestObject.current == True) \
                .first()  # noqa: E712
            if previous_obj:
                previous_obj.current = False
                previous_obj.save()

            if self.update_all:
                log.debug('{} already exists and will be updated.'.format(entry_name))  # noqa: E501
                status = 'change'
            else:
                log.debug('{} will not be updated.'.format(entry_name))  # noqa: E501  # noqa: E501
                status = 'unchanged'

            obj = HarvestObject(guid=entry_guid,
                                job=self.job,
                                extras=[
                                    HOExtra(key='status', value=status),
                                    HOExtra(key='restart_date', value=entry_restart_date)
                                ])

            obj.content = entry['content']
            obj.package = package
            obj.save()
            return obj.id

        elif not package:
            # It's a product we haven't harvested before.
            log.debug(
                '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                format(entry_name))  # noqa: E501
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value='new'),
                    HOExtra(key='restart_date', value=entry_restart_date)
                ])
            obj.content = entry['content']
            obj.package = None
            obj.save()
            return obj.id
示例#2
0
    def _gather_object(self, job, url, size, start_date, forecast_date):
        filename = parse_filename(url)
        filename_id = (filename.replace('-v02.0-fv02.0', '').replace(
            '-fv02.0',
            '').replace('-sv01.00', '').replace('-sv05.00', '').replace(
                '-v02', '').replace('-sv10.00',
                                    '').replace('-sv09.00',
                                                '').replace('-sv07.00', ''))

        status, package = self._was_harvested(filename_id, self.update_all)

        extras = [HOExtra(key='status', value=status)]
        assert start_date
        content = json.dumps(
            {
                'identifier': filename_id,
                'ftp_link': url,
                'size': size,
                'start_date': start_date,
                'forecast_date': forecast_date,
                'restart_date': start_date
            },
            default=str)
        obj = HarvestObject(job=job, guid=url, extras=extras, content=content)
        obj.package = package
        obj.save()
        return obj.id
示例#3
0
    def _gather_object(self, job, url, start_date):
        filename = parse_filename(url)
        filename_id = filename

        status, package = self._was_harvested(filename_id, self.update_all)

        extras = [HOExtra(key='status', value=status)]
        assert start_date
        content = json.dumps({
            'identifier': filename_id,
            'http_link': url,
            'start_date': start_date,
            'restart_date': start_date
        }, default=str
        )
        obj = HarvestObject(job=job,
                            guid=url,
                            extras=extras,
                            content=content)
        obj.package = package
        obj.save()
        return obj.id
示例#4
0
    def _gather_object(self, job, product, resources, manifest_content,
                       last_harvest_date):
        name = parse_filename(product).lower()

        status, package = self._was_harvested(name, self.update_all)

        extras = [HOExtra(key='status', value=status)]

        content = json.dumps(
            {
                'name': name,
                'restart_date': last_harvest_date.strftime('%Y-%m-%d'),
                'manifest_content': manifest_content,
                'resources': resources
            },
            default=str)

        obj = HarvestObject(job=job,
                            guid=unicode(uuid.uuid4()),
                            extras=extras,
                            content=content)
        obj.package = package
        obj.save()
        return obj.id
    def _crawl_results(self,
                       harvest_url,
                       timeout=5,
                       limit=100,
                       provider=None):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        first_query = True
        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                r = requests.get(harvest_url, verify=False, timeout=timeout)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e),
                                        self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           status_code, timeout))  # noqa: E128
                return ids
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(
                    r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           r.status_code,
                                           elapsed))  # noqa: E128
                return ids

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(
                    log_message.format(
                        self.provider, timestamp, r.status_code,
                        r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = Soup(r.content, 'lxml')
            json_content = json.loads(soup.text)

            # Get the URL for the next loop, or None to break the loop
            log.debug(harvest_url)
            harvest_url = self._get_next_url(harvest_url, json_content)

            # Get the entries from the results
            entry_list = self._get_entries_from_results(json_content)

            if first_query:
                entries = entry_list
            else:
                entries = entry_list[1:]

            first_query = False

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = entry['identifier']
                entry_restart_date = entry['restart_date']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug(
                            '{} already exists and will be updated.'.format(
                                entry_name))  # noqa: E501
                        status = 'change'
                    else:
                        log.debug('{} will not be updated.'.format(
                            entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(guid=entry_guid,
                                        job=self.job,
                                        extras=[
                                            HOExtra(key='status',
                                                    value=status),
                                            HOExtra(key='restart_date',
                                                    value=entry_restart_date)
                                        ])
                    obj.content = json.dumps(entry['content'])
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)

                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug(
                        '{} has not been harvested before. Creating a new harvest object.'
                        .format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid,
                                        job=self.job,
                                        extras=[
                                            HOExtra(key='status', value='new'),
                                            HOExtra(key='restart_date',
                                                    value=entry_restart_date)
                                        ])
                    new_counter += 1
                    obj.content = json.dumps(entry['content'])
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter, 0))  # noqa: E128, E501

        return ids
    def _parse_products(self, products):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0

        # Create a harvest object for each entry
        for entry in products:

            entry_guid = entry['imgtif'].split('/')[1].lower(
            ) + "_" + entry['type'] + "_" + str(entry['intid'])
            entry_name = entry['imgtif'].split('/')[1].lower(
            ) + "_" + entry['type'] + "_" + str(entry['intid'])
            entry_restart_date = entry['master']

            package = Session.query(Package) \
                .filter(Package.name == entry_name).first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'
                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value=status),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                obj.content = json.dumps(entry)
                obj.package = package
                obj.save()
                ids.append(obj.id)

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .format(entry_name))  # noqa: E501
                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value='new'),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                new_counter += 1
                obj.content = json.dumps(entry)
                obj.package = None
                obj.save()
                ids.append(obj.id)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter, 0))  # noqa: E128, E501

        return ids
示例#7
0
    def gather_stage(self, harvest_job):
        requests_cache.install_cache()
        requests_cache.clear()

        session = requests_cache.CachedSession()

        self.log = logging.getLogger(__file__)
        self.log.debug('OSCAR Harvester gather_stage for job: %r', harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        base_url = self.source_config.get('oai_pmh_url')
        metadata_prefix = self.source_config.get('metadata_prefix')
        start_date = self.source_config.get('start_date', None)
        self.update_all = self.source_config.get('update_all', False)

        last_token = self._get_last_harvesting_index(self.job.source_id,
                                                     'last_token')
        next_token = self._get_last_harvesting_index(self.job.source_id,
                                                     'next_token')
        next_station = self._get_last_harvesting_index(self.job.source_id,
                                                       'next_station')
        restart_date = self._get_last_harvesting_index(self.job.source_id,
                                                       'restart_date')
        restart_date = restart_date if last_token else None

        ids = []
        first_query = True
        while (ids == [] and next_token) or first_query:
            first_query = False

            current_token = last_token if next_station else next_token
            if current_token:
                query_url = "{}?verb=ListIdentifiers&resumptionToken={}".format(
                    base_url, current_token)
            elif restart_date:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format(
                    base_url, metadata_prefix, restart_date)
            elif start_date:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format(
                    base_url, metadata_prefix, start_date)
            else:
                query_url = "{}?verb=ListIdentifiers&metadataPrefix={}".format(
                    base_url, metadata_prefix)

            self.log.debug('Querying: {}.'.format(query_url))
            raw_list_ids = self.get_list_identifiers(session, query_url)

            list_stations, largest_datastamp = self.get_station_ids(
                raw_list_ids)

            next_token = self.get_resumption_token(raw_list_ids)
            last_token = current_token
            restart_date = restart_date if restart_date else ''
            restart_date = largest_datastamp if largest_datastamp > restart_date else restart_date

            if list_stations == []:
                next_station = None
            else:
                valid_deployment = None
                station_index = 0
                while not valid_deployment and station_index <= len(
                        list_stations) - 1:
                    station = list_stations[station_index]
                    next_station = None if (next_station
                                            == station) else next_station
                    if not next_station:
                        station_query = '{}?verb=GetRecord&metadataPrefix={}&identifier={}'.format(
                            base_url, metadata_prefix, station)
                        print('Querying station: {}.'.format(station))
                        record = self.get_record(session, station_query)
                        if record:
                            station_info = StationInfo(record)
                            if station_info.isValid():
                                station_info.id = station
                                observation_list = station_info.get_observations(
                                )
                                station_dict = station_info.get_dict()
                                station_info = None
                                for observation in observation_list:
                                    observation_info = ObservationInfo(
                                        session, observation)
                                    deployments_list = observation_info.get_deployments(
                                    )
                                    observation_dict = observation_info.get_dict(
                                    )
                                    observation_info = None
                                    for deployment in deployments_list:
                                        deployment_info = DeploymentInfo(
                                            session, deployment)
                                        if deployment_info.isValid():
                                            deployment_dict = deployment_info.get_dict(
                                            )
                                            deployment_info = None
                                            valid_deployment = True
                                            if station_index + 1 <= len(
                                                    list_stations) - 1:
                                                next_station = list_stations[
                                                    station_index + 1]
                                            else:
                                                next_station = None
                                            entry_guid = unicode(uuid.uuid4())
                                            entry_id = '{}_{}'.format(
                                                station_dict['id'],
                                                deployment_dict['id'])
                                            entry_name = clean_snakecase(
                                                entry_id)
                                            self.log.debug(
                                                'Gathering %s', entry_name)

                                            content = {}
                                            content['station'] = station_dict
                                            content[
                                                'observation'] = observation_dict
                                            content[
                                                'deployment'] = deployment_dict

                                            package_query = Session.query(
                                                Package)
                                            query_filtered = package_query.filter(
                                                Package.name == entry_name)
                                            package = query_filtered.first()

                                            if package:
                                                # Meaning we've previously harvested this,
                                                # but we may want to reharvest it now.
                                                previous_obj = Session.query(HarvestObject) \
                                                    .filter(HarvestObject.guid == entry_guid) \
                                                    .filter(HarvestObject.current == True) \
                                                    .first()  # noqa: E712
                                                if previous_obj:
                                                    previous_obj.current = False
                                                    previous_obj.save()

                                                if self.update_all:
                                                    self.log.debug(
                                                        '{} already exists and will be updated.'
                                                        .format(entry_name)
                                                    )  # noqa: E501
                                                    status = 'change'

                                                else:
                                                    self.log.debug(
                                                        '{} will not be updated.'
                                                        .format(entry_name)
                                                    )  # noqa: E501
                                                    status = 'unchanged'

                                            elif not package:
                                                # It's a product we haven't harvested before.
                                                self.log.debug(
                                                    '{} has not been harvested before. Creating a new harvest object.'
                                                    .  # noqa: E501
                                                    format(entry_name
                                                           ))  # noqa: E501
                                                status = 'new'
                                            obj = HarvestObject(
                                                guid=entry_guid,
                                                job=self.job,
                                                extras=[
                                                    HOExtra(key='status',
                                                            value=status),
                                                    HOExtra(key='last_token',
                                                            value=last_token),
                                                    HOExtra(key='next_token',
                                                            value=next_token),
                                                    HOExtra(
                                                        key='next_station',
                                                        value=next_station),
                                                    HOExtra(key='restart_date',
                                                            value=restart_date)
                                                ])
                                            obj.content = json.dumps(content)
                                            obj.package = None if status == 'new' else package
                                            obj.save()
                                            ids.append(obj.id)

                                if not valid_deployment:
                                    self.log.debug(
                                        'Station {} does not have valid deployments.'
                                        .format(station))
                            else:
                                self.log.debug(
                                    'Station {} is not valid.'.format(station))
                    station_index += 1
        return ids
    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('SCENT Harvester gather_stage for job: %r', harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        max_dataset = self.source_config.get('max_dataset', 100)
        wfs_url = self.source_config.get('wfs_url')
        wfs_version = self.source_config.get('wfs_version')
        collection = self.source_config.get('collection')
        typename = COLLECTION[collection].get('collection_typename')
        tag_typename = COLLECTION[collection].get('tag_typename', None)
        self.update_all =  self.source_config.get('update_all', False)

        last_product_index = (
            self._get_last_harvesting_index(harvest_job.source_id)
        )

        if last_product_index:
            last_product_index = last_product_index + 1
        else:
            last_product_index = 0

        wfs = WFS(url=wfs_url, version=wfs_version)

        wfs.set_collection(typename)
        sortby=['When']

        result = wfs.make_request(max_dataset, sortby, last_product_index)
        entries = result['features']
        name = '{}_{}'.format(collection.lower(), '{}')
        ids = []
        for entry in entries:
            entry_guid = unicode(uuid.uuid4())
            entry_name = name.format(convert_to_clean_snakecase(entry['id']))
            log.debug('gathering %s', entry_name)

            
            content = {}
            content['collection_content'] = entry
            if tag_typename:
                wfs.set_collection(tag_typename)
                filterxml = wfs.set_filter_equal_to('image_id', entry['id'])
                result = wfs.make_request(constraint=filterxml)
                result = wfs.get_request(constraint=filterxml)
                content['tag_url'] = result

            package_query = Session.query(Package)
            query_filtered = package_query.filter(Package.name == entry_name)
            package = query_filtered.first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'

                else:
                    log.debug(
                        '{} will not be updated.'.format(entry_name))  # noqa: E501
                    status = 'unchanged'

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                    format(entry_name))  # noqa: E501
                status = 'new'
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value=status),
                    HOExtra(key='index', value=last_product_index)
                ])
            obj.content = json.dumps(content)
            obj.package = None if status == 'new' else package
            obj.save()
            last_product_index += 1
            ids.append(obj.id)
        return ids
示例#9
0
    def _crawl_results(self, harvest_url, limit=100, timeout=5, username=None, password=None, provider=None):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        update_counter = 0
        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                r = requests.get(harvest_url,
                                 auth=HTTPBasicAuth(username, password),
                                 verify=False, timeout=timeout)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e), self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(log_message.format(self.provider,
                        timestamp, status_code, timeout))  # noqa: E128
                return ids
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(log_message.format(self.provider,
                        timestamp, r.status_code, elapsed))  # noqa: E128
                return ids

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(log_message.format(self.provider,
                    timestamp, r.status_code, r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = Soup(r.content, 'lxml')

            # Get the URL for the next loop, or None to break the loop
            harvest_url = self._get_next_url(soup)

            # Get the entries from the results
            entries = self._get_entries_from_results(soup)

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = entry['identifier']
                entry_restart_date = entry['restart_date']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    # We need package_show to ensure that all the conversions
                    # are carried out.
                    context = {"user": "******", "ignore_auth": True,
                               "model": model, "session": Session}
                    pkg_dict = logic.get_action('package_show')(context, {"id": package.name})  # noqa: E501
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug('{} already exists and will be updated.'.format(entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    # E.g., a Sentinel dataset exists,
                    # but doesn't have a NOA resource yet.
                    elif self.flagged_extra and not get_pkg_dict_extra(pkg_dict, self.flagged_extra):  # noqa: E501
                        log.debug('{} already exists and will be extended.'.format(entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    else:
                        log.debug('{} will not be updated.'.format(entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value=status),
                                                HOExtra(key='restart_date',
                                                value=entry_restart_date)])
                    obj.content = entry['content']
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)
                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value='new'),
                                                HOExtra(key='restart_date',
                                                value=entry_restart_date)])
                    new_counter += 1
                    obj.content = entry['content']
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(harvester_msg.format(self.provider,
                                       timestamp, self.job.id, new_counter, update_counter))  # noqa: E128, E501
        return ids
示例#10
0
    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('SatcenBetter Harvester gather_stage for job: %r',
                       harvest_job)

        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        self.update_all = self.source_config.get('update_all', False)
        interface = INTERFACE(self.source_config, COLLECTION)

        last_product_index = (self._get_last_harvesting_index(
            harvest_job.source_id, interface))
        interface.update_index(last_product_index)
        interface.build_url()

        log.debug('URL: {}'.format(interface.current_url))  # noqa: E501

        ids = []
        try:
            results = interface.get_results()
        except Timeout as e:
            self._save_gather_error('Request timed out: {}'.format(e),
                                    self.job)  # noqa: E501
            return ids
        if type(results) is not list:
            self._save_gather_error('{} error: {}'.format(
                results['status_code'], results['message']),
                                    self.job)  # noqa: E501
            return ids

        for entry in results:
            name_path = interface.get_name_path()

            name_url = get_field(entry,
                                 name_path['relative_location'].split(","),
                                 name_path['fixed_attributes'])
            entry_name = parse_name(name_url).lower()
            entry_guid = unicode(uuid.uuid4())
            package_query = Session.query(Package)
            query_filtered = package_query.filter(Package.name == entry_name)
            package = query_filtered.first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'

                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .  # noqa: E501
                    format(entry_name))  # noqa: E501
                status = 'new'
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value=status),
                    HOExtra(key=interface.get_pagination_mechanism(),
                            value=interface.get_index())
                ])
            obj.content = json.dumps(entry)
            obj.package = None if status == 'new' else package
            obj.save()
            interface.increment_index()
            ids.append(obj.id)
        return ids
    def _gather_entry(self, entry, path, row, update_all=False):
        # Create a harvest object for each entry
        entry_guid = unicode(uuid.uuid4())
        entry_name = entry.lower()  # noqa: E501
        log.debug('gathering %s', entry)

        package_query = Session.query(Package)
        query_filtered = package_query.filter(Package.name == entry_name)
        package = query_filtered.first()

        if package:
            # Meaning we've previously harvested this,
            # but we may want to reharvest it now.
            previous_obj = Session.query(HarvestObject) \
                .filter(HarvestObject.guid == entry_guid) \
                .filter(HarvestObject.current == True) \
                .first()  # noqa: E712
            if previous_obj:
                previous_obj.current = False
                previous_obj.save()

            if update_all:
                log.debug('{} already exists and will be updated.'.format(
                    entry_name))  # noqa: E501
                status = 'change'
                obj = HarvestObject(
                    guid=entry_guid,
                    job=self.job,
                    extras=[
                        HOExtra(key='status', value=status),
                        HOExtra(key='path', value=path),
                        HOExtra(key='row', value=row)
                    ])
                obj.content = entry
                obj.package = package
                obj.save()
                return obj.id

            else:
                log.debug(
                    '{} will not be updated.'.format(entry_name))  # noqa: E501
                status = 'unchanged'
                obj = HarvestObject(
                    guid=entry_guid,
                    job=self.job,
                    extras=[
                        HOExtra(key='status', value=status),
                        HOExtra(key='path', value=path),
                        HOExtra(key='row', value=row)
                    ])
                obj.content = entry
                obj.package = package
                obj.save()
                return obj.id

        elif not package:
            # It's a product we haven't harvested before.
            log.debug(
                '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                format(entry_name))  # noqa: E501
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value='new'),
                    HOExtra(key='path', value=path),
                    HOExtra(key='row', value=row)
                ])
            obj.content = entry
            obj.package = None
            obj.save()
            return obj.id
示例#12
0
    def _gather(self, job, config):

        ftp_user = config['username']
        ftp_passwd = config['password']
        source_type = config['harvester_type']
        ftp_source = create_ftp_source(source_type)

        if not hasattr(self, 'harvester_logger'):
            self.harvester_logger = self.make_harvester_logger()

        self.provider = 'deimos_imaging'

        existing_files = ftp_source._get_ftp_urls(ftp_user, ftp_passwd)

        metadata_dict = {}
        ids = []
        new_counter = 0
        for ftp_url in existing_files:
            filename = self.parse_filename(ftp_url)
            product_type = self.parse_filedirectory(ftp_url)
            identifier = filename

            content = {'identifier': identifier, 'product_type': product_type, 'ftp_link': ftp_url}  # noqa: E501

            raw_id = identifier.replace(product_type, 'L0R')

            if raw_id in metadata_dict:
                metadata = metadata_dict[raw_id]
            else:
                metadata = self._get_metadata(raw_id)
                metadata_dict[raw_id] = metadata

            for key in metadata:
                content[key] = metadata[key]

            content = json.dumps(content, default=str)

            package = Session.query(Package) \
                .filter(Package.name == identifier.lower()).first()

            if package:
                log.debug('{} will not be updated.'.format(identifier))  # noqa: E501
                status = 'unchanged'
                obj = HarvestObject(guid=ftp_url, job=job,
                                    extras=[HOExtra(key='status',
                                            value=status)])

                obj.content = content
                obj.package = package
                obj.save()
                ids.append(obj.id)

            elif not package:
                log.debug('{} has not been harvested before. Creating a new harvest object.'.format(identifier))  # noqa: E501
                status = 'new'
                new_counter += 1

                extras = [HOExtra(key='status', value=status)]

                obj = HarvestObject(job=job,
                                    guid=ftp_url,
                                    extras=extras)

                obj.content = content
                obj.package = None
                obj.save()
                ids.append(obj.id)

        harvester_msg = '{:<12} | {} | Job ID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(harvester_msg.format(self.provider,
                timestamp, job.id, new_counter, '0'))  # noqa: E128, E501

        return ids
    def _parse_products(self, products, mosquito_type):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0

        # Create a harvest object for each entry
        for entry in products:
            # Add mosquito type on object
            entry['mosquito_type'] = mosquito_type

            # Correct Date
            if entry['dt_placement'].startswith('00'):
                entry['dt_corrected'] = '20' + entry['dt_placement'][2:]

                filename = "{}_{}_{}".format(mosquito_type,
                                             entry['station_id'],
                                             entry['dt_corrected'])

            else:
                filename = "{}_{}_{}".format(mosquito_type,
                                             entry['station_id'],
                                             entry['dt_placement'])

            # Sanitize filename
            filename = self._sanitize_filename(filename)

            # Add coast_mean on aedes for uniqueness
            if mosquito_type == 'aedes':
                filename = filename + '_' + str(
                    int(entry['coast_mean_dist_1000']))

            entry_guid = filename
            entry_name = filename
            entry['filename'] = filename

            entry_restart_date = entry['dt_placement']

            package = Session.query(Package) \
                .filter(Package.name == entry_name).first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = model.Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'change'
                else:
                    log.debug('{} will not be updated.'.format(
                        entry_name))  # noqa: E501
                    status = 'unchanged'

                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value=status),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                obj.content = json.dumps(entry)
                obj.package = package
                obj.save()
                ids.append(obj.id)

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'
                    .format(entry_name))  # noqa: E501
                obj = HarvestObject(guid=entry_guid,
                                    job=self.job,
                                    extras=[
                                        HOExtra(key='status', value='new'),
                                        HOExtra(key='restart_date',
                                                value=entry_restart_date)
                                    ])
                new_counter += 1
                obj.content = json.dumps(entry)
                obj.package = None
                obj.save()
                ids.append(obj.id)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter, 0))  # noqa: E128, E501

        return ids
示例#14
0
    def gather_stage(self, harvest_job):
        self.log = logging.getLogger(__file__)
        self.log.debug('VITO Harvester gather_stage for job: %r', harvest_job)
        
        self.job = harvest_job
        self.source_config = self._get_config(harvest_job)
        self.update_all = self.source_config.get('update_all', False)
        interface = INTERFACE(self.source_config, COLLECTION)
        last_product_index = (
            self._get_last_harvesting_index(harvest_job.source_id, interface)
        )
        interface.update_index(last_product_index)
        interface.build_url_date()
        
        path_to_entries = interface.get_entries_path()

        ids = []
        try:
            results = interface.get_results()
            if results:
                entries = self.get_field(results, path_to_entries[:])
            else:
                return ids
        except Timeout as e:
            self._save_gather_error('Request timed out: {}'.format(e), self.job)  # noqa: E501
            return ids
        except Exception as e:
            return ids
        if entries == None:
            return ids
        elif type(entries) is not list:
            entries = [entries]

        identifier_path = interface.get_identifier_path()

        for entry in entries:
            entry_id = self.clean_snakecase(self.get_field(entry, identifier_path[:])[0])
            entry_guid = unicode(uuid.uuid4())
            package_query = Session.query(Package)
            query_filtered = package_query.filter(Package.name == entry_id)
            package = query_filtered.first()

            if package:
                # Meaning we've previously harvested this,
                # but we may want to reharvest it now.
                previous_obj = Session.query(HarvestObject) \
                    .filter(HarvestObject.guid == entry_guid) \
                    .filter(HarvestObject.current == True) \
                    .first()  # noqa: E712
                if previous_obj:
                    previous_obj.current = False
                    previous_obj.save()

                if self.update_all:
                    log.debug('{} already exists and will be updated.'.format(
                        entry_id))  # noqa: E501
                    status = 'change'

                else:
                    log.debug(
                        '{} will not be updated.'.format(entry_id))  # noqa: E501
                    status = 'unchanged'

            elif not package:
                # It's a product we haven't harvested before.
                log.debug(
                    '{} has not been harvested before. Creating a new harvest object.'.  # noqa: E501
                    format(entry_id))  # noqa: E501
                status = 'new'
            obj = HarvestObject(
                guid=entry_guid,
                job=self.job,
                extras=[
                    HOExtra(key='status', value=status),
                    HOExtra(key=interface.get_pagination_mechanism(), value=interface.get_index())
                ])
            obj.content = json.dumps(entry)
            obj.package = None if status == 'new' else package
            obj.save()
            interface.increment_index()
            ids.append(obj.id)
        return ids
    def _crawl_results(self, harvest_url, limit=100, timeout=5):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        update_counter = 0
        base_url = self.source_config.get('source_url')

        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            soup = self._make_request(harvest_url, timeout)

            if not soup:
                return ids

            next_url = soup.find('csw:searchresults', elementset="summary")
            records_returned = next_url['numberofrecordsreturned']
            next_record = next_url['nextrecord']
            number_records_matched = next_url['numberofrecordsmatched']

            if next_record != '0':
                current_record = str(eval(next_record) - eval(records_returned))  # noqa: E501
            else:
                current_record = str(eval(number_records_matched) - eval(records_returned))  # noqa: E501

            # Get the URL for the next loop, or None to break the loop
            # Only works if StartPosition is last URL parameter
            harvest_url = self._get_next_url(harvest_url, records_returned, next_record, limit)  # noqa: E501

            # Get the entries from the results
            entries = self._get_entries_from_results(soup, current_record, next_record)  # noqa: E501

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = 'saeon_csag_' + entry['identifier'].lower().replace('.', '_').replace('/', '-')  # noqa: E501

                full_content = {}
                full_content['extra_content'] = self._get_entry_time_and_author(base_url, entry['identifier'], timeout)  # noqa: E501
                full_content['raw_content'] = entry['content']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug('{} already exists and will be updated.'.format(entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    else:
                        log.debug('{} already exists and will not be updated.'.format(entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value=status),
                                                HOExtra(key='restart_record',
                                                value=entry['restart_record'])])  # noqa: E501
                    obj.content = json.dumps(full_content)
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)
                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid, job=self.job,
                                        extras=[HOExtra(key='status',
                                                value='new'),
                                                HOExtra(key='restart_record',
                                                value=entry['restart_record'])])  # noqa: E501
                    new_counter += 1
                    obj.content = json.dumps(full_content)
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(harvester_msg.format(self.provider,
                                       timestamp, self.job.id, new_counter, update_counter))  # noqa: E128, E501

        return ids
    def _crawl_results(self, harvest_url, limit=100, timeout=5):  # noqa: E501
        """
        Iterate through the results, create harvest objects,
        and return the ids.
        """
        ids = []
        new_counter = 0
        update_counter = 0

        while len(ids) < limit and harvest_url:
            # We'll limit ourselves to one request per second
            start_request = time.time()

            # Make a request to the website
            timestamp = str(datetime.utcnow())
            log_message = '{:<12} | {} | {} | {}s'
            try:
                r = requests.get(harvest_url, timeout=timeout)
            except Timeout as e:
                self._save_gather_error('Request timed out: {}'.format(e),
                                        self.job)  # noqa: E501
                status_code = 408
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           status_code, timeout))  # noqa: E128
                return ids
            if r.status_code != 200:
                self._save_gather_error('{} error: {}'.format(
                    r.status_code, r.text), self.job)  # noqa: E501
                elapsed = 9999
                if hasattr(self, 'provider_logger'):
                    self.provider_logger.info(
                        log_message.format(self.provider, timestamp,
                                           r.status_code,
                                           elapsed))  # noqa: E128
                return ids

            if hasattr(self, 'provider_logger'):
                self.provider_logger.info(
                    log_message.format(
                        self.provider, timestamp, r.status_code,
                        r.elapsed.total_seconds()))  # noqa: E128, E501

            soup = Soup(r.content, 'lxml')

            next_url = soup.find('csw:searchresults', elementset="summary")
            records_returned = next_url['numberofrecordsreturned']
            next_record = next_url['nextrecord']
            number_records_matched = next_url['numberofrecordsmatched']

            if next_record != '0':
                current_record = str(
                    eval(next_record) - eval(records_returned))  # noqa: E501
            else:
                current_record = str(
                    eval(number_records_matched) -
                    eval(records_returned))  # noqa: E501

            # Get the URL for the next loop, or None to break the loop
            # Only works if StartPosition is last URL parameter
            harvest_url = self._get_next_url(harvest_url, records_returned,
                                             next_record, limit)  # noqa: E501

            # Get the entries from the results
            entries = self._get_entries_from_results(soup, current_record,
                                                     next_record)  # noqa: E501

            # Create a harvest object for each entry
            for entry in entries:
                entry_guid = entry['guid']
                entry_name = entry['identifier']

                package = Session.query(Package) \
                    .filter(Package.name == entry_name).first()

                if package:
                    # Meaning we've previously harvested this,
                    # but we may want to reharvest it now.
                    previous_obj = model.Session.query(HarvestObject) \
                        .filter(HarvestObject.guid == entry_guid) \
                        .filter(HarvestObject.current == True) \
                        .first()  # noqa: E712
                    if previous_obj:
                        previous_obj.current = False
                        previous_obj.save()

                    if self.update_all:
                        log.debug(
                            '{} already exists and will be updated.'.format(
                                entry_name))  # noqa: E501
                        status = 'change'
                        update_counter += 1
                    else:
                        log.debug('{} already exists and will not be updated.'.
                                  format(entry_name))  # noqa: E501
                        status = 'unchanged'

                    obj = HarvestObject(
                        guid=entry_guid,
                        job=self.job,
                        extras=[
                            HOExtra(key='status', value=status),
                            HOExtra(key='restart_record',
                                    value=entry['restart_record'])
                        ])  # noqa: E501
                    obj.content = entry['content']
                    obj.package = package
                    obj.save()
                    ids.append(obj.id)
                elif not package:
                    # It's a product we haven't harvested before.
                    log.debug(
                        '{} has not been harvested before. Creating a new harvest object.'
                        .format(entry_name))  # noqa: E501
                    obj = HarvestObject(guid=entry_guid,
                                        job=self.job,
                                        extras=[
                                            HOExtra(key='status', value='new'),
                                            HOExtra(
                                                key='restart_record',
                                                value=entry['restart_record'])
                                        ])  # noqa: E501
                    new_counter += 1
                    obj.content = entry['content']
                    obj.package = None
                    obj.save()
                    ids.append(obj.id)

            end_request = time.time()
            request_time = end_request - start_request
            if request_time < 1.0:
                time.sleep(1 - request_time)

        harvester_msg = '{:<12} | {} | jobID:{} | {} | {}'
        if hasattr(self, 'harvester_logger'):
            timestamp = str(datetime.utcnow())
            self.harvester_logger.info(
                harvester_msg.format(self.provider, timestamp, self.job.id,
                                     new_counter,
                                     update_counter))  # noqa: E128, E501

        return ids