def _gather_entry(self, entry, auth=None): # Create a harvest object for each entry entry_guid = entry['guid'] log.debug('gathering %s', entry_guid) entry_name = entry['identifier'].replace('v101_', '').replace('.hdf5', '') # noqa: E501 entry_restart_date = entry['restart_date'] package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format(entry_name)) # noqa: E501 # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = entry['content'] obj.package = package obj.save() return obj.id elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = entry['content'] obj.package = None obj.save() return obj.id
def _gather_object(self, job, url, size, start_date, forecast_date): filename = parse_filename(url) filename_id = (filename.replace('-v02.0-fv02.0', '').replace( '-fv02.0', '').replace('-sv01.00', '').replace('-sv05.00', '').replace( '-v02', '').replace('-sv10.00', '').replace('-sv09.00', '').replace('-sv07.00', '')) status, package = self._was_harvested(filename_id, self.update_all) extras = [HOExtra(key='status', value=status)] assert start_date content = json.dumps( { 'identifier': filename_id, 'ftp_link': url, 'size': size, 'start_date': start_date, 'forecast_date': forecast_date, 'restart_date': start_date }, default=str) obj = HarvestObject(job=job, guid=url, extras=extras, content=content) obj.package = package obj.save() return obj.id
def _gather_object(self, job, url, start_date): filename = parse_filename(url) filename_id = filename status, package = self._was_harvested(filename_id, self.update_all) extras = [HOExtra(key='status', value=status)] assert start_date content = json.dumps({ 'identifier': filename_id, 'http_link': url, 'start_date': start_date, 'restart_date': start_date }, default=str ) obj = HarvestObject(job=job, guid=url, extras=extras, content=content) obj.package = package obj.save() return obj.id
def _gather_object(self, job, product, resources, manifest_content, last_harvest_date): name = parse_filename(product).lower() status, package = self._was_harvested(name, self.update_all) extras = [HOExtra(key='status', value=status)] content = json.dumps( { 'name': name, 'restart_date': last_harvest_date.strftime('%Y-%m-%d'), 'manifest_content': manifest_content, 'resources': resources }, default=str) obj = HarvestObject(job=job, guid=unicode(uuid.uuid4()), extras=extras, content=content) obj.package = package obj.save() return obj.id
def _crawl_results(self, harvest_url, timeout=5, limit=100, provider=None): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 first_query = True while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, verify=False, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format( r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format( self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') json_content = json.loads(soup.text) # Get the URL for the next loop, or None to break the loop log.debug(harvest_url) harvest_url = self._get_next_url(harvest_url, json_content) # Get the entries from the results entry_list = self._get_entries_from_results(json_content) if first_query: entries = entry_list else: entries = entry_list[1:] first_query = False # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] entry_restart_date = entry['restart_date'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug( '{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = json.dumps(entry['content']) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) new_counter += 1 obj.content = json.dumps(entry['content']) obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids
def _parse_products(self, products): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 # Create a harvest object for each entry for entry in products: entry_guid = entry['imgtif'].split('/')[1].lower( ) + "_" + entry['type'] + "_" + str(entry['intid']) entry_name = entry['imgtif'].split('/')[1].lower( ) + "_" + entry['type'] + "_" + str(entry['intid']) entry_restart_date = entry['master'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = json.dumps(entry) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) new_counter += 1 obj.content = json.dumps(entry) obj.package = None obj.save() ids.append(obj.id) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids
def gather_stage(self, harvest_job): requests_cache.install_cache() requests_cache.clear() session = requests_cache.CachedSession() self.log = logging.getLogger(__file__) self.log.debug('OSCAR Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) base_url = self.source_config.get('oai_pmh_url') metadata_prefix = self.source_config.get('metadata_prefix') start_date = self.source_config.get('start_date', None) self.update_all = self.source_config.get('update_all', False) last_token = self._get_last_harvesting_index(self.job.source_id, 'last_token') next_token = self._get_last_harvesting_index(self.job.source_id, 'next_token') next_station = self._get_last_harvesting_index(self.job.source_id, 'next_station') restart_date = self._get_last_harvesting_index(self.job.source_id, 'restart_date') restart_date = restart_date if last_token else None ids = [] first_query = True while (ids == [] and next_token) or first_query: first_query = False current_token = last_token if next_station else next_token if current_token: query_url = "{}?verb=ListIdentifiers&resumptionToken={}".format( base_url, current_token) elif restart_date: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format( base_url, metadata_prefix, restart_date) elif start_date: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format( base_url, metadata_prefix, start_date) else: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}".format( base_url, metadata_prefix) self.log.debug('Querying: {}.'.format(query_url)) raw_list_ids = self.get_list_identifiers(session, query_url) list_stations, largest_datastamp = self.get_station_ids( raw_list_ids) next_token = self.get_resumption_token(raw_list_ids) last_token = current_token restart_date = restart_date if restart_date else '' restart_date = largest_datastamp if largest_datastamp > restart_date else restart_date if list_stations == []: next_station = None else: valid_deployment = None station_index = 0 while not valid_deployment and station_index <= len( list_stations) - 1: station = list_stations[station_index] next_station = None if (next_station == station) else next_station if not next_station: station_query = '{}?verb=GetRecord&metadataPrefix={}&identifier={}'.format( base_url, metadata_prefix, station) print('Querying station: {}.'.format(station)) record = self.get_record(session, station_query) if record: station_info = StationInfo(record) if station_info.isValid(): station_info.id = station observation_list = station_info.get_observations( ) station_dict = station_info.get_dict() station_info = None for observation in observation_list: observation_info = ObservationInfo( session, observation) deployments_list = observation_info.get_deployments( ) observation_dict = observation_info.get_dict( ) observation_info = None for deployment in deployments_list: deployment_info = DeploymentInfo( session, deployment) if deployment_info.isValid(): deployment_dict = deployment_info.get_dict( ) deployment_info = None valid_deployment = True if station_index + 1 <= len( list_stations) - 1: next_station = list_stations[ station_index + 1] else: next_station = None entry_guid = unicode(uuid.uuid4()) entry_id = '{}_{}'.format( station_dict['id'], deployment_dict['id']) entry_name = clean_snakecase( entry_id) self.log.debug( 'Gathering %s', entry_name) content = {} content['station'] = station_dict content[ 'observation'] = observation_dict content[ 'deployment'] = deployment_dict package_query = Session.query( Package) query_filtered = package_query.filter( Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: self.log.debug( '{} already exists and will be updated.' .format(entry_name) ) # noqa: E501 status = 'change' else: self.log.debug( '{} will not be updated.' .format(entry_name) ) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. self.log.debug( '{} has not been harvested before. Creating a new harvest object.' . # noqa: E501 format(entry_name )) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='last_token', value=last_token), HOExtra(key='next_token', value=next_token), HOExtra( key='next_station', value=next_station), HOExtra(key='restart_date', value=restart_date) ]) obj.content = json.dumps(content) obj.package = None if status == 'new' else package obj.save() ids.append(obj.id) if not valid_deployment: self.log.debug( 'Station {} does not have valid deployments.' .format(station)) else: self.log.debug( 'Station {} is not valid.'.format(station)) station_index += 1 return ids
def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('SCENT Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) max_dataset = self.source_config.get('max_dataset', 100) wfs_url = self.source_config.get('wfs_url') wfs_version = self.source_config.get('wfs_version') collection = self.source_config.get('collection') typename = COLLECTION[collection].get('collection_typename') tag_typename = COLLECTION[collection].get('tag_typename', None) self.update_all = self.source_config.get('update_all', False) last_product_index = ( self._get_last_harvesting_index(harvest_job.source_id) ) if last_product_index: last_product_index = last_product_index + 1 else: last_product_index = 0 wfs = WFS(url=wfs_url, version=wfs_version) wfs.set_collection(typename) sortby=['When'] result = wfs.make_request(max_dataset, sortby, last_product_index) entries = result['features'] name = '{}_{}'.format(collection.lower(), '{}') ids = [] for entry in entries: entry_guid = unicode(uuid.uuid4()) entry_name = name.format(convert_to_clean_snakecase(entry['id'])) log.debug('gathering %s', entry_name) content = {} content['collection_content'] = entry if tag_typename: wfs.set_collection(tag_typename) filterxml = wfs.set_filter_equal_to('image_id', entry['id']) result = wfs.make_request(constraint=filterxml) result = wfs.get_request(constraint=filterxml) content['tag_url'] = result package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug( '{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='index', value=last_product_index) ]) obj.content = json.dumps(content) obj.package = None if status == 'new' else package obj.save() last_product_index += 1 ids.append(obj.id) return ids
def _crawl_results(self, harvest_url, limit=100, timeout=5, username=None, password=None, provider=None): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 update_counter = 0 while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, auth=HTTPBasicAuth(username, password), verify=False, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format(r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') # Get the URL for the next loop, or None to break the loop harvest_url = self._get_next_url(soup) # Get the entries from the results entries = self._get_entries_from_results(soup) # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] entry_restart_date = entry['restart_date'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. # We need package_show to ensure that all the conversions # are carried out. context = {"user": "******", "ignore_auth": True, "model": model, "session": Session} pkg_dict = logic.get_action('package_show')(context, {"id": package.name}) # noqa: E501 previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' update_counter += 1 # E.g., a Sentinel dataset exists, # but doesn't have a NOA resource yet. elif self.flagged_extra and not get_pkg_dict_extra(pkg_dict, self.flagged_extra): # noqa: E501 log.debug('{} already exists and will be extended.'.format(entry_name)) # noqa: E501 status = 'change' update_counter += 1 else: log.debug('{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date)]) obj.content = entry['content'] obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date)]) new_counter += 1 obj.content = entry['content'] obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info(harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, update_counter)) # noqa: E128, E501 return ids
def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('SatcenBetter Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) self.update_all = self.source_config.get('update_all', False) interface = INTERFACE(self.source_config, COLLECTION) last_product_index = (self._get_last_harvesting_index( harvest_job.source_id, interface)) interface.update_index(last_product_index) interface.build_url() log.debug('URL: {}'.format(interface.current_url)) # noqa: E501 ids = [] try: results = interface.get_results() except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 return ids if type(results) is not list: self._save_gather_error('{} error: {}'.format( results['status_code'], results['message']), self.job) # noqa: E501 return ids for entry in results: name_path = interface.get_name_path() name_url = get_field(entry, name_path['relative_location'].split(","), name_path['fixed_attributes']) entry_name = parse_name(name_url).lower() entry_guid = unicode(uuid.uuid4()) package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' . # noqa: E501 format(entry_name)) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key=interface.get_pagination_mechanism(), value=interface.get_index()) ]) obj.content = json.dumps(entry) obj.package = None if status == 'new' else package obj.save() interface.increment_index() ids.append(obj.id) return ids
def _gather_entry(self, entry, path, row, update_all=False): # Create a harvest object for each entry entry_guid = unicode(uuid.uuid4()) entry_name = entry.lower() # noqa: E501 log.debug('gathering %s', entry) package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='path', value=path), HOExtra(key='row', value=row) ]) obj.content = entry obj.package = package obj.save() return obj.id else: log.debug( '{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='path', value=path), HOExtra(key='row', value=row) ]) obj.content = entry obj.package = package obj.save() return obj.id elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='path', value=path), HOExtra(key='row', value=row) ]) obj.content = entry obj.package = None obj.save() return obj.id
def _gather(self, job, config): ftp_user = config['username'] ftp_passwd = config['password'] source_type = config['harvester_type'] ftp_source = create_ftp_source(source_type) if not hasattr(self, 'harvester_logger'): self.harvester_logger = self.make_harvester_logger() self.provider = 'deimos_imaging' existing_files = ftp_source._get_ftp_urls(ftp_user, ftp_passwd) metadata_dict = {} ids = [] new_counter = 0 for ftp_url in existing_files: filename = self.parse_filename(ftp_url) product_type = self.parse_filedirectory(ftp_url) identifier = filename content = {'identifier': identifier, 'product_type': product_type, 'ftp_link': ftp_url} # noqa: E501 raw_id = identifier.replace(product_type, 'L0R') if raw_id in metadata_dict: metadata = metadata_dict[raw_id] else: metadata = self._get_metadata(raw_id) metadata_dict[raw_id] = metadata for key in metadata: content[key] = metadata[key] content = json.dumps(content, default=str) package = Session.query(Package) \ .filter(Package.name == identifier.lower()).first() if package: log.debug('{} will not be updated.'.format(identifier)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=ftp_url, job=job, extras=[HOExtra(key='status', value=status)]) obj.content = content obj.package = package obj.save() ids.append(obj.id) elif not package: log.debug('{} has not been harvested before. Creating a new harvest object.'.format(identifier)) # noqa: E501 status = 'new' new_counter += 1 extras = [HOExtra(key='status', value=status)] obj = HarvestObject(job=job, guid=ftp_url, extras=extras) obj.content = content obj.package = None obj.save() ids.append(obj.id) harvester_msg = '{:<12} | {} | Job ID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info(harvester_msg.format(self.provider, timestamp, job.id, new_counter, '0')) # noqa: E128, E501 return ids
def _parse_products(self, products, mosquito_type): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 # Create a harvest object for each entry for entry in products: # Add mosquito type on object entry['mosquito_type'] = mosquito_type # Correct Date if entry['dt_placement'].startswith('00'): entry['dt_corrected'] = '20' + entry['dt_placement'][2:] filename = "{}_{}_{}".format(mosquito_type, entry['station_id'], entry['dt_corrected']) else: filename = "{}_{}_{}".format(mosquito_type, entry['station_id'], entry['dt_placement']) # Sanitize filename filename = self._sanitize_filename(filename) # Add coast_mean on aedes for uniqueness if mosquito_type == 'aedes': filename = filename + '_' + str( int(entry['coast_mean_dist_1000'])) entry_guid = filename entry_name = filename entry['filename'] = filename entry_restart_date = entry['dt_placement'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = json.dumps(entry) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) new_counter += 1 obj.content = json.dumps(entry) obj.package = None obj.save() ids.append(obj.id) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids
def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('VITO Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) self.update_all = self.source_config.get('update_all', False) interface = INTERFACE(self.source_config, COLLECTION) last_product_index = ( self._get_last_harvesting_index(harvest_job.source_id, interface) ) interface.update_index(last_product_index) interface.build_url_date() path_to_entries = interface.get_entries_path() ids = [] try: results = interface.get_results() if results: entries = self.get_field(results, path_to_entries[:]) else: return ids except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 return ids except Exception as e: return ids if entries == None: return ids elif type(entries) is not list: entries = [entries] identifier_path = interface.get_identifier_path() for entry in entries: entry_id = self.clean_snakecase(self.get_field(entry, identifier_path[:])[0]) entry_guid = unicode(uuid.uuid4()) package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_id) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_id)) # noqa: E501 status = 'change' else: log.debug( '{} will not be updated.'.format(entry_id)) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_id)) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key=interface.get_pagination_mechanism(), value=interface.get_index()) ]) obj.content = json.dumps(entry) obj.package = None if status == 'new' else package obj.save() interface.increment_index() ids.append(obj.id) return ids
def _crawl_results(self, harvest_url, limit=100, timeout=5): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 update_counter = 0 base_url = self.source_config.get('source_url') while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() soup = self._make_request(harvest_url, timeout) if not soup: return ids next_url = soup.find('csw:searchresults', elementset="summary") records_returned = next_url['numberofrecordsreturned'] next_record = next_url['nextrecord'] number_records_matched = next_url['numberofrecordsmatched'] if next_record != '0': current_record = str(eval(next_record) - eval(records_returned)) # noqa: E501 else: current_record = str(eval(number_records_matched) - eval(records_returned)) # noqa: E501 # Get the URL for the next loop, or None to break the loop # Only works if StartPosition is last URL parameter harvest_url = self._get_next_url(harvest_url, records_returned, next_record, limit) # noqa: E501 # Get the entries from the results entries = self._get_entries_from_results(soup, current_record, next_record) # noqa: E501 # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = 'saeon_csag_' + entry['identifier'].lower().replace('.', '_').replace('/', '-') # noqa: E501 full_content = {} full_content['extra_content'] = self._get_entry_time_and_author(base_url, entry['identifier'], timeout) # noqa: E501 full_content['raw_content'] = entry['content'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' update_counter += 1 else: log.debug('{} already exists and will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value=status), HOExtra(key='restart_record', value=entry['restart_record'])]) # noqa: E501 obj.content = json.dumps(full_content) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value='new'), HOExtra(key='restart_record', value=entry['restart_record'])]) # noqa: E501 new_counter += 1 obj.content = json.dumps(full_content) obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info(harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, update_counter)) # noqa: E128, E501 return ids
def _crawl_results(self, harvest_url, limit=100, timeout=5): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 update_counter = 0 while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format( r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format( self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') next_url = soup.find('csw:searchresults', elementset="summary") records_returned = next_url['numberofrecordsreturned'] next_record = next_url['nextrecord'] number_records_matched = next_url['numberofrecordsmatched'] if next_record != '0': current_record = str( eval(next_record) - eval(records_returned)) # noqa: E501 else: current_record = str( eval(number_records_matched) - eval(records_returned)) # noqa: E501 # Get the URL for the next loop, or None to break the loop # Only works if StartPosition is last URL parameter harvest_url = self._get_next_url(harvest_url, records_returned, next_record, limit) # noqa: E501 # Get the entries from the results entries = self._get_entries_from_results(soup, current_record, next_record) # noqa: E501 # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug( '{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' update_counter += 1 else: log.debug('{} already exists and will not be updated.'. format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_record', value=entry['restart_record']) ]) # noqa: E501 obj.content = entry['content'] obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra( key='restart_record', value=entry['restart_record']) ]) # noqa: E501 new_counter += 1 obj.content = entry['content'] obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, update_counter)) # noqa: E128, E501 return ids