Exemplo n.º 1
0
def harvest_object_create(context, data_dict):
    """ Create a new harvest object

    :type guid: string (optional)
    :type content: string (optional)
    :type job_id: string
    :type source_id: string (optional)
    :type package_id: string (optional)
    :type extras: dict (optional)
    """
    check_access('harvest_object_create', context, data_dict)
    data, errors = _validate(
        data_dict, harvest_object_create_schema(), context)

    if errors:
        raise logic.ValidationError(errors)

    obj = HarvestObject(
        guid=data.get('guid'),
        content=data.get('content'),
        job=data['job_id'],
        harvest_source_id=data.get('source_id'),
        package_id=data.get('package_id'),
        extras=[HarvestObjectExtra(key=k, value=v)
                for k, v in data.get('extras', {}).items()]
    )

    obj.save()
    return harvest_object_dictize(obj, context)
Exemplo n.º 2
0
    def gather_stage(self, harvest_job):
        log.debug('In SocrataHarvester 2 gather_stage (%s)' %
                  harvest_job.source.url)
        get_all_packages = True

        dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/')
        log.debug(dcatUrl)

        adaptorInstance = socrataAdaptor()
        package_ids = adaptorInstance.listDatasetIds(dcatUrl)
        print('****')
        print(len(package_ids))
        print(package_ids)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    if "http" not in package_id:
                        # Create a new HarvestObject for this identifier
                        obj = HarvestObject(guid=package_id, job=harvest_job)
                        obj.save()
                        object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' %
                                        url, harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)
    def gather_stage(self, harvest_job):

        print('Html Harvest Gather Stage')
        db2 = client.odm
        collection = db2.html_jobs
        backupi = 0
        ## Get source URL
        source_url = harvest_job.source.url
        ## mongoDb connection
        document = collection.find_one({"cat_url": source_url})
        id1 = document['_id']
        if 'btn_identifier' in document.keys():
            if document['btn_identifier'] != None and document[
                    'btn_identifier'] != '':
                cat_url = document['cat_url']
                dataset_identifier = document['identifier']
                btn_identifier = document['btn_identifier']
                action_type = document['action_type']
                try:
                    sleep_time = document['sleep_time']
                except:
                    sleep_time = 3
                package_ids = javascript_case.ParseJavascriptPages(
                    cat_url, dataset_identifier, btn_identifier, action_type,
                    sleep_time)
                print(package_ids)
            else:
                package_ids = harvester_final.read_data(id1, backupi)
        else:
            package_ids = harvester_final.read_data(id1, backupi)
        print(package_ids)
        #print(len(package_ids))
        #package_ids=[]
        #package_ids.append('http://data.belgium.be/dataset/mortality-tables-gender')
        #package_ids.append('test')
        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error(
                    'No packages received for URL: %s' % source_url,
                    harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)
    def gather_stage(self, harvest_job):

        if harvest_job.source.url.startswith('basic_test'):
            obj = HarvestObject(guid = 'test1', job = harvest_job)
            obj.extras.append(HarvestObjectExtra(key='key', value='value'))
            obj2 = HarvestObject(guid = 'test2', job = harvest_job)
            obj3 = HarvestObject(guid = 'test_to_delete', job = harvest_job)
            obj.add()
            obj2.add()
            obj3.save() # this will commit both
            return [obj.id, obj2.id, obj3.id]

        return []
    def gather_stage(self, harvest_job):

        print("Html Harvest Gather Stage")
        db2 = client.odm
        collection = db2.html_jobs
        backupi = 0
        ## Get source URL
        source_url = harvest_job.source.url
        ## mongoDb connection
        document = collection.find_one({"cat_url": source_url})
        id1 = document["_id"]
        if "btn_identifier" in document.keys():
            if document["btn_identifier"] != None and document["btn_identifier"] != "":
                cat_url = document["cat_url"]
                dataset_identifier = document["identifier"]
                btn_identifier = document["btn_identifier"]
                action_type = document["action_type"]
                try:
                    sleep_time = document["sleep_time"]
                except:
                    sleep_time = 3
                package_ids = javascript_case.ParseJavascriptPages(
                    cat_url, dataset_identifier, btn_identifier, action_type, sleep_time
                )
                print(package_ids)
            else:
                package_ids = harvester_final.read_data(id1, backupi)
        else:
            package_ids = harvester_final.read_data(id1, backupi)
        print(package_ids)
        # print(len(package_ids))
        # package_ids=[]
        # package_ids.append('http://data.belgium.be/dataset/mortality-tables-gender')
        # package_ids.append('test')
        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid=package_id, job=harvest_job)
                    obj.save()
                    object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error("No packages received for URL: %s" % source_url, harvest_job)
                return None
        except Exception, e:
            self._save_gather_error("%r" % e.message, harvest_job)
Exemplo n.º 6
0
    def _create_harvest_objects(self, remote_ids, harvest_job):
        '''
        Given a list of remote ids and a Harvest Job, create as many Harvest Objects and
        return a list of their ids to be passed to the fetch stage.

        TODO: Not sure it is worth keeping this function
        '''
        try:
            object_ids = []
            if len(remote_ids):
                for remote_id in remote_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid = remote_id, job = harvest_job)
                    obj.save()
                    object_ids.append(obj.id)
                return object_ids
            else:
               self._save_gather_error('No remote datasets could be identified', harvest_job)
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)
    def gather_stage(self,harvest_job):
        log.debug('In CustomHarvester  gather_stage (%s)' % harvest_job.source.url)
        get_all_packages = True
        db=client.odm
        db_jobs=db.jobs
        config=db_jobs.find_one({"cat_url":harvest_job.source.url})
        datasets_list_url=config['datasets_list_url']
        datasets_list_identifier=config['datasets_list_identifier']
        dataset_id=config['dataset_id']
        api_key=config['apikey']
        if "data.norge.no" in harvest_job.source.url.rstrip('/'):
        	many_datasets_list=['/api/dcat/data.json?page=1','/api/dcat/data.json?page=2','/api/dcat/data.json?page=3','/api/dcat/data.json?page=4']
        else:
        	many_datasets_list.append(datasets_list_url) 

        j=0
        all_datasets=[]
        while j<len(many_datasets_list): 
			url=harvest_job.source.url.rstrip('/')+many_datasets_list[j].replace('{api}',api_key)
			print(url)
			result=urllib2.urlopen(url)
			try:
				  datasets=json.load(result)
				  if datasets_list_identifier!="":
					datasets=datasets[datasets_list_identifier]
			except:
				  try:
					headers = {'Accept':'application/json'}
					r=urllib2.Request(url,headers=headers)
					datasets=t=json.loads(urllib2.urlopen(r).read())
					if datasets_list_identifier!="":
					  datasets=datasets[datasets_list_identifier]
				  except:
					result=urllib2.urlopen(url)
					read=result.read()
					read=read.replace("null(","datasets=").rstrip(')')
					exec(read)
			count=0
			while count<len(datasets):
				all_datasets.append(datasets[count])
				count+=1
			datasets[:]=[]
			j+=1
	 
        
        i=0
        package_ids=[]	
        while i<len(all_datasets):
		  package_ids.append(all_datasets[i][dataset_id])
		  i+=1

        #print('****package ids****')
        #print(package_ids)
        #print(len(package_ids))
        
        ###load existing datasets names and ids from mongoDb
        datasets=list(custom_db.find({'catalogue_url':harvest_job.source.url}))
        datasets_ids=[]
        datasets_names=[]
        j=0
        while j<len(datasets):
		  datasets_ids.append(datasets[j]['id'])
		  j+=1

        
        
        ###check for deleted datasets that exist in mongo
        count_pkg_ids=0
        while count_pkg_ids<len(package_ids):
		  temp_pckg_id=package_ids[count_pkg_ids]
		  if temp_pckg_id in datasets_ids:
			datasets_ids.remove(temp_pckg_id)
		  count_pkg_ids+=1
        if len(datasets_ids)>0:
		j=0
		while j<len(datasets_ids):
		  i=0
		  while i<len(datasets):
			if datasets_ids[j] in datasets[i]['id']:
			  document=datasets[i]
			  document.update({"deleted_dataset":True})
			  custom_db.save(document)
			i+=1
		  j+=1

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                      obj = HarvestObject(guid = package_id, job = harvest_job)
                      obj.save()
                      object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' % url,
                    harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
    def gather_stage(self,harvest_job):
        log.debug('In OpendatasoftHarvester 2 gather_stage (%s)' % harvest_job.source.url)
        get_all_packages = True


        url=harvest_job.source.url.rstrip('/')+"/api/datasets/1.0/search/?rows=100000000"
        result=json.load(urllib2.urlopen(url))
        datasets=result['datasets']
        i=0
        package_ids=[]
        while i<len(datasets):
		  package_ids.append(datasets[i]['datasetid'])
		  i+=1

        #package_ids = adaptorInstance.listDatasetIds(dcatUrl)
       #print('****package ids****')
        #print(package_ids)

        
        ###load existing datasets names and ids from mongoDb
        datasets=list(opendatasoft_db.find({'catalogue_url':harvest_job.source.url}))
        datasets_ids=[]
        datasets_names=[]
        j=0
        while j<len(datasets):
		  datasets_ids.append(datasets[j]['id'])
		  j+=1

        
        
        ###check for deleted datasets that exist in mongo
        count_pkg_ids=0
        while count_pkg_ids<len(package_ids):
		  temp_pckg_id=package_ids[count_pkg_ids]
		  if temp_pckg_id in datasets_ids:
			datasets_ids.remove(temp_pckg_id)
		  count_pkg_ids+=1
        if len(datasets_ids)>0:
		#print(datasets_ids)
		j=0
		while j<len(datasets_ids):
		  i=0
		  while i<len(datasets):
			if datasets_ids[j] in datasets[i]['id']:
			  document=datasets[i]
			  document.update({"deleted_dataset":True})
			  opendatasoft_db.save(document)
			i+=1
		  j+=1

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    #if "http" not in package_id: 
                    # Create a new HarvestObject for this identifier
                      obj = HarvestObject(guid = package_id, job = harvest_job)
                      obj.save()
                      object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' % url,
                    harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
    def gather_stage(self,harvest_job):
        log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url)
        get_all_packages = True

        dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/')
        log.debug(dcatUrl)

        adaptorInstance = socrataAdaptor()
        package_ids = adaptorInstance.listDatasetIds(dcatUrl)
        #print('****')
        #print(len(package_ids))
        #print(package_ids)


        ##load existing datasets names and ids from mongoDb
        datasets=list(socrata_db.find({'catalogue_url':harvest_job.source.url.rstrip('/')}))
        datasets_ids=[]
        datasets_names=[]
        j=0
        while j<len(datasets):
		  datasets_ids.append(datasets[j]['id'])
		  datasets_names.append(datasets[j]['name'])
		  j+=1
        #print(datasets_names)
        




        ##check for deleted datasets that exist in mongo
        count_pkg_ids=0
        while count_pkg_ids<len(package_ids):
		  temp_pckg_id=package_ids[count_pkg_ids]
		  if temp_pckg_id in datasets_ids:
			datasets_ids.remove(temp_pckg_id)
		  if temp_pckg_id in datasets_names:
			datasets_names.remove(temp_pckg_id)
		  count_pkg_ids+=1
        if len(datasets_names)<len(datasets_ids):
		  #print(datasets_names)
		  j=0
		  #print(harvest_job.source.url.rstrip('/'))
		  while j<len(datasets_names):
			i=0
			while i<len(datasets):
			  if datasets_names[j] in datasets[i]['name']:
				document=datasets[i]
				document.update({"deleted_dataset":True})
				socrata_db.save(document)
			  i+=1

			j+=1
        else:
		  #print(datasets_ids)
		  j=0
		  while j<len(datasets_ids):
			i=0
			while i<len(datasets):
			  if datasets_ids[j] in datasets[i]['id']:
				document=datasets[i]
				document.update({"deleted_dataset":True})
				socrata_db.save(document)
			  i+=1

			j+=1


        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    if "http" not in package_id: 
                    # Create a new HarvestObject for this identifier
                      obj = HarvestObject(guid = package_id, job = harvest_job)
                      obj.save()
                      object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' % url,
                    harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
    def gather_stage(self,harvest_job):
	  
		print('Html Harvest Gather Stage')
		db2 = client.odm
		odm=db2.odm
		collection=db2.html_jobs
		backupi=0
		mainurl=""
		ctlg_url=harvest_job.source.url
		if 'http://' in ctlg_url:
		  mainurl1=ctlg_url[ctlg_url.find('http://')+7:]
		  mainurl='http://'+mainurl1[0:mainurl1.find('/')]
		if 'https://' in ctlg_url:
		  mainurl1=ctlg_url[ctlg_url.find('https://')+8:]
		  mainurl='https://'+mainurl1[0:mainurl1.find('/')]

		##load existing datasets names and ids from mongoDb
		datasets=list(odm.find({'catalogue_url':mainurl}))
		if len(datasets)==0:
			datasets=list(odm.find({'catalogue_url':harvest_job.source.url.rstrip('/')+'/'}))
		datasets_ids=[]
		j=0
		while j<len(datasets):
			datasets_ids.append(datasets[j]['url'])
			j+=1
		#print("==============================================")
		#print(datasets_ids)
		#print(len(datasets_ids))
		#print("==============================================")

        ## Get source URL
		source_url = harvest_job.source.url
		## mongoDb connection
		document=collection.find_one({"cat_url":source_url})
		id1=document['_id']
		if 'btn_identifier' in document.keys():
		  if document['btn_identifier']!=None and document['btn_identifier']!='':
			cat_url=document['cat_url']
			dataset_identifier=document['identifier']
			btn_identifier=document['btn_identifier']
			action_type=document['action_type']
			try:
				sleep_time=document['sleep_time']
			except:
				sleep_time=3
			package_ids=javascript_case.ParseJavascriptPages(cat_url,dataset_identifier,btn_identifier,action_type,sleep_time)
			#print(package_ids)
		  else:
			package_ids=harvester_final.read_data(id1,backupi)
		else:
			package_ids=harvester_final.read_data(id1,backupi)
		#print(package_ids)

		##check for deleted datasets that exist in mongo
		count_pkg_ids=0
		if len(package_ids)>0:
			while count_pkg_ids<len(package_ids):
				temp_pckg_id=package_ids[count_pkg_ids]
				if temp_pckg_id in datasets_ids:
					datasets_ids.remove(temp_pckg_id)
				count_pkg_ids+=1
			j=0
			while j<len(datasets_ids):
				i=0
				while i<len(datasets):
					if datasets_ids[j] == datasets[i]['url']:
						document=datasets[i]
						document.update({"deleted_dataset":True})
						odm.save(document)
					i+=1
				j+=1

		#print(len(package_ids))
		#package_ids=[]
		#package_ids.append('http://data.belgium.be/dataset/mortality-tables-gender')
		#package_ids.append('test')
		try:
		    object_ids = []
		    if len(package_ids):
		        for package_id in package_ids:
		            # Create a new HarvestObject for this identifier
		            obj = HarvestObject(guid = package_id, job = harvest_job)
		            obj.save()
		            object_ids.append(obj.id)

		        return object_ids

		    else:
		       self._save_gather_error('No packages received for URL: %s' % source_url,
		               harvest_job)
		       return None
		except Exception, e:
		    self._save_gather_error('%r'%e.message,harvest_job)
			  #document=odm.find_one({"catalogue_url":str(base_url)+"/","id":datasets_ids[j]})
			#document.update({"deleted_dataset":True})
			#odm.save(document)
			j+=1





        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    # Create a new HarvestObject for this identifier
                    obj = HarvestObject(guid = package_id, job = harvest_job)
                    obj.save()
                    object_ids.append(obj.id)

                return object_ids

            else:
               self._save_gather_error('No packages received for URL: %s' % url,
                       harvest_job)
               return None
        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)


    def fetch_stage(self,harvest_object):
        log.debug('In CKANHarvester fetch_stage')
        self._set_config(harvest_object.job.source.config)