def check_structure(dict): fields = [ckan for ckan,pilot,field in schema.dataset_all_fields() if field['type'] not in [u'fixed',u'calculated']] mandatory = [ckan for ckan,pilot,field in schema.dataset_all_fields() if field['mandatory'] == u'all'] fields.append('resources') fields.append('validation_override') missing_fields = set(dict.iterkeys()).symmetric_difference(set(fields)) mandatory_fields = set(mandatory).intersection(set(fields)) print "Missing Mandatory Fields", missing_fields.intersection(mandatory_fields) print "Missing Values ", [key for key,value in dict.items() if value=='MISSING'] print "------------- Details ---------------" print "Fields Missing from Package_dict" pprint(list(missing_fields)) print "Mandatory Fields that are not fixed or calculated" pprint(mandatory_fields)
def table(): Recs = [] t = PrettyTable(['No.','CKAN Name','Description','Pilot Name']) t.align["City name"] = "l" # Left align city names t.padding_width = 1 # One space between column edges and contents (default) print u'\u2019'.encode('utf-8') for i, (ckan_name, pilot_name, field) in enumerate(schema_description.dataset_all_fields()): description = field['description']['eng'] #description = field['description']['eng'].replace(u'\u2019','') #Fix bad windows chars t.add_row([i,str(ckan_name), description, str(pilot_name) ]) t.align='l' print t
def _process_node(self,node): #print node.xpath("FORM[NAME='thisformid']/A/text()") #print etree.tostring(node, with_tail=True) package_dict = {'resources': [], 'tags':[]} for ckan_name, pilot_name, field in schema_description.dataset_all_fields(): try: # the simplest case, one to one mapping of values # temporary hack because name has not been mapped to thisformid in the schema if ckan_name == "id": #package_dict['id'] = node.xpath("FORM[NAME='thisformid']/A/text()")[0] continue elif ckan_name == 'name': package_dict['name'] = "pilot-" + str(node.xpath("FORM[NAME='thisformid']/A/text()")[0]).split("-")[0].lower() continue elif ckan_name== 'tags': continue else: print ckan_name, pilot_name value = node.xpath("FORM[NAME='%s']/A/text()" % pilot_name)[0] if "|" in value: split_value = value.split("|")[1] rval = field['choices_by_pilot_uuid'][split_value] package_dict[ckan_name] = rval['key'] else: package_dict[ckan_name] = value except IndexError: #when None, eg. same as elif pilot_name is None: package_dict[ckan_name] = '' print "INDEX ERROR" pass except KeyError: print "KEY ERROR" pass # now do resources, use my own logic as resources = [] resource_dict = {} if ckan_name in schema_description.all_resource_fields: try: value = node.xpath("FORM[NAME='%s']/A/text()" % pilot_name)[0] if "|" in value: split_value = value.split("|")[1] rval = field['choices_by_pilot_uuid'][split_value] resource_dict[ckan_name] = rval['key'] else: resource_dict[ckan_name] = value except IndexError: continue resources.append(resource_dict) package_dict['resources'] = resources pprint(package_dict) #sys.exit() self.out.write(json.dumps(package_dict) + "\n")
def process_record(node): package_dict = {'resources': [], 'tags': []} data = {} extras = {} resource={} resources=[] for ckan_name, pilot_name, field in schema_description.dataset_all_fields(): try: # the simplest case, one to one mapping of values # temporary hack because name has not been mapped to thisformid in the schema value = node.xpath("FORM[NAME='%s']/A/text()" % pilot_name)[0] pilot_code = value.split('|') if pilot_code[0] in mappings.code_mapping_strategies: data[ckan_name] = mappings.code_mapping_strategies[pilot_code[0]](pilot_code[1]) else: data[ckan_name] = value except UnicodeDecodeError: print "UNICODE ERROR" except IndexError: #same as elif pilot_name is None: if ckan_name == "name": data['name'] = "statcan-" + mappings.random_id() print elif ckan_name in mappings.default_strategies: data[ckan_name] = mappings.default_strategies[ckan_name]() else: data[ckan_name] = "default_" + ckan_name finally: # reorganize dict for CKAN if ckan_name in schema_description.extra_package_fields: extras[ckan_name] = data[ckan_name] del data[ckan_name] # now populate packages elif ckan_name == 'url': resource['url'] = "http://www.statcan.gc.ca/cgi-bin/sum-som/fl/cstsaveascsv.cgi?filename=arts63a-fra.htm&lan=fre" #resource['url'] = node.xpath("FORM['dataset_link_en_1']/A/text()")[0] elif ckan_name in schema_description.all_resource_fields: resource[ckan_name] = "default_package_value " + ckan_name resources.append(resource) data['extras'] = extras data['resources'] = resources data['groups'] = ["statcan"] # extras = {key:value for (key, value) in data if key in schema_description.extra_package_fields} s = "some\x00string. with\x15 funny characters" foo = filter(lambda x: x in string.printable, s) print foo valid_utf8 = True try: foo.decode('utf-8') except UnicodeDecodeError: valid_utf8 = False print valid_utf8 whatisthis(data) data2=json.dumps(data,encoding="utf-8") whatisthis(data2) data3 = filter(lambda x: x in string.printable, s) whatisthis(data3) #data = json.dumps(data) with open('/Users/peder/Desktop/data.json','w') as outfile: json.dump(data,outfile) sys.exit()
def _parse_fields(self,node): ''' package fields ''' for ckan_name, pilot_name, field in schema_description.dataset_all_fields(): if pilot_name: path = "FORM[NAME='%s']/A/text()"%pilot_name element = node.xpath(path) try: # Deal with Pilot UUID CODES value = element[0].strip() if "|" in value: split_value=value.split("|")[1] self.fields[pilot_name] = field['choices_by_pilot_uuid'][split_value]['key'] else: self.fields[pilot_name]=value except IndexError: self.fields[pilot_name]="" else: # ckan_name / field does not belong at PilotRecord level. Process in CanadaRecord pass ''' Grab data that is not defined in schema ''' self.fields['language'] = common.language(node) geo_lower_left = node.xpath("FORM[NAME='geo_lower_left']/A/text()") geo_upper_right = node.xpath("FORM[NAME='geo_upper_right']/A/text()") if geo_lower_left and geo_upper_right and geo_lower_left[0] != "N/A": print "GEO ", geo_upper_right, geo_lower_left try: left,bottom = geo_lower_left[0].split(" ") right, top = geo_upper_right[0].split(" ") except ValueError: ''' To catch values that have a dash that should perhaps be a minus ['84 - 43'] ['41.5 - 141'] ''' left,bottom = geo_lower_left[0].replace(" - "," -").split(" -") right, top = geo_upper_right[0].replace(" - "," -").split(" -") coordinates = [[left, bottom], [left,top], [right, top], [right, bottom]] self.fields['spatial']= {'type': 'Polygon', 'coordinates': coordinates} ''' resources''' try: for i in range(1,5): url = node.xpath("FORM[NAME='dataset_link_en_%d']/A/text()" % i) if url: resource_dict = {} resource_dict['url']=url[0] if "http://data.gc.ca/commonwebsol/fileuploads/C/4/0/C4060F22-17EB-450D-9B5E-A1216E75DF47/Dictionnaire" in resource_dict['url']: print "STOP" # Force a language from parent resource_dict['language'] = self.fields['language'] format = node.xpath("FORM[NAME='dataset_format_%d']/A/text()" % i) size = node.xpath("FORM[NAME='dataset_size_%d']/A/text()" % i) if format:resource_dict['format']=format[0].split("|")[1] self.resources.append(PilotResource(resource_dict,'dataset_link_en_')) else: break extras=['supplementary_documentation_en', 'supplementary_documentation_fr', 'data_dictionary_fr', 'dictionary_list:_en'] for extra in extras: url= node.xpath("FORM[NAME='%s']/A/text()"% extra) if url: resource_dict = {} resource_dict['url']=url[0] self.resources.append(PilotResource(resource_dict,extra)) except: raise
def process_node(self,count, node, language): try: id = str(node.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower() # if id == "2da1db44-d00f-4764-8524-d42e3b798ce0": # print "STOP" except: print "======NO ID=========", node.xpath("DC.TITLE")[0].text try: geo_lower_left = node.xpath("FORM[NAME='geo_lower_left']/A/text()") geo_upper_right = node.xpath("FORM[NAME='geo_upper_right']/A/text()") spatial='' if geo_lower_left and geo_upper_right: left,bottom = geo_lower_left[0].split(" ") right, top = tuple(geo_upper_right[0].split(" ")) coordinates = [[left, bottom], [left,top], [right, top], [right, bottom]] spatial = {'type': 'Polygon', 'coordinates': coordinates} print spatial #sys.exit() except: print "NO GEO" #raise package_dict = {'resources': []} package_dict['resources'] = self.node_resources(node,language) package_dict['spatial']= spatial for ckan_name, pilot_name, field in schema_description.dataset_all_fields(): try: if ckan_name == "id": package_dict['id'] = str(node.xpath("FORM[NAME='thisformid']/A/text()")[0]).lower() continue elif ckan_name == 'name': continue elif ckan_name== 'tags': continue elif ckan_name == 'title': t = node.xpath("FORM[NAME='title_en']/A/text()")[0] package_dict['title'] = self.strip_title(t) if t == None: raise "No English Title", t continue elif ckan_name=='title_fra': # Look for t_fr = node.xpath("FORM[NAME='title_fr']/A/text()")[0] if t_fr == None: raise "No French Title", t_fr # Filter out -version anglaise etc for marker in common.language_markers_fra: if marker in t_fr: package_dict['title_fra'] = t_fr.split(marker)[0] break package_dict['title_fra'] = t_fr continue value ='' if pilot_name: if pilot_name=="url_fra": print pilot_name try: result = node.xpath("FORM[NAME='%s']/A/text()" % pilot_name) if result: value = result[0] else: value ='' except IndexError as e: print e if "|" in value: split_value = value.split("|")[1] rval = field['choices_by_pilot_uuid'][split_value] package_dict[ckan_name] = rval['key'] if pilot_name == "department": package_dict['owner_org'] = field['choices_by_pilot_uuid'][split_value]['key'] else: if pilot_name == 'frequency': if value: package_dict['maintenance_and_update_frequency'] = pilot_frequency_list[value] else: package_dict['maintenance_and_update_frequency'] = pilot_frequency_list[''] continue else: package_dict[ckan_name] = value except IndexError: #when None, eg. same as elif pilot_name is None: package_dict[ckan_name] = '' continue print count, "INDEX ERROR ", ckan_name, pilot_name,package_dict[pilot_name] except KeyError as e: print "KEY ERROR : ", ckan_name, pilot_name, e package_dict[ckan_name] = '' continue # Filter out things that will not pass validatation if package_dict['geographic_region'] == "Canada Canada":package_dict['geographic_region']='' region = package_dict['geographic_region'] package_dict['geographic_region'] = region.replace("Yukon Territory Territoire du Yukon","Yukon Yukon" ) package_dict['author_email'] = '*****@*****.**' package_dict['catalog_type'] = schema_description.dataset_field_by_id['catalog_type']['choices'][0]['key'] #Override validation package_dict['validation_override']=validation_override #Fix dates try: t = common.time_coverage_fix(package_dict['time_period_coverage_start'],package_dict['time_period_coverage_end']) package_dict['time_period_coverage_start'] =common.timefix(t[0]) package_dict['time_period_coverage_end'] = common.timefix(t[1]) except KeyError: ''' Times were never set ''' package_dict['time_period_coverage_start'] ="1000-01-01" package_dict['time_period_coverage_end'] ="3000-01-01" package_dict['date_published'] = package_dict['date_published'].replace("/", "-") package_dict['time_period_coverage_start']=check_date(package_dict['time_period_coverage_start']) package_dict['time_period_coverage_end']=check_date(package_dict['time_period_coverage_end']) package_dict['date_published']=check_date(package_dict['date_published']) package_dict['portal_release_date']='2013-05-24' if node.find("FLOWSTATUS").text == "pending": package_dict['portal_release_date']='' package_dict['ready_to_publish']=True package_dict['license_id']='ca-ogl-lgo' #if count>1200:sys.exit() def reformat_date(date_string): try: timepoint = datetime.strptime(date_string.strip(), "%m/%d/%Y") except ValueError: timepoint = datetime.strptime(date_string.strip(), "%Y/%m/%d") day = timepoint.date() return day.isoformat() if "/" in package_dict['date_modified']: package_dict['date_modified']=reformat_date(package_dict['date_modified']) key_eng = package_dict['keywords'].replace("\n"," ").replace("/","-").replace("(","").replace(")","").replace(":","-").replace(u"´","'").split(",") key_fra = package_dict['keywords_fra'].replace("\n"," ").replace("/","-").replace('"','').replace("(",""). replace(":","-").replace(")","").split(",") package_dict['keywords'] = ",".join([k.strip() for k in key_eng if len(k)<100 and len(k)>1]) package_dict['keywords_fra'] = ",".join([k for k in key_fra if len(k)<100 and len(k)>1]) if package_dict['owner_org']=='aafc-aac': for marker in agriculture_title_markers: if marker in package_dict['title']: new = package_dict['title'].split(marker)[1] package_dict['title']=new.lstrip(" ") break for marker in agriculture_title_markers: if marker in package_dict['title_fra']: new_fr = package_dict['title_fra'].split(marker)[1] package_dict['title_fra']=new_fr.lstrip(" ") break if package_dict['owner_org']=='hc-sc': for resource in package_dict['resources']: if resource['resource_type']=='file': resource['format']='TXT' #print count,package_dict['title'], len(package_dict['resources']) return package_dict