def get_stats(self): import pprint printer = pprint.PrettyPrinter(indent=4) db = DatabasePlainFiles('stats/') stats = db.loadDbase('stats17028') #tag cloud """ tag_cloud = [] for tag in stats['tags']: if stats['tags'][tag] > 15: #5 is okay for i in range(int(stats['tags'][tag] / 15)): tag_cloud.append(tag) import json db.saveDbaseRaw('tag_cloud', json.dumps(tag_cloud)) """ #tags overall tag_usage = 0 tag_count = 0 for tag in stats['tags']: tag_usage = tag_usage + stats['tags'][tag] tag_count = tag_count + 1 print tag_usage print tag_count #format statistics """
def process_download_all_log(self): db_logs = DatabasePlainFiles(self.log_folder) download_all_log = db_logs.loadDbaseRaw('download_all_log.txt') download_all_log = download_all_log.split('\n') resources_success = [] resources_check = [] resources_fail = [] for line in download_all_log: if( re.match("^Could not download", line) ): resources_fail.append(line) continue if(line == ''): continue resource_id = line.split()[1] status_code = int(line.split()[3]) if(status_code == 200): resources_success.append({resource_id: status_code}) else: resources_check.append({resource_id: status_code}) print len(resources_success) print len(resources_check) import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(resources_check)
def delete_bad_response(self): db_logs = DatabasePlainFiles(self.log_folder) download_all_log = db_logs.loadDbaseRaw('download_all_log.txt') download_all_log = download_all_log.split('\n') resources_success = [] resources_check = [] resources_fail = [] for line in download_all_log: if( re.match("^Could not download", line) ): resources_fail.append(line) continue if(line == ''): continue resource_id = line.split()[1] status_code = int(line.split()[3]) if(status_code == 200): resources_success.append(resource_id) else: resources_check.append(resource_id) for resource in resources_check: if(os.path.exists('files/'+resource)): os.remove('files/'+resource) print 'resources clean-up complete!'
def get_failed_resources_ckan_urls(self): db_logs = DatabasePlainFiles(self.log_folder) resources_fail = db_logs.loadDbaseRaw('resources_fail.csv') resources_fail = resources_fail.split('\n') for line in resources_fail: resource_id = line.strip() resource = ckaninterface.Resource(resource_id) print resource_id + ' ' + resource.ckan_url
def delete_html_pages(self): db_logs = DatabasePlainFiles(self.log_folder) html_pages = db_logs.loadDbaseRaw('html_pages.txt') html_pages = html_pages.split('\n') for resource in html_pages: if(os.path.exists('files/'+resource) and resource != ''): os.remove('files/'+resource) print "clean-up complete!"
def download_all_csv_resources(self): """ Download csv resources if resource unaccessible (404 or 503) - add to the list post-processing - check mimetype of the file - if not csv - report """ db = DatabasePlainFiles(self.log_folder) download_all_log = "download_all_log.txt" ckan = ckaninterface.CKAN_Application() csv_resource_list = ckan.get_csv_resource_list() csv_resource_list_max = len(csv_resource_list) - 1 for i in range(csv_resource_list_max): resource = ckaninterface.Resource(csv_resource_list[i]) db.addDbaseRaw(download_all_log, resource._download())
def download_n_random_csv(self, n): db = DatabasePlainFiles(self.log_folder) random_csv_filename = "random_csv.txt" import random ckan = ckaninterface.CKAN_Application() csv_resource_list = ckan.get_csv_resource_list() csv_resource_list_max = len(csv_resource_list) - 1 for i in range(n): rand = random.randint(0, csv_resource_list_max) db.addDbaseRaw(random_csv_filename, str(rand) + "\n") resource = ckaninterface.Resource(csv_resource_list[rand]) try: resource._download() except: pass
def choose_n_random(self, n=10): db = DatabasePlainFiles('files/.analyzed/') analyzed_ids = db.loadDbaseRaw('100_analyze_ids') analyzed_ids = analyzed_ids.split('\n') all_ids=self.get_files() csv_resource_list_max = len(all_ids) - 1 output = [] import random for i in range(n): rand = random.randint(0, csv_resource_list_max) if(not all_ids[rand] in analyzed_ids): output.append(all_ids[rand]) import pprint pp = pprint.PrettyPrinter(indent=4) print pp.pprint(output)
def check_good_response(self): db_logs = DatabasePlainFiles(self.log_folder) download_all_log = db_logs.loadDbaseRaw('download_all_log.txt') download_all_log = download_all_log.split('\n') resources_success = [] resources_check = [] resources_fail = [] for line in download_all_log: if( re.match("^Could not download", line) ): resources_fail.append(line) continue if(line == ''): continue resource_id = line.split()[1] status_code = int(line.split()[3]) if(status_code == 200): resources_success.append(resource_id) else: resources_check.append(resource_id) bad_resources = [] for resource in resources_success: max_size_bytes = 1048576 statinfo = os.stat('files/'+resource) if(statinfo.st_size > max_size_bytes): print str(resource) + ' larger than 1Mb!' continue file = open('files/'+resource, 'rb') string = file.read() file.close() if(re.match('.*html.*', string, flags=re.I)): print str(resource) + ' html page!' bad_resources.append(resource) else: print str(resource) + ' ok!'
def read_data_folder(self): import pickle #get data folder list data_folder = 'data/' file_list = os.listdir(data_folder) stats = { 'maintainer': {}, 'isopen': {}, 'author': {}, 'version': {}, 'license_id': {}, 'type': {}, 'mimetype': {}, 'format': {}, 'resource_type': {}, 'tags': {}, 'groups': {}, 'license': {}, 'license_title': {}, 'geographic_coverage': {}, 'geographical_granularity': {}, 'temporal_coverage-from': {}, 'temporal_coverage-to': {}, 'temporal_granularity': {}, 'national_statistic': {}, 'precision': {}, 'series': {}, 'date_released': {}, 'categories': {} } import pprint printer = pprint.PrettyPrinter(indent=4) db = DatabasePlainFiles('stats/') stats = db.loadDbase('stats14061') for num, file in enumerate(file_list): print num if(num < 14061 or file == "package_list"): continue f = open(data_folder + file) object = pickle.load(f) f.close() self.add_to_stats(object['maintainer'], 'maintainer', stats) self.add_to_stats(object['isopen'], 'isopen', stats) self.add_to_stats(object['author'], 'author', stats) self.add_to_stats(object['version'], 'version', stats) self.add_to_stats(object['type'], 'type', stats) for resource in object['resources']: self.add_to_stats(resource['mimetype'], 'mimetype', stats) self.add_to_stats(resource['format'], 'format', stats) self.add_to_stats(resource['resource_type'], 'resource_type', stats) for tag in object['tags']: self.add_to_stats(tag, 'tags', stats) for group in object['groups']: self.add_to_stats(group, 'groups', stats) self.add_to_stats(object['license'], 'license', stats) self.add_to_stats(object['license_title'], 'license_title', stats) try: self.add_to_stats(object['extras']['geographic_coverage'], 'geographic_coverage', stats) self.add_to_stats(object['extras']['geographical_granularity'], 'geographical_granularity', stats) self.add_to_stats(object['extras']['temporal_coverage-from'], 'temporal_coverage-from', stats) self.add_to_stats(object['extras']['temporal_coverage-to'], 'temporal_coverage-to', stats) self.add_to_stats(object['extras']['temporal_granularity'], 'temporal_granularity', stats) self.add_to_stats(object['extras']['series'], 'series', stats) self.add_to_stats(object['extras']['precision'], 'precision', stats) self.add_to_stats(object['extras']['national_statistic'], 'national_statistic', stats) self.add_to_stats(object['extras']['date_released'], 'date_released', stats) self.add_to_stats(object['extras']['categories'], 'categories', stats) except BaseException as e: pass #print str(e) db.saveDbase('stats' + str(num), stats) #output stats to file print 'script executed!'
def convert_all_to_rdf(self, start_from = 0): conversion_log = DatabasePlainFiles(self.conversion_log_folder) process_log = DatabasePlainFiles(self.log_folder) process_log_filename = "rdf_conversion.log" all_ids = self.get_files() overall = len(all_ids) for num, resource_id in enumerate(all_ids): if(num < start_from): continue print "Converting resource to RDF " + str(num) + " out of " + str(overall) print str(resource_id) string = "Converting resource to RDF " + str(num) + " out of " + str(overall) + "\n" process_log.addDbaseRaw(process_log_filename, string) string = str(resource_id) + "\n" process_log.addDbaseRaw(process_log_filename, string) #Skip folders if(resource_id == ".analyzed" or resource_id == '.all-resources' or resource_id =='.broken_retrieved' or resource_id == "files.tar.gz"): continue #Init the resource resource = ckaninterface.Resource(resource_id) #create wiki-page for resource string = "creating wiki page for resource" + "\n" process_log.addDbaseRaw(process_log_filename, string) wiki_page = resource.generate_default_wiki_page() string = str(resource.create_wiki_page(wiki_page)) process_log.addDbaseRaw(process_log_filename, string) #transform resource to RDF sparqlify_message, returncode = resource.transform_to_rdf('default-tranformation-configuration') conversion_log.addDbaseRaw(resource_id + '.log', sparqlify_message + "\n" + str(returncode))