column_number = 0 data_values_tuple_list = [] for country, data_value in data_values_dict.items(): for year, value in data_value.items(): data_values_tuple_list.append( (value, year, country, newvariable.pk)) with connection.cursor() as c: c.executemany(insert_string, data_values_tuple_list) newimport = ImportHistory( import_type='clioinfra', import_time=timezone.now().strftime( '%Y-%m-%d %H:%M:%S'), import_notes='Importing file %s' % one_file, import_state=json.dumps({ 'file_hash': file_checksum(file), 'file_name': one_file })) newimport.save() for eachdataset in new_datasets_list: write_dataset_csv(eachdataset.pk, eachdataset.name, None, 'clioinfra_fetcher', '') for eachdataset in old_datasets_list: write_dataset_csv(eachdataset.pk, eachdataset.name, eachdataset.name, 'clioinfra_fetcher', '')
c.executemany(insert_string, data_values_tuple_list) data_values_tuple_list = [] if len(data_values_tuple_list): # insert any leftover data_values with connection.cursor() as c: c.executemany(insert_string, data_values_tuple_list) newimport = ImportHistory(import_type='unwpp', import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'), import_notes='Importing file %s' % file_to_parse, import_state=json.dumps( {'file_hash': file_checksum(os.path.join(wpp_downloads_save_location, file_to_parse)), 'file_name': file_to_parse })) newimport.save() write_dataset_csv(newdataset.pk, newdataset.name, None, 'unwpp_fetcher', '') else: if imported_before_hash == file_checksum(os.path.join(wpp_downloads_save_location, file_to_parse)): sys.exit('No updates available.') country_name_entity_ref = process_entities(country_names_dict) existing_categories = DatasetCategory.objects.values('name') existing_categories_list = {item['name'] for item in existing_categories} if un_wpp_category_name_in_db not in existing_categories_list: the_category = DatasetCategory(name=un_wpp_category_name_in_db, fetcher_autocreated=True) the_category.save() else:
import os import sys sys.path.insert(1, os.path.join(sys.path[0], '../..')) import grapher_admin.wsgi from grapher_admin.views import write_dataset_csv from grapher_admin.models import Dataset, Variable from django.conf import settings # use this script to make the initial csv and metadata export of all datasets to the repo all_datasets = Dataset.objects.all() for each in all_datasets: last_updated_by = Variable.objects.filter(datasetId=each).order_by('-updated_at') if last_updated_by: committer = last_updated_by.first() if not committer.uploaded_by: committer_name = settings.DATASETS_REPO_USERNAME committer_email = settings.DATASETS_REPO_EMAIL else: committer_name = committer.uploaded_by.get_full_name() committer_email = committer.uploaded_by.email write_dataset_csv(each.pk, each.name, None, committer_name, committer_email)
column_number = 0 if row_number % 10 == 0: time.sleep(0.001) # this is done in order to not keep the CPU busy all the time, the delay after each 10th row is 1 millisecond if len(data_values_tuple_list): # insert any leftover data_values with connection.cursor() as c: c.executemany(insert_string, data_values_tuple_list) logger.info("Dumping data values...") newimport = ImportHistory(import_type='povstats', import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'), import_notes='Initial import of POVSTATS datasets', import_state=json.dumps({'file_hash': file_checksum(povstats_downloads_save_location + 'povstats.zip')})) newimport.save() for dataset in datasets_list: write_dataset_csv(dataset.pk, dataset.name, None, 'povstats_fetcher', '') logger.info("Import complete.") else: last_import = import_history.last() deleted_indicators = {} # This is used to keep track which variables' data values were already deleted before writing new values if json.loads(last_import.import_state)['file_hash'] == file_checksum(povstats_downloads_save_location + 'povstats.zip'): logger.info('No updates available.') sys.exit('No updates available.') logger.info('New data is available.') available_variables = Variable.objects.filter(datasetId__in=Dataset.objects.filter(namespace='povstats')) available_variables_list = [] for each in available_variables.values('code'):
data_values_tuple_list.append((str(float(row[str(i)])), i, c_name_entity_ref[row['Country or Area Name']].pk, variable_name_to_object[variable_name].pk)) except: pass if len(data_values_tuple_list) > 3000: # insert when the length of the list goes over 3000 with connection.cursor() as c: c.executemany(insert_string, data_values_tuple_list) data_values_tuple_list = [] if row_number % 100 == 0: time.sleep(0.001) # this is done in order to not keep the CPU busy all the time, the delay after each 100th row is 1 millisecond if len(data_values_tuple_list): # insert any leftover data_values with connection.cursor() as c: c.executemany(insert_string, data_values_tuple_list) data_values_tuple_list = [] for dataset in existing_datasets_list: write_dataset_csv(dataset.pk, dataset.name, dataset.name, 'un_sdg_fetcher', '') for dataset in new_datasets_list: write_dataset_csv(dataset.pk, dataset.name, None, 'un_sdg_fetcher', '') newimport = ImportHistory(import_type='un_sdg', import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'), import_notes='A un_sdg import was performed', import_state='There are a total of %s un_sdg variables after the import' % Variable.objects.filter(fk_dst_id__namespace='un_sdg').count()) newimport.save() print("--- %s seconds ---" % (time.time() - start_time))
) # this is done in order to not keep the CPU busy all the time, the delay after each 10th row is 1 millisecond if len(data_values_tuple_list): # insert any leftover data_values with connection.cursor() as c: c.executemany(insert_string, data_values_tuple_list) logger.info("Dumping data values...") newimport = ImportHistory( import_type='climatech', import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'), import_notes='Initial import of climatech datasets', import_state=json.dumps( {'file_hash': file_checksum(excel_filename)})) newimport.save() for dataset in datasets_list: write_dataset_csv(dataset.pk, dataset.name, None, 'climatech_fetcher', '') logger.info("Import complete.") else: last_import = import_history.last() deleted_indicators = { } # This is used to keep track which variables' data values were already deleted before writing new values if json.loads(last_import.import_state)['file_hash'] == file_checksum( excel_filename): logger.info('No updates available.') sys.exit('No updates available.') logger.info('New data is available.') available_variables = Variable.objects.filter( fk_dst_id__in=Dataset.objects.filter(namespace='climatech'))
for oneimport in import_history: if json.loads(oneimport.import_state )['file_name'] == os.path.basename(eachfile): file_imported_before = True imported_before_hash = json.loads( oneimport.import_state)['file_hash'] if not file_imported_before: process_csv_file_insert(eachfile, os.path.basename(eachfile)) newimport = ImportHistory( import_type='faostat', import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'), import_notes='Importing file %s' % os.path.basename(eachfile), import_state=json.dumps({ 'file_hash': file_checksum(eachfile), 'file_name': os.path.basename(eachfile) })) newimport.save() else: if imported_before_hash == file_checksum(eachfile): print('No updates available for file %s.' % os.path.basename(eachfile)) for eachdataset in datasets_list: write_dataset_csv(eachdataset.pk, eachdataset.name, None, 'faostat_fetcher', '') print("Script execution time: %s" % (datetime.now() - start_time))
with connection.cursor() as c: c.executemany(insert_string, data_values_tuple_list) data_values_tuple_list = [] if row_number % 100 == 0: time.sleep( 0.001) # this is done in order to not keep the CPU busy all the time, the delay after each 100th row is 1 millisecond if len(data_values_tuple_list): # insert any leftover data_values with connection.cursor() as c: c.executemany(insert_string, data_values_tuple_list) data_values_tuple_list = [] print( '################################################################################################') newimport = ImportHistory(import_type='ilostat', import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'), import_notes='Importing file %s' % one_file, import_state=json.dumps( {'file_hash': file_checksum(file), 'file_name': one_file })) newimport.save() os.remove(file.replace('.gz', '')) for onedataset in new_datasets_list: write_dataset_csv(onedataset.pk, onedataset.name, None, 'ilostat_fetcher', '') for onedataset in old_datasets_list: write_dataset_csv(onedataset.pk, onedataset.name, onedataset.name, 'ilostat_fetcher', '')
column_number = 0 if row_number % 10 == 0: time.sleep(0.001) # this is done in order to not keep the CPU busy all the time, the delay after each 10th row is 1 millisecond if len(data_values_tuple_list): # insert any leftover data_values with connection.cursor() as c: c.executemany(insert_string, data_values_tuple_list) logger.info("Dumping data values...") newimport = ImportHistory(import_type='findex', import_time=timezone.now().strftime('%Y-%m-%d %H:%M:%S'), import_notes='Initial import of Findex datasets', import_state=json.dumps({'file_hash': file_checksum(findex_downloads_save_location + 'findex.zip')})) newimport.save() for dataset in datasets_list: write_dataset_csv(dataset.pk, dataset.name, None, 'findex_fetcher', '') logger.info("Import complete.") else: last_import = import_history.last() deleted_indicators = {} # This is used to keep track which variables' data values were already deleted before writing new values if json.loads(last_import.import_state)['file_hash'] == file_checksum(findex_downloads_save_location + 'findex.zip'): logger.info('No updates available.') sys.exit('No updates available.') logger.info('New data is available.') available_variables = Variable.objects.filter(fk_dst_id__in=Dataset.objects.filter(namespace='findex')) available_variables_list = [] for each in available_variables.values('code'):