def download_file(url_to_file): """Download a Dataverse file and return the filename""" """ import re d = r.headers['content-disposition'] fname = re.findall("filename=(.+)", d) """ file_handle, filepath = tempfile.mkstemp() msgt('download file: %s' % url_to_file) r = requests.get(url_to_file, stream=True) if r.status_code != 200: msg('bad status: %s' % r.status_code) if isfile(filepath): make_sure_file_deleted(filepath) return None, None file_ext = None content_dict = r.headers['content-disposition'] #print 'content_dict', content_dict #fname = re.findall("filename=(.+)", content_dict) fname = format_file_name(content_dict) if fname: file_ext = fname.split('.')[-1].lower() print 'file_ext', file_ext with os.fdopen(file_handle, 'wb') as tmp: for chunk in r.iter_content(chunk_size=1024): if chunk: tmp.write(chunk) msg('File downloaded: %s' % filepath) return filepath, file_ext
def compare_dicts(self, section, new_dict, old_dict, **kwargs): """Compare two dicts, noting whether a key/value was Removed, Added, or Modified Note: Attempts to preserve key order--dict is usually an OrderedDict """ # optional: get keys to skip skip_list = kwargs.get('skip_list', []) old_dict_keys = [key for key, val in old_dict.items()] new_dict_keys = [key for key, val in new_dict.items()] #old_dict_keys = set(old_dict.keys()) #new_dict_keys = set(new_dict.keys()) # ------------------------------ # added attributes # ------------------------------ added = [key for key in new_dict_keys if key not in old_dict_keys] #new_dict_keys - old_dict_keys for added_key in added: if added_key not in skip_list: self.record_diff_desc_added(\ section, added_key, new_dict[added_key]) # ------------------------------ # Removed attributes # ------------------------------ removed = [key for key in old_dict_keys if key not in new_dict_keys] for removed_key in removed: if removed_key not in skip_list: self.record_diff_desc_removed(\ section, removed_key, old_dict[removed_key]) # ------------------------------ # Modified attributes # ------------------------------ intersect_keys = [key for key in new_dict_keys\ if key not in removed and key not in added] mod_keys = [shared_key for shared_key in intersect_keys \ if old_dict[shared_key] != new_dict[shared_key]] for mod_key in mod_keys: msg('...> %s %s' % (mod_key, type(old_dict[mod_key]))) # Is the value another dict? # if mod_key not in skip_list: self.compare_items(\ section, mod_key, new_dict[mod_key], old_dict[mod_key])
def write_files_to_mongo(self, **kwargs): """Write the saved dataset files to Mongo""" client = MongoClient() db = client.dataverse_database collection = db.datasets # look at kwargs # dataset_start_id = kwargs.get('dataset_start_id', 0) delete_all = kwargs.get('delete_all', False) # If appropriate, Delete existing records # if delete_all: msgt('Deleting current records') result = collection.delete_many({}) msg('result.deleted_count: %s' % result.deleted_count) return fnames = os.listdir(self.output_dir) fnames = [ x for x in fnames if x.endswith('.json') and x.startswith('ds_') ] fnames.sort() start_time = int(time.time()) # epoch seconds cnt = 0 for fname in fnames: cnt += 1 ds_id = int(fname.split('.')[0].split('_')[1]) msgt('(%d) process dataset %s (%s)' % (cnt, ds_id, fname)) if ds_id < dataset_start_id: msg('skipping it') continue content = open(join(self.output_dir, fname), 'r').read() content = update_json_text(content) content_doc = json.loads(content, object_pairs_hook=OrderedDict) content_doc['_id'] = ds_id content_doc['dtype'] = 'dataset' #doc_id = collection.insert_one(content_doc).inserted_id #doc_id = collection.save(content_doc) #.inserted_id doc_id = collection.save(content_doc) if cnt % 500 == 0: self.show_elapsed_time(start_time) self.show_elapsed_time(start_time)
def write_files_to_mongo(self, **kwargs): """Write the saved dataset files to Mongo""" client = MongoClient() db = client.dataverse_database collection = db.datasets # look at kwargs # dataset_start_id = kwargs.get('dataset_start_id', 0) delete_all = kwargs.get('delete_all', False) # If appropriate, Delete existing records # if delete_all: msgt('Deleting current records') result = collection.delete_many({}) msg('result.deleted_count: %s' % result.deleted_count) return fnames = os.listdir(self.output_dir) fnames = [x for x in fnames if x.endswith('.json') and x.startswith('ds_')] fnames.sort() start_time = int(time.time()) # epoch seconds cnt = 0 for fname in fnames: cnt += 1 ds_id = int(fname.split('.')[0].split('_')[1]) msgt('(%d) process dataset %s (%s)' % (cnt, ds_id, fname)) if ds_id < dataset_start_id: msg('skipping it') continue content = open(join(self.output_dir, fname), 'r').read() content = update_json_text(content) content_doc = json.loads(content, object_pairs_hook=OrderedDict) content_doc['_id'] = ds_id content_doc['dtype'] = 'dataset' #doc_id = collection.insert_one(content_doc).inserted_id #doc_id = collection.save(content_doc) #.inserted_id doc_id = collection.save(content_doc) if cnt % 500 == 0: self.show_elapsed_time(start_time) self.show_elapsed_time(start_time)
def run_comparison(self): """Compare the two JSON datasets""" msgt(self.run_comparison.__doc__) # Run a quick check to see if the dicts are the same. # if cmp(self.old_ds, self.new_ds) == 0: msg('No differences!') return new_files_list = self.new_ds.pop('files', []) old_files_list = self.old_ds.pop('files', []) #print 'new_files_list', new_files_list self.compare_dicts(\ '', self.new_ds, self.old_ds) self.compare_file_lists(\ new_files_list, old_files_list)
def download_file(url_to_file): """Download a Dataverse file and return the filename""" file_handle, filepath = tempfile.mkstemp() msgt('download file: %s' % url_to_file) r = requests.get(url_to_file, stream=True) if r.status_code != 200: msg('bad status: %s' % r.status_code) if isfile(filepath): make_sure_file_deleted(filepath) return None with os.fdopen(file_handle, 'wb') as tmp: for chunk in r.iter_content(chunk_size=1024): if chunk: tmp.write(chunk) msg('File downloaded: %s' % filepath) return filepath
def show(self, show_section=True): """print info""" if show_section: msgt('%s: [%s] %s' % (self.section, self.attr_name, self.note)) msg('attribute: %s' % self.attr_name) msg('\nnew: %s' % self.new_val) msg('\nold: %s' % self.old_val) dashes()
def get_count_broken_notifications(): """ Query each object type and make sure notifications aren't broken Example map { 'DvObject': [1], 'Dataverse': [2], 'Dataset': [14, 11], 'DatasetVersion': [13, 12, 7], 'DataFile': [9] } """ broken_cnt = 0 user_ids = [] for model_name, type_id_list in get_dv_object_to_object_id_map().items( ): # Get a list of object ids for this model type # that were not emailed--e.g. should show up # on the notifications pages # msgt('check: %s %s' % (model_name, type_id_list)) model_user_id_list = UserNotification.objects.select_related('user'\ ).filter(\ object_type__in=type_id_list, ).values_list('objectid', 'user__id') model_id_list = [x[0] for x in model_user_id_list] user_ids += [x[1] for x in model_user_id_list] msg('model_id_list len: %s' % len(model_id_list)) if len(model_id_list) == 0: continue # Used for later "bad notice" counts notice_counter = Counter(model_id_list) msg('notice_counter len: %s' % len(notice_counter)) unique_id_list = list(set(model_id_list)) msg('unique_id_list len: %s' % len(unique_id_list)) # Need to upgrade apps files and not use this method model_class = eval(model_name) if model_name in ['DvObject', 'DatasetVersion', 'FileMetadata']: existing_ids = model_class.objects.filter(id__in=unique_id_list\ ).values_list('id', flat=True\ ).distinct() else: existing_ids = model_class.objects.select_related('dvobject'\ ).filter(dvobject__id__in=unique_id_list\ ).values_list('dvobject__id', flat=True\ ).distinct() msg('existing_ids len: %s' % len(existing_ids)) if len(unique_id_list) == len(existing_ids): # Looks good! continue missing_ids = list(set(unique_id_list) - set(existing_ids)) for missing_id in missing_ids: broken_cnt += notice_counter.get(missing_id, 0) unique_user_ids = len(set(user_ids)) return (broken_cnt, unique_user_ids)
def make_json_files(self): # Set publication status # filters = {} if self.published_only: filters.update(query_helper.get_is_published_filter_param()) # Query for dataset ids # ds_id_query = Dataset.objects.filter(**filters\ ).annotate(ds_id=F('dvobject__id')\ ).values_list('ds_id', flat=True\ ).order_by('ds_id') # Iterate through dataset ids # #start_time = datetime.now() start_time = int(time.time()) # epoch seconds cnt = 0 no_versions_found_list = [45900] for ds_id in ds_id_query: cnt += 1 msgt('(%d) Checking dataset id %s' % (cnt, ds_id)) if ds_id < self.dataset_start_id: msg('skipping...(start at dataset id: %d)' % self.dataset_start_id) continue # Create file name # fname = 'ds_%s.json' % (str(ds_id).zfill(8)) full_fname = join(OUTPUT_DIR, fname) # Should we overwrite the existing file? # if isfile(full_fname) and not self.overwrite_existing_files: msg('skipping...file already exists') continue dataset_version = get_latest_dataset_version(ds_id) if dataset_version is None: msg("Could not find dataset_version!") no_versions_found_list.append(ds_id) continue dataset_as_json = DatasetSerializer(dataset_version).as_json() open(full_fname, 'w').write(json.dumps(dataset_as_json, indent=4)) msg('File written: %s' % full_fname) if cnt % 500 == 0: self.show_elapsed_time(start_time) #if cnt > 10: # self.show_elapsed_time(start_time) # break self.show_elapsed_time(start_time) print 'no_versions_found_list: %s' % no_versions_found_list
def get_basic_stats(): cnt_read_notifications = UserNotification.objects.filter(\ readnotification=True, ).count() cnt_unread_notifications = UserNotification.objects.filter(\ readnotification=False, ).count() cnt_undated_notifications = UserNotification.objects.filter(\ senddate__isnull=True ).count() day_cnt_1 = 365 day_cnt_1_date = datetime.now() - timedelta(days=day_cnt_1) day_cnt_2 = 180 day_cnt_2_date = datetime.now() - timedelta(days=day_cnt_2) cnt_old_unread_notifications = UserNotification.objects.filter(\ readnotification=False, senddate__lt=day_cnt_1_date ).count() cnt_old_unread_notifications2 = UserNotification.objects.filter(\ readnotification=False, senddate__lt=day_cnt_2_date ).count() broken_cnt, impacted_users = NotificationStats.get_count_broken_notifications() msg('broken_cnt: %s' % broken_cnt) msg('impacted_users: %s' % impacted_users) file_stats = dict(\ cnt_broken_notifications=NamedStat(\ 'Broken Notifications / Impacted Users', broken_cnt, ('The notification refers to an object that' ' no longer exists. These notifications should' ' be deleted from the database. (May be' ' responsible for some users who receive an' ' error when clicking on the notifications' ' tab.)'), 'view_broken_notifications', **dict(stat2=impacted_users)), cnt_read_notifications=NamedStat(\ 'Read Notifications', cnt_read_notifications, ('Count of read notifications'), None), cnt_unread_notifications=NamedStat(\ 'All Unread Notifications', cnt_unread_notifications, ('Count of unread notifications.'), None), cnt_unread_old_notifications=NamedStat(\ 'Unread: Older than %s Days' % day_cnt_1, cnt_old_unread_notifications, ('Count of' ' notifications <b>older' ' than %d days</b>') % day_cnt_1, None), cnt_old_unread_notifications2=NamedStat(\ 'Unread: Older than %s Days' % day_cnt_2, cnt_old_unread_notifications2, ('Count of' ' notifications <b>older' ' than %d days</b>') % day_cnt_2, None), cnt_undated_notifications=NamedStat(\ 'Undated Notifications', cnt_undated_notifications, ('Count of undated notifications'), None), #cnt_harvested_zero=NamedStat(\ # 'Filesize 0 (Harvested)', # cnt_harvested_zero, # ('Count of harvested Datafiles displaying a' # ' size of 0 bytes')), ) return file_stats
def get_count_broken_notifications(): """ Query each object type and make sure notifications aren't broken Example map { 'DvObject': [1], 'Dataverse': [2], 'Dataset': [14, 11], 'DatasetVersion': [13, 12, 7], 'DataFile': [9] } """ broken_cnt = 0 user_ids = [] for model_name, type_id_list in get_dv_object_to_object_id_map().items(): # Get a list of object ids for this model type # that were not emailed--e.g. should show up # on the notifications pages # msgt('check: %s %s' % (model_name, type_id_list)) model_user_id_list = UserNotification.objects.select_related('user'\ ).filter(\ object_type__in=type_id_list, ).values_list('objectid', 'user__id') model_id_list = [x[0] for x in model_user_id_list] user_ids += [x[1] for x in model_user_id_list] msg('model_id_list len: %s' % len(model_id_list)) if len(model_id_list) == 0: continue # Used for later "bad notice" counts notice_counter = Counter(model_id_list) msg('notice_counter len: %s' % len(notice_counter)) unique_id_list = list(set(model_id_list)) msg('unique_id_list len: %s' % len(unique_id_list)) # Need to upgrade apps files and not use this method model_class = eval(model_name) if model_name in ['DvObject', 'DatasetVersion', 'FileMetadata']: existing_ids = model_class.objects.filter(id__in=unique_id_list\ ).values_list('id', flat=True\ ).distinct() else: existing_ids = model_class.objects.select_related('dvobject'\ ).filter(dvobject__id__in=unique_id_list\ ).values_list('dvobject__id', flat=True\ ).distinct() msg('existing_ids len: %s' % len(existing_ids)) if len(unique_id_list) == len(existing_ids): # Looks good! continue missing_ids = list(set(unique_id_list) - set(existing_ids)) for missing_id in missing_ids: broken_cnt += notice_counter.get(missing_id, 0) unique_user_ids = len(set(user_ids)) return (broken_cnt, unique_user_ids)
def get_basic_stats(): cnt_read_notifications = UserNotification.objects.filter(\ readnotification=True, ).count() cnt_unread_notifications = UserNotification.objects.filter(\ readnotification=False, ).count() cnt_undated_notifications = UserNotification.objects.filter(\ senddate__isnull=True ).count() day_cnt_1 = 365 day_cnt_1_date = datetime.now() - timedelta(days=day_cnt_1) day_cnt_2 = 180 day_cnt_2_date = datetime.now() - timedelta(days=day_cnt_2) cnt_old_unread_notifications = UserNotification.objects.filter(\ readnotification=False, senddate__lt=day_cnt_1_date ).count() cnt_old_unread_notifications2 = UserNotification.objects.filter(\ readnotification=False, senddate__lt=day_cnt_2_date ).count() broken_cnt, impacted_users = NotificationStats.get_count_broken_notifications( ) msg('broken_cnt: %s' % broken_cnt) msg('impacted_users: %s' % impacted_users) file_stats = dict(\ cnt_broken_notifications=NamedStat(\ 'Broken Notifications / Impacted Users', broken_cnt, ('The notification refers to an object that' ' no longer exists. These notifications should' ' be deleted from the database. (May be' ' responsible for some users who receive an' ' error when clicking on the notifications' ' tab.)'), 'view_broken_notifications', **dict(stat2=impacted_users)), cnt_read_notifications=NamedStat(\ 'Read Notifications', cnt_read_notifications, ('Count of read notifications'), None), cnt_unread_notifications=NamedStat(\ 'All Unread Notifications', cnt_unread_notifications, ('Count of unread notifications.'), None), cnt_unread_old_notifications=NamedStat(\ 'Unread: Older than %s Days' % day_cnt_1, cnt_old_unread_notifications, ('Count of' ' notifications <b>older' ' than %d days</b>') % day_cnt_1, None), cnt_old_unread_notifications2=NamedStat(\ 'Unread: Older than %s Days' % day_cnt_2, cnt_old_unread_notifications2, ('Count of' ' notifications <b>older' ' than %d days</b>') % day_cnt_2, None), cnt_undated_notifications=NamedStat(\ 'Undated Notifications', cnt_undated_notifications, ('Count of undated notifications'), None), #cnt_harvested_zero=NamedStat(\ # 'Filesize 0 (Harvested)', # cnt_harvested_zero, # ('Count of harvested Datafiles displaying a' # ' size of 0 bytes')), ) return file_stats
def get_data_rows(self, as_json=False, pretty_print=False): """ Return information as JSON { "data" : "total_row_count" : 117 "preview_row_count" : 50 "column_names" : ["Name", "Position", "Office"] "rows" : [ [ "Tiger Nixon", "System Architect", "Edinburgh" ], [ "Garrett Winters", "Accountant", "Tokyo" ] ] } """ if self.has_error(): return None # Read the table try: if self.is_excel: msgt('Excel!') df = pd.read_excel(self.filepath) #error_bad_lines=False) else: df = pd.read_table(self.filepath) except Exception as ex_obj: msg(ex_obj) msgt('Failed to open file via pandas!') temp_file_helper.make_sure_file_deleted(self.filepath) if self.is_excel: self.add_error('Failed to open Excel file via pandas. [%s]' % ex_obj) else: self.add_error( '<b>Probably not a tabular file!</b> Failed to open file via pandas. [%s]' % ex_obj) return None self.describe_as_html = df.describe().to_html() json_string = df.describe().to_json() self.describe_as_dict = json.loads(json_string, object_pairs_hook=OrderedDict) # Retrieve the columns self.column_names = df.columns.tolist() # Retrieve the rows self.data_rows = df[:self.num_preview_rows].values.tolist() #print 'rows', json.dumps(rows) # Format the response info_dict = OrderedDict() info_dict['total_row_count'] = len(df.index) info_dict['preview_row_count'] = len(self.data_rows) info_dict['column_names'] = self.column_names info_dict['rows'] = self.data_rows info_dict['describe_as_html'] = self.describe_as_html info_dict['describe_as_dict'] = self.describe_as_dict if as_json: if pretty_print: return json.dumps(info_dict, indent=4) return json.dumps(info_dict) return info_dict