def notify_editor_inactivity(): User = apps.get_model('user.User') last_week = timezone.now() - timedelta(days=7) editors = User.objects.editors() inactive_contributors = editors.annotate( paper_count=Count( 'id', filter=Q( contributions__contribution_type=Contribution.SUBMITTER, contributions__created_date__gte=last_week ) ), comment_count=Count( 'id', filter=Q( contributions__contribution_type=Contribution.COMMENTER, contributions__created_date__gte=last_week ) ), total_contributions=F('paper_count') + F('comment_count') ).filter( total_contributions__lt=3 ) logging = [] for inactive_contributor in inactive_contributors.iterator(): paper_count = inactive_contributor.paper_count comment_count = inactive_contributor.comment_count logging.append( ( inactive_contributor.email, f'Paper count: {paper_count}', f'Comment count: {comment_count}' ) ) inactive_contributor.notify_inactivity( paper_count, comment_count ) log_info(logging)
def radiologist_labels_cleaning(radiologist_labels_df): """ Some rows have duplicates even once distilled to Normal vs. Abormal, in the absence of any distinguishing features (such as an obvious indication that one is a correction) any rows with these accession numbers must be removed. """ logging = [] logging.append(len(radiologist_labels_df)) radiologist_labels_df['is_normal_radiologist'] = radiologist_labels_df['classes'].apply(lambda x: \ 1 if x[0] == 'Normal' else 0) radiologist_labels_df.drop(columns=['classes', 'dicom_elements.name'], inplace=True) # gets distinct rows in cases where the row is perfectly duplicated radiologist_labels_df.drop_duplicates(keep='first', inplace=True) logging.append(len(radiologist_labels_df)) # two of the same sample with different labels -> impossible to know which is the 'correct' label. radiologist_labels_df.drop_duplicates(subset=['dicom_elements.value'], keep=False, inplace=True) logging.append(len(radiologist_labels_df)) radiologist_labels_df.set_index('dicom_elements.value', inplace=True) return radiologist_labels_df, logging
def hospital_records_cleaning(hospital_records_df): """ Remove duplicates from hospital_records and model_outputs_df and standarsize the probabilistic classifications. """ logging = [] logging.append(len(hospital_records_df)) # select single row for 'true duplicates' hospital_records_df.drop_duplicates(keep='first', inplace=True) logging.append(len(hospital_records_df)) # removing complete duplicates hospital_records_df.drop_duplicates(subset=['Accession Number'], keep=False, inplace=True) # has no effect logging.append(len(hospital_records_df)) hospital_records_df.rename(columns={'Normal': 'Normal_hard_class'}, inplace=True) hospital_records_df.set_index('Accession Number', inplace=True) return hospital_records_df, logging
def model_outputs_cleaning(model_outputs_df): logging = [] # do you even need this line? -> yes hospital_records_df has boolean output not probabilities model_outputs_df['Normal'] = model_outputs_df[['Abnormal','Normal']].apply(lambda x: x[1] if not pd.isnull(x[1]) \ else 1 - x[0], axis=1) # isolate useful columns. model_outputs_df = model_outputs_df[['Normal', 'accession_number']] logging.append(len(model_outputs_df)) # salvage single row for 'true duplicates' model_outputs_df.drop_duplicates(keep='first', inplace=True) logging.append(len(model_outputs_df)) model_outputs_dirty = model_outputs_df.sort_values('accession_number') # two of the same sample with different labels -> impossible to know which is the 'correct' label. model_outputs_df.drop_duplicates(subset=['accession_number'], keep=False, inplace=True) logging.append(len(model_outputs_df)) model_outputs_df.set_index('accession_number', inplace=True) return model_outputs_df, logging, model_outputs_dirty
def _logger(line): """ Callback function to log embeddedqemu output. """ log.append(line)
def f(v): l.append(v)
def _logger(line): """ Callback function to log libvirtd output. """ log.append(line)