def _organize_matrix(self): Juice = cm.Confusion(os.path.join(self.input_path, self.cm_truth), "") tp, fn, fp, tn, p, r = self._get_name_and_rank() names = self.get_matrix_names() TP = self._get_true_positives(tp) FN = self._get_false_negatives(fn) FP = self._get_false_positives(fp) TN = self._get_true_negatives(tn) all_tax_ids = set(TP.keys()) | set(FN.keys()) | set(FP.keys()) | set( TN.keys()) matrix_sum = Juice.matrix_sum() for tax_id in all_tax_ids: for name in names: if (name not in TP[tax_id]) and (name not in FN[tax_id]) and ( name not in FP[tax_id]) and (name not in TN[tax_id]): TN[tax_id][name] = matrix_sum if (name not in TP[tax_id]): TP[tax_id][name] = 0 if name not in FN[tax_id]: FN[tax_id][name] = 0 if name not in FP[tax_id]: FP[tax_id][name] = 0 return TP, FN, FP, TN, p, r
def save_matrices_as_csv(self, file_path): Juice = cm.Confusion(os.path.join(self.input_path, self.cm_truth), "") for name in self.matrix_dict: if self.saved[name] == False: Juice.set_file_name(os.path.join(self.input_path, name)) self.create_table(name) csv_name = os.path.join(file_path, self.cm_truth + " " + name) Juice.save_matrix_table(self.matrix_tables[name], csv_name) return
def create_table(self, name=""): Juice = cm.Confusion(os.path.join(self.input_path, self.cm_truth), "") if name == "": for name in self.matrix_dict: if name not in self.matrix_tables: Juice.set_file_name(os.path.join(self.input_path, name)) self.matrix_tables[name] = Juice.create_matrix_table( Juice.reformat_matrix( Juice.add_other_info(self.matrix_dict[name]))) elif name in self.matrix_dict: Juice.set_file_name(os.path.join(self.input_path, name)) self.matrix_tables[name] = Juice.create_matrix_table( Juice.reformat_matrix( Juice.add_other_info(self.matrix_dict[name]))) else: print("There is no matrix by the name \'{}\'".format(name)) return
def main(self, gnd_truth, excel_name="TaxaPerformanceMetrics_byTool", gen_dir="", file_path="", csv=0, dendros=0): gen_paths = glob(os.path.join(gen_dir, "*.profile")) self.input_path = gen_dir self.output_path = file_path self.output_name = excel_name Juice = cm.Confusion(os.path.join(self.input_path, gnd_truth), "") self.set_truth(gnd_truth) for path in gen_paths: name = os.path.basename(path) if name not in self.matrix_dict: Juice.set_file_name(path) self.add_matrix(name, Juice.main("no")) if csv == 1: self.save_matrices_as_csv(self.output_path) self.save_as_excel(self.output_path, excel_name) # Dendrograms if dendros == 1: sheets = [ "True Positives", "False Negatives", "False Positives", "True Negatives", "Precall" ] for sheet in sheets: ranks = self.read_excel( sheet, os.path.join(self.output_path, excel_name + ".xlsx")) ranks.append("") for rank in ranks: self.create_dendrogram( sheet, rank, os.path.join(self.output_path, excel_name + ".xlsx")) print("\nThe Dendrograms have been saved in {}.".format( self.output_path)) return
def get_top_taxid(self, x, metric='tp', difficulty='least'): excel_name = os.path.join(self.output_path, self.output_name) + '.xlsx' metric_df = pd.DataFrame() metric_df['Tax ID'] = pd.read_excel(excel_name, sheet_name='Precision', engine='openpyxl')['Tax ID'] names = pd.read_excel(excel_name, sheet_name='True Positives', engine='openpyxl')['name'] metric_df['Names'] = [re.split('\|', name).pop() for name in names] if metric.lower() == 'precall': metric_df['Pre-Agg'] = pd.read_excel( excel_name, sheet_name='Precision', engine='openpyxl')['Aggregate'] metric_df['Re-Agg'] = pd.read_excel(excel_name, sheet_name='Recall', engine='openpyxl')['Aggregate'] metric_df['Average'] = (metric_df['Pre-Agg'] + metric_df['Re-Agg']) / 2 base = 'Average' elif metric.lower() == 'tp': base = 'TP-Agg' metric_df[base] = pd.read_excel(excel_name, sheet_name='True Positives', engine='openpyxl')['Aggregate'] elif metric.lower() == 'fn': base = 'FN-Agg' metric_df[base] = pd.read_excel(excel_name, sheet_name='False Negatives', engine='openpyxl')['Aggregate'] elif metric.lower() == 'fp': base = 'FP-Agg' metric_df[base] = pd.read_excel(excel_name, sheet_name='False Positives', engine='openpyxl')['Aggregate'] elif metric.lower() == 'tn': base = 'TN-Agg' metric_df[base] = pd.read_excel(excel_name, sheet_name='True Negatives', engine='openpyxl')['Aggregate'] # Filtering out taxids not in ground truth Juice = cm.Confusion(self.cm_truth, '') Tea = cm.comp.Comparator() Chai = cm.comp.pp.Parser() # create set of taxids not in the ground truth untrue_taxids = Juice.dictionary_to_set( Tea.save_tax_ID( Chai.main(os.path.join(self.input_path, self.cm_truth)))) ^ set( metric_df['Tax ID']) untrue_rows = [] for utt in untrue_taxids: # find the index of the rows for untrue taxids untrue_rows.append(metric_df[metric_df['Tax ID'] == utt].index[0]) # drop the rows for untrue taxids metric_df.drop(untrue_rows, inplace=True) if difficulty.lower() == 'most': order = False # for descending elif difficulty.lower() == 'least': order = True # for ascending needed_df = metric_df.sort_values(by=base, ascending=order, na_position='last').iloc[0:x, :] fn = 'Top_' + difficulty.capitalize() + '-' + metric.upper( ) + '_taxid.xlsx' needed_df.to_excel(os.path.join(self.output_path, fn), index=False) print('\nSaved as {}'.format(os.path.join(self.output_path, fn))) return os.path.join(self.output_path, fn)
def trace_back(self, metric): Chai = cm.comp.pp.Parser() true_samples = pd.DataFrame.from_dict( Chai.main(os.path.join(self.input_path, self.cm_truth), 1)) preds = self.get_matrix_names() true_data = {} data = {} # to get true data for col in true_samples.columns: for ind in true_samples.index: if col not in true_data: true_data[col] = set() if isinstance(true_samples.loc[ind, col], Iterable): taxids = [ tax for tax in re.split('\|', true_samples.loc[ind, col][-1]) if tax != '' ] true_data[col] = true_data[col] | set(taxids) # to get predicted data for name in preds: data[name] = {} matrix = pd.DataFrame.from_dict( Chai.main(os.path.join(self.input_path, name), 1)) for col in matrix.columns: for ind in matrix.index: if col not in data[name]: data[name][col] = set() if isinstance(matrix.loc[ind, col], Iterable): taxids = [ tax for tax in re.split('\|', matrix.loc[ind, col][-1]) if tax != '' ] data[name][col] = data[name][col] | set(taxids) # turn predicted data into confusion matrices Tea = cm.comp.Comparator() Juice = cm.Confusion('', '') new_matrix = {} for name in data: combined_taxid = Tea.combine_tax_ID(true_data, data[name]) new_matrix[name] = Juice.confusion_matrix(true_data, data[name], combined_taxid) matrix_df = pd.DataFrame.from_dict(new_matrix, orient='index') # make a data frame for the correct metric df = pd.DataFrame() if metric == 'True Positives': TP = {} for name in matrix_df.columns: for tax_id in matrix_df.index: if tax_id not in TP: TP[tax_id] = {} if name not in TP[tax_id]: if isinstance(matrix_df.loc[tax_id, name], Iterable): TP[tax_id][name] = matrix_df.loc[tax_id, name][0] df = pd.DataFrame.from_dict(TP).fillna(0) elif metric == 'False Negatives': FN = {} for name in matrix_df.columns: for tax_id in matrix_df.index: if tax_id not in FN: FN[tax_id] = {} if name not in FN[tax_id]: if isinstance(matrix_df.loc[tax_id, name], Iterable): FN[tax_id][name] = matrix_df.loc[tax_id, name][1] df = pd.DataFrame.from_dict(FN).fillna(0) elif metric == 'False Positives': FP = {} for name in matrix_df.columns: for tax_id in matrix_df.index: if tax_id not in FP: FP[tax_id] = {} if name not in FP[tax_id]: if isinstance(matrix_df.loc[tax_id, name], Iterable): FP[tax_id][name] = matrix_df.loc[tax_id, name][2] df = pd.DataFrame.from_dict(FP).fillna(0) elif metric == 'True Negatives': TN = {} for name in matrix_df.columns: for tax_id in matrix_df.index: if tax_id not in TN: TN[tax_id] = {} if name not in TN[tax_id]: if isinstance(matrix_df.loc[tax_id, name], Iterable): TN[tax_id][name] = matrix_df.loc[tax_id, name][3] df = pd.DataFrame.from_dict(TN).fillna(0) elif metric == 'Precision': # TP / (TP + FP) P = {} for name in matrix_df.columns: for tax_id in matrix_df.index: if tax_id not in P: P[tax_id] = {} if name not in P[tax_id]: if isinstance(matrix_df.loc[tax_id, name], Iterable): tp = matrix_df.loc[tax_id, name][0] fp = matrix_df.loc[tax_id, name][2] if (tp == 0) and (fp == 0): P[tax_id][name] = np.nan else: P[tax_id][name] = (tp) / (tp + fp) df = pd.DataFrame.from_dict(P) elif metric == 'Recall': # TP / (TP + FN) R = {} for name in matrix_df.columns: for tax_id in matrix_df.index: if tax_id not in R: R[tax_id] = {} if name not in R[tax_id]: if isinstance(matrix_df.loc[tax_id, name], Iterable): tp = matrix_df.loc[tax_id, name][0] fn = matrix_df.loc[tax_id, name][1] if (tp == 0) and (fn == 0): R[tax_id][name] = np.nan else: R[tax_id][name] = (tp) / (tp + fn) df = pd.DataFrame.from_dict(R) return df