def get_bin_nrd(filestats: str) -> pd.DataFrame: """ Calculates NRD for each bin of AF :param filestats: path to file # NRD and discordance is calculated as follows: # m .. number of matches # x .. number of mismatches # NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA) # RR discordance = xRR / (xRR + mRR) # RA discordance = xRA / (xRA + mRA) # AA discordance = xAA / (xAA + mAA) """ genos = {0: 'RR Hom', 1: 'RA Het', 2: 'AA Hom'} gcsaf = bcfstats.get_table_dataframe(filestats, "GCsAF") gcsaf = gcsaf.astype(float) gcsaf['tot matches'] = gcsaf['RR Hom matches'] + gcsaf[ 'RA Het matches'] + gcsaf['AA Hom matches'] gcsaf['tot mismatches'] = gcsaf['RR Hom mismatches'] + gcsaf[ 'RA Het mismatches'] + gcsaf['AA Hom mismatches'] gcsaf['NRD'] = (gcsaf['RR Hom mismatches'] + gcsaf['RA Het mismatches'] + gcsaf['AA Hom mismatches']) /\ (gcsaf['RR Hom mismatches'] + gcsaf['RA Het mismatches'] + gcsaf['AA Hom mismatches'] + gcsaf['RA Het matches'] + gcsaf['AA Hom matches']) for geno in genos.values(): gcsaf['{} discordance'.format(geno)] = gcsaf['{} mismatches'.format( geno)] / (gcsaf['{} mismatches'.format(geno)] + gcsaf['{} matches'.format(geno)]) return gcsaf
def get_overall_nrd(filestats: str) -> pd.DataFrame: """ :param filestats: path to file # NRD and discordance is calculated as follows: # m .. number of matches # x .. number of mismatches # NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA) # RR discordance = xRR / (xRR + mRR) # RA discordance = xRA / (xRA + mRA) # AA discordance = xAA / (xAA + mAA) """ nrds = bcfstats.get_table_dataframe(filestats, "NRDs") return nrds
def get_bin_accuracy(filestats: str) -> pd.DataFrame: """ :param filestats: path to file ratio tp / (tp + fp + tn + fn) for each class """ genos = {0: 'RR Hom', 1: 'RA Het', 2: 'AA Hom'} gcsaf = bcfstats.get_table_dataframe(filestats, "GCsAF") gcsaf = gcsaf.astype(float) for geno in genos.values(): gcsaf['{} accuracy'.format(geno)] = gcsaf['{} matches'.format( geno)] / gcsaf['number of genotypes'] return gcsaf
def get_bin_recall(filestats: str) -> pd.DataFrame: """ :param filestats: path to file ratio tp / (tp + fn) for each class """ # this is the per-class concordance genos = {0: 'RR Hom', 1: 'RA Het', 2: 'AA Hom'} gcsaf = bcfstats.get_table_dataframe(filestats, "GCsAF") gcsaf = gcsaf.astype(float) for geno in genos.values(): gcsaf['{} recall'.format(geno)] = gcsaf['{} matches'.format(geno)] / ( gcsaf['{} mismatches'.format(geno)] + gcsaf['{} matches'.format(geno)]) return gcsaf
def get_bin_precision(filestats: str) -> pd.DataFrame: """ :param filestats: path to file ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives """ # this cannot be calculated: I do not know which one are the false positives? genos = {0: 'RR Hom', 1: 'RA Het', 2: 'AA Hom'} gcsaf = bcfstats.get_table_dataframe(filestats, "GCsAF") gcsaf = gcsaf.astype(float) for geno in genos.values(): pass #gcsaf['{} accuracy'.format(geno)] = gcsaf['{} matches'.format(geno)] / gcsaf['number of genotypes'] # or? # gcsaf['RR Hom precision'] = gcsaf['RR Hom matches'] / # (gcsaf['RA Het mismatches'] + gcsaf['AA Hom mismatches'] - gcsaf['RR Hom mismatches'] + gcsaf['RR Hom matches']) return None
def get_bin_maf_nrd(filestats: str) -> pd.DataFrame: """ Calculates NRD for each bin of AF :param filestats: path to file # NRD and discordance is calculated as follows: # m .. number of matches # x .. number of mismatches # NRD = (xRR + xRA + xAA) / (xRR + xRA + xAA + mRA + mAA) # RR discordance = xRR / (xRR + mRR) # RA discordance = xRA / (xRA + mRA) # AA discordance = xAA / (xAA + mAA) """ genos = {0: 'RR Hom', 1: 'RA Het', 2: 'AA Hom'} gcsaf = bcfstats.get_table_dataframe(filestats, "GCsAF") gcsaf = gcsaf.astype(float) # Create MAF values from AF and aggregate the counts for MAF mafconverter = lambda x: x if x <= 0.5 else 1 - x gcsaf['minor allele frequency'] = gcsaf['allele frequency'].apply( mafconverter).round(decimals=2) gcsaf.drop(labels=['allele frequency', 'dosage r-squared'], axis=1, inplace=True) gpby = gcsaf.groupby(['minor allele frequency'], axis=0) gcsaf = gpby.sum() # Compute number of (mis)matches gcsaf['tot matches'] = gcsaf['RR Hom matches'] + gcsaf[ 'RA Het matches'] + gcsaf['AA Hom matches'] gcsaf['tot mismatches'] = gcsaf['RR Hom mismatches'] + gcsaf[ 'RA Het mismatches'] + gcsaf['AA Hom mismatches'] gcsaf['NRD'] = (gcsaf['RR Hom mismatches'] + gcsaf['RA Het mismatches'] + gcsaf['AA Hom mismatches']) / \ (gcsaf['RR Hom mismatches'] + gcsaf['RA Het mismatches'] + gcsaf['AA Hom mismatches'] + gcsaf[ 'RA Het matches'] + gcsaf['AA Hom matches']) for geno in genos.values(): gcsaf['{} discordance'.format(geno)] = gcsaf['{} mismatches'.format( geno)] / (gcsaf['{} mismatches'.format(geno)] + gcsaf['{} matches'.format(geno)]) return gcsaf