def test_figure(self): from idiva.stat.vcf_to_fisher import figure_pvalues for kind in ["head", "full"]: workdir = basepath / kind vcf = unlist1(workdir.glob("*.vcf.gz")) with ReadVCF.open(vcf) as vcf: for px in figure_pvalues(vcf): px.f.savefig( (workdir / px.info['name proposal']).with_suffix(".png"))
def post(vcf_file: Path): log.info("=> Entering the postprocessing stage.") from idiva.stat.vcf_to_fisher import figure_pvalues from idiva.io.vcf import SEP with ReadVCF.open(vcf_file) as vcf: for px in figure_pvalues(vcf): file = vcf_file.parent / px.info['name proposal'] log.info(F"Saving figure and data to {file}.* .") px.f.savefig(file.with_suffix(".png")) df: pandas.DataFrame = px.info['df'] df.to_csv(file.with_suffix(".csv"), sep=SEP)
def translate_vcf(self, vcf) -> pd.DataFrame: """ Returns a dataframe that contains the following features from a vcf file CHROM, POS, ID, VAR """ cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() with ReadVCF.open(vcf) as reader: with seek_then_rewind(reader.fd, seek=reader.dataline_start_pos) as fd: dataframe = pd.read_csv(fd, sep='\t', usecols=range( len(DataHandler.INIT_COLS)), header=None, names=DataHandler.INIT_COLS, dtype={ 'CHROM': np.int, 'POS': np.int, 'ID': np.str, 'REF': np.str, 'ALT': np.str }) # Check if ALT contains only one value or several values seperated by ',' assert (len([ uni for uni in dataframe['ALT'].unique().tolist() if ',' in uni ]) == 0) # store only SNP variants dataframe = dataframe[dataframe['REF'].apply( lambda x: {x}.issubset({'A', 'C', 'G', 'T'}))] dataframe = dataframe[dataframe['ALT'].apply( lambda x: {x}.issubset({'A', 'C', 'G', 'T'}))] # Check if only SNP for ref in dataframe['REF']: assert (len(ref) == 1) for alt in dataframe['ALT']: assert (len(alt) == 1) assert (set(dataframe['REF'].unique().tolist()).issubset( {'A', 'C', 'G', 'T'})) assert (set(dataframe['ALT'].unique().tolist()).issubset( {'A', 'C', 'G', 'T'})) dataframe['CHROM'] = pd.to_numeric(dataframe[['CHROM']].apply( self.translate_chrom, axis=1)) dataframe = self.encode_ref_alt(dataframe) dataframe.drop_duplicates() # TODO: same CHROM POS and rsID but not same REF & ALT # consequence of real world data (Kjong Nov 30) # => identify samples by CHROM, POS and VAR # same CHROM rsID REF ALT but not same POS # => rsIDs are not completely unique ! # Ignore rsID (Kjong Nov 23) """ print(len(dataframe['ID'].unique().tolist())) print(len(dataframe['ID'].tolist())) CHROM POS ID REF ALT VAR 56638 17 1649616 rs544719440 A G 2 576511 17 19159733 rs540831825 A G 2 717227 17 27196477 rs202111951 T C 10 919995 17 34642425 rs568794696 C A 3 2105598 17 77663493 rs148485780 C T 5 CHROM POS ID REF ALT VAR 56637 17 1649616 rs544719440 A C 1 576510 17 19159733 rs540831825 A C 1 717226 17 27196477 rs202111951 T A 9 919587 17 34540858 rs568794696 C A 3 2105592 17 77663435 rs148485780 C T 5 """ return dataframe
dataframe.to_csv(file_path, sep='\t', index=False) return dataframe if __name__ == '__main__': dh = DataHandler() print(dh.preprocess_clinvar()) cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() with ReadVCF.open(cache / 'control_v2.vcf') as ctrl_vcf: with ReadVCF.open(cache / 'case_processed_v2.vcf') as case_vcf: test_set = dh.create_test_set_v2( case_vcf=case_vcf, ctrl_vcf=ctrl_vcf, ) print(test_set) """ print(dataframe) cache = (Path(__file__).parent.parent.parent.parent / "input/download_cache").resolve() assert cache.is_dir() file_path = str(cache) + "/cadd_full.vcf" dataframe = dataframe.fillna(value=".")