def get_vcf(self, processed_dir: str = "", real_path: str = "") \ -> [str]: '''Get a list of vcf files. ''' if real_path: vcfs = [real_path] else: if 'documents' not in self._js: return [] # vcfs are saved inside documents and marked by is_vcf vcfs = [ d['document_name'] for d in self._js['documents'] if d and d['is_vcf'] ] # return empty if no vcfs present if not vcfs: return [] case_id = self.get_case_id() if os.path.exists(vcfs[0]): vcf_path = vcfs[0] LOGGER.info("Case %s, VCF file %s is found.", case_id, vcfs[0]) destination_vcf = os.path.join(processed_dir, case_id + ".vcf.gz") else: vcf_dir = os.path.join(self._base_dir, "vcfs") raw_vcfs = list(os.listdir(vcf_dir)) # convert and save vcfs to specified location if not already present processed_vcfs = [ f.strip(".vcf.gz") for f in os.listdir(processed_dir) ] destination_vcf = os.path.join(processed_dir, case_id + ".vcf.gz") case_vcfs = [v for v in raw_vcfs if case_id in v] if not case_vcfs: LOGGER.info("Case %s, VCF file %s could not be found.", case_id, vcfs[0]) return [] vcf = case_vcfs[0] vcf_path = os.path.join(vcf_dir, vcf) move_vcf(vcf_path, destination_vcf) return [destination_vcf]
def get_vcf(self, processed_dir: str = "data/PEDIA/vcfs/original") \ -> [str]: '''Get a list of vcf files. ''' # vcfs are saved inside documents and marked by is_vcf vcfs = [ d['document_name'] for d in self._js['documents'] if d and d['is_vcf'] ] # return empty if no vcfs present if not vcfs: return [] vcf_dir = os.path.join(self._base_dir, "vcfs") raw_vcfs = list(os.listdir(vcf_dir)) # convert and save vcfs to specified location if not already present processed_vcfs = [ f.strip(".vcf.gz") for f in os.listdir(processed_dir) ] case_id = self.get_case_id() destination_vcf = os.path.join(processed_dir, case_id + ".vcf.gz") if case_id not in processed_vcfs: case_vcfs = [v for v in raw_vcfs if case_id in v] if not case_vcfs: LOGGER.info("Case %s, VCF file %s could not be found.", case_id, vcfs[0]) return [] vcf = case_vcfs[0] vcf_path = os.path.join(vcf_dir, vcf) kind = filetype.guess(vcf_path) # get mimetype mime = kind.mime if kind is not None else "text" move_vcf(vcf_path, destination_vcf, mime) return [destination_vcf]
def dump_vcf(self, path: str, recreate: bool = False) -> None: '''Dumps vcf file to given path. Initializes vcf generation if none has yet been created. Created vcf is saved to self.vcf. ''' if hasattr(self, 'vcf') and not recreate: if isinstance(self.vcf, str): LOGGER.debug( "VCF generation for case %s failed. Error message:%s", self.case_id, self.vcf) else: outputpath = os.path.join(path, self.case_id + '.vcf') # add header to vcf with open(outputpath, 'w') as outfile: outfile.write( '##fileformat=VCFv4.1\n##INFO=<ID=HGVS,Number=1,Type=String,Description="HGVS-Code">\n##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n') outfile.write( '##contig=<ID=1,assembly=b37,length=249250621>\n') outfile.write( '##contig=<ID=2,assembly=b37,length=243199373>\n') outfile.write( '##contig=<ID=3,assembly=b37,length=198022430>\n') outfile.write( '##contig=<ID=4,assembly=b37,length=191154276>\n') outfile.write( '##contig=<ID=5,assembly=b37,length=180915260>\n') outfile.write( '##contig=<ID=6,assembly=b37,length=171115067>\n') outfile.write( '##contig=<ID=7,assembly=b37,length=159138663>\n') outfile.write( '##contig=<ID=8,assembly=b37,length=146364022>\n') outfile.write( '##contig=<ID=9,assembly=b37,length=141213431>\n') outfile.write( '##contig=<ID=10,assembly=b37,length=135534747>\n') outfile.write( '##contig=<ID=11,assembly=b37,length=135006516>\n') outfile.write( '##contig=<ID=12,assembly=b37,length=133851895>\n') outfile.write( '##contig=<ID=13,assembly=b37,length=115169878>\n') outfile.write( '##contig=<ID=14,assembly=b37,length=107349540>\n') outfile.write( '##contig=<ID=15,assembly=b37,length=102531392>\n') outfile.write( '##contig=<ID=16,assembly=b37,length=90354753>\n') outfile.write( '##contig=<ID=17,assembly=b37,length=81195210>\n') outfile.write( '##contig=<ID=18,assembly=b37,length=78077248>\n') outfile.write( '##contig=<ID=19,assembly=b37,length=59128983>\n') outfile.write( '##contig=<ID=20,assembly=b37,length=63025520>\n') outfile.write( '##contig=<ID=21,assembly=b37,length=48129895>\n') outfile.write( '##contig=<ID=22,assembly=b37,length=51304566>\n') outfile.write( '##contig=<ID=X,assembly=b37,length=155270560>\n') outfile.write( '##contig=<ID=Y,assembly=b37,length=59373566>\n') self.vcf.to_csv(outputpath, mode='a', sep='\t', index=False, header=True, quoting=csv.QUOTE_NONE) move_vcf(outputpath, outputpath + '.gz', 'text') os.remove(outputpath) # catches cases without genomic entries elif not self.hgvs_models or not self.get_variants(): LOGGER.debug('VCF generation for case %s not possible, Error message: No variants',self.case_id) else: LOGGER.debug("Generating VCF for case %s", self.case_id) self.vcf = self.create_vcf(path) self.dump_vcf(path)