Пример #1
0
    def get_vcf(self,
            processed_dir: str = "",
            real_path: str = "") \
            -> [str]:
        '''Get a list of vcf files.
        '''

        if real_path:
            vcfs = [real_path]
        else:
            if 'documents' not in self._js:
                return []
            # vcfs are saved inside documents and marked by is_vcf
            vcfs = [
                d['document_name'] for d in self._js['documents']
                if d and d['is_vcf']
            ]
            # return empty if no vcfs present
            if not vcfs:
                return []

        case_id = self.get_case_id()
        if os.path.exists(vcfs[0]):
            vcf_path = vcfs[0]
            LOGGER.info("Case %s, VCF file %s is found.", case_id, vcfs[0])
            destination_vcf = os.path.join(processed_dir, case_id + ".vcf.gz")
        else:
            vcf_dir = os.path.join(self._base_dir, "vcfs")
            raw_vcfs = list(os.listdir(vcf_dir))

            # convert and save vcfs to specified location if not already present
            processed_vcfs = [
                f.strip(".vcf.gz") for f in os.listdir(processed_dir)
            ]
            destination_vcf = os.path.join(processed_dir, case_id + ".vcf.gz")

            case_vcfs = [v for v in raw_vcfs if case_id in v]
            if not case_vcfs:
                LOGGER.info("Case %s, VCF file %s could not be found.",
                            case_id, vcfs[0])
                return []
            vcf = case_vcfs[0]
            vcf_path = os.path.join(vcf_dir, vcf)
        move_vcf(vcf_path, destination_vcf)

        return [destination_vcf]
Пример #2
0
    def get_vcf(self, processed_dir: str = "data/PEDIA/vcfs/original") \
            -> [str]:
        '''Get a list of vcf files.
        '''
        # vcfs are saved inside documents and marked by is_vcf
        vcfs = [
            d['document_name'] for d in self._js['documents']
            if d and d['is_vcf']
        ]
        # return empty if no vcfs present
        if not vcfs:
            return []
        vcf_dir = os.path.join(self._base_dir, "vcfs")
        raw_vcfs = list(os.listdir(vcf_dir))

        # convert and save vcfs to specified location if not already present
        processed_vcfs = [
            f.strip(".vcf.gz") for f in os.listdir(processed_dir)
        ]
        case_id = self.get_case_id()
        destination_vcf = os.path.join(processed_dir, case_id + ".vcf.gz")

        if case_id not in processed_vcfs:
            case_vcfs = [v for v in raw_vcfs if case_id in v]
            if not case_vcfs:
                LOGGER.info("Case %s, VCF file %s could not be found.",
                            case_id, vcfs[0])
                return []
            vcf = case_vcfs[0]
            vcf_path = os.path.join(vcf_dir, vcf)
            kind = filetype.guess(vcf_path)
            # get mimetype
            mime = kind.mime if kind is not None else "text"
            move_vcf(vcf_path, destination_vcf, mime)

        return [destination_vcf]
Пример #3
0
    def dump_vcf(self, path: str, recreate: bool = False) -> None:
        '''Dumps vcf file to given path. Initializes vcf generation if none has yet been created.
        Created vcf is saved to self.vcf.
        '''
        if hasattr(self, 'vcf') and not recreate:
            if isinstance(self.vcf, str):
                LOGGER.debug(
                    "VCF generation for case %s failed. Error message:%s", self.case_id, self.vcf)
            else:
                outputpath = os.path.join(path, self.case_id + '.vcf')
                # add header to vcf
                with open(outputpath, 'w') as outfile:
                    outfile.write(
                        '##fileformat=VCFv4.1\n##INFO=<ID=HGVS,Number=1,Type=String,Description="HGVS-Code">\n##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n')
                    outfile.write(
                        '##contig=<ID=1,assembly=b37,length=249250621>\n')
                    outfile.write(
                        '##contig=<ID=2,assembly=b37,length=243199373>\n')
                    outfile.write(
                        '##contig=<ID=3,assembly=b37,length=198022430>\n')
                    outfile.write(
                        '##contig=<ID=4,assembly=b37,length=191154276>\n')
                    outfile.write(
                        '##contig=<ID=5,assembly=b37,length=180915260>\n')
                    outfile.write(
                        '##contig=<ID=6,assembly=b37,length=171115067>\n')
                    outfile.write(
                        '##contig=<ID=7,assembly=b37,length=159138663>\n')
                    outfile.write(
                        '##contig=<ID=8,assembly=b37,length=146364022>\n')
                    outfile.write(
                        '##contig=<ID=9,assembly=b37,length=141213431>\n')
                    outfile.write(
                        '##contig=<ID=10,assembly=b37,length=135534747>\n')
                    outfile.write(
                        '##contig=<ID=11,assembly=b37,length=135006516>\n')
                    outfile.write(
                        '##contig=<ID=12,assembly=b37,length=133851895>\n')
                    outfile.write(
                        '##contig=<ID=13,assembly=b37,length=115169878>\n')
                    outfile.write(
                        '##contig=<ID=14,assembly=b37,length=107349540>\n')
                    outfile.write(
                        '##contig=<ID=15,assembly=b37,length=102531392>\n')
                    outfile.write(
                        '##contig=<ID=16,assembly=b37,length=90354753>\n')
                    outfile.write(
                        '##contig=<ID=17,assembly=b37,length=81195210>\n')
                    outfile.write(
                        '##contig=<ID=18,assembly=b37,length=78077248>\n')
                    outfile.write(
                        '##contig=<ID=19,assembly=b37,length=59128983>\n')
                    outfile.write(
                        '##contig=<ID=20,assembly=b37,length=63025520>\n')
                    outfile.write(
                        '##contig=<ID=21,assembly=b37,length=48129895>\n')
                    outfile.write(
                        '##contig=<ID=22,assembly=b37,length=51304566>\n')
                    outfile.write(
                        '##contig=<ID=X,assembly=b37,length=155270560>\n')
                    outfile.write(
                        '##contig=<ID=Y,assembly=b37,length=59373566>\n')

                self.vcf.to_csv(outputpath, mode='a', sep='\t', index=False,
                                header=True, quoting=csv.QUOTE_NONE)
                move_vcf(outputpath, outputpath + '.gz', 'text')
                os.remove(outputpath)
        # catches cases without genomic entries
        elif not self.hgvs_models or not self.get_variants():
            LOGGER.debug('VCF generation for case %s not possible, Error message: No variants',self.case_id)
        else:
            LOGGER.debug("Generating VCF for case %s", self.case_id)
            self.vcf = self.create_vcf(path)
            self.dump_vcf(path)