Exemplo n.º 1
0
def Vcf(fh, sv, variant_id):
    errFH(fh)
    vcf_fh = None
    if fh.endswith('.gz'):
        import gzip
        vcf_fh = gzip.open(fh, 'r')
    else:
        vcf_fh = open(fh, 'r')
    with vcf_fh as f:
        for l in f:
            if l.startswith('#'): continue
            s, e, r = -9, -9, tokenize(l)
            if r != 0:
                c, s = str(r[0]), int(r[1])
                s -= 1
                if len(r) < 8:
                    sys.stderr.write(
                        'WARNING: Skipping bad entry {} in VCF file {}. INFO column is required: {}\n'
                        .format(l.rstrip(), fh, __help_url__))
                else:
                    for i in r[7].split(';'):
                        if 'SVTYPE=' in i: cl = i.replace('SVTYPE=', '')
                        if i.startswith('END=') and 'CI' not in i:
                            e = int(i.replace('END=', ''))
                    if s == -9 or e == -9:
                        sys.stderr.write(
                            'WARNING: Skipping bad entry {} in VCF file {}. SV entry missing positions: {}\n'
                            .format(l.rstrip(), fh, __help_url__))
                    else:
                        if checkCall(cl, l, fh) == True:
                            sv.append((c, s, e, cl))
                            variant_id[(str(c), int(s), int(e),
                                        str(cl))] = str(r[2])
    vcf_fh.close()
Exemplo n.º 2
0
 def load_clf(self, jsonfh=None):
     import shutil
     errFH(jsonfh)
     clfs = {}
     with open(jsonfh) as f:
         clfs = json.load(f)
     realpaths = []
     for name in clfs:
         print 'loading {} classifier ...'.format(name)
         clf_dir = get_path() + '/resources/training_sets/' + name + '/'
         if not os.path.exists(clf_dir): os.makedirs(clf_dir)
         for x in clfs[name]:
             clffh = str(clfs[name][x])
             if not os.path.isfile(clffh):
                 sys.stderr.write(
                     'WARNING: {} does not exist. Please check the paths in {}\n'
                     .format(clffh, jsonfh))
             else:
                 clfname = clffh.split('/').pop()
                 newpath = clf_dir + clfname
                 clfreplace = True
                 if os.path.isfile(newpath):
                     clfreplace = query_yes_no(
                         'WARNING: {} exists... Replace?'.format(newpath),
                         'no')
                 if clfreplace == True:
                     if self.clfs.get(name) == None: self.clfs[name] = {}
                     self.clfs[name][x] = newpath
                     realpaths.append(newpath)
                     shutil.copyfile(clffh, newpath)
     for x in realpaths:
         print 'installed classifier {}'.format(x)
     print 'appending to {}... DO NOT ALTER {}'.format(self.json, self.json)
     dump_json(self.json, self.clfs)
Exemplo n.º 3
0
 def __init__(self, peds=None):
     self.sex = {}
     self.males = {}
     self.ids = []
     for fh in peds:
         errFH(fh)
         with open(fh, 'r') as f:
             for l in f:
                 r = tokenize(l)
                 if r != 0:
                     if len(r) < 5:
                         sys.stderr.write(
                             'WARNING {} does not contain 5 elements. PED files are formatted as:\nFamily ID  Individual ID  Paternal ID  Maternal ID  Sex(1=male;2=female;)\n'
                             .format(l))
                     else:
                         sample_id, sex = str(r[1]), int(r[4])
                         if sex != 1 and sex != 2:
                             sys.stderr.write(
                                 'WARNING {} is not an accepted sex entry. Accepted entries: 1=male; 2=female. Error found here:{}\n'
                                 .format(sex, l))
                         else:
                             self.sex[sample_id] = sex
                             self.ids.append(sample_id)
                             if sex == 1: self.males[sample_id] = 1
     self.ids = sorted(list(set(self.ids)), key=str.lower)
Exemplo n.º 4
0
def Bed(fh, sv):
    errFH(fh)
    with open(fh, 'r') as f:
        for l in f:
            if l.startswith('#'): continue
            r = tokenize(l)
            if r == 0 or len(r) < 4:
                sys.stderr.write(
                    'WARNING: Skipping bad entry {} in BED file {}. BED format requirements: {}\n'
                    .format(l.rstrip(), fh, __help_url__))
            elif r != 0 and checkCall(str(r[3]), l, fh) != False:
                sv.append((str(r[0]), int(r[1]), int(r[2]), str(r[3])))
Exemplo n.º 5
0
def bam_init(args=None, Ped=None, Snv=None, gen=None):
    bams = []
    for f in args:
        errFH(f)
        try:
            pysam.AlignmentFile(f).check_index()
        except ValueError:
            sys.stderr.write(
                'WARNING: {} is not indexed with samtools index. Skipping ...\n'
                .format(f))
            continue
        except AttributeError:
            sys.stderr.write(
                'WARNING: {} appears to be in SAM format. Convert to BAM with `samtools view -bh {}` and index with samtools index\n'
                .format(f, f))
            continue
        bam = Bam(f, Ped, gen)
        if len(Snv) < 1:
            print 'FATAL ERROR: SNV file(s) were not formatted correctly. See https://github.com/dantaki/SV2/wiki/input#snv-vcf for details'
            sys.stderr.write(
                'FATAL ERROR: SNV file(s) were not formatted correctly. See https://github.com/dantaki/SV2/wiki/input#snv-vcf for details\n'
            )
            sys.exit(1)
        for snv in Snv:
            if snv.id.get(bam.id) != None:
                bam.Snv, bam.snv_index = snv, snv.id[bam.id]
        if bam.id != None and bam.Snv == None:
            sys.stderr.write(
                'WARNING: BAM file {} sample name (@RG  SM:<sample_id>):{} not found in SNV VCFs. Skipping {} ...\n'
                .format(f, bam.id, f))
        if bam.id == None:
            sys.stderr.write(
                'WARNING: Skipping BAM file {}. No sample name (@RG SM:<sample_id>). See https://github.com/dantaki/SV2/wiki/input#bam for details'
            )
        if bam.id != None and bam.Snv != None: bams.append(bam)
    if len(bams) < 1:
        print 'FATAL ERROR: BAM file(s) were not formatted correctly. See https://github.com/dantaki/SV2/wiki/input#bam for details'
        sys.stderr.write(
            'FATAL ERROR: BAM file(s) were not formatted correctly. See https://github.com/dantaki/SV2/wiki/input#bam for details\n'
        )
        sys.exit(1)
    return bams
Exemplo n.º 6
0
 def write_config(self, hg19=None, hg38=None, mm10=None):
     conf = ConfigParser()
     for x in self.clfs['default']:
         clf_fh = str(self.clfs['default'][x])
         if not os.path.isfile(clf_fh):
             realpath = get_path() + '/resources/training_sets/' + clf_fh
             if os.path.isfile(realpath): self.clfs['default'][x] = realpath
             else:
                 print 'FATAL ERROR: {} pickle file not found. If this file is missing, reinstall sv2: pip uninstall sv2 -y && pip install sv2-VERSION.tar.gz'.format(
                     realpath)
                 sys.stderr.write(
                     'FATAL ERROR: {} pickle file not found. If this file is missing, reinstall sv2: pip uninstall sv2 -y && pip install sv2-VERSION.tar.gz\n'
                     .format(realpath))
                 sys.exit(1)
     dump_json(self.json, self.clfs)
     if not os.path.isfile(self.fh):
         conf_fh = open(self.fh, 'w')
         conf.add_section('FASTA_PATHS')
         conf.set('FASTA_PATHS', 'hg19', hg19)
         conf.set('FASTA_PATHS', 'hg38', hg38)
         conf.set('FASTA_PATHS', 'mm10', mm10)
         conf.set('RESOURCE_DIR', 'sv2_resource', self.resource)
         conf.write(conf_fh)
         conf_fh.close()
     else:
         conf.read(self.fh)
         if hg19 != None:
             errFH(hg19)
             conf.set('FASTA_PATHS', 'hg19', hg19)
         if hg38 != None:
             errFH(hg38)
             conf.set('FASTA_PATHS', 'hg38', hg38)
         if mm10 != None:
             errFH(mm10)
             conf.set('FASTA_PATHS', 'mm10', mm10)
         if self.resource != None:
             conf.set('RESOURCE_DIR', 'sv2_resource', self.resource)
         with open(self.fh, 'w') as conf_fh:
             conf.write(conf_fh)