def Vcf(fh, sv, variant_id): errFH(fh) vcf_fh = None if fh.endswith('.gz'): import gzip vcf_fh = gzip.open(fh, 'r') else: vcf_fh = open(fh, 'r') with vcf_fh as f: for l in f: if l.startswith('#'): continue s, e, r = -9, -9, tokenize(l) if r != 0: c, s = str(r[0]), int(r[1]) s -= 1 if len(r) < 8: sys.stderr.write( 'WARNING: Skipping bad entry {} in VCF file {}. INFO column is required: {}\n' .format(l.rstrip(), fh, __help_url__)) else: for i in r[7].split(';'): if 'SVTYPE=' in i: cl = i.replace('SVTYPE=', '') if i.startswith('END=') and 'CI' not in i: e = int(i.replace('END=', '')) if s == -9 or e == -9: sys.stderr.write( 'WARNING: Skipping bad entry {} in VCF file {}. SV entry missing positions: {}\n' .format(l.rstrip(), fh, __help_url__)) else: if checkCall(cl, l, fh) == True: sv.append((c, s, e, cl)) variant_id[(str(c), int(s), int(e), str(cl))] = str(r[2]) vcf_fh.close()
def load_clf(self, jsonfh=None): import shutil errFH(jsonfh) clfs = {} with open(jsonfh) as f: clfs = json.load(f) realpaths = [] for name in clfs: print 'loading {} classifier ...'.format(name) clf_dir = get_path() + '/resources/training_sets/' + name + '/' if not os.path.exists(clf_dir): os.makedirs(clf_dir) for x in clfs[name]: clffh = str(clfs[name][x]) if not os.path.isfile(clffh): sys.stderr.write( 'WARNING: {} does not exist. Please check the paths in {}\n' .format(clffh, jsonfh)) else: clfname = clffh.split('/').pop() newpath = clf_dir + clfname clfreplace = True if os.path.isfile(newpath): clfreplace = query_yes_no( 'WARNING: {} exists... Replace?'.format(newpath), 'no') if clfreplace == True: if self.clfs.get(name) == None: self.clfs[name] = {} self.clfs[name][x] = newpath realpaths.append(newpath) shutil.copyfile(clffh, newpath) for x in realpaths: print 'installed classifier {}'.format(x) print 'appending to {}... DO NOT ALTER {}'.format(self.json, self.json) dump_json(self.json, self.clfs)
def __init__(self, peds=None): self.sex = {} self.males = {} self.ids = [] for fh in peds: errFH(fh) with open(fh, 'r') as f: for l in f: r = tokenize(l) if r != 0: if len(r) < 5: sys.stderr.write( 'WARNING {} does not contain 5 elements. PED files are formatted as:\nFamily ID Individual ID Paternal ID Maternal ID Sex(1=male;2=female;)\n' .format(l)) else: sample_id, sex = str(r[1]), int(r[4]) if sex != 1 and sex != 2: sys.stderr.write( 'WARNING {} is not an accepted sex entry. Accepted entries: 1=male; 2=female. Error found here:{}\n' .format(sex, l)) else: self.sex[sample_id] = sex self.ids.append(sample_id) if sex == 1: self.males[sample_id] = 1 self.ids = sorted(list(set(self.ids)), key=str.lower)
def Bed(fh, sv): errFH(fh) with open(fh, 'r') as f: for l in f: if l.startswith('#'): continue r = tokenize(l) if r == 0 or len(r) < 4: sys.stderr.write( 'WARNING: Skipping bad entry {} in BED file {}. BED format requirements: {}\n' .format(l.rstrip(), fh, __help_url__)) elif r != 0 and checkCall(str(r[3]), l, fh) != False: sv.append((str(r[0]), int(r[1]), int(r[2]), str(r[3])))
def bam_init(args=None, Ped=None, Snv=None, gen=None): bams = [] for f in args: errFH(f) try: pysam.AlignmentFile(f).check_index() except ValueError: sys.stderr.write( 'WARNING: {} is not indexed with samtools index. Skipping ...\n' .format(f)) continue except AttributeError: sys.stderr.write( 'WARNING: {} appears to be in SAM format. Convert to BAM with `samtools view -bh {}` and index with samtools index\n' .format(f, f)) continue bam = Bam(f, Ped, gen) if len(Snv) < 1: print 'FATAL ERROR: SNV file(s) were not formatted correctly. See https://github.com/dantaki/SV2/wiki/input#snv-vcf for details' sys.stderr.write( 'FATAL ERROR: SNV file(s) were not formatted correctly. See https://github.com/dantaki/SV2/wiki/input#snv-vcf for details\n' ) sys.exit(1) for snv in Snv: if snv.id.get(bam.id) != None: bam.Snv, bam.snv_index = snv, snv.id[bam.id] if bam.id != None and bam.Snv == None: sys.stderr.write( 'WARNING: BAM file {} sample name (@RG SM:<sample_id>):{} not found in SNV VCFs. Skipping {} ...\n' .format(f, bam.id, f)) if bam.id == None: sys.stderr.write( 'WARNING: Skipping BAM file {}. No sample name (@RG SM:<sample_id>). See https://github.com/dantaki/SV2/wiki/input#bam for details' ) if bam.id != None and bam.Snv != None: bams.append(bam) if len(bams) < 1: print 'FATAL ERROR: BAM file(s) were not formatted correctly. See https://github.com/dantaki/SV2/wiki/input#bam for details' sys.stderr.write( 'FATAL ERROR: BAM file(s) were not formatted correctly. See https://github.com/dantaki/SV2/wiki/input#bam for details\n' ) sys.exit(1) return bams
def write_config(self, hg19=None, hg38=None, mm10=None): conf = ConfigParser() for x in self.clfs['default']: clf_fh = str(self.clfs['default'][x]) if not os.path.isfile(clf_fh): realpath = get_path() + '/resources/training_sets/' + clf_fh if os.path.isfile(realpath): self.clfs['default'][x] = realpath else: print 'FATAL ERROR: {} pickle file not found. If this file is missing, reinstall sv2: pip uninstall sv2 -y && pip install sv2-VERSION.tar.gz'.format( realpath) sys.stderr.write( 'FATAL ERROR: {} pickle file not found. If this file is missing, reinstall sv2: pip uninstall sv2 -y && pip install sv2-VERSION.tar.gz\n' .format(realpath)) sys.exit(1) dump_json(self.json, self.clfs) if not os.path.isfile(self.fh): conf_fh = open(self.fh, 'w') conf.add_section('FASTA_PATHS') conf.set('FASTA_PATHS', 'hg19', hg19) conf.set('FASTA_PATHS', 'hg38', hg38) conf.set('FASTA_PATHS', 'mm10', mm10) conf.set('RESOURCE_DIR', 'sv2_resource', self.resource) conf.write(conf_fh) conf_fh.close() else: conf.read(self.fh) if hg19 != None: errFH(hg19) conf.set('FASTA_PATHS', 'hg19', hg19) if hg38 != None: errFH(hg38) conf.set('FASTA_PATHS', 'hg38', hg38) if mm10 != None: errFH(mm10) conf.set('FASTA_PATHS', 'mm10', mm10) if self.resource != None: conf.set('RESOURCE_DIR', 'sv2_resource', self.resource) with open(self.fh, 'w') as conf_fh: conf.write(conf_fh)