def replace_genome(filename, tp, outfile1, outfile2, BWA): f, out1, out2 = shell.IsGzipFile(filename), open(outfile1, 'wt'), open( outfile2, 'wt') for line in f: if line[0] == '>': out1.write(line) out2.write(line) else: out1.write(line.upper().replace(tp[0], tp[1])) out2.write(line.upper().replace(BT[tp[0]], BT[tp[1]])) for fd in [f, out1, out2]: fd.close() # index and rm p1 = subprocess.Popen([BWA, 'index', outfile1], stderr=subprocess.PIPE, stdout=subprocess.PIPE) p2 = subprocess.Popen([BWA, 'index', outfile2], stderr=subprocess.PIPE, stdout=subprocess.PIPE) p1.wait() p2.wait() if p1.returncode != 0 or p2.returncode != 0: shell.eprint('[' + PROGRAM + '] Error: bwa index failed') sys.exit(1) os.system('rm -f ' + outfile1) os.system('rm -f ' + outfile2) return 0
def bwaaln(tp, genomefile, fq1, fq2, phred, outdir, config): if phred == 33: args = [] elif phred == 64: args = ['-I'] else: shell.eprint('[' + PROGRAM + '] Error: phred error') sys.exit(1) for k, v in config['bwa']['aln'][tp].items(): if v != '': args += [k, v] p1 = subprocess.Popen( [BWA, 'aln'] + args + [genomefile, fq1, '-f', outdir + os.path.basename(fq1) + '_1.sai'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if fq2 != '': p2 = subprocess.Popen( [BWA, 'aln'] + args + [genomefile, fq2, '-f', outdir + os.path.basename(fq2) + '_2.sai'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) p2.wait() p1.wait() if p1.returncode != 0 and p2.returncode != 0: shell.eprint('[' + PROGRAM + '] Error: bwa aln error') sys.exit(1) args = [] for k, v in config['bwa']['sampe'][tp].items(): args += [k, v] if fq2 != '': p = subprocess.Popen([BWA, 'sampe'] + args + [ genomefile, outdir + os.path.basename(fq1) + '_1.sai', outdir + os.path.basename(fq2) + '_2.sai', fq1, fq2 ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) else: p = subprocess.Popen([ BWA, 'samse', genomefile, outdir + os.path.basename(fq1) + '_1.sai', fq1 ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) infile = pysam.AlignmentFile(p.stdout, 'r') oufile = pysam.AlignmentFile(outdir + tp + '_aln.bam', 'wb', template=infile) for r in infile: oufile.write(r) p.wait() oufile.close() infile.close() os.system('rm -f ' + outdir + os.path.basename(fq1) + '_1.sai') if fq2: os.system('rm -f ' + outdir + os.path.basename(fq2) + '_2.sai') return outdir + tp + '_aln.bam'
def spawnSourceFiles(entries, isTarget=False, systemHeaders=False): if not entries: return for path in entries: if os.path.isfile(path): spawnSourceFile(path, isTarget, systemHeaders) elif os.path.isdir(path): spawnSourceDirectory(path, isTarget, systemHeaders) else: eprint("Error: couldn't process path '%s'." % path)
def _transfer_cigar(cigartuples, start, query, qual): ''' Judge the extend end of scaffold by read if out, and return ''' #shell.eprint(cigartuples) d = {} # record handle of each cigar offset = 0 # the offset on read i = start # i for Delete position in genome #shell.eprint(query) for operation, num in cigartuples: #shell.eprint(offset) #shell.eprint(query[offset: offset+num]) if operation == 0: ## M try: d['M'].append( (start, start + num, query[offset:offset + num], qual[offset:offset + num])) except KeyError: d['M'] = [(start, start + num, query[offset:offset + num], qual[offset:offset + num])] #shell.eprint(d) start += num - 1 i += num offset += num elif operation == 1: ## I try: d['I'].append((start, query[offset:offset + num], qual[offset:offset + num])) except KeyError: d['I'] = [(start, query[offset:offset + num], qual[offset:offset + num])] start += 1 offset += num elif operation == 2: ## D try: d['D'].append((i, num)) except KeyError: d['D'] = [(i, num)] start += num + 1 i += num elif operation == 3: ## N start += num + 1 i += num offset += num elif operation == 4: ## S offset += num else: shell.eprint( '[' + PROGRAM + '] Error: the cigar string may have unsupported char, should only have [MISDN]' ) return start, d
def bamsortindex(bamfile): prefix = '.'.join(bamfile.split('.')[:-1]) try: pysam.sort('-o', prefix + '.sorted.bam', bamfile) except: shell.eprint('[' + PROGRAM + '] Error: ' + bamfile + ' sort error') sys.exit(1) try: pysam.index(prefix + '.sorted.bam') except: shell.eprint('[' + PROGRAM + '] Error: ' + bamfile + ' index error') os.system('rm -f ' + bamfile) return prefix + '.sorted.bam'
def _trim(cigartuples, t, start, seq, qual): ''' Trim the reads' 5' and 3' end. ''' newcigarlist, flag, tr = [], 1, 0 ## tr for operation, num in cigartuples: if flag: if operation in (0, 1, 4): ## 0 for M, 1 for I, 4 for S if num + tr > t[0]: #shell.eprint(num, tr, t[0]) newcigarlist.append((operation, num + tr - t[0])) if operation == 0 and len(t) == 2: start += t[0] - tr flag = 0 else: tr += num if operation == 0 and len(t) == 2: start += num #shell.eprint(newcigarlist, 'ok') elif operation in (2, 3): ## 2 for D, 3 for N if len(t) == 2: start += num else: shell.eprint( '[' + PROGRAM + '] Error: the cigar string may have unsupported char, should only have [MISDN]' ) else: newcigarlist.append((operation, num)) #shell.eprint(newcigarlist, t) if len(t) == 2: return _trim(newcigarlist[::-1], [t[1]], start, seq[t[0]:-t[1]], qual[t[0]:-t[1]]) elif len(t) == 1: ## if newcigarlist[0][0] == 1: newcigarlist[0] = (4, newcigarlist[0][1]) elif newcigarlist[0][0] in (2, 3): newcigarlist = newcigarlist[1:] ## if newcigarlist[-1][0] == 1: newcigarlist[-1] = (4, newcigarlist[-1][1]) elif newcigarlist[-1][0] in (2, 3): newcigarlist = newcigarlist[:-1] return newcigarlist[::-1], start, seq, qual
def getDotPicture(graph, engine): if not executableExists(engine): eprint("No 'dot' executable!") dotFileName = None pngFileName = None with tempfile.NamedTemporaryFile(delete=False) as dotFile: dotFileName = os.path.abspath(dotFile.name) dotFile.write(graph) with tempfile.NamedTemporaryFile(delete=False) as pngFile: pngFileName = os.path.abspath(pngFile.name) runCommand("%s -Tpng -o %s %s" % (engine, pngFileName, dotFileName)) with open(pngFileName, "r") as graphFile: return graphFile.read()
def get_config(): try: optlist, args = getopt.getopt(sys.argv[1:], 'hp:t:', ['help', 'trim=', 'prefix=']) except getopt.GetoptError as e: shell.eprint('[' + PROGRAM + '] Error: ' + str(e)) sys.exit(2) if optlist == [] and args == []: print_help() sys.exit(0) config = {'sam2base': {}} for opt, value in optlist: if opt in ('-h', '--help'): print_help() sys.exit(0) elif opt in ('-t', '--trim'): try: config['sam2base']['trim'] = (int(value.split(',')[0]), int(value.split(',')[1])) except ValueError: shell.eprint('[' + PROGRAM + '] Error: --trim parameter should be integer') sys.exit(1) elif opt in ('p', '--prefix'): if value.endswith('.'): shell.eprint( '[' + PROGRAM + '] Error: --prefix parameter should not end with \'.\'') sys.exit(1) else: config['sam2base']['output'] = value else: assert False, 'unhandled option' try: genomefile, bamfile = args except ValueError as e: shell.eprint('[' + PROGRAM + '] Error: only two input files should be provided') sys.exit(1) return config, genomefile, bamfile
def soapnuke(tp, fq1, fq2, phred, config): if phred == 33: args = ['-Q', '2'] elif phred == 64: args = ['-Q', '1'] else: shell.eprint('[' + PROGRAM + '] Error: phred error') sys.exit(1) if fq2 == '': args += [ '-1', fq1, '-o', 'soapnuke/' + tp, '-C', os.path.basename(fq1) + '.clean.fq.gz' ] fq1 = 'soapnuke/' + tp + '/' + os.path.basename(fq1) + '.clean.fq.gz' else: args += [ '-1', fq1, '-2', fq2, '-o', 'soapnuke/' + tp, '-C', os.path.basename(fq1) + '_1.clean.fq.gz', '-D', os.path.basename(fq2) + '_2.clean.fq.gz' ] fq1, fq2 = 'soapnuke/' + tp + '/' + os.path.basename( fq1) + '_1.clean.fq.gz', 'soapnuke/' + tp + '/' + os.path.basename( fq2) + '_2.clean.fq.gz' for k, v in config['soapnuke']['filter'][tp].items(): if v != '': args += [k, v] p = subprocess.Popen([SOAPNUKE, 'filter'] + args + ['-G', '-5', '1'], stdout=open('log', 'w'), stderr=subprocess.STDOUT) p.wait() if p.returncode != 0: shell.eprint('[' + PROGRAM + '] Error: soapnuke run error') shell.eprint(''.join(open('log', 'r').read())) sys.exit(1) return fq1, fq2, 33
def sam2base(genomefile, sortbamfile, trim=(0, 0), output='', suffix='.sb.gz'): ''' Bam to sb file. sb -> singlebase, for the future, sbz for the sb specific compressed file, sbi for the sb index file. ''' def _trim(cigartuples, t, start, seq, qual): ''' Trim the reads' 5' and 3' end. ''' newcigarlist, flag, tr = [], 1, 0 ## tr for operation, num in cigartuples: if flag: if operation in (0, 1, 4): ## 0 for M, 1 for I, 4 for S if num + tr > t[0]: #shell.eprint(num, tr, t[0]) newcigarlist.append((operation, num + tr - t[0])) if operation == 0 and len(t) == 2: start += t[0] - tr flag = 0 else: tr += num if operation == 0 and len(t) == 2: start += num #shell.eprint(newcigarlist, 'ok') elif operation in (2, 3): ## 2 for D, 3 for N if len(t) == 2: start += num else: shell.eprint( '[' + PROGRAM + '] Error: the cigar string may have unsupported char, should only have [MISDN]' ) else: newcigarlist.append((operation, num)) #shell.eprint(newcigarlist, t) if len(t) == 2: return _trim(newcigarlist[::-1], [t[1]], start, seq[t[0]:-t[1]], qual[t[0]:-t[1]]) elif len(t) == 1: ## if newcigarlist[0][0] == 1: newcigarlist[0] = (4, newcigarlist[0][1]) elif newcigarlist[0][0] in (2, 3): newcigarlist = newcigarlist[1:] ## if newcigarlist[-1][0] == 1: newcigarlist[-1] = (4, newcigarlist[-1][1]) elif newcigarlist[-1][0] in (2, 3): newcigarlist = newcigarlist[:-1] return newcigarlist[::-1], start, seq, qual def _transfer_cigar(cigartuples, start, query, qual): ''' Judge the extend end of scaffold by read if out, and return ''' #shell.eprint(cigartuples) d = {} # record handle of each cigar offset = 0 # the offset on read i = start # i for Delete position in genome #shell.eprint(query) for operation, num in cigartuples: #shell.eprint(offset) #shell.eprint(query[offset: offset+num]) if operation == 0: ## M try: d['M'].append( (start, start + num, query[offset:offset + num], qual[offset:offset + num])) except KeyError: d['M'] = [(start, start + num, query[offset:offset + num], qual[offset:offset + num])] #shell.eprint(d) start += num - 1 i += num offset += num elif operation == 1: ## I try: d['I'].append((start, query[offset:offset + num], qual[offset:offset + num])) except KeyError: d['I'] = [(start, query[offset:offset + num], qual[offset:offset + num])] start += 1 offset += num elif operation == 2: ## D try: d['D'].append((i, num)) except KeyError: d['D'] = [(i, num)] start += num + 1 i += num elif operation == 3: ## N start += num + 1 i += num offset += num elif operation == 4: ## S offset += num else: shell.eprint( '[' + PROGRAM + '] Error: the cigar string may have unsupported char, should only have [MISDN]' ) return start, d def _update(seq, dw, **kw): if 'M' in kw: for line in kw['M']: #shell.eprint(line) for j, i in enumerate(range(line[0], line[1])): if i not in dw: dw[i] = {'M': [line[2][j], line[3][j]]} else: try: dw[i]['M'][0] += line[2][j] dw[i]['M'][1] += line[3][j] except KeyError: #shell.eprint(j) dw[i].update({'M': [line[2][j], line[3][j]]}) if 'I' in kw: for line in kw['I']: if line[0] not in dw: dw[line[0]] = {'I': [line[1], line[2]]} else: try: dw[line[0]]['I'][0] += ',' + line[1] dw[line[0]]['I'][1] += ',' + line[2] except KeyError: dw[line[0]].update({'I': [line[1], line[2]]}) if 'D' in kw: for line in kw['D']: st = '' for i in range(line[1]): st += seq[line[0] + i - 1] if line[0] not in dw: dw[line[0]] = {'D': [st, '0' * line[1]]} else: try: dw[line[0]]['D'][0] += ',' + st dw[line[0]]['D'][1] += ',' + '0' * line[1] except KeyError: dw[line[0]].update({'D': [st, '0' * line[1]]}) return dw def _write2gzipfile(dw, fw): wr = [] for key in sorted(dw.keys()): if 'M' in dw[key]: if 'I' in dw[key]: if 'D' in dw[key]: wr.append(sca + '\t' + str(key) + '\t' + seq[key - 1] + '\t' + '[M]:' + dw[key]['M'][0] + ';[I]:' + dw[key]['I'][0] + ';[D]:' + dw[key]['D'][0] + '\t' + '[M]:' + dw[key]['M'][1] + ';[I]:' + dw[key]['I'][1] + ';[D]:' + dw[key]['D'][1] + '\t' + str(len(dw[key]['M'][0])) + '\n') else: wr.append(sca + '\t' + str(key) + '\t' + seq[key - 1] + '\t' + '[M]:' + dw[key]['M'][0] + ';[I]:' + dw[key]['I'][0] + '\t' + '[M]:' + dw[key]['M'][1] + ';[I]:' + dw[key]['I'][1] + '\t' + str(len(dw[key]['M'][0])) + '\n') elif 'D' in dw[key]: wr.append(sca + '\t' + str(key) + '\t' + seq[key - 1] + '\t' + '[M]:' + dw[key]['M'][0] + ';[D]:' + dw[key]['D'][0] + '\t' + '[M]:' + dw[key]['M'][1] + ';[D]:' + dw[key]['D'][1] + '\t' + str(len(dw[key]['M'][0])) + '\n') else: wr.append(sca + '\t' + str(key) + '\t' + seq[key - 1] + '\t' + '[M]:' + dw[key]['M'][0] + '\t' + '[M]:' + dw[key]['M'][1] + '\t' + str(len(dw[key]['M'][0])) + '\n') fw.write(''.join(wr)) return {} if not sortbamfile.endswith('.bam'): shell.eprint('[' + PROGRAM + '] Error: input should be sorted and indexed bam') sys.exit(1) if trim[0] < 0 or trim[1] < 0: shell.eprint( 'Are you serious ? --trim should not be negative integer.') sys.exit(255) if output == '': output = sortbamfile # Get a chromosome or scaffold from genome then do with bam ## If scaffold has no mapping reads, could it raise error? fw = gzip.open(output + suffix, 'wt') with pysam.AlignmentFile(sortbamfile, 'rb') as f: for sca, seq, seqlen in shell.Fa2Geno(genomefile): dw, newend = {}, 1 try: for read in f.fetch(contig=sca): if 1 < newend < read.reference_start + 1: dw = _write2gzipfile(dw, fw) if len(read.query_sequence) != len(read.qual): continue #shell.eprint(read.query_sequence) if trim == (0, 0): newstart, d = _transfer_cigar(read.cigartuples, read.reference_start + 1, read.query_sequence, read.qual) else: if read.flag & 16: trim = trim[::-1] newstart, d = _transfer_cigar( *_trim(list( read.cigartuples), trim, read.reference_start + 1, read.query_sequence, read.qual)) # if seqlen < newstart: continue if newstart > newend: newend = newstart dw = _update(seq, dw, **d) _write2gzipfile(dw, fw) except ValueError as e: shell.eprint('[' + PROGRAM + '] Error: input should be sorted and indexed bam') os.system('rm -f ' + output + suffix) sys.exit(1) fw.close() return output + suffix
def merge_RES(Config, Genomefile, Phred_DNA=33, Phred_RNA=33, Qual_cutoff=30, HomoPrior=0.99, Rate=2, Method='Bayesian', Ploidy=2, Intron=None, DNAdepth=10, RNAdepth=3, Bayesian_Posterior_Probability=0.95, FDR_DNA_Heterozygosis=0.05, Non_Ref_BaseCount=0, Paralogous_D=1, Intronic=6, Homopolymer=1, out_path='./'): ''' merge the result of scanner and check ''' BT = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'} BASE = ('A', 'C', 'G', 'T') def read(f, s, table, several_tp): for line in f: if line[0] == '#': continue fd = line.strip().split() key1, key2, key3, key4, key5 = (fd[0], fd[1], fd[2]), (s, 'DNA'), ( s, 'RNA'), (s, 'SNPvalue'), (s, 'Editvalue') try: table[key1].update({key2: fd[5]}) except KeyError: table[key1] = {key2: fd[5]} table[key1][key3] = fd[9] if Method == 'Bayesian': table[key1][key4] = (fd[6], fd[7]) else: table[key1][key4] = fd[6] table[key1][key5] = fd[12] table[key1]['gbase'] = fd[3].upper() try: if table[key1]['type'] != fd[10]: several_tp.add(key1) except KeyError: table[key1]['type'] = fd[10] return 0 def deleteintron(sca, l, table): for i in l: try: del table[(sca, str(i), '+')] except KeyError: pass try: del table[(sca, str(i), '-')] except KeyError: pass try: del table[(sca, str(i), '.')] except KeyError: pass return 0 def caculus(f, table, errorRate, st): for line in f: fd = line.strip().split() ref = fd[2] for strand in st: key1 = (fd[0], fd[1], strand) if key1 in table: key2 = (s, 'RNA') if key2 not in table[key1]: editV = (s, 'Editvalue') if fd[5] == 0: table[key1][key2] = '0,0,0,0' table[key1][editV] = '1' else: seq, qual = shell._DealBasequality( fd[3][4:4 + int(fd[-1])].upper(), fd[4][4:4 + int(fd[-1])], Phred_RNA) l = [seq.count(i) for b in BASE] table[key1][key2] = '{},{},{},{}'.format(*l) tol, editdep = 0, 0 for i in range(4): tol += l[i] if BASE[i] != ref and l[i] > editdep: editdep = l[i] table[key1][editV] = shell._SNVvalue_Binomial( (editdep, tol), errorRate, RNA=True) return 0 if Method not in ('Bayesian', 'Binomial', 'Frequency'): print_help() sys.exit(1) elif Method == 'Bayesian' and Genomefile == None: print_help() sys.exit(1) HetePrior = 1 - HomoPrior ## read config file sample, order = {}, [] with open(Config, 'r') as f: for line in f: fd = line.strip().split() order.append(fd[0]) if os.path.isfile(fd[1] + '.stat') == True: sample[fd[0]] = {3: fd[1] + '.stat'} else: shell.eprint( '[bigtable] Error: DNA single base file do not have .stat file in the same directory' ) sys.exit(1) sample[fd[0]][0] = fd[1] if len(fd) < 4: shell.eprint('[bigtable] Error: config file format error') sys.exit(1) sample[fd[0]][1.1] = fd[2] sample[fd[0]][1.2] = fd[3] if len(fd) == 6: sample[fd[0]][2.1] = fd[4] sample[fd[0]][2.2] = fd[5] ## read table several_tp, table, peak_dep = set(), {}, {} for s, v in sample.items(): f = shell.IsGzipFile(v[1.2]) f.readline() read(f, s, table, several_tp) f.close() with open(v[3], 'r') as f: fd = f.readlines()[1].strip().split() peak_dep[s] = float( fd[4]) if float(fd[4]) > float(fd[3]) else float(fd[3]) try: f = shell.IsGzipFile(v[2.2]) f.readline() read(f, s, table, several_tp) f.close() except: pass ## delete sites have different editing type in different sample for key in several_tp: del table[key] ## filter sites locating near junctions if Intron != None: f = shell.IsGzipFile(Intron) for line in f: fd = line.strip().split() if int(fd[3]) < int(fd[4]): beg, end = int(fd[3]), int(fd[4]) else: beg, end = int(fd[4]), int(fd[3]) deleteintron(fd[1], range(beg, beg + Intronic), table) deleteintron(fd[1], range(end - Intronic + 1, end + 1), table) f.close() ## filter homopolymer if Homopolymer: seq, leng, st = {}, {}, '' m = [re.compile(i * 5) for i in BASE] f = shell.IsGzipFile(Genomefile) for line in f: if line[0] == '>': if st: seq[key] = st leng[key] = len(st) st = '' key = line.strip().split()[0][1:] else: st += line.strip().upper() if st: seq[key] = st leng[key] = len(st) delkey = set() for key in table: sca, pos, strand = key beg = int(pos) - 4 if int(pos) - 4 > 1 else 1 end = int(pos) + 4 if int(pos) + 4 < leng[sca] else leng[sca] nt = seq[sca][beg - 1:end] for m1 in m: if m1.match(nt): delkey.add(key) break for key in delkey: del table[key] del seq del leng del delkey if Method == 'Bayesian': basecontent = shell._BasePercent(Genomefile) weight = 0.5 weight_other = (1 - weight) / 2 ## adjust substitution rate for illumina FixedError = { 'A': { 'C': weight, 'T': weight_other, 'G': weight_other }, 'C': { 'A': weight, 'T': weight_other, 'G': weight_other }, 'G': { 'T': weight, 'A': weight_other, 'C': weight_other }, 'T': { 'G': weight, 'A': weight_other, 'C': weight_other } } FixedKey = [] for key in FixedError.keys(): for ke in FixedError[key].keys(): if (key, ke) not in FixedKey and (ke, key) not in FixedKey: FixedKey.append((key, ke)) errorRate = 10**(-1 * Qual_cutoff / 10) for s, v in sample.items(): with gzip.open(v[0], 'rt') as f: for line in f: fd = line.strip().split() ref = fd[2].upper() for strand in ('+', '-', '.'): key1 = (fd[0], fd[1], strand) if key1 in table: key2 = (s, 'DNA') if key2 not in table[key1]: snpV = (s, 'SNPvalue') if fd[5] == 0: table[key1][snpV] = 'NA' table[key1][key2] = '0,0,0,0' else: seq, qual = shell._DealBasequality( fd[3][4:4 + int(fd[-1])].upper(), fd[4][4:4 + int(fd[-1])], Phred_DNA) table[key1][key2] = ','.join( [str(seq.count(i)) for b in BASE]) if seq == '': table[key1][snpV] = 'NA' continue if Method == 'Bayesian': result = shell._SNPvalue_Bayesian( seq, qual, Ploidy, FixedKey, FixedError, basecontent) table[key1][snpV] = (result[0], result[1]) elif Method == 'Binomial': table[key1][ snpV] = shell._SNVvalue_Binomial( ','.join([ str(seq.count(i)) for b in BASE ]), ref) elif Method == 'Frequency': table[key1][ snpV] = shell._SNPvalue_Frequency( seq, ref)[1] else: shell.eprint( '[bigtable] Error: --method not recognize' ) sys.exit(1) with gzip.open(v[1.1], 'rt') as f: caculus(f, table, errorRate, ['+', '.']) if 2.1 not in v.keys(): continue with gzip.open(v[2.1], 'rt') as f: caculus(f, table, errorRate, ['-']) ## Complement DNA and RNA information for all sites in the table for k, v in table.items(): for spl in order: keyDNA, keysnpV, keyRNA, keyeditV = (spl, 'DNA'), ( spl, 'SNPvalue'), (spl, 'RNA'), (spl, 'Editvalue') if keyDNA not in v: table[k][keyDNA] = '0,0,0,0' table[k][keysnpV] = 'NA' if keyRNA not in v: table[k][keyRNA] = '0,0,0,0' table[k][keyeditV] = '1' ## Remove sites with high DNA depth and multiple RNA editing types delkey = set() for key1 in table: for s in order: fd = [int(i) for i in table[key1][(s, 'DNA')].split(',')] if Paralogous_D: dep = sum(fd) if dep > 2 * peak_dep[s]: delkey.add(key1) break fd = [int(i) for i in table[key1][(s, 'RNA')].split(',')] rna_count = {} for i in range(4): if BASE[i] != table[key1]['gbase']: rna_count[BASE[i]] = fd[i] key3 = sorted(rna_count.keys(), key=lambda x: rna_count[x], reverse=True) if len(key3) != 3: shell.eprint('[bigtable] Error: rna depth error') sys.exit(1) if rna_count[key3[0]] > 0 and (rna_count[key3[1]] / rna_count[key3[0]]) > 0.01: delkey.add(key1) break for key in delkey: del table[key] delkey = set() for key1 in table: kplus, kminus = (k[0], k[1], '+'), (k[0], k[1], '-') if kplus in table and kminus in table: plusdep, miusdep = 0, 0 for s in order: plusdep += sum( [int(i) for i in table[kplus][(s, 'RNA')].split(',')]) miusdep += sum( [int(i) for i in table[kminus][(s, 'RNA')].split(',')]) if plusdep > miusdep: delkey.add(kminus) elif plusdep < miusdep: delkey.add(kplus) for key in delkey: del table[key] del delkey fdr, binomial = {}, {} for key1 in table: for s in order: try: fdr[s].append([key1, table[key1][(s, 'Editvalue')]]) except KeyError: fdr[s] = [[key1, table[key1][(s, 'Editvalue')]]] if Method == 'Binomial': try: binomial[s].append([key1, table[key1][(s, 'SNPvalue')]]) except KeyError: binomial[s] = [[key1, table[key1][(s, 'SNPvalue')]]] for s, v in fdr.items(): fd = sorted(v, key=lambda x: float(x[1]), reverse=True) fd_fdr = shell._Fdr([float(i[1]) for i in fd]) for i in range(len(fd_fdr)): table[fd[i][0]][(s, 'Editvalue')] = fd_fdr[i] least_dep = 1 if Method == 'Binomial': for s, v in binomial.items(): fd = sorted(v, key=lambda x: float(x[1]), reverse=True) fd_fdr = shell._Fdr([float(i[1]) for i in fd]) for i in range(len(fd_fdr)): table[fd[i][0]][(s, 'SNPvalue')] = shell._FormatP(fd_fdr[i]) p = sorted(peak_dep.keys(), key=lambda x: peak_dep[x]) ratio = 1 / Ploidy while least_dep < p[0]: if shell._SNVvalue_Binomial( (0, least_dep), ratio, RNA=True) < 0.05: break least_dep += 1 fw = open(out_path + '/RES.txt', 'w') title = '#1.Chromosome\t2.Coordinate\t3.Strand\t4.Gbase\t5.EditType' for idx, s in enumerate(order): title += '\t' + str( 6 + idx * 2) + '.' + s + '.DNA_baseCount[A,C,G,T]\t' + str( 7 + idx * 2) + '.' + s + '.RNA_basecount[A,C,G,T];P_value' title += '\n' fw.write(title) for key1 in sorted(table.keys(), key=lambda x: (x[0], int(x[1]))): info = [] flag = 1 for s in order: rnainfo = table[key1][(s, 'RNA')].split(',') rna_dep = sum([int(i) for i in rnainfo]) dna_dep = sum([int(i) for i in table[key1][(s, 'DNA')].split(',')]) if table[key1][(s, 'SNPvalue')] != 'NA': if Method == 'Bayesian': if len( set(table[key1][(s, 'SNPvalue')][0]) - set(table[key1]['gbase'])) != 0 or float( table[key1][(s, 'SNPvalue')] [1]) < Bayesian_Posterior_Probability: flag = 0 elif Method == 'Binomial': if dna_dep < least_dep: n = 0 for i in table[key1][(s, 'RNA')].split(','): if int(i) == 0: n += 1 if n < 3: flag = 0 else: if table[key1][(s, 'SNPvalue')] < FDR_DNA_Heterozygosis: flag = 0 elif Method == 'Frequency': if table[key1][(s, 'SNPvalue')] < Non_Ref_BaseCount: flag = 0 if dna_dep >= DNAdepth and rna_dep >= RNAdepth and float( table[key1][(s, 'Editvalue')]) < 0.05: alt_base = table[key1]['type'].split('->')[1] if key1[2] == '-': alt_base = BT[alt_base] base_dep = {} for i in range(4): base_dep[BASE[i]] = int(rnainfo[i]) if base_dep[alt_base] > 0: info.append(table[key1][(s, 'DNA')] + '\t' + table[key1][(s, 'RNA')] + ';' + str(table[key1][(s, 'Editvalue')]) + '*') else: info.append(table[key1][(s, 'DNA')] + '\t' + table[key1][(s, 'RNA')] + ';' + str(table[key1][(s, 'Editvalue')])) else: info.append(table[key1][(s, 'DNA')] + '\t' + table[key1][(s, 'RNA')] + ';' + str(table[key1][(s, 'Editvalue')])) if flag == 1: fw.write(('{}\t' * 5).format( *(list(key1) + [table[key1]['gbase'], table[key1]['type']])) + '\t'.join(info) + '\n') return 0
def get_config(): try: optlist, args = getopt.getopt(sys.argv[1:], 'ho:', ['help', 'outdir=', 'bwa=']) except getopt.GetoptError as e: shell.eprint('[' + PROGRAM + '] Error: ' + str(e)) sys.exit(2) if optlist == [] and args == []: print_help() sys.exit(0) global BWA, OUTDIR for opt, value in optlist: if opt in ('-h', '--help'): print_help() sys.exit(1) elif opt in ('-o', '--outdir'): OUTDIR = os.path.abspath(value) + '/' elif opt == '--bwa': if os.path.exists(value): BWA = os.path.abspath(value) else: shell.eprint('[' + PROGRAM + '] Error: bwa path does not exist') sys.exit(1) try: genomefile = args[0] if os.path.isfile(genomefile): genomefile = os.path.abspath(genomefile) else: shell.eprint('[' + PROGRAM + '] Error: genome file does not exist') sys.exit(1) except ValueError as e: shell.eprint('[' + PROGRAM + '] Error: ' + str(e)) sys.exit(1) if BWA == '': r = subprocess.getstatusoutput('which bwa') if r[0] == 1: shell.eprint('[' + PROGRAM + '] Warning: lack bwa program') sys.exit(1) else: BWA = r[1] if os.path.exists(OUTDIR) == False: try: os.makedirs(OUTDIR) except: shell.eprint('[' + PROGRAM + '] Error: outdir could not be created, please check') sys.exit(1) if OUTDIR == '': OUTDIR = os.getcwd() + '/regeo' try: os.makedirs(OUTDIR) except FileExistsError: pass except: shell.eprint('[' + PROGRAM + '] Error: outdir could not be created, please check') sys.exit(1) os.chdir(OUTDIR) return genomefile
def main(): global DBAM, RBAM, DFQ1, DFQ2, RFQ1, RFQ2, PDFQ1, PRFQ1 config, genomefile = get_config() logging.info('Program Start') if FLAG & 2: logging.info('do soapnuke') if DFQ1 != '': DFQ1, DFQ2, PDFQ1 = soapnuke('DNA', DFQ1, DFQ2, PDFQ1, config) if RFQ1 != '': RFQ1, RFQ2, PRFQ1 = soapnuke('RNA', RFQ1, RFQ2, PRFQ1, config) logging.info('soapnuke has done') if FLAG & 4: logging.info('do pilon') logging.info('check genome index') for suffix in ['.amb', '.ann', '.bwt', '.pac', '.sa']: if os.path.isfile(genomefile + suffix) == False: logging.info('genome index not found') p = subprocess.Popen([BWA, 'index', genomefile], stdout=open('log', 'w'), stderr=subprocess.STDOUT) p.wait() if p.returncode != 0: shell.eprint('[' + PROGRAM + '] Error: bwa index run error') shell.eprint(''.join(open('log', 'r').read())) sys.exit(1) logging.info('genome index done') break try: os.makedirs('pilon/') except FileExistsError: pass except: shell.eprint('[' + PROGRAM + '] Error: mkdir pilon/ error') sys.exit(1) args = [] for k, v in config['bwa']['mem'].items(): args += (k, v) args += [genomefile, DFQ1] if DFQ2 == '' else [genomefile, DFQ1, DFQ2] p = subprocess.Popen([BWA, 'mem'] + args, stdout=subprocess.PIPE, stderr=open('log', 'w')) infile = pysam.AlignmentFile(p.stdout, 'r') oufile = pysam.AlignmentFile('pilon/mem_just.bam', 'wb', template=infile) for r in infile: oufile.write(r) infile.close() oufile.close() p.wait() if p.returncode != 0: shell.eprint('[' + PROGRAM + '] Error: bwa mem run error') shell.eprint(''.join(open('log', 'r').read())) sys.exit(1) DBAM_mem = bamsortindex('pilon/mem_just.bam') try: os.makedirs('pilon/sub/') except FileExistsError: pass except: shell.eprint('[' + PROGRAM + '] Error: mkdir pilon/sub/ error') sys.exit(1) shell.splitFa(genomefile, 'pilon/sub/') f = pysam.AlignmentFile(DBAM_mem, 'rb') for fn in [ 'pilon/sub/' + bed for bed in os.listdir('pilon/sub/') if bed.endswith('.bed') ]: fw = pysam.AlignmentFile('.'.join(fn.split('.')[:-1]) + '.bam', 'wb', template=f) with open(fn, 'r') as fg: for line in fg: for r in f.fetch(contig=line.strip().split()[0]): if r.flag & 256 or r.flag & 2048: continue fw.write(r) fw.close() try: pysam.index('.'.join(fn.split('.')[:-1]) + '.bam') except: shell.eprint('[' + PROGRAM + '] Error: ' + '.'.join(fn.split('.')[:-1]) + '.bam' + ' index error') f.close() for fn in [ 'pilon/sub/' + fa for fa in os.listdir('pilon/sub/') if fa.endswith('.fa') ]: maxn = int( subprocess.getstatusoutput( 'awk \'BEGIN{x};{x+=$3};END{print x}\' ' + '.'.join(fn.split('.')[:-1]) + '.bed')[1]) maxmem = '-Xmx8g' if maxn <= 100000000 else '-Xmx16g' p = subprocess.Popen([ JAVA, maxmem, '-jar', PILON, '--fix', 'snps', '--genome', fn, '--bam', '.'.join(fn.split('.')[:-1]) + '.bam', '--output', fn + '_fix_snps' ], stdout=open('log', 'w'), stderr=subprocess.STDOUT) p.wait() if p.returncode != 0: shell.eprint('[' + PROGRAM + '] Error: pilon run error') shell.eprint(''.join(open('log', 'r').read())) sys.exit(1) os.system('cat pilon/sub/*_fix_snps.fasta > ' + genomefile + '.fix_snps.fa') os.system('sed -i s/_pilon// ' + genomefile + '.fix_snps.fa') os.system('rm -rf pilon/sub/*') genomefile = genomefile + '.fix_snps.fa' logging.info('pilon has done') # check and do bwa aln logging.info('do bwa aln') logging.info('check fixed genome index') for suffix in ['.amb', '.ann', '.bwt', '.pac', '.sa']: if os.path.isfile(genomefile + suffix) == False: logging.info('fixed genome not found') p = subprocess.Popen([BWA, 'index', genomefile], stdout=open('log', 'w'), stderr=subprocess.STDOUT) p.wait() if p.returncode != 0: shell.eprint( '[' + PROGRAM + '] Error: bwa index for fixed snps genome run error') shell.eprint(''.join(open('log', 'r').read())) sys.exit(1) logging.info('fixed genome index done') break try: os.makedirs('aln/') except FileExistsError: pass except: shell.eprint('[' + PROGRAM + '] Error: make aln directory error') sys.exit(1) if DFQ1 != '' and DBAM == '': DBAM = bwaaln('DNA', genomefile, DFQ1, DFQ2, PDFQ1, 'aln/', config) if RFQ1 != '' and RBAM == '': RBAM = bwaaln('RNA', genomefile, RFQ1, RFQ2, PRFQ1, 'aln/', config) logging.info('bwa aln has done') logging.info('do get best bam') if DBAM: bestuniqbam(DBAM, DNA=True, **config['bestuniqbam']['DNA']) DBAM = bamsortindex('.'.join(os.path.basename(DBAM).split('.')[:-1]) + '.best.bam') logging.info('The output file of DNA alignment is ' + DBAM) if RBAM: bestuniqbam(RBAM, RNA=True, **config['bestuniqbam']['RNA']) if 'SS' in config['bestuniqbam']['RNA']: bamsortindex('.'.join(os.path.basename(RBAM).split('.')[:-1]) + '.negative.bam') bamsortindex('.'.join(os.path.basename(RBAM).split('.')[:-1]) + '.positive.bam') logging.info( 'The output files of strand-specific RNA alignment are ' + '.'.join(os.path.basename(RBAM).split('.')[:-1]) + '.negative.bam and ' + '.'.join(os.path.basename(RBAM).split('.')[:-1]) + '.positive.bam') else: bamsortindex('.'.join(os.path.basename(RBAM).split('.')[:-1]) + '.best.bam') logging.info('The output file of RNA alignment is ' + '.'.join(os.path.basename(RBAM).split('.')[:-1]) + '.best.bam') os.system('rm -f log') logging.info('All things have been done! Have a good day!') return 0
def get_config(): shortopts = 'ho:' longopts = [ 'help', 'outdir=', 'DNA-fq1=', 'DNA-fq2=', 'RNA-fq1=', 'RNA-fq2=', 'DNA-bam=', 'RNA-bam=', 'bwa=', 'soapnuke=', 'java=', 'pilon=', 'config=', 'DNA-mapQ=', 'RNA-mapQ=', 'rmdup=', 'uniq=', 'ss=', 'DNA-I=', 'RNA-I=' ] try: optlist, args = getopt.getopt(sys.argv[1:], shortopts, longopts) except getopt.GetoptError as e: shell.eprint('[' + PROGRAM + '] Error: ' + str(e)) sys.exit(2) if optlist == [] and args == []: print_help() sys.exit(0) config = {'bestuniqbam': {'DNA': {}, 'RNA': {}}} tobool = {'T': True, 'F': False} global DBAM, DFQ1, DFQ2, RBAM, RFQ1, RFQ2 global PDFQ1, PDFQ2, PRFQ1, PRFQ2 global BWA, SOAPNUKE, JAVA, PILON global FLAG, OUTDIR for opt, value in optlist: if opt in ('-h', '--help'): print_help() sys.exit(0) elif opt in ('-o', '--outdir'): OUTDIR = os.path.abspath(value) + '/' elif opt == '--DNA-fq1': if os.path.exists(value): DFQ1 = os.path.abspath(value) else: shell.eprint('[' + PROGRAM + '] Error: dna fq1 file does not exist') sys.exit(1) elif opt == '--DNA-fq2': if os.path.exists(value): DFQ2 = os.path.abspath(value) else: shell.eprint('[' + PROGRAM + '] Error: dna fq2 file does not exist') sys.exit(1) elif opt == '--RNA-fq1': if os.path.exists(value): RFQ1 = os.path.abspath(value) else: shell.eprint('[' + PROGRAM + '] Error: rna fq1 file does not exist') sys.exit(1) elif opt == '--RNA-fq2': if os.path.exists(value): RFQ2 = os.path.abspath(value) else: shell.eprint('[' + PROGRAM + '] Error: rna fq2 file does not exist') sys.exit(1) elif opt == '--DNA-bam': if os.path.exists(value): DBAM = os.path.abspath(value) else: shell.eprint('[' + PROGRAM + '] Error: dna bam file does not exist') sys.exit(1) elif opt == '--RNA-bam': if os.path.exists(value): RBAM = os.path.abspath(value) else: shell.eprint('[' + PROGRAM + '] Error: rna bam file does not exist') sys.exit(1) elif opt == '--bwa': if os.path.exists(value): BWA = os.path.abspath(value) else: shell.eprint('[' + PROGRAM + '] Error: bwa path does not exist') sys.exit(1) elif opt == '--soapnuke': if os.path.exists(value): SOAPNUKE = os.path.abspath(value) else: shell.eprint('[' + PROGRAM + '] Error: soapnuke path does not exist') sys.exit(1) elif opt == '--java': if os.path.exists(value): JAVA = os.path.abspath(value) else: shell.eprint('[' + PROGRAM + '] Error: java path does not exist') sys.exit(1) elif opt == '--pilon': if os.path.exists(value): PILON = os.path.abspath(value) else: shell.eprint('[' + PROGRAM + '] Error: pilon path does not exist') sys.exit(1) elif opt == '--config': if os.path.exists(value): with open(value, 'r') as f: config.update(json.load(f)) else: shell.eprint('[' + PROGRAM + '] Error: config file does not exist') sys.exit(1) if len( set(['-C', '-D', '-o', '-1', '-2']) & config['soapnuke']['filter'].keys()) > 0: shell.eprint( '[' + PROGRAM + '] Error: soapnuke parameters -C -D -o -1 -2 do not needed to provided' ) sys.exit(1) elif opt == '--DNA-mapQ': try: config['bestuniqbam']['DNA']['mapQ'] = int(value) except ValueError: shell.eprint('[' + PROGRAM + '] Error: --DNA-mapQ should be integer') sys.exit(1) elif opt == '--RNA-mapQ': try: config['bestuniqbam']['RNA']['mapQ'] = int(value) except ValueError: shell.eprint('[' + PROGRAM + '] Error: --RNA-mapQ should be integer') sys.exit(1) elif opt == '--rmdup': try: config['bestuniqbam']['RNA']['RmDup'] = tobool[value] except KeyError: shell.eprint('[' + PROGRAM + '] Error: --rmdup should be T or F') sys.exit(1) elif opt == '--uniq': try: config['bestuniqbam']['RNA']['Uniq'] = tobool[value] except KeyError: shell.eprint('[' + PROGRAM + '] Error: --uniq should be T or F') sys.exit(1) elif opt == '--ss': try: config['bestuniqbam']['RNA']['SS'] = tobool[value] except KeyError: shell.eprint('[' + PROGRAM + '] Error: --ss should be T or F') sys.exit(1) else: assert False, 'unhandled option' if BWA == '': shell.eprint('[' + PROGRAM + '] Error: bwa is necessary') sys.exit(1) if PILON != '' and JAVA == '': shell.eprint('[' + PROGRAM + '] Error: pilon process needs java') sys.exit(1) if os.path.exists(OUTDIR) == False: try: os.makedirs(OUTDIR) except FileExistsError: shell.eprint( '[' + PROGRAM + '] Warning: outdir have existed, may have some conflict') except: shell.eprint('[' + PROGRAM + '] Error: outdir could not be created, please check') sys.exit(1) try: genomefile = args[0] genomefile = os.path.abspath(genomefile) try: os.symlink(genomefile, OUTDIR + os.path.basename(genomefile)) except FileExistsError: pass except: shell.eprint('[' + PROGRAM + '] Error: could not create soft link of genomefile') sys.exit(1) genomefile = os.path.basename(genomefile) except ValueError: shell.eprint('[' + PROGRAM + '] Error: ' + str(e)) sys.exit(1) if (DFQ1 != '' and DBAM != '') or (DFQ1 == '' and DBAM == ''): shell.eprint( '[' + PROGRAM + '] Error: DNA fastq file or bamfile should be provided only one') sys.exit(1) if (RFQ1 != '' and RBAM != '') or (RFQ1 == '' and RBAM == ''): shell.eprint( '[' + PROGRAM + '] Error: RNA fastq file or bamfile should be provided only one') sys.exit(1) if (DBAM != '' and RBAM != '') and (SOAPNUKE != '' or PILON != ''): shell.eprint( '[' + PROGRAM + '] Error: there are some logical error. Offering bam file means there is no need to do soapnuke or pilon' ) sys.exit(1) if DFQ1 != '': PDFQ1 = shell.checkFqQuality(DFQ1) if RFQ1 != '': PRFQ1 = shell.checkFqQuality(RFQ1) if SOAPNUKE != '': FLAG += 2 if PILON != '': FLAG += 4 os.chdir(OUTDIR) logging.basicConfig(level=logging.INFO, filename=OUTDIR + PROGRAM + '.log', filemode='w', format='%(asctime)s : %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') return config, genomefile
def bestuniqbam(bamfile, mapQ=20, DNA=None, RNA=None, SS=False, RmDup=True, Uniq=True, outdir='./'): ''' Filter bam to best uniq bam. ##eg: bestuniqbam(bamfile, RNA-mapQ=20, RNA=True, SS=True, RmDup=True, Uniq=True, outdir='./scanner') ''' def _flag_split(g): ''' According to sam format, 115 = 64 + 32 + 16 + 2 + 1. ''' return set([g & 2**i for i in range(0, 12)]) # check bamfile if bamfile.split('.')[-1] == 'bam': file_type_tag = 'rb' elif bamfile.split('.')[-1] == 'sam': file_type_tag = 'r' else: shell.eprint( '[' + PROGRAM + '] Error: the input bamfile/samfile should be *.bam or *.sam.') sys.exit(1) # parameter tickle if (RNA is None and DNA is None) or (RNA != None and DNA != None): shell.eprint('[' + PROGRAM + '] Error: --DNA or --RNA is needed!') sys.exit(1) if DNA == True and SS == True: shell.eprint( '[' + PROGRAM + '] Warning: DNA do not have --ss, but it\'s ok to run this program.' ) if os.path.exists(outdir) == False: try: os.makedirs(outdir) except: sys.exit(1) # core try: f = pysam.AlignmentFile(bamfile, file_type_tag) except: shell.eprint('[' + PROGRAM + '] Error: check the bam file ' + os.path.basename(bamfile) + ' please!') sys.exit(1) ## DNA and RNA's bamfile name should be different by yourself. if SS: fn1 = outdir + '.'.join( os.path.basename(bamfile).split('.')[:-1]) + '.negative.bam' fn2 = outdir + '.'.join( os.path.basename(bamfile).split('.')[:-1]) + '.positive.bam' fw1 = pysam.AlignmentFile(fn1, 'wb', template=f) fw2 = pysam.AlignmentFile(fn2, 'wb', template=f) else: fn3 = outdir + '.'.join( os.path.basename(bamfile).split('.')[:-1]) + '.best.bam' fw3 = pysam.AlignmentFile(fn3, 'wb', template=f) for line in f.fetch(until_eof=True): if int(line.mapping_quality) < mapQ: continue flag = int(line.flag) tags = line.get_tags() if 1024 & flag and RmDup: continue if RNA: if Uniq and (('XT', 'U') not in tags or ('X0', 1) not in tags or ('X1', 0) not in tags): continue if 256 & flag: continue if flag in (67, 131, 115, 179): continue if 32 & flag and 16 & flag: continue if SS: if 64 & flag and 32 & flag and line.template_length >= 0: fw1.write(line) elif 128 & flag and 16 & flag and line.template_length <= 0: fw1.write(line) elif 64 & flag and 16 & flag and line.template_length <= 0: fw2.write(line) elif 128 & flag and 32 & flag and line.template_length >= 0: fw2.write(line) elif 64 & flag and 16 & flag != 16 and 32 & flag != 32: fw1.write(line) elif 128 & flag and 16 & flag != 16 and 32 & flag != 32: fw2.write(line) elif flag == 16: fw2.write(line) elif flag == 0: fw1.write(line) else: fw3.write(line) elif DNA: fw3.write(line) else: pass if SS: fw1.close() fw2.close() return fn1, fn2 else: fw3.close() return fn3