def make_ad_dict(fn): fi = '/work/biophysics/mtang/SNP_calling/scripts/data/adaptor/index_and_adaptor' #the index by the number adict = {} #the index by the barcode bdict = {} for line in cmn.file2lines(fi): index, barcode, seq = line.strip().split() adict[index] = seq bdict[barcode] = seq taken = set([]) rdict = {} for line in cmn.file2lines(fn): try: sp, index = line.strip().split() except: continue if index.isdigit(): ad1 = adict[index] else: try: ad1 = bdict[index] except KeyError: print('Error! can not find index info for %s' % line) sys.exit() rdict[sp] = ad1 return rdict
def parse_popDef(fpopdef): adict = {} for fn in cmn.file2lines(fpopdef): print(fn) popname = cmn.lastName(fn).replace('IDs', '') IDs = cmn.file2lines(fn) adict[popname] = IDs return adict
def parse_popDef(fpopdef, freceipt, inclusion): adict = {} receipt = cmn.lastName(freceipt) for fn in cmn.file2lines(fpopdef): print(fn) if cmn.lastName(fn) == receipt: continue popname = cmn.lastName(fn).replace('IDs','').rstrip('_') IDs = [line.split()[0] for line in cmn.file2lines(fn)] IDs = set(IDs) & inclusion adict[popname] = IDs return adict
def read_indel_info(fn): adict = {} for line in cmn.file2lines(fn): key, a, b, char = line.strip().split('\t') i, j = list(map(int, key[1:-1].split(', '))) adict[(i,j)] = char return adict
def read_baits(): fns = cmn.cmd2lines('ls baits/bait*.fa') seqDict = {} for fn in fns: name, seq = cmn.file2lines(fn) seqDict[name[1:]] = list(seq) return seqDict
def read_baits(fn): adict = {} toAdd = {} hasPrimer = True new = [] for line in cmn.file2lines(fn): if line.strip() == '': continue sp, name, seq = line.split() print(len(seq)) if len(seq) != 698: hasPrimer = False if len(seq) == 658: #fixable seq = add_primer(seq) else: print('Error! didn\'t recognize the length of the bait %s %s' % (sp, name)) sys.exit() newline = '%s\t%s\t%s\n' % (sp, name, seq) new.append(newline) key = '%s_%s' % (sp, name) adict[key] = seq toAdd[name] = seq if not hasPrimer: print('revise the input baits to add primer...') cmn.write_file(''.join(new), fn) return adict, toAdd
def read_baits(fn): adict = {} for line in cmn.file2lines(fn): sp, name, seq = line.split() key = '%s_%s' % (sp, name) adict[key] = seq return adict
def isSameGenus(fn): lines = cmn.file2lines(fn)[1:] genus_list = [] #There are two checks: # 1. if the denovo barcode has a same genus with ref, take it # 2. otherwise, check if denovo barcode has a same one as refbase # 3. if both not satisfied, then unknown for line in lines: items = line.strip().split() found_genus = guessGenus(items[3]) ref_genus = guessGenus(items[4]) N = int(items[2]) if '_denovo' in items[0]: if found_genus == ref_genus: return ',takenD' if N <= 20: genus_list.append(found_genus) if len(genus_list) == len(lines): checkSet = set(genus_list) if len(checkSet) == 1: return ',takenD' elif len(checkSet) > 1: return ',diffGenus' return ',unknown'
def read_aln(fn): seqs = {} for line in cmn.file2lines(fn): name, seq = line.strip().split() seqs[name] = seq return seqs
def read_baitInfo(fadd): adict = {} for line in cmn.file2lines(fadd): sp, defline, seq = line.split() defline = parse_name(defline) adict[defline] = seq return adict
def parse_blast_output(fn, rset): adict = {} for line in cmn.file2lines(fn): #qseqid sseqid pident evalue qlen qstart qend slen sstart send items = line.strip().split() if len(items) == 0: continue qseqid, sseqid = items[:2] if qseqid in adict: continue evalue = float(items[3]) if evalue > 0.001: continue ident = float(items[2]) if ident < 50: continue isForward = True sstart, send = list(map(int, items[8:10])) if sstart > send: isForward = False if qseqid in rset: isForward = (not isForward) adict[qseqid] = (sseqid, isForward) return adict
def nonGap_char(fn): seq = ''.join([ line.strip() for line in cmn.file2lines(fn) if line.strip() != '' and (not line[0] == '>') ]) seq = seq.replace('N', '-') N = len(seq) - seq.count('-') return N
def read_genus_info(fn): adict = {} for line in cmn.file2lines(fn): sp, defline = line.split() genus = defline.split('_')[1] adict[sp] = genus return adict
def parse_receiptInfo(fn, alist): adict = {} for line in cmn.file2lines(fn): sample = line.strip().split()[0] adict[sample] = line blist = [adict[sample] for sample in alist] return blist
def parse_fastqlist(fn): adict = {} for line in cmn.file2lines(fn): sample = cmn.lastName(line).split('_')[0] try: adict[sample].append(line) except KeyError: adict[sample] = [line] return adict
def read_correct_info(fn): #3614_mito 5 C rdict = {} for line in cmn.file2lines(fn): items = line.strip().split('\t') #TODO: asuming only working on one scaffold (mito) index, char, info = items[-3:] rdict[int(index) - 1] = char return rdict
def check_NA(label): fn = label + '_stat.report' line = cmn.file2lines(fn)[-1] items = line.strip().split() #return isWarnning if len(items) != 10 or 'NA' in items: return True else: return False
def read_aln(fn): seqs = {} seq = '' for line in cmn.file2lines(fn): if line.strip() == '': continue name, seq = line.strip().split() seqs[name] = seq return seqs, len(seq)
def read_length_info(fn): adict = {} for line in cmn.file2lines(fn): scaf, length = line.strip().split() adict[scaf] = int(length) #assume the last scaffold is mito if 'mito' not in scaf: scaf = '' return adict, scaf
def old_get_goodIDs(): fn1 = 'compare.check' #fn2 = 'checkSummary.report' goodIDs = set([]) for line in cmn.file2lines(fn1): items = line.strip().split() if 'takenD' in line: goodIDs.add(items[0]) return goodIDs
def get_names(): adict = {} fns = cmn.cmd2lines('ls -tr /project/biophysics/Nick_lab/wli/sequencing/scripts/data/*.sampleData') for fn in fns: for line in cmn.file2lines(fn): line = line.strip() items = line.split() sp = items[0].split('-')[-1] line = line.replace(items[0], sp).replace('-', '_').replace('(','').replace(')', '') adict[sp] = '_'.join(line.split()) return adict
def parse_protein_query(fn): alist = [] for line in cmn.file2lines(fn): if line.strip() == '': continue if line[0] == '>': Id = line[1:].split()[0] alist.append(Id) return alist
def parse_blastDict(fn): adict = {} for line in cmn.file2lines(fn): #qseqid sseqid evalue pident qlen qstart qend slen sstart send items = line.strip().split() gene = items[0] qlen, qstart, qend, slen, sstart, send = list(map(int, items[4:10])) i, j, isReverse = find_start_and_end(qlen, qstart, qend, slen, sstart, send) adict[gene] = (i, j, isReverse) return adict
def parse_bait_file(fbait): new = {} target = {} for line in cmn.file2lines(fbait): ID, sp, seq = line.strip().split() #name = '%s_%s' % (ID, sp) name = sp if len(target) == 0: new[name] = seq target[name] = seq return new, target
def parse_indel_file(fn): #adict = {} total = 0 for line in cmn.file2lines(fn): N = len(line.strip().split()[-1]) total += N #try: # adict[N] += 1 #except KeyError: # adict[N] = 1 #return adict return total
def read_letter_info(fn): #3614_mito 107 T C 63 3614_mito_1 rdict = {} for line in cmn.file2lines(fn): if line.strip() == '': continue items = line.strip().split() scaf, index, char1, char2 = items[:4] phase = items[-1] #TODO:currently, assume only mito is present rdict[int(index) - 1] = (char1, char2, phase) return rdict
def parse_refTable(fn): adict = {} for line in cmn.file2lines(fn): items = line.strip().split() sample = items[0] refs = items[1:] maxSize = 0 for fref in refs: size = compute_fileSize([fref]) maxSize = max(size, maxSize) adict[sample] = maxSize return adict
def backup_vcf_coverage(wdir): ddir = '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/check_vcf_coverage' fns = cmn.cmd2lines('ls %s/*_vcf.cov' % wdir) #1. only back up the new version of cov file for fn in fns: print('processing %s...' % fn) lines = cmn.file2lines(fn) items = lines[-1].strip().split() if len(items) != 6: print('skip old format file %s' % fn) continue fnlabel = cmn.lastName(fn) dn = '%s/%s' % (ddir, fnlabel) if os.path.exists(dn): print('merging new and old data for %s' % fnlabel) covOld = float(cmn.file2lines(dn)[-1].split()[-2]) cov = float(lines[-1].split()[-2]) if cov > covOld: cmn.run('cp %s %s' % (fn, dn)) else: cmn.run('cp %s %s' % (fn, dn))
def read_nickMade_barcodes(fn): adict = {} for line in cmn.file2lines(fn): if line.strip() == '': continue if line[0] == '#': continue if line[0] == '>': name = line[1:].strip().split('_')[0] seq = line.strip() adict[name] = seq return adict
def parse_inserted_gap(ID, seq, label): fn = 'sampleRun_%s/bait_insertion' % ID #if cmn.filexist(fn) or ('N' in seq.replace('-', 'N').strip('N')): if cmn.filexist(fn): #lines = cmn.file2lines(fn) #lines = sorted(lines, key=lambda x: int(x.split(',')[0][1:])) #Ngap = 0 #for line in lines: # items = line.strip().split() # Ngap += len(items[-1]) #check what is the right range of sequence print('runing blast to fix %s' % ID) checkSeq = seq.replace('-', 'N').strip('N') fquery = 'tmpInput.fa' fasta = '>input\n%s\n' % checkSeq cmn.write_file(fasta, fquery) dn = 'tmpBr_%s.txt' % label cmd = 'blastn -query %s -db /project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa ' % fquery cmd += '-task blastn-short -dust no -outfmt \'6 qseqid sseqid qstart qend sstart send evalue pident qseq sseq\' -out %s' % dn cmn.run(cmd) isFixed = False for line in cmn.file2lines(dn): items = line.strip().split() #print items qstart, qend, sstart, send = list(map(int, items[2:6])) if sstart == 1 and send == 658 and qstart == 21: qseq, sseq = items[-2:] new = [ char1 for char1, char2 in zip(qseq, sseq) if char2 != '-' ] if len(new) == 658: seq = seq[:qstart - 1] + ''.join(new) + seq[qend:] print('solution found for %s' % ID) isFixed = True break if sstart == 2 and send == 655 and qstart == 22: qseq, sseq = items[-2:] new = [ char1 for char1, char2 in zip(qseq, sseq) if char2 != '-' ] if len(new) == 654: seq = seq[:qstart - 1] + ''.join(new) + seq[qend:] print('solution found for %s' % ID) isFixed = True break if not isFixed: cmn.append_file('%s\t%s\n' % (ID, label), 'cannot_fixed_indel.txt') return seq