def parse_scaf_length(line): ##contig=<ID=scaffold1_len454574_cov98,length=458273> adict = {} scaf = cmn.find_between(line, 'ID=', ',length') length = int(cmn.find_between(line, ',length=', '>')) adict[scaf] = length return adict
def check_difference(seq1, seq2): print(len(seq1), len(seq2)) if len(seq1) == len(seq2): return sum([char1 != char2 for char1, char2 in zip(seq1, seq2) if char1 not in gapChars and char2 not in gapChars]) cmn.write_file(seq1, 'tmpSeq1.fa') cmn.write_file(seq2, 'tmpSeq2.fa') info = cmn.cmd2info('blastn -query tmpSeq1.fa -subject tmpSeq2.fa') #Identities = 656/656 (100%) identityString = cmn.find_between(info, 'Identities = ', ' (') identN, totalN = list(map(int, identityString.split('/'))) cmn.write_file(info, 'checkTmp%s.br' % (ID)) return totalN - identN
#different chars and not a gap seq.append('X') fasta = '>%s\n%s\n' % (Id, ''.join(seq)) refBaseDict[Id] = ''.join(seq) new.append(fasta) cmn.write_file(''.join(new), 'sum_barcodes.fa') #cmn.run('rm -r sampleRun_fake') #check denovo pipeline one fns = cmn.cmd2lines('ls sampleRun_*/denovo_barcode.fa') denovoDict = {} new = [] for fn in fns: Id = cmn.find_between(fn, 'sampleRun_', '/') lines = cmn.file2lines(fn) seq = ''.join(lines[1:]) if seq > 658: tmp = seq.replace('N', '') if len(tmp) == 658: seq = tmp denovoDict[Id] = seq fasta = '>%s\n%s\n' % (Id, seq) new.append(fasta) cmn.write_file(''.join(new), 'sum_denovo.fa') new = [] for Id in clean_lines:
#options=parse_options() try: wdir = sys.argv[1] except: print("Usage: *.py t100_highQ", file=sys.stderr) sys.exit() cmd = 'grep "Estimated Ln Prob of Data" %s/*/r*/*.log' % wdir lines = cmn.cmd2lines(cmd) print('\n'.join(lines)) rdict = {} countK = {} for line in lines: K = cmn.find_between(line, 'structureK', '/') K = int(K) lnL = float(line.strip().split()[-1]) try: rdict[K].append(lnL) except KeyError: rdict[K] = [lnL] try: countK[K] += 1 except: countK[K] = 1 keys = list(rdict.keys()) keys.sort() for K in keys:
if __name__=='__main__': #options=parse_options() try: odir, f_ass = sys.argv[1:3] except: print("Usage: *.py filelist assembly_v0.fa", file=sys.stderr) print("you should index assembly_v0.fa first with -p assembly_v0", file=sys.stderr) print("using command /home2/wli/local/bwa-0.7.12/bwa index ", file=sys.stderr) sys.exit() #fns = cmn.cmd2lines('ls %s/*.fq' % odir) fns = cmn.getid(odir) group_dict = separate_by_label(fns) ass_label = cmn.find_between(cmn.lastName(f_ass), 'assembly_', '.fa') cmn.mkdir('job_files') cmn.mkdir('cmd_files') for plabel in group_dict: print('processing lib %s' % plabel) each = group_dict[plabel] #also parse the files inside this function #return the file name after parsing paired, unpaired = separate_by_pair(plabel, each) if paired == None: continue label = '%s_%s' % (plabel, ass_label) #index_label = cmn.lastName(f_ass).replace('.fa', '') index_label = f_ass.replace('.fa', '')
try: fn=sys.argv[1] except: print("Usage: *.py link.file", file=sys.stderr) sys.exit() dn = 'retrieved_barcodes.fa' dp = open(dn, 'w') for link in cmn.file2lines(fn): if link[0] == '#': continue print('processing ' + link) info = cmn.link2info(link) seq = cmn.find_between(info, "generateBarcode ('#barcodeImg_", "');").split('\'')[-1] takeName = False takeSp = False for line in info.split('\n'): if 'Sequence ID' in line: takeName = True continue if '<td>Species:</td>' in line: takeSp = True continue if takeName: takeName = False #<td style="width:160px;">ANICE505-10.COI-5P</td> name = cmn.find_between(line, '>', '<').split('.COI')[0] if takeSp:
def scaf2numb(scaf): numb = cmn.find_between(scaf, 'scaffold', '_cov') return int(numb)
if python_lib not in sys.path: sys.path.append(python_lib) import cmn import requests as rq #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: ID = sys.argv[1] except: print("Usage: *.py SRNPnumber", file=sys.stderr) sys.exit() url = 'http://janzen.sas.upenn.edu/Wadults/resultsexpressVOUCHB.lasso' json = {'submitButtonName': 'SUBMIT', 'voucher': ID} r = rq.post(url, data=json) sp = cmn.find_between(r.content, 'species:<b>', '</b>').strip() if '<title>' not in sp: print(sp)
print("Usage: *.py t100_highQ", file=sys.stderr) sys.exit() #cmd = 'grep "Estimated Ln Prob of Data" %s/*/r*/*.log' % wdir cmd = 'ls %s/*/r*/structure*f' % wdir fns = cmn.cmd2lines(cmd) print(fns) outdir = 'harvest_%s' % wdir cmn.mkdir(outdir) cmd_dict = {} for fn in fns: #cov_3/structureK10/r0/structure.output_f K = cmn.find_between(fn, 'structureK', '/') rep = fn.split('/')[-2] dn = '%s/out_K%s_%s_f' % (outdir, K, rep) cmd = 'cp %s %s' % (fn, dn) try: cmd_dict[K].append(cmd) except KeyError: cmd_dict[K] = [cmd] for K in cmd_dict: cmds = cmd_dict[K] if len(cmds) < 3: print('insufficent replicates for K=%s, skip' % K) continue for cmd in cmds: cmn.run(cmd)
except: print("Usage: *.py vcf", file=sys.stderr) sys.exit() length_dict = {} current_scaf = '' seqDict = {} order_list = [] with open(fn) as fp: for line in fp: line = line.strip() # ##contig=<ID=scaffold1_cov51,length=30279> if line.startswith('##contig='): scaf = cmn.find_between(line, '<ID=', ',') length = int(cmn.find_between(line, ',length=', '>')) length_dict[scaf] = length if line[0] != '#': items = line.strip().split() scaf = items[0] if scaf != current_scaf: order_list.append(scaf) #start a new scaffold expect_index = 1 current_scaf = scaf else: expect_index += 1 index = int(items[1])