bfa = '>%s\n%s\n' % (subjct, seq) difference = compare_itself(fasta, bfa) paired_fa.append(bfa) keys = list(difference.keys()) keys.sort() for pos in keys: info.append('%s\t%s\t%s\t%s\n' % (name, subjct, pos, difference[pos])) if isInBlast: #itself is found by blast pass else: #this is very special #the barcode is not in the blast result difference, subjct = compare_top_hit(br_result) bfa = '>%s(addBack_closest_barcode)\n%s\n' % ( subjct, barCodeDict[subjct]) paired_fa.append(bfa) keys = list(difference.keys()) keys.sort() for pos in keys: info.append('%s\t%s\t%s\t%s\n' % (name, subjct, pos, difference[pos])) info.append('#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#\n') dn = cmn.lastName(fn) + '.report' cmn.write_file(''.join(info), dn) dn = cmn.lastName(fn) + '_paired.fa' cmn.write_file(''.join(paired_fa), dn)
if __name__=='__main__': #options=parse_options() try: fn=sys.argv[1] except: print("Usage: *.py", file=sys.stderr) sys.exit() fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa' #fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes.fasta' seqDict = read_fa(fall) info = [] for line in cmn.file2lines(fn): #5077 Autochton zarex items = line.strip().split() sample, genus, sp = items[:3] query_sequence, qlen = get_query_sequence(seqDict, genus, sp) br_result = do_barcode_blast(query_sequence) print(br_result) #print '\n'.join(br_result) baits = pick_barcode_baits(br_result, qlen, seqDict) info += format_baits(sample, baits) dn = cmn.lastName(fn) + '.baits' cmn.write_file(''.join(info), dn)
fn, fadd = sys.argv[1:3] except: print("Usage: *.py aln repID.file", file=sys.stderr) sys.exit() #fID = '/work/biophysics/wli/introgression2/4_filterIntro/rep_sps' #fID = '/project/biophysics/Nick_lab/wli/sequencing/myAnalysis/clean_ref_bias/4_build_tree/pureIDs' #goodIDs = set([i.split()[0] for i in cmn.file2lines(fID) # if i.strip() != '']) #fadd = 'added_sps' #if cmn.filexist(fadd): # print 'found local list, add them in' goodIDs = set(cmn.getid(fadd)) dn = cmn.lastName(fn).replace('.fasta', '').replace('.fa', '') + '_taken.fa' dp = open(dn, 'w') new = [] leftIDs = set(goodIDs) with open(fn) as fp: for line in fp: if line[0] == '>': name = line[1:].strip().split('_')[0].split('-')[0] if name in goodIDs: isGood = True if name in leftIDs: leftIDs.remove(name) else: isGood = False
for each in contig: scaf, position1, char1, char2, phase = each phase += '[swap]' newlist.append((scaf, position1, char2, char1, phase)) return newlist if __name__ == '__main__': #options=parse_options() try: fsam, fletter = sys.argv[1:] except: print("Usage: *.py *.sam *.letters", file=sys.stderr) sys.exit() outlabel = cmn.lastName(fletter)[:-8] print(outlabel) #{read_query_name: [record1, record2]} paired_samDict = read_samfile(fsam) #covDict[scaf][index][char] covDict = compute_coverage_from_sam(paired_samDict) cons_seq = make_cons_from_covDict(covDict) #adict = {'scaf': position1: [A, T, phase]} letter_dict, inconsistent_positions = read_letter_file(fletter) #still save inconsistent letter in letter_dict because we need to break contigs by them letter_dict, corrected_dict = correct_false_snp_call( letter_dict, covDict, inconsistent_positions) new = [] for scaf in corrected_dict:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__=='__main__': #options=parse_options() try: fR1, fR2 = sys.argv[1:3] except: print("Usage: *.py R1 R2", file=sys.stderr) sys.exit() sample = cmn.lastName(fR1).split('_')[0] spacing_list = [2, 3, 5, 10] count = 0 for spacing in spacing_list: fpR1 = open(fR1) fpR2 = open(fR2) dnlabel = '%s.spacing%s' % (sample, spacing) print('making %s' % dnlabel) dnR1 = open('%s_R1.fastq' % dnlabel, 'w') dnR2 = open('%s_R2.fastq' % dnlabel, 'w') for i, line1 in enumerate(fpR1): line2 = fpR2.readline()
adict[sp] = ref rset.add(ref) if isbad: sys.exit() return rset, adict #1. read in data fns = cmn.getid(sys.argv[1]) bwa_dirs = [line.strip().rstrip('/') for line in cmn.getid(sys.argv[2])] #2. check which reference they used #sp is unique vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns} #6188_3842_assembly_v2_snp_step2.vcf #vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns} sps = list(vcf_dict.keys()) #ref_genomes, refmapping = detect_ref_genomes(sps, bwa_dirs) ref_genomes, refmapping = set([]), {} for fn in fns: #../../step3_gatk/5729_3614_assembly_v1/5729_3614_assembly_v1_snp_step2.vcf fnlabel = cmn.lastName(fn).replace('_snp_step2.vcf', '') items = fnlabel.split('_') sp = items[0] ref = '_'.join(items[1:]) ref_genomes.add(ref) refmapping[fnlabel] = ref
import sys python_lib = '/work/00412/mtang/sequencing/scripts' if python_lib not in sys.path: sys.path.append(python_lib) import cmn #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py *.clw", file=sys.stderr) sys.exit() new = [] with open(fn) as fp: for line in fp: exon, sp, seq = line.strip().split() sp = sp.split('.')[0] new.append('>%s_%s\n%s\n' % (sp, exon, seq)) dn = cmn.lastName(fn).replace('.sum', '') + '.fa' cmn.write_file(''.join(new), dn)
adict = {} fastas = cmn.txt_read(fa).split('>')[1:] print(fastas) for each in fastas: lines = each.strip().split('\n') defline = lines[0] seq = ''.join([line.strip() for line in lines[1:]]) #seq = seq.replace('N', '-') adict[defline] = seq return adict if __name__ == '__main__': #options=parse_options() try: fn, Range = sys.argv[1:3] i, j = list(map(int, Range.split('-'))) except: print("Usage: *.py aln 0-10000", file=sys.stderr) sys.exit() new = [] seqDict = read_fa(fn) for name in seqDict: seq = seqDict[name] fasta = '>%s\n%s\n' % (name, seq[i:j]) new.append(fasta) dn = '%s_%s.fa' % (cmn.lastName(fn).replace('.fa', ''), Range) cmn.write_file(''.join(new), dn)
import cmn #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] ref_scaf = sys.argv[2] except: print("Usage: *.py samfile scaf", file=sys.stderr) sys.exit() dn = 'filtered' + cmn.lastName(fn) dp = open(dn, 'w') with open(fn) as fp: for line in fp: if line[0] == '@': if ref_scaf in line: dp.write(line) else: if line.strip().split()[2] == ref_scaf: dp.write(line) dp.close()
if __name__ == '__main__': #options=parse_options() try: fqlist, fmitolist = sys.argv[1:] except: print("Usage: *.py fqlist refMitoList", file=sys.stderr) sys.exit() #ftemplate = '/work/biophysics/wli/Eudamine/wholeMito_run2/mito_denovo.template' ftemplate = '/project/biophysics/Nick_lab/wli/sequencing/scripts/mito_scripts/mito_refDenovo.template' template = cmn.txt_read(ftemplate) fqlist = cmn.file2lines(fqlist) groupDict = {} for fq in fqlist: fq = os.path.abspath(fq) ID = cmn.lastName(fq).split('_')[0] try: groupDict[ID].append(fq) except KeyError: groupDict[ID] = [fq] fmitolist = os.path.abspath(fmitolist) for sample in groupDict: fqlist = groupDict[sample] wdir = 'mitoRef_%s' % sample cmn.mkdir(wdir) os.chdir(wdir) cwd = os.getcwd() info = template.replace('[cwd]', cwd) cmn.write_lines(fqlist, 'fqlist') cmd = 'cat %s|xargs cat > ref_mito.fa; module add bwa; bwa index ref_mito.fa' % fmitolist
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py *.fq", file=sys.stderr) sys.exit() taken = set([]) dn = cmn.lastName(fn).split('.')[0] + '_unified.fastq' isGood = True #new = [] dp = open(dn, 'w') with open(fn) as fp: for i, line in enumerate(fp): if i % 4 == 0: ID = line.strip() if ID not in taken: isGood = True taken.add(ID) else: isGood = False print('duplcated ID: %s' % ID) if isGood:
import sys python_lib = '/home2/wli/my_programs/python_lib' if python_lib not in sys.path: sys.path.append(python_lib) import cmn #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py *.clw", file=sys.stderr) sys.exit() new = [] with open(fn) as fp: for line in fp: name, seq = line.strip().split() new.append('>%s\n%s\n' % (name, seq)) dn = cmn.lastName(fn).replace('.clw', '') + '.fa' cmn.write_file(''.join(new), dn)
] fmis = cmn.cmd2lines('ls rescued_read_assembled_mis1*.txt')[0] fns.append(fmis) for fn in fns: cmd = 'chmod a+w %s' % fn cmn.run(cmd) cmd = "ssh [email protected] 'rm /data/www/wenlin/html/transfer/barcode_lineup_files/%s_rescued_read_assembled_mis1*.txt'" % sp cmn.run(cmd) cmd = 'rm /project/biophysics/Nick_lab/wli/archive/BWA_barcodes/lineup_files/%s_rescued_read_assembled_mis1*.txt' % sp cmn.run(cmd) ddirs = [ '/project/biophysics/Nick_lab/wli/archive/BWA_barcodes/lineup_files', '[email protected]:/data/www/wenlin/html/transfer/barcode_lineup_files/other_data' ] cmd = 'rsync -av %s [email protected]:/data/www/wenlin/html/transfer/barcode_lineup_files/%s_%s' % ( fmis, sp, fmis) cmn.run(cmd) for ddir in ddirs: for fn in fns: if 'rescued_read_assembled_mis1' in fn and '/other_data' in ddir: continue cmd = 'rsync -av %s %s/%s_%s' % (fn, ddir, sp, cmn.lastName(fn)) cmn.run(cmd)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py assembly", file=sys.stderr) sys.exit() seqDict, order_scafs = read_fa(fn) #write in the same dict as assembly olabel = cmn.lastName(fn) dnlabel = '.'.join(olabel.split('.')[:-1]) + '_scaf.header' dn = fn.replace(olabel, dnlabel) fdn = open(dn, 'w') for name in order_scafs: shortname = name.split()[0] length = len(seqDict[name]) for i in range(length): line = '%s\t%s\n' % (shortname, (i + 1)) fdn.write(line) fdn.close()
with open(fn) as fp: for line in fp: if line[0] == '>': defline = line.strip() if sampleID in defline: isTaken = True else: isTaken = False else: if isTaken: takenSeq = line.strip() print('take the base seq as %s for %s' % (defline, sampleID)) break goodP = [i for i in range(len(takenSeq)) if takenSeq[i] not in gapChars] f_label = cmn.lastName(fn).replace('.fasta', '').replace('.fa', '') dn = f_label + '_base%s.fa' % sampleID dp = open(dn, 'w') with open(fn) as fp: for line in fp: if line[0] == '>': defline = line.strip() else: goodSeq = ''.join([line[i] for i in goodP]) dp.write('%s\n%s\n' % (defline, goodSeq)) dp.close()
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py sam", file=sys.stderr) sys.exit() dnH = cmn.lastName(fn).replace('.sam', '') + '.HighQmapStat' dnL = cmn.lastName(fn).replace('.sam', '') + '.mapStat' rdict = {} hdict = {} samfile = pysam.AlignmentFile(fn) for record in samfile: if record.is_unmapped: continue scaf = record.reference_name aligns = record.get_aligned_pairs() N = len([each for each in aligns if None not in each]) if scaf not in rdict: rdict[scaf] = [0, 0]
adict[defline] = seq.upper() return adict #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__=='__main__': #options=parse_options() try: fn=sys.argv[1] except: print("Usage: *.py aln.fa", file=sys.stderr) sys.exit() seqDict = read_fa(fn) newDict = {key: ''.join([rdict[char] for char in seqDict[key][::-1]]) for key in seqDict} dn = cmn.lastName(fn) + '.reverse' fastas = ['>%s_reverse\n%s\n' % (name, newDict[name]) for name in newDict] cmn.write_file(''.join(fastas), dn)
adict[defline] = seq.upper() return adict #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__=='__main__': #options=parse_options() try: fn=sys.argv[1] except: print("Usage: *.py aln.fa", file=sys.stderr) sys.exit() seqDict = read_fa(fn) newDict = {key: ''.join([rdict[char] for char in seqDict[key]]) for key in seqDict} dn = cmn.lastName(fn) + '.cmpl' fastas = ['>%s_reverse\n%s\n' % (name, newDict[name]) for name in newDict] cmn.write_file(''.join(fastas), dn)
if copy_counts[0] > copy_counts[1]: char2 = char1 else: char1 = char2 else: print('unrecognized line: %s' % line, file=sys.stderr) sys.exit() seq1.append(char1) seq2.append(char2) #output the last phased_blocks.append('%s\t%s\t%s\t%s\t%s\t%s\n' % (lastPhase, lastScaf, lastPosition[1], right[1], lastPosition[2], right[2])) dnlabel = cmn.lastName(fn).replace('.vcf', '') sp = dnlabel.split('_')[1] dn = dnlabel + '_phased.fa' with open(dn, 'w') as dp: dp.write('>%s_ref_or_phase1\n' % sp) dp.write(''.join(seq1)) dp.write('\n') dp.write('>%s_called_or_phase2\n' % sp) dp.write(''.join(seq2)) dp.write('\n') dn = dnlabel + 'phased.blocks' cmn.write_file(''.join(phased_blocks), dn)
sys.exit() fhead = '/work/biophysics/mtang/SNP_calling/indexed_references/Junonia_v2_scaf.header' #fhead = 'Calycopis_cecrops_assembly_V1.1_scaf.header' print('loading header info...') headDict = {} with open(fhead) as fp: for i, line in enumerate(fp): scaf, index = line.strip().split() try: headDict[scaf].append(i) except KeyError: headDict[scaf] = [i] print('finish loading header, begin parsing fasta...') outdir = '%s_scafs' % cmn.lastName(fn) cmn.mkdir(outdir) seqDict = read_fa(fn) for scaf in headDict: indexes = headDict[scaf] new = [] for name in seqDict: seq = seqDict[name] newSeq = ''.join([seq[i] for i in indexes]) fasta = '>%s\n%s\n' % (name, newSeq) new.append(fasta) dn = '%s/%s.fa' % (outdir, scaf) cmn.write_file(''.join(new), dn)
if python_lib not in sys.path: sys.path.append(python_lib) import cmn #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py vcf", file=sys.stderr) sys.exit() total = cmn.cmd2info('wc -l %s' % fn).split()[0] SNPs = cmn.cmd2info('grep HaplotypeScore %s > %s.tmp; wc -l %s.tmp' % (fn, fn, fn)).split()[0] lowqual = cmn.cmd2info('grep LowQual %s.tmp|wc -l ; rm %s.tmp' % (fn, fn)).split()[0] print(cmn.lastName(fn), total, SNPs, lowqual, int(SNPs) / float(total), int(lowqual) / float(SNPs))
seq = [] else: seq.append(line.strip()) #last seq seqDict[sp].append(''.join(seq)) new = [] for sp in seqDict: seq1, seq2 = seqDict[sp] diffN = sum([seq1[i] != seq2[i] for i in range(len(seq1))]) if diffN < cutoff: if diffN == 0: seq = seq1 defline = '%s_unique' % sp else: seq = collapse_seqs(seq1, seq2) defline = '%s_diff%s' % (sp, diffN) fasta = '>%s\n%s\n' % (defline, seq) else: #need to keep both copy label = '%s_diff%s' % (sp, diffN) fasta = '>%s_cp1\n%s\n>%s_cp2\n%s\n' % (label, seq1, label, seq2) new.append(fasta) dn = '%s_collapse_cut%s.fa' % (cmn.lastName(fn).replace('.fa', '') , cutoff) cmn.write_file(''.join(new), dn)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__=='__main__': #options=parse_options() try: fnlist, outdir = sys.argv[1:] except: print("Usage: *.py falist outdirName", file=sys.stderr) sys.exit() outlabel = cmn.lastName(fnlist) #scan through to see the total IDlist IDs = set([]) fns = cmn.file2lines(fnlist) for fn in fns: with open(fn) as fp: for line in fp: if line[0] == '>': ID = name2ID(line[1:].strip()) IDs.add(ID) #read in sequence and partition shift = 0 final = {} setList = [] for fn in fns:
adict = {} alist = [] fastas = cmn.txt_read(fa).split('>')[1:] for each in fastas: lines = each.strip().split('\n') defline = lines[0] alist.append(defline) seq = ''.join(lines[1:]) adict[defline] = seq return adict, alist #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py", file=sys.stderr) sys.exit() seqDict, orderlist = read_fa(fn) new = ['>%s\n%s\n' % (name, seqDict[name].upper()) for name in orderlist] dn = cmn.lastName(fn).replace('.fa', '') + '_4tree.fa' cmn.write_file(''.join(new), dn)
continue if N == 0 or N == 1: #skip the all gapped positions #also skip same character lines continue else: for i, char in enumerate(chars): if char in missing_data: result[i].append(-9) continue try: code = char_label[char] except KeyError: code = current_count char_label[char] = current_count current_count += 1 result[i].append(code) dn = cmn.lastName(fn) + 'STRUCTUREinput.txt' new = ['\t'.join(map(str, line)) for line in result] new.append('') cmn.write_lines(new, dn) print('number of loci: %s' % (len(line) - 1))
if __name__=='__main__': #options=parse_options() try: #fn, f_table = sys.argv[1:3] fn = sys.argv[1] except: print("Usage: *.py fqlist", file=sys.stderr) sys.exit() cmn.mkdir('tmpStat') IDlist = set([]) fq_groups = {} for line in cmn.file2lines(fn): Id = cmn.lastName(line).split('_')[0] Id = Id.replace('NVG-', '').replace('11-BOA-','').replace('LEP-', 'LEP') IDlist.add(Id) fq = os.path.abspath(line) try: fq_groups[Id].append(fq) except KeyError: fq_groups[Id] = [fq] nameDict = get_names_4barcode() fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa' #fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes.fasta' seqDict = read_fa(fall) fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/addedBaits_fromPipeline.fa' if cmn.filexist(fadd):
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #fns = cmn.cmd2lines('ls ../0_libs/*/*.fq') #fns += cmn.cmd2lines('ls ../0_libs/*/*.fastq') #fns = cmn.file2lines('../fqlist') #fns = cmn.cmd2lines('ls /project/biophysics/Nick_lab/wli/sequencing/Eudamine/BEAST_timing/tmp_link_fastq/*q') fns = cmn.file2lines(sys.argv[1]) #skip_list = set(['5316', '5721']) gdict = {} for fn in fns: #items = fn.split('/') #sp = items[-2] sp = cmn.lastName(fn).split('_')[0] try: gdict[sp].append(fn) except: gdict[sp] = [fn] formatcmds = '\n\n' for sp in gdict: #if sp in skip_list: # continue cmd = '' fns = gdict[sp] for fn in fns: cmd += 'fq2fa %s >> %s.fa; ' % (fn, sp)
seq = ''.join(lines[1:]) adict[defline] = seq return adict, len(seq) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__=='__main__': #options=parse_options() try: fn=sys.argv[1] except: print("Usage: *.py", file=sys.stderr) sys.exit() seqDict, length = read_fa(fn) new = ['%s\t%s' % (len(seqDict), length)] for name in seqDict: new.append('%s %s' % (name, seqDict[name])) dn = cmn.lastName(fn) + '.phylip' cmn.write_lines(new, dn)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py", file=sys.stderr) sys.exit() adict = {} with open(fn) as fp: for line in fp: if line[0] == '>': label = line.strip() else: seq = line.strip() adict[label] = '%s\n%s\n' % (label, seq) times = 10 keys = list(adict.keys()) cmn.mkdir('shuffle_genome') for each in range(times): random.shuffle(keys) new = [adict[key] for key in keys] dn = 'shuffle_genome/%s_shuffle%s' % (cmn.lastName(fn), each) cmn.write_file(''.join(new), dn)
if __name__ == '__main__': #options=parse_options() try: fn, frange = sys.argv[1:] except: print("Usage: *.py", file=sys.stderr) sys.exit() geneRange = read_gene_range(frange) seqDict, order_list = read_fa(fn) stat = [] outdir = '%s_gene_fasta' % cmn.lastName(fn) cmn.mkdir(outdir) for gene in geneRange: i, j = geneRange[gene] print(gene, i, j) stat.append('%s\t%s\n' % (gene, j - i)) dn = '%s/%s.fa' % (outdir, gene) with open(dn, 'w') as dp: for name in order_list: seq = seqDict[name][i:j] if seq.strip('-').strip('N') == '': continue fasta = '>%s\n%s\n' % (name, seq) dp.write(fasta)