def split_and_rename(infile, outdir): contigs = fastaparser.read_fasta(infile) for contig in contigs: filename = contig[0].split()[0][1:] filename = join(outdir, filename + ".fasta") print filename fastaparser.write_fasta_to_file(filename, [contig])
def parse_mash(contig_file, table): contigs = fastaparser.read_fasta(contig_file) similar_lists = {} for contig in contigs: similar_lists[get_short_name(contig[0])] = [] for line in open(table, 'r'): arr = line.split() dist = float(arr[2]) if dist < 0.1: similar_lists[arr[0]].append(arr[1]) print "processed input" to_sort = [] for l in similar_lists: to_sort.append([l, len(similar_lists[l])]) sorted_similar = sorted(to_sort, key=itemgetter(1), reverse=True) outcontigs = [] used = set() for contig_info in sorted_similar: if contig_info[0] not in used: for similar in similar_lists[contig_info[0]]: used.add(similar) if contig_info[1] > 10: print contig_info print similar_lists[contig_info[0]] #far from optimal but whynot for contig in contigs: if get_short_name(contig[0]) == contig_info[0]: outcontigs.append(contig) break result_f = join(os.path.dirname(contig_file), "interesting.fasta") os.system("rm " + result_f) fastaparser.write_fasta_to_file(result_f, outcontigs)
def extract_circular_from_file(file, indir, outdir): out_file = join(outdir, os.path.splitext(file)[0] + ".circular.fasta") contigs = fastaparser.read_fasta(join(sys.argv[1], file)) circulars = [] count = [] for contig in contigs: arr = contig[0].strip(';').split('_') # if float(arr[3]) > 500: if len(contig[1]) < 500: continue for kval in range(200, 50, -1): # kval = 55 if kval >= len(contig[1]) or len(contig[1]) < 500: continue start = contig[1][:kval] end = contig[1][-kval:] if start == end: # print (">" + contig[0][1:]) # print (contig[1]) # print (" k equal " + str(kval)) print(contig[0] + " is circular " + str(kval)) # contig[0] = contig[0] + " k: " + str(kval) circulars.append(contig) break fastaparser.write_fasta_to_file(out_file, circulars)
def glue_and_rename(indir, outfile): for file in os.listdir(indir): arr = file.split('.') if len(arr) < 4: continue contigs = fastaparser.read_fasta(join(indir, file)) for contig in contigs: new_name = contig[0] + " " + arr[0] + "." + arr[1] print new_name fastaparser.write_fasta_to_file(outfile, zip([new_name], [contig[1]]))
def extract_not_listed(infasta, list): listed = set() for line in open(list, 'r'): listed.add(">" + line.split()[0]) print len(listed) contigs = fastaparser.read_fasta(infasta) print len(contigs) outcontigs = [] for contig in contigs: if not contig[0].split()[0] in listed: # print contig[0] outcontigs.append(contig) # else: # listed.remove(contig[0]) # for c in listed: # print c print len(outcontigs) outfasta = infasta[:-6] + ".unknown.fasta" os.system("rm " + outfasta) fastaparser.write_fasta_to_file(outfasta, outcontigs)
def break_scaffolds(argv): if (len(argv) != 4) and (len(argv) != 2): print( "Usage: " + argv[0] + " <input fasta (scaffolds)> (to get stats on sizes of Ns regions)") print( "Usage: " + argv[0] + " <input fasta (scaffolds)> <THRESHOLD> <output fasta (contigs)> (to break contigs on Ns regions of size >= THRESHOLD)" ) sys.exit() BREAK_SCAFFOLDS = False if len(argv) == 4: BREAK_SCAFFOLDS = True N_NUMBER = None counter = 0 if BREAK_SCAFFOLDS: N_NUMBER = int(argv[2]) sizes_of_Ns_regions = dict() new_fasta = [] for id, (name, seq) in enumerate(fastaparser.read_fasta(argv[1])): i = 0 cur_contig_number = 1 cur_contig_start = 0 while (i < len(seq)) and (seq.find("N", i) != -1): start = seq.find("N", i) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 i = end + 1 if BREAK_SCAFFOLDS and (end - start) >= N_NUMBER: new_fasta.append( (name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:start])) cur_contig_number += 1 cur_contig_start = end if not BREAK_SCAFFOLDS: if (end - start) in sizes_of_Ns_regions: sizes_of_Ns_regions[(end - start)] += 1 else: sizes_of_Ns_regions[(end - start)] = 1 if BREAK_SCAFFOLDS: new_fasta.append((name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:])) counter += cur_contig_number if BREAK_SCAFFOLDS: fastaparser.write_fasta_to_file(argv[3], new_fasta) #print (" * " + str(id + 1) + " scaffold(s) were broken into " + str(counter) + " contig(s)") else: list_of_sizes = sizes_of_Ns_regions.keys() list_of_sizes.sort() avg_len = 0.0 nruns = 0 for k, v in sizes_of_Ns_regions: avg_len += k * v nruns += v print k, sizes_of_Ns_regions[k] avg_len /= nruns print "N-runs: " + str(nruns) + ", avg. len: " + str(avg_len)
if not os.path.isdir(output_dir): os.makedirs(output_dir) # creating single-entry references and chains params_subst_dict = dict() input_fasta = fastaparser.read_fasta(sys.argv[1]) cwd = os.getcwd() os.chdir(ideal_assembler_bin_dir) for K in K_list: print("Starting with K=" + str(K)) result_fasta = [] for id, fasta_entry in enumerate(input_fasta): cur_ref_name = os.path.join(output_dir, 'chr_' + str(id) + '.fasta') cur_chain_name = os.path.join(output_dir, 'chr_' + str(id) + '_K' + str(K) + '_chain') log_filename = os.path.join(output_dir, 'chr_' + str(id) + '_K' + str(K) + '.log') fastaparser.write_fasta_to_file(cur_ref_name, [fasta_entry]) shutil.copy(chain_template, cur_chain_name) cur_params_subst_dict = dict(params_subst_dict) cur_params_subst_dict['OUT_BASE'] = 'chr_' + str(id) + '_K' + str(K) tmp_dir = os.path.join(ideal_assembler_bin_dir, 'data/cap/cache/env_' + cur_params_subst_dict['OUT_BASE']) cur_params_subst_dict['REFERENCE'] = cur_ref_name cur_params_subst_dict['KMER_SIZE'] = str(K) update_template_params(cur_chain_name, cur_params_subst_dict) cmd_line = ideal_assembler_bin + ' ' + cur_chain_name + ' >> ' + log_filename + ' 2>> ' + log_filename print('running with ' + os.path.basename(cur_ref_name) + ' on K=' + str(K)) return_code = os.system(cmd_line) if return_code: print("Error happened when executing cmd_line " + cmd_line) sys.exit(1) catch_phrase = 'Outputting contigs to' for line in open(log_filename):
#!/usr/bin/python ############################################################################ # Copyright (c) 2015 Saint Petersburg State University # Copyright (c) 2011-2014 Saint Petersburg Academic University # All Rights Reserved # See file LICENSE for details. ############################################################################ # Convert contigs (i.e a reference) for experiment of running SPAdes on E. coli MC reads in "IonTorrent" mode # (all series of repeated nucleotides are changed to single nucleotides). import sys import os import fastaparser # MAIN if len(sys.argv) < 3: print("Usage: " + sys.argv[0] + " <input fasta> <output fasta>") sys.exit() new_fasta = [] for name, seq in fastaparser.read_fasta(sys.argv[1]): new_seq = seq[0] for i in range(1, len(seq)): if seq[i - 1] != seq[i]: new_seq += seq[i] new_fasta.append((name, new_seq)) fastaparser.write_fasta_to_file(sys.argv[2], new_fasta)