Пример #1
0
def split_and_rename(infile, outdir):
    contigs = fastaparser.read_fasta(infile)
    for contig in contigs:
        filename = contig[0].split()[0][1:]
        filename = join(outdir, filename + ".fasta")
        print filename
        fastaparser.write_fasta_to_file(filename, [contig])
Пример #2
0
def parse_mash(contig_file, table):
    contigs = fastaparser.read_fasta(contig_file)
    similar_lists = {}
    for contig in contigs:
        similar_lists[get_short_name(contig[0])] = []
    for line in open(table, 'r'):
        arr = line.split()
        dist = float(arr[2])
        if dist < 0.1:
            similar_lists[arr[0]].append(arr[1])
    print "processed input"
    to_sort = []
    for l in similar_lists:
        to_sort.append([l, len(similar_lists[l])])
    sorted_similar = sorted(to_sort, key=itemgetter(1), reverse=True)
    outcontigs = []
    used = set()
    for contig_info in sorted_similar:
        if contig_info[0] not in used:
            for similar in similar_lists[contig_info[0]]:
                used.add(similar)
            if contig_info[1] > 10:
                print contig_info
                print similar_lists[contig_info[0]]
                #far from optimal but whynot
                for contig in contigs:
                    if get_short_name(contig[0]) == contig_info[0]:
                        outcontigs.append(contig)
                        break
    result_f = join(os.path.dirname(contig_file), "interesting.fasta")
    os.system("rm " + result_f)
    fastaparser.write_fasta_to_file(result_f, outcontigs)
Пример #3
0
def extract_circular_from_file(file, indir, outdir):
    out_file = join(outdir, os.path.splitext(file)[0] + ".circular.fasta")
    contigs = fastaparser.read_fasta(join(sys.argv[1], file))
    circulars = []
    count = []
    for contig in contigs:
        arr = contig[0].strip(';').split('_')
        #      if float(arr[3]) > 500:
        if len(contig[1]) < 500: continue
        for kval in range(200, 50, -1):
            #            kval = 55
            if kval >= len(contig[1]) or len(contig[1]) < 500:
                continue
            start = contig[1][:kval]
            end = contig[1][-kval:]

            if start == end:
                #               print (">" + contig[0][1:])
                #               print (contig[1])
                #                print (" k equal " + str(kval))
                print(contig[0] + " is circular " + str(kval))
                #                contig[0] = contig[0] + " k: " + str(kval)
                circulars.append(contig)
                break
    fastaparser.write_fasta_to_file(out_file, circulars)
Пример #4
0
def glue_and_rename(indir, outfile):
    for file in os.listdir(indir):
        arr = file.split('.')
        if len(arr) < 4:
            continue
        contigs = fastaparser.read_fasta(join(indir, file))
        for contig in contigs:
            new_name = contig[0] + " " + arr[0] + "." + arr[1]
            print new_name
            fastaparser.write_fasta_to_file(outfile,
                                            zip([new_name], [contig[1]]))
Пример #5
0
def extract_not_listed(infasta, list):
    listed = set()
    for line in open(list, 'r'):
        listed.add(">" + line.split()[0])
    print len(listed)
    contigs = fastaparser.read_fasta(infasta)
    print len(contigs)
    outcontigs = []
    for contig in contigs:
        if not contig[0].split()[0] in listed:
            #            print contig[0]
            outcontigs.append(contig)


#        else:
#            listed.remove(contig[0])
#    for c in listed:
#        print c
    print len(outcontigs)
    outfasta = infasta[:-6] + ".unknown.fasta"
    os.system("rm " + outfasta)
    fastaparser.write_fasta_to_file(outfasta, outcontigs)
Пример #6
0
def break_scaffolds(argv):
    if (len(argv) != 4) and (len(argv) != 2):
        print(
            "Usage: " + argv[0] +
            " <input fasta (scaffolds)> (to get stats on sizes of Ns regions)")
        print(
            "Usage: " + argv[0] +
            " <input fasta (scaffolds)> <THRESHOLD> <output fasta (contigs)> (to break contigs on Ns regions of size >= THRESHOLD)"
        )
        sys.exit()

    BREAK_SCAFFOLDS = False
    if len(argv) == 4:
        BREAK_SCAFFOLDS = True

    N_NUMBER = None
    counter = 0
    if BREAK_SCAFFOLDS:
        N_NUMBER = int(argv[2])

    sizes_of_Ns_regions = dict()
    new_fasta = []
    for id, (name, seq) in enumerate(fastaparser.read_fasta(argv[1])):
        i = 0
        cur_contig_number = 1
        cur_contig_start = 0
        while (i < len(seq)) and (seq.find("N", i) != -1):
            start = seq.find("N", i)
            end = start + 1
            while (end != len(seq)) and (seq[end] == 'N'):
                end += 1

            i = end + 1
            if BREAK_SCAFFOLDS and (end - start) >= N_NUMBER:
                new_fasta.append(
                    (name.split()[0] + "_" + str(cur_contig_number),
                     seq[cur_contig_start:start]))
                cur_contig_number += 1
                cur_contig_start = end

            if not BREAK_SCAFFOLDS:
                if (end - start) in sizes_of_Ns_regions:
                    sizes_of_Ns_regions[(end - start)] += 1
                else:
                    sizes_of_Ns_regions[(end - start)] = 1

        if BREAK_SCAFFOLDS:
            new_fasta.append((name.split()[0] + "_" + str(cur_contig_number),
                              seq[cur_contig_start:]))
            counter += cur_contig_number

    if BREAK_SCAFFOLDS:
        fastaparser.write_fasta_to_file(argv[3], new_fasta)
        #print (" * " + str(id + 1) + " scaffold(s) were broken into " + str(counter) + " contig(s)")
    else:
        list_of_sizes = sizes_of_Ns_regions.keys()
        list_of_sizes.sort()
        avg_len = 0.0
        nruns = 0
        for k, v in sizes_of_Ns_regions:
            avg_len += k * v
            nruns += v
            print k, sizes_of_Ns_regions[k]
        avg_len /= nruns
        print "N-runs: " + str(nruns) + ", avg. len: " + str(avg_len)
Пример #7
0
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

# creating single-entry references and chains
params_subst_dict = dict()
input_fasta = fastaparser.read_fasta(sys.argv[1])
cwd = os.getcwd()
os.chdir(ideal_assembler_bin_dir)
for K in K_list:
    print("Starting with K=" + str(K))
    result_fasta = []
    for id, fasta_entry in enumerate(input_fasta):
        cur_ref_name = os.path.join(output_dir, 'chr_' + str(id) + '.fasta')
        cur_chain_name = os.path.join(output_dir, 'chr_' + str(id) + '_K' + str(K) + '_chain')
        log_filename = os.path.join(output_dir, 'chr_' + str(id) + '_K' + str(K) + '.log')
        fastaparser.write_fasta_to_file(cur_ref_name, [fasta_entry])
        shutil.copy(chain_template, cur_chain_name)
        cur_params_subst_dict = dict(params_subst_dict)
        cur_params_subst_dict['OUT_BASE'] = 'chr_' + str(id) + '_K' + str(K)
        tmp_dir = os.path.join(ideal_assembler_bin_dir, 'data/cap/cache/env_' + cur_params_subst_dict['OUT_BASE'])
        cur_params_subst_dict['REFERENCE'] = cur_ref_name
        cur_params_subst_dict['KMER_SIZE'] = str(K)
        update_template_params(cur_chain_name, cur_params_subst_dict)
        cmd_line = ideal_assembler_bin + ' ' + cur_chain_name + ' >> ' + log_filename + ' 2>> ' +  log_filename
        print('running with ' + os.path.basename(cur_ref_name) + ' on K=' + str(K))
        return_code = os.system(cmd_line)
        if return_code:
            print("Error happened when executing cmd_line " + cmd_line)
            sys.exit(1)
        catch_phrase = 'Outputting contigs to'
        for line in open(log_filename):
#!/usr/bin/python

############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################

# Convert contigs (i.e a reference) for experiment of running SPAdes on E. coli MC reads in "IonTorrent" mode
# (all series of repeated nucleotides are changed to single nucleotides).

import sys
import os
import fastaparser

# MAIN
if len(sys.argv) < 3:
    print("Usage: " + sys.argv[0] + " <input fasta> <output fasta>")
    sys.exit()

new_fasta = []
for name, seq in fastaparser.read_fasta(sys.argv[1]):
    new_seq = seq[0]
    for i in range(1, len(seq)):
        if seq[i - 1] != seq[i]:
            new_seq += seq[i]
    new_fasta.append((name, new_seq))

fastaparser.write_fasta_to_file(sys.argv[2], new_fasta)