예제 #1
0
def standardize_flanks(flank_file_path, index_dict, flank, genome_dict2):
    """Find the index position of the start and end of the DNA match in the sequences with flanks. If either flank is not as long as the flank setting, add N's to reach that number. If the index is -1 (not found), go back into the genome sequence to get the correct locus"""

    seq_dict = {}
    seq_order = []
    modified = 0
    flank_in = open(flank_file_path, "r")
    adj_flank_path = flank_file_path + "_adj"
    print "In standardize_flanks, flank =", flank
    base_path = os.path.splitext(flank_file_path)[0]
    genomic_path = base_path + ".genomic"
    genomic_out = open(genomic_path, "w")

    for title, seq in fastaIO.FastaGeneralIterator(flank_in):
        add_left = ''
        add_right = ''
        title = title.strip()
        seq_order.append(title)
        seq_dict[title] = seq
        strand = title.split("Direction:")[1]
        strand = strand.strip()
        seq_len = len(seq)
        contig = title.split("Sbjct:")[1].split(" ")[0]
        locus_str = title.split("Location:(")[1].split(" - ")
        start = int(locus_str[0])
        end = int(locus_str[1].split(")")[0])
        name = title
        #get genomic copy without flanks as PHI only report hit seq not genomic, then proceed
        genomic_seq = fastaIO.sequence_retriever(contig, start, end, 0,
                                                 genome_dict2)
        if strand == 'minus':
            genomic_seq = fastaIO.reverse_complement(genomic_seq)
        print >> genomic_out, ">" + title + "\n" + genomic_seq

        if args.Type == 'nucl':
            name = title.split(' ')[0]
        if not seq_len or seq_len == 0:
            left_flank_len = -1
            right_flank_index = -1
        else:
            left_flank_len = seq.upper().find(index_dict[name]['left'])
            if left_flank_len < flank and left_flank_len != -1:
                retry = seq.upper().find(index_dict[name]['left'], flank - 20)
                if retry == flank:
                    left_flank_len = retry

            right_flank_index = seq.upper().rfind(index_dict[name]['right'])
            right_flank_start = right_flank_index + 25  #first nt of flank, 1st nt after search string
            if right_flank_start > (seq_len -
                                    flank) and right_flank_index != -1:
                retry = seq.upper().rfind(index_dict[name]['right'], 0,
                                          (seq_len - flank) + 20)
                if retry == (seq_len - flank):
                    right_flank_start = retry + 25

        if left_flank_len == -1 or right_flank_index == -1:
            new_seq = fastaIO.sequence_retriever(contig, start, end, flank,
                                                 genome_dict2)
            if strand == 'minus':
                new_seq = fastaIO.reverse_complement(new_seq)
            seq_dict[title] = new_seq
            modified += 1
            continue
        right_flank_len = seq_len - right_flank_start
        if left_flank_len < flank:
            needed_left = flank - left_flank_len
            add_left = "N" * needed_left
        if right_flank_len < flank:
            needed_right = flank - right_flank_len
            add_right = "N" * needed_right
        if add_left or add_right:
            new_seq = add_left + seq + add_right
            seq_dict[title] = new_seq
            modified += 1
    flank_in.close()
    genomic_out.close()
    if modified != 0:
        flank_out = open(adj_flank_path, "w")
        for title in seq_order:
            print >> flank_out, ">" + title + "\n" + seq_dict[title]
        flank_out.close()
        return (adj_flank_path)
    else:
        return (flank_file_path)
args = sys.argv[1:]

def usage():
    print """
    usage:
    python summarize_protein_matches.py <nonredundant_pep_file> <output_file>
    
    
    """
    sys.exit(-1)

if len(args) != 2 or sys.argv[1] == '-h' or sys.argv[1] == '-help' or sys.argv[1] == '-H' or sys.argv[1] == '-Help' or sys.argv[1] == '--h' or sys.argv[1] == '--help':
    usage()
track_dict = defaultdict(int)
    
with open(sys.argv[1], "r") as f, open(sys.argv[2], "w", 1) as out:
    for title, seq in fastaIO.FastaGeneralIterator(f):
        hit_class = ''
        if "plus_" in title:
            hit_class = title.rsplit("plus_", 1)[1]
        elif "minus_" in title:
            hit_class = title.rsplit("minus_", 1)[1]
        if hit_class == "?":
            hit_class = "Undetermined"
        track_dict[hit_class] += 1
        
    for key in track_dict:
        print>>out, key + "\t" + str(track_dict[key])
        
        
예제 #3
0
def runTarget(query, blast_out, blast_file_out, path):
    #make output directory
    os.mkdir(blast_out)

    #make command log file
    log_out = open(os.path.join(blast_out, "log.txt"), "w")
    print >> log_out, " ".join(sys.argv)
    log_out.close()

    #use blastn if DNA
    if args.Type == 'nucl':
        print "Using BLASTN\n"
        BLASTN(query, blast_file_out)

    #use tblastn if protein
    elif args.Type == 'prot':
        print "Using TBLASTN\n"
        TBLASTN(query, blast_file_out)

    #make svg drawing(s)
    print "Making svg image of blast results\n"
    Blast_draw(blast_file_out)

    #convert svg image to jpg
    print "Converting svg to jpg\n"
    for svg_file in glob.glob(str(blast_file_out) + "*.svg"):
        jpg_file = os.path.splitext(svg_file)[0]
        jpg_file = jpg_file + ".jpg"
        img_convert(svg_file, jpg_file)

    if args.S == 'Blast':
        return

    blast_in = str(blast_file_out) + ".blast"
    PHI_out = str(blast_file_out)
    print "Blast in:", blast_in + "  PHI out:", PHI_out
    print "Running PHI"
    PHI(blast_in, PHI_out, query)
    print "PHI finished!\n"

    filter_list = str(blast_file_out) + ".list"
    #print "filter list path:", filter_list
    filter_path = os.path.join(path, "parse_target_list.py")
    #print "filter script path:", filter_path

    #print args.E
    if args.E == True:
        #print "E is true!"
        subp.call(["cp", filter_list, filter_list + "_ori.list"])
        subp.call(["python", filter_path, filter_list, str(args.W)])
        time.sleep(1)
        PHI_draw(filter_list + "_ori", Type)
        img_convert(filter_list + "_ori" + ".tcf_drawer.svg",
                    filter_list + "_ori" + ".tcf_drawer.pdf")

    #make svg image of PHI homologs
    print "Making svg image of homologs\n"
    PHI_draw(PHI_out, Type)

    #convert svg to pdf
    print "Coverting svg image to pdf\n"
    img_convert(
        str(PHI_out) + ".tcf_drawer.svg",
        str(PHI_out) + ".tcf_drawer.pdf")

    #get query length
    query_in = open(query, "r")
    query_len = 0
    for title, seq in fastaIO.FastaGeneralIterator(query_in):
        query_len = len(seq)
    query_in.close()

    #check that two or more copies were found and setup index checker to check for correct length flanks and for the correct locus sequence
    print "Building index dict"
    copies = 0
    index_dict = defaultdict(dict)
    dna_copies_in = open(PHI_out + ".dna", "r")
    for title, seq in fastaIO.FastaGeneralIterator(dna_copies_in):
        #print "Seq_name:", title, "\nSeq_len:", len(seq), "\n"
        index_dict[title]['left'] = seq[:25].upper()
        index_dict[title]['right'] = seq[-25:].upper()
    dna_copies_in.close()
    flank_file_path = PHI_out + ".flank"

    genome_dict = {}
    genome_in = open(str(args.genome), "r")
    for title, seq in fastaIO.FastaGeneralIterator(genome_in):
        title2 = title.split(" ")[0]
        genome_dict[title2] = seq
    genome_in.close()

    flank_file_path = standardize_flanks(flank_file_path, index_dict, args.p_f,
                                         genome_dict)

    flank_copies_in = open(flank_file_path, "r")
    for title, seq in fastaIO.FastaGeneralIterator(flank_copies_in):
        copies += 1
    flank_copies_in.close()

    if args.S == 'PHI':
        return

    if copies >= 2:
        filter_list = []
        in_list = []
        if args.a == 'hits' or args.a == 'both':
            print "hits will be aligned"
            if args.f > 0:
                print "Filtering flagged for hits\n"
                if args.Type == 'nucl':
                    filter_list.append([
                        PHI_out + ".dna",
                        PHI_out + ".dna_filter-" + str(args.f)
                    ])
                else:
                    in_list.append(str(PHI_out) + ".aa")
            else:
                if args.Type == 'nucl':
                    in_list.append(str(PHI_out) + ".dna")
                else:
                    in_list.append(str(PHI_out) + ".aa")

        if args.a == 'flanks' or args.a == 'both':
            print "Flanks will be aligned"
            if args.f > 0:
                print "Filtering flagged for flanks\n"
                if args.Type == 'nucl':
                    filter_list.append([
                        flank_file_path,
                        PHI_out + ".flank_filter-" + str(args.f)
                    ])
                else:
                    in_list.append(flank_file_path)
            else:
                in_list.append(flank_file_path)
        print "Entries in filter list:  ", len(filter_list), "\n"
        if len(filter_list) != 0:
            #print "in_list =\n", in_list
            for in_path, out_path_base in filter_list:
                in_file = open(in_path, "r")
                out_path = out_path_base + "_under"
                out_path2 = out_path_base + "_over"
                out_file = open(out_path, "w")
                out_file2 = open(out_path2, "w")
                for title, seq in fastaIO.FastaGeneralIterator(in_file):
                    copy_len = len(seq) - (int(args.p_f) * 2)
                    if copy_len <= (query_len * args.f):
                        print >> out_file, ">" + title + "\n" + seq
                    else:
                        print >> out_file2, ">" + title + "\n" + seq
                in_list.append(out_path)
                in_file.close()
                out_file.close()
                out_file2.close()

        #Run Mafft

        in_count = len(in_list)
        processed = 0
        for in_path in in_list:
            split_list = []
            in_file = open(in_path, "r")
            copies = 0
            for title, seq in fastaIO.FastaGeneralIterator(in_file):
                copies += 1
            in_file.close()
            print str(copies) + " copies in " + in_path, "\n"
            if copies >= 601:
                print "Shuffling and splitting file for seperate alignments\n"
                split_list, copies = fastaIO.shuffle_split(in_path, 350)
                print "Length of split list in:", len(split_list)
            print "Length of split list out:", len(split_list)
            if len(split_list) > 0:
                for path in split_list:
                    msa_out = path + ".msa"
                    print "Running Mafft"
                    if args.Type == 'nucl':
                        MAFFT_NT(path, msa_out)
                    else:
                        MAFFT_P(path, msa_out)
                    if not os.path.exists(msa_out):
                        print "MAFFT alignment failed and is most likely because of not enough RAM. Please rerun TARGeT on this query with increased RAM and/or fewer processors. TARGeT is now exiting."
                        exit(1)
                processed += 1
                if args.S == 'MSA':
                    if (in_count - processed) == 0:
                        return
                    else:
                        continue

            else:
                msa_out = in_path + ".msa"
                print "Running Mafft"
                if args.Type == 'nucl':
                    MAFFT_NT(in_path, msa_out)
                else:
                    MAFFT_P(in_path, msa_out)
                processed += 1

                if not os.path.exists(msa_out):
                    print "MAFFT alignment failed and is most likely because of not enough RAM. Please rerun TARGeT on this query with increased RAM and/or fewer processors. TARGeT is now exiting."
                    exit(1)
                #print "in_count - processed = ", in_count - processed, "\n"
                if args.S == 'MSA':
                    if (in_count - processed) == 0:
                        return
                    else:
                        continue

            #Run FastTreeMP

            msa_list = glob.glob(in_path + "*.msa")
            print "FastTreeMP will run on", len(msa_list), " MSA(s)\n"
            c = 0
            for msa_out in msa_list:
                print "Running FastTreeMP on MSA", c
                tree_out = msa_out + ".nw"
                print "Output tree path: ", tree_out, "\n\n"

                #Can only limit FastTree processor use through OMP_NUM_THREADS. Otherwise, it will use all processors available.
                current_env = os.environ.copy()
                #print "current_env before change:  ", current_env
                current_env['OMP_NUM_THREADS'] = str(args.P)
                #print "OMP_NUM_THREADS after change:  ", current_env['OMP_NUM_THREADS'], "\n\n"

                if args.Type == 'nucl' or (args.Type == 'prot'
                                           and ".flank" in msa_out):
                    proc = subp.Popen([
                        "FastTreeMP", "-nt", "-gamma", "-out", tree_out,
                        msa_out
                    ],
                                      env=current_env)
                    proc.wait()
                    print "\nFastTreeMP finished.\n"
                else:
                    proc = subp.Popen(
                        ["FastTreeMP", "-gamma", "-out", tree_out, msa_out],
                        env=current_env)
                    proc.wait()
                    print "\nFastTreeMP finished.\n"

                print "Converting output tree file to eps image\n"
                out = open(tree_out + ".eps",
                           "w")  #open output file for redirected stdout
                #print "Eps image out path: ", out

                if copies > 45:
                    height = copies * 13
                    width = round(height / 3)
                    print "Image height: ", height, "\twidth: ", width, "\n"
                    subp.call([
                        "treebest", "export", "-y",
                        str(height), "-x",
                        str(width), "-b", "4.5", "-f", "13", "-m", "40",
                        tree_out
                    ],
                              stdout=out)
                else:
                    subp.call(["treebest", "export", tree_out], stdout=out)
                out.close()  #close output file

                print "Coverting eps image to pdf\n"
                subp.call(["convert", tree_out + ".eps", tree_out + ".pdf"])
                c += 1
    else:
        print "Less than two copies found. Multiple alignment and tree building will not be performed.\n"
예제 #4
0
"""
    sys.exit(-1)


bed_list = []

pat = re.compile(
    r"Sbjct:(.+)[_| ]Length.+Location:\(([0-9]*)[_|\s]*-[_|\s]*([0-9]*)\).*Direction:(.+)"
)

for root, dirs, files in os.walk(sys.argv[1]):
    for filename in files:
        if fnmatch.fnmatch(filename, '*.flank'):
            fpath = os.path.join(root, filename)
            in_handle = open(fpath, "r")
            for title, seq in fastaIO.FastaGeneralIterator(in_handle):
                m = pat.search(title)
                if m:
                    contig = m.group(1)
                    if "_" in contig:
                        contig = contig.split("_")[0]
                    start = m.group(2)
                    if int(start) > 0:
                        start = str(int(start) - 1)
                    end = m.group(3)
                    strand = m.group(4)
                    if strand == 'plus':
                        strand = "+"
                    elif strand == 'minus':
                        strand = "-"
예제 #5
0
import os.path
import fnmatch
import fastaIO

top = '''#!/bin/bash
#PBS -l nodes=1:ppn=2,walltime=03:00:00
module load stajichlab
module load stajichlab-python
module load cd-hit

python /rhome/bradc/cd-hit_protein_dna2.py '''
c = 1
for root, dirs, files in os.walk(sys.argv[1]):
    for filename in files:
        if fnmatch.fnmatch(filename, '*_fix.dna'):
            fpath = os.path.join(root, filename)
            if os.stat(fpath).st_size == 0:
                continue
            in_handle = open(fpath, "r")
            d = 0
            for title, seq in fastaIO.FastaGeneralIterator(in_handle):
                d += 1
            in_handle.close()
            if d < 2:
                continue
            full = top + root
            out_handle = open(sys.argv[2] + str(c) + ".sh", "w")
            print >> out_handle, full
            out_handle.close()
            c += 1
예제 #6
0
            out_contents = []
            #go through rest of the group output folder contents, seleting files to be combined
            out_contents = os.listdir(i)
            #print "out content length:", len(out_contents)
            for files in out_contents:

                if fnmatch.fnmatch(files, '*.fasta'):
                    fpath = os.path.join(i, files)
                    in_file = open(fpath, "r")
                    for line in in_file:
                        line = line.strip()
                        fasta.append(line)
                    in_file.close()
                    in_file = open(fpath, "r")
                    for title, seq in fastaIO.FastaGeneralIterator(in_file):
                        #print "copy title:", title
                        element_info_dict['total_len'] += len(seq)
                        sanity_copy_count += 1
                        copy_dict[title] = 1
                    print "copy count now:", sanity_copy_count
                    in_file.close()
                    break

            out_contents = []
            #go through rest of the group output folder contents, seleting files to be combined
            out_contents = os.listdir(i)
            for files in out_contents:
                for keys in element_info_dict['tsd_len']:
                    if fnmatch.fnmatch(files,
                                       '*.insertion-site' + str(keys) + '.fa'):
예제 #7
0
    if c < 4:
        print ' '.join(["Unknown:", name, new_name])
    if name not in wanted_dict:
        wanted_dict[name] = new_name
    c += 1
no_match_in.close()
print "\n"
#for key in wanted_dict:
#    print key, "  ", wanted_dict[key]

if sys.argv[5] != "na":
    c = 0
    #import hit sequence file
    hit_in = open(sys.argv[5], "r")
    hit_track = OrderedDict()
    for title, seq in fastaIO.FastaGeneralIterator(hit_in):
        if arg_len == 9:
            if "plus_" in title:
                title = title.rsplit("_plus", 1)[0]
            elif "minus_" in title:
                title = title.rsplit("_minus", 1)[0]

        else:
            if "plus_" in title:
                title = title.rsplit("plus_", 1)[0] + "plus"
            elif "minus_" in title:
                title = title.rsplit("minus_", 1)[0] + "minus"

        if c < 4:
            print "hit title:", title
        if title in wanted_dict: