Пример #1
0
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs,
                     run_id, fixed_dirs, blast_dtypes, references,
                     min_nt_match, min_nt_score, min_nt_idp, min_aa_match,
                     min_aa_score, min_aa_idp, capture_span, timestamp):
    """Collect Blast results and extract match contigs."""
    # load inputs
    ref_n = run_ref.name
    run_root = r_root_dir+run_id+"/"
    match_root = run_root+run_dirs['match_out_dir']+ref_n+"/"
    capture_root = run_root+run_dirs['capture_dir']+ref_n+"/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"])
    run_ref.log(logstring)
    # collect results
    ref_hits = {}
    control_scores = []
    run_ref.log("Segs/Gs\t")
    run_ref.log("\t".join([genome['name'] for genome in genomes]))
    for seg in run_ref.segs:
        seg_n = seg['name']
        print "\t", seg_n, "...",
        run_ref.log("".join(["\n", seg_n]))
        blast_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg_n+"/"
        capture_dir = capture_root+"/"+seg_n+"/"
        ensure_dir([blast_dir, capture_dir])
        ref_flag = True
        for genome in genomes:
            g_name = genome['name']
            print "|",
            # process
            if g_name not in ref_hits.keys():
                ref_hits[g_name] = {}
            matches_dir = match_root+g_name+"/"
            ensure_dir([matches_dir])
            blast_infile = blast_dir+g_name+"_out.txt"
            genome_ctg_dir = fixed_dirs['fas_contigs_dir']+g_name+"/"
            rec_array = read_array(blast_infile, blast_dtypes)
            if len(rec_array) > 0:  # take qualified hits
                p_cnt = 0
                n_cnt = 0
                if g_name in [ref['name'] for ref in references]:
                    copyfile(genome_ctg_dir+g_name+"_1.fas",
                             matches_dir+g_name+".fas")
                    if ref_flag:
                        # positive control TODO: better solution
                        control_scores.append(rec_array[0][11])
                        ref_flag = False
                for line in rec_array:
                    idp = line[2]
                    q_start, q_stop = line[8], line[9]
                    score = line[11]
                    length = abs(q_stop-q_start)
                    # check the blast mode to use the right thresholds
                    if blast_mode == 'n' or blast_mode == 'tx':
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    elif blast_mode == 'tn':
                        min_match = min_aa_match
                        min_score = min_aa_score
                        min_idp = min_aa_idp
                    else: # default to nucleotide mode
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    if length>min_match and score>min_score and idp>min_idp:
                        print "+",
                        p_cnt +=1
                        contig_id = line[1]
                        if contig_id not in ref_hits[g_name].keys():
                            ref_hits[g_name][contig_id] = {seg_n: score}
                        else:
                            ref_hits[g_name][contig_id][seg_n] = score
                        pattern = re.compile(r'('+contig_id+')\.fas')
                        for item in listdir(genome_ctg_dir):
                            match = re.match(pattern, item)
                            if match:
                                fas_file = matches_dir+match.group(1)+".fas"
                                if not path.exists(fas_file):
                                    copyfile(genome_ctg_dir+item, fas_file)
                        # context capture
                        capture_flag = False
                        while True:
                            try:
                                if int(seg_n) in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            except ValueError:
                                if seg_n in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            else:
                                break
                        if capture_flag:
                            # load the sequence
                            contig_file = matches_dir+contig_id+".fas"
                            contig_rec = load_fasta(contig_file)
                            # check orientation
                            if q_start < q_stop:
                                c_start = q_start-capture_span
                                c_stop = q_stop+capture_span
                            else:
                                c_start = q_stop-capture_span
                                c_stop = q_start+capture_span
                            print c_start, c_stop
                            # check limits
                            if c_start < 0:
                                c_start = 1
                            if c_stop > len(contig_rec.seq):
                                c_stop = len(contig_rec.seq)
                            # proceed
                            cxt_file = capture_dir+g_name+"_"+contig_id+".fas"
                            cxt_rec = SeqRecord(id=contig_id+"_"
                                                    +str(c_start)+"_"
                                                    +str(c_stop),
                                                seq=contig_rec.seq
                                                    [c_start:c_stop])
                            write_fasta(cxt_file, cxt_rec)
                    else:
                        print "-",
                        n_cnt +=1
                if n_cnt > 0:
                    logstring = "".join(["\t", str(p_cnt), " (",
                                         str(n_cnt), ")"])
                else:
                    logstring = "".join(["\t", str(p_cnt)])
                run_ref.log(logstring)
            else:
                print "-",
                run_ref.log("".join(["\t", "0"]))
        print ""
    return ref_hits, control_scores
Пример #2
0
def glomp_good_reads(dataset, bin_type):
    """Use matching reads lists to glomp good reads."""
    # Identify dataset
    nickname = dataset['nickname']
    # Identify input directory and filenames root
    match_dir_root = dirs['match_dir']+nickname+"/"
    # Signal process start
    print "-- Glomping matches against all references for", nickname, "--"
    print datetime.now()
    filter_files = []
    rescue_files = []
    for ref_nick in dataset['ref_nicks']:
        infile = match_dir_root+ref_nick+"/"+nickname+bin_type+"_match.npy"
        ref_type = [reference['type'] for reference in references if
                    reference['nickname'] is ref_nick]
        if ref_type[0] is 'filter':
            filter_files.append({'ref_nick': ref_nick, 'matches': infile})
        elif ref_type[0] is 'rescue':
            rescue_files.append({'ref_nick': ref_nick, 'matches': infile})
    # process filter files
    print "\tprocessing filter references", [filter_file['ref_nick'] for
                                             filter_file in filter_files]
    filter_arrays = []
    for file in filter_files:
        data_array = numpy.load(file['matches'])
        filter_arrays.append(data_array)
    array_index = 0
    filter_IRA = filter_arrays[array_index]
    filter_URA = filter_arrays[array_index]
    while array_index < len(filter_arrays)-1:
        array_index +=1
        filter_IRA = numpy.intersect1d(filter_IRA, filter_arrays[array_index])
        filter_URA = numpy.union1d(filter_URA, filter_arrays[array_index])
    print "\t\t"+str(len(filter_IRA)), "present in all filter references"
    print "\t\t"+str(len(filter_URA)), "matching reads all together (union)"
    # process rescue files
    print "\tprocessing rescue references", [rescue_file['ref_nick'] for
                                             rescue_file in rescue_files]
    rescue_arrays = []
    for file in rescue_files:
        data_array = numpy.load(file['matches'])
        rescue_arrays.append(data_array)
    array_index = 0
    rescue_IRA = rescue_arrays[array_index]
    rescue_URA = rescue_arrays[array_index]
    while array_index < len(rescue_arrays)-1:
        array_index +=1
        rescue_IRA = numpy.intersect1d(rescue_IRA, rescue_arrays[array_index])
        rescue_URA = numpy.union1d(rescue_URA, rescue_arrays[array_index])
    print "\t\t"+str(len(rescue_IRA)), "present in all rescue references"
    print "\t\t"+str(len(rescue_URA)), "matching reads all together (union)"
    # prepare for masking
    print "\tpreparing selection masks"
    q2a_file = dirs['mft_dir']+nickname+"/"+nickname+bin_type+"_track.txt"
    dtype = numpy.dtype([('title', 'S50'), ('bincode', 'S15')])
    pair_array = read_array(q2a_file, dtype, separator='\t')
    # create masking arrays
    mask = numpy.zeros(len(pair_array), bool)
    mask = numpy.invert(mask)
    # filter out baddies - flip to False
    for item in filter_URA:
        mask[item-1] = False    # False means reject
        if item%2==0:
            mask[item-2] = False    # even numbers are /2, flip previous
        else:
            mask[item] = False      # odd numbers are /1, flip next

    # rescue goodies - flip to True
    for item in rescue_URA:
        mask[item-1] = True     # True means accept
        if item%2==0:
            mask[item-2] = True     # even numbers are /2, flip previous
        else:
            mask[item] = True       # odd numbers are /1, flip next
    # save mask to file (where True means keep, False means reject)
    mask_file = dirs['trim_dir']+nickname+bin_type+"_mask.npy"
    numpy.save(mask_file, mask)
    # separate the two sets and write to file
    bin_accept_titles = pair_array[mask]
    accept_file = dirs['select_dir']+nickname+bin_type+"_accept.npy"
    numpy.save(accept_file, bin_accept_titles)
    inv_mask = numpy.invert(mask)
    bin_reject_titles = pair_array[inv_mask]
    reject_file = dirs['select_dir']+nickname+bin_type+"_reject.npy"
    numpy.save(reject_file, bin_reject_titles)
    print "\t\t"+str(len(bin_accept_titles)), "reads to accept"
    print "\t\t"+str(len(bin_reject_titles)), "reads to reject"
    print "-- Done! --"
    print datetime.now()
Пример #3
0
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs,
                     run_id, fixed_dirs, blast_dtypes, references,
                     min_nt_match, min_nt_score, min_nt_idp, min_aa_match,
                     min_aa_score, min_aa_idp, capture_span, timestamp):
    """Collect Blast results and extract match contigs."""
    # load inputs
    ref_n = run_ref.name
    run_root = r_root_dir + run_id + "/"
    match_root = run_root + run_dirs['match_out_dir'] + ref_n + "/"
    capture_root = run_root + run_dirs['capture_dir'] + ref_n + "/"
    print " ", ref_n
    # log
    logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"])
    run_ref.log(logstring)
    # collect results
    ref_hits = {}
    control_scores = []
    run_ref.log("Segs/Gs\t")
    run_ref.log("\t".join([genome['name'] for genome in genomes]))
    for seg in run_ref.segs:
        seg_n = seg['name']
        print "\t", seg_n, "...",
        run_ref.log("".join(["\n", seg_n]))
        blast_dir = run_root + run_dirs[
            'blast_out_dir'] + ref_n + "/" + seg_n + "/"
        capture_dir = capture_root + "/" + seg_n + "/"
        ensure_dir([blast_dir, capture_dir])
        ref_flag = True
        for genome in genomes:
            g_name = genome['name']
            print "|",
            # process
            if g_name not in ref_hits.keys():
                ref_hits[g_name] = {}
            matches_dir = match_root + g_name + "/"
            ensure_dir([matches_dir])
            blast_infile = blast_dir + g_name + "_out.txt"
            genome_ctg_dir = fixed_dirs['fas_contigs_dir'] + g_name + "/"
            rec_array = read_array(blast_infile, blast_dtypes)
            if len(rec_array) > 0:  # take qualified hits
                p_cnt = 0
                n_cnt = 0
                if g_name in [ref['name'] for ref in references]:
                    copyfile(genome_ctg_dir + g_name + "_1.fas",
                             matches_dir + g_name + ".fas")
                    if ref_flag:
                        # positive control TODO: better solution
                        control_scores.append(rec_array[0][11])
                        ref_flag = False
                for line in rec_array:
                    idp = line[2]
                    q_start, q_stop = line[8], line[9]
                    score = line[11]
                    length = abs(q_stop - q_start)
                    # check the blast mode to use the right thresholds
                    if blast_mode == 'n' or blast_mode == 'tx':
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    elif blast_mode == 'tn':
                        min_match = min_aa_match
                        min_score = min_aa_score
                        min_idp = min_aa_idp
                    else:  # default to nucleotide mode
                        min_match = min_nt_match
                        min_score = min_nt_score
                        min_idp = min_nt_idp
                    if length > min_match and score > min_score and idp > min_idp:
                        print "+",
                        p_cnt += 1
                        contig_id = line[1]
                        if contig_id not in ref_hits[g_name].keys():
                            ref_hits[g_name][contig_id] = {seg_n: score}
                        else:
                            ref_hits[g_name][contig_id][seg_n] = score
                        pattern = re.compile(r'(' + contig_id + ')\.fas')
                        for item in listdir(genome_ctg_dir):
                            match = re.match(pattern, item)
                            if match:
                                fas_file = matches_dir + match.group(
                                    1) + ".fas"
                                if not path.exists(fas_file):
                                    copyfile(genome_ctg_dir + item, fas_file)
                        # context capture
                        capture_flag = False
                        while True:
                            try:
                                if int(seg_n) in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            except ValueError:
                                if seg_n in run_ref.capture:
                                    capture_flag = True
                                else:
                                    break
                            else:
                                break
                        if capture_flag:
                            # load the sequence
                            contig_file = matches_dir + contig_id + ".fas"
                            contig_rec = load_fasta(contig_file)
                            # check orientation
                            if q_start < q_stop:
                                c_start = q_start - capture_span
                                c_stop = q_stop + capture_span
                            else:
                                c_start = q_stop - capture_span
                                c_stop = q_start + capture_span
                            print c_start, c_stop
                            # check limits
                            if c_start < 0:
                                c_start = 1
                            if c_stop > len(contig_rec.seq):
                                c_stop = len(contig_rec.seq)
                            # proceed
                            cxt_file = capture_dir + g_name + "_" + contig_id + ".fas"
                            cxt_rec = SeqRecord(
                                id=contig_id + "_" + str(c_start) + "_" +
                                str(c_stop),
                                seq=contig_rec.seq[c_start:c_stop])
                            write_fasta(cxt_file, cxt_rec)
                    else:
                        print "-",
                        n_cnt += 1
                if n_cnt > 0:
                    logstring = "".join(
                        ["\t", str(p_cnt), " (",
                         str(n_cnt), ")"])
                else:
                    logstring = "".join(["\t", str(p_cnt)])
                run_ref.log(logstring)
            else:
                print "-",
                run_ref.log("".join(["\t", "0"]))
        print ""
    return ref_hits, control_scores
Пример #4
0
def glomp_blast_out(dataset, ref_nick):
    """Consolidate Blast output files."""
    # Identify the genome
    nickname = dataset['nickname']
    # Determine the input file root
    root_dir = dirs['blast_out_dir']+nickname+"/"+ref_nick+"/"
    file_root = root_dir+nickname
    # Signal process start
    print "-- Consolidating B_out for", nickname, "against", ref_nick, "--"
    print datetime.now()
    # Cycle through bin types
    series_index = 0
    averages = []       # for comparing series later
    binned_pos = []
    for bin_type in bin_types:
        index = 1
        bin_arrays =[]
        while os.path.isfile(file_root+bin_type+"_"+str(index)+"_blast.out"):
            infile = file_root+bin_type+"_"+str(index)+"_blast.out"
            rec_array = read_array(infile, blast_dtypes)
            if len(rec_array) > 0:
                bin_arrays.append(rec_array)
            index +=1
        print "\t\t"+str(len(bin_arrays)), "arrays for", \
        nickname+bin_type, "series"
        if len(bin_arrays) > 0:
            series = numpy.hstack(bin_arrays)
        else:
            series = []
        print "\t\t"+str(len(series)), "total records in", \
        nickname+bin_type, "series"
        # Save to file
        cons_outfile = file_root+bin_type+"_cons_out.npy"
        numpy.save(cons_outfile, series)
        # Evaluate match positions on reference
        positions = []
        match_read = []
        for row in series:
            # collect match read info while we're at it
            # use regex to extract query index
            query_pattern = re.compile(r'\w*_(\d*)')
            query_match = query_pattern.match(row[0])
            query_index = int(query_match.group(1))
            match_read.append(query_index)
            # use regex to extract ref coords
            ref_pattern = re.compile(r'\w*_\d*_(\d*)')
            ref_match = ref_pattern.match(row[1])
            ref_pos = int(ref_match.group(1))
            pos_scaled = ref_pos/cpm['size']   # adjust to db segment length
            positions.append(pos_scaled)
        # uniquify the match read array
        unique_matches = numpy.unique(match_read)
        print "\t"+str(len(unique_matches)), "unique matches for", bin_type
        # write to file for future use
        match_dir_root = dirs['match_dir']+nickname+"/"+ref_nick+"/"
        ensure_dir(match_dir_root)
        match_outfile = match_dir_root+nickname+bin_type+"_match.npy"
        numpy.save(match_outfile, unique_matches)
        # now count ocurrences per position
        pos_np = numpy.array(positions)
        binned = numpy.bincount(pos_np)
        binned_pos.append(binned)
        pos_count_average = numpy.average(binned)
        averages.append((pos_count_average, series_index))
        series_index +=1
    # compare series
    averages.sort()
    averages.reverse()
    order_indices = []
    for pair in averages:
        order_indices.append(pair[1])
    # identify reference
    ref_name = [reference['full_name'] for reference in references if
                reference['nickname'] is ref_nick]
    # prep directory & file
    fig_root = dirs['reports_dir']+"match_figs/"
    fig_file = fig_root+nickname+"_"+ref_nick+".png"
    ensure_dir(fig_root)
    # generate a figure
    pylot.autoscale(enable=True, axis='both', tight=True)
    pylot.xlabel('Position on the chromosome (/'+str(cpm['size'])+')')
    pylot.ylabel('Number of matches (includes multiples)')
    pylot.title(nickname+' matches to '+ref_name)
    pylot.grid(True)
    for index in order_indices:
        label_root = nickname+bin_types[index]
        label_str = label_root+" ("+str(numpy.sum(binned_pos[index]))+")"
        pylot.plot(binned_pos[index], label=label_str)
    pylot.legend(loc=1)
    pylot.savefig(fig_file, dpi=None, facecolor='w', edgecolor='w',
                  orientation='portrait', papertype=None, format=None)
    pylot.clf()
    print "\t"+str(series_index), "series consolidated and parsed"
    print "-- Done, see plot --"
    print datetime.now()
    return "OK"