예제 #1
0
def main(input_dir, canonical_file, output_prefix, hamm_dist):

    list_seq_files = glob.glob(os.path.join(input_dir, "*_cleaved.txt"))

    dict_sequences = {}
    canonical_sequences = []
    canonical_sequences = seq_IO.read_sequences(canonical_file)

    for filename in list_seq_files:
        sequences = seq_IO.read_sequences(filename)
        for can in canonical_sequences:
            if hamm_dist == -1:
                seq_sim = [seq for seq in sequences if chem_sim(seq, can)]
            else:
                seq_sim = [
                    seq for seq in sequences
                    if conv.hamdist(seq, can) <= hamm_dist
                ]
            if seq_sim:
                dict_sequences[(filename, can)] = seq_sim

    outfile_canon = '%scanonical_sim_cleaved%d.csv' % (output_prefix,
                                                       hamm_dist)

    canon_out = open(outfile_canon, "w")

    for (filename, can), seqs in dict_sequences.items():
        canon_out.write(filename + "," + can + "," + ','.join(seqs) + "\n")
예제 #2
0
def main(data_file, output_prefix, degree_file, width, height):

    sequences = seq_IO.read_sequences(data_file, additional_params=True, header=True, list_vals=True)
    seq_degree = seq_IO.read_sequences(degree_file, additional_params=True, header=True)

    degree_frac = defaultdict(list)

    for seq, seq_dict in sequences.items():
        degree_frac[seq_degree[seq]['Degree']].append(np.mean(seq_dict["Frac"]))

    data = [ np.mean(seq_dict["Frac"]) for seq, seq_dict in sequences.items() ]

    degree_frac_avg = [ np.mean(list_fracs) for degree, list_fracs in degree_frac.items() ]
    degree_frac_std = [ np.std(list_fracs) for degree, list_fracs in degree_frac.items() ]

    fig, axarr = pconv.create_ax(1, 1, shx=False, shy=False)

    hist.draw_actual_plot(axarr[0,0], data, "", "", normed=False, nbins=30, edgecolor=None, log=False)    
    #axarr[0,0].ticklabel_format(axis='x', style='sci', scilimits=(-2,2))

    pconv.save_fig(fig, output_prefix, "hist", width, height, tight=True, size=10) 

    fig2, axarr2 = pconv.create_ax(1, 1, shx=True, shy=True)

    bar.draw_actual_plot(axarr2[0,0], degree_frac_avg, 'g', "", "Degree", "Fraction Shortest Path Uncleaved", tick_label=degree_frac.keys(), yerr=degree_frac_std)
    #axarr[0,0].set_ylim([0,1.3])
    pconv.save_fig(fig2, output_prefix, "bar", width, height, tight=True, size=10)
예제 #3
0
def main(sequence_list, canonical_seq_list, known_cleaved):    

    sequences = seq_IO.read_sequences(sequence_list, additional_params=True)

    canonical_seqs = seq_IO.read_sequences(canonical_seq_list)

    known_cleaved_list = seq_IO.read_sequences(known_cleaved)

    base = os.path.splitext(sequence_list)[0]

    cleaved_seqs = [ (s[0],s[1],s[2],min([conv.hamdist(s[0],c) for c in canonical_seqs])) for s in sequences if s[1] == 'CLEAVED' and s[2] > 2.0 and s[0] not in known_cleaved_list]
    uncleaved_seqs = [ (s[0],s[1],s[2],min([conv.hamdist(s[0],c) for c in canonical_seqs])) for s in sequences if s[1] == 'UNCLEAVED' and s[2] < -2.0 and s[0] not in known_cleaved_list]

    cl_s_dist = [ s[2] for s in cleaved_seqs]
    uncl_s_dist = [s[2] for s in uncleaved_seqs]

    print max(cl_s_dist)
    print min(uncl_s_dist)

    cleaved_seqs_low_ham = sorted(cleaved_seqs, key=lambda x: (x[3], -x[2]))[0:4]
    cleaved_seqs_hi_ham = sorted(cleaved_seqs, key=lambda x: (-x[3], -x[2]))[0:4]
    uncleaved_seqs_low_ham = sorted(uncleaved_seqs, key=lambda x: (x[3], x[2]))[0:4]
    uncleaved_seqs_hi_ham = sorted(uncleaved_seqs, key=lambda x: (-x[3], x[2]))[0:4]

    outfile = '%s_selected.csv' % (base)

    out = open(outfile,"w")
    out.write("Cleaved_seqs_low_hamming_distance\n")
    out.write("\n".join( [ ",".join(map(str,s)) for s in cleaved_seqs_low_ham ] )) 
    out.write("\nCleaved_seqs_high_hamming_distance\n")
    out.write("\n".join( [ ",".join(map(str,s)) for s in cleaved_seqs_hi_ham ] ))
    out.write("\nUncleaved_seqs_low_hamming_distance\n")
    out.write("\n".join( [ ",".join(map(str,s)) for s in uncleaved_seqs_low_ham ] ))
    out.write("\nUncleaved_seqs_high_hamming_distance\n")
    out.write("\n".join( [ ",".join(map(str,s)) for s in uncleaved_seqs_hi_ham ] ))
예제 #4
0
def main(sequence_list, trained_cleaved, trained_uncleaved):

    sequences = seq_IO.read_sequences(sequence_list, additional_params=True)

    trained_cleaved_list = seq_IO.read_sequences(trained_cleaved)

    trained_uncleaved_list = seq_IO.read_sequences(trained_uncleaved)

    base = os.path.splitext(sequence_list)[0]

    cleaved_seqs = [(s[0], s[1],
                     min([conv.hamdist(s[0], c)
                          for c in trained_cleaved_list])) for s in sequences
                    if s[1] == 'CLEAVED']
    uncleaved_seqs = [
        (s[0], s[1],
         min([conv.hamdist(s[0], c) for c in trained_uncleaved_list]))
        for s in sequences if s[1] == 'UNCLEAVED'
    ]

    outfile = '%s_selected_hamm.csv' % (base)

    out = open(outfile, "w")
    out.write("Cleaved_seqs\n")
    out.write("\n".join([",".join(map(str, s)) for s in cleaved_seqs]))
    out.write("\nUncleaved_seqs\n")
    out.write("\n".join([",".join(map(str, s)) for s in uncleaved_seqs]))
예제 #5
0
def main(input_file, canonical_file, output_prefix, hamm_dist, less_than,
         more_than):

    set_sequences = set()
    canonical_sequences = []
    canonical_sequences = seq_IO.read_sequences(canonical_file)
    sequences = seq_IO.read_sequences(input_file)
    for ind, can in enumerate(canonical_sequences):
        if less_than and more_than:
            raise ValueError('Cannot set both --less_than and --more_than')
        elif less_than:
            set_sequences = find_seqs_less_than(can, sequences, set_sequences,
                                                hamm_dist)
        elif more_than and ind == 0:
            set_sequences = find_seqs_more_than_first(can, sequences,
                                                      set_sequences, hamm_dist)
        elif more_than:
            set_sequences = find_seqs_more_than(can, sequences, set_sequences,
                                                hamm_dist)
        else:
            raise ValueError(
                'Cannot have both --less_than and --more_than as false')

    less_v_more = "less" if less_than else "more"

    outfile_canon = '%scanonical_sim_cleaved_%s_%d.csv' % (
        output_prefix, less_v_more, hamm_dist)

    with open(outfile_canon, "w") as canon_out:
        canon_out.write('\n'.join(set_sequences))
예제 #6
0
def main(list_sequence_names, output_prefix, canonical_file):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences


    canonical_seqs = seq_IO.read_sequences(canonical_file)

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float})
        list_sequences.append(sequences)
        extended_list_sequences.extend(sequences[:])
        labels.append(label)

    dict_sequences = { seq : fitness for (seq, fitness) in extended_list_sequences }

    epi = {}

    for canonical_seq in canonical_seqs: 
        mut_func = { "Both_Functional" : [], "Both_Nonfunctional" : [], "One_Functional" : [] }
        mut_nonfunc = { "Both_Functional" : [], "Both_Nonfunctional" : [], "One_Functional" : [] }

        outfile_epi = '%s_%s_epi.csv' % (output_prefix, canonical_seq)
        epi_out = open(outfile_epi,"w")
	print canonical_seq
	epi = {}
        double_mut = [ seq for seq in extended_list_sequences if conv.hamdist(canonical_seq, seq[0]) == 2 ]
        for seq_fit in extended_list_sequences:
            seq = seq_fit[0]
            fit = seq_fit[1] 
            mut_dict = mut_func if fit == 1000 else mut_nonfunc
            list_fit = get_inter_fitness(canonical_seq, seq, dict_sequences)
	    if len(list_fit) <=  1:
                continue
            if all(list_fit):
		if seq_fit in double_mut:
                    
                    sum_fit = sum(list_fit)
	            print sum_fit
                    if sum_fit == 2000:
                        mut_dict["Both_Functional"].append((canonical_seq, seq))
                    elif sum_fit == 0:
                        mut_dict["Both_Nonfunctional"].append((canonical_seq, seq))
                    elif sum_fit == 1000:
                        mut_dict["One_Functional"].append((canonical_seq, seq))
                epi[seq] = (calc_epi(list_fit, fit),list_fit+[fit])

        epi_out.write("Total Double Mutants,%s\n" % (len(double_mut)))

	for label, list_muts in mut_func.items():
            for (can, seq) in list_muts:
                epi_out.write("End Functional,%s,%s,%s\n" % (label,can,seq) )
        for label, list_muts in mut_nonfunc.items():
            for (can, seq) in list_muts:
                epi_out.write("End Functional,%s,%s,%s\n" % (label,can,seq) )	   
	epi_out.write("\n".join(["{0},{1},{2}".format(seq,epi,",".join([str(f) for f in fits])) for seq, (epi,fits) in epi.items()] ) ) 
	epi_out.close()
예제 #7
0
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file):

    list_sequences = [
    ]  #list of list of sequences, where each item represents a label
    extended_list_sequences = []  #flat list of sequences
    labels = []  #labels for list_sequences

    canonical_seqs = seq_IO.read_sequences(canonical_file)

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename,
                                          additional_params=True,
                                          ind_type={1: float})
        new_seqs = [(seq, fitness,
                     min([conv.hamdist(seq, can)
                          for can in canonical_seqs]) <= 2)
                    for seq, fitness in sequences]
        list_sequences.append(new_seqs)
        extended_list_sequences.extend(new_seqs[:])
        labels.append(label)

    outfile_nodes = '%s_nodes.csv' % (output_prefix)

    edges = [
        (seq2, seq)
        for seq, seq2 in itertools.combinations(extended_list_sequences, 2)
        if conv.hamdist(seq2[0], seq[0]) == hamming_dist
    ]

    for canonical_seq in canonical_seqs:
        outfile_edges = '%s_%s_edges.csv' % (output_prefix, canonical_seq)
        edges_out = open(outfile_edges, "w")
        edges_out.write("Source,Target,Weight\n")
        print canonical_seq
        for ([seq1, fit1, can1], [seq2, fit2, can2]) in edges:
            dist_seq1 = conv.hamdist(canonical_seq, seq1)
            dist_seq2 = conv.hamdist(canonical_seq, seq2)
            fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2
            fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1
            fit_upper = fit_upper if fit_upper > 0 else 0.001
            seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2
            seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1
            out_str = "{0},{1},{2}\n".format(seq_lower, seq_upper,
                                             fit_lower / float(fit_upper))
            edges_out.write(
                out_str)  #does this have the correct directionality?
        edges_out.close()

    already_written_nodes = []

    nodes_out = open(outfile_nodes, "w")
    nodes_out.write("Id,Label,Type,Fitness,Canonical\n")
    for seqs, label in zip(list_sequences, labels):
        nodes_out.write("\n".join(
            "{0},{0},{1},{2},{3}".format(x, label, fitness, can)
            for (x, fitness, can) in seqs if x not in already_written_nodes))
        already_written_nodes.extend([s[0] for s in seqs])
        nodes_out.write("\n")
예제 #8
0
def main(list_sequence_names, canonical_list, output_prefix, func_labels,
         unfunc_labels):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_list)

    for canonical in canonical_list_seq:

        dict_sequences = {}

        for [filename, label] in list_sequence_names:
            sequences = seq_IO.read_sequences(filename)
            distances = [conv.hamdist(seq, canonical) for seq in sequences]

            dict_sequences[label] = {
                i: sum([d for d in distances if d == i])
                for i in xrange(1, 6)
            }

        x = []
        y = []
        for i in xrange(1, 6):
            func = 0.0
            unfunc = 0.0
            for label, dict_sums in dict_sequences.items():
                if label in func_labels:
                    func = func + dict_sums[i]
                elif label in unfunc_labels:
                    unfunc = unfunc + dict_sums[i]
            if unfunc != 0:
                x.append(i)
                y.append(func / (func + unfunc))
        print x
        print y
        series.append([x, y, canonical])
    fig, ax = pconv.create_ax(1, 1)

    scatterplot.plot_series(ax[0, 0],
                            series,
                            title="",
                            x_axis="# of Mutations",
                            y_axis="Fraction of Variants that are Functional",
                            alpha=1.0,
                            connect_dots=True,
                            size=30,
                            edgecolors='k')
    ax[0, 0].set_xlim(xmin=1, xmax=5)
    ax[0, 0].set_xticks(xrange(1, 6))
    pconv.save_fig(fig,
                   output_prefix,
                   canonical + "_fraction_func_mutant",
                   6,
                   6,
                   size=15)
예제 #9
0
def main(json_file, output_prefix, novel_seqs_file, canonical_file):

    print "Started Script: {0}".format(datetime.datetime.now())
    
    with open(json_file) as data_file:    
        data = json.load(data_file)

    G = json_graph.node_link_graph(data, directed=False)

    print "Finished Reading in Graph: {0}".format(datetime.datetime.now())

    id_seq = networkx.get_node_attributes(G, "sequence")
    id_status = networkx.get_node_attributes(G, "status")
    seq_id = { seq : node_id for node_id, seq in id_seq.items()}

    print "Created inverse lookup table: {0}".format(datetime.datetime.now())

    novel_seqs = seq_IO.read_sequences(novel_seqs_file)
    canonical_seqs = seq_IO.read_sequences(canonical_file)

    novel_fracs = {}    

    print "Ready to enter loop: {0}".format(datetime.datetime.now())

    for n in novel_seqs:
	novel_fracs[n] = {}
        hamm_dist = sorted([ (conv.hamdist(n,c),c) for c in canonical_seqs ]) 
	min_hamm_dist = hamm_dist[0][0]
        print "Found hamming distances: {0}".format(datetime.datetime.now())

        for hamm, c in hamm_dist:
	    #only analyze min_dist canonical sequences
	    if hamm != min_hamm_dist:
	        continue
	    novel_fracs[n][c] = []
	    #generate list of 5 paths
            #paths = itertools.islice(networkx.all_shortest_paths(G, seq_id[n], seq_id[c]), 5)
            paths = [ networkx.shortest_path(G, seq_id[n], seq_id[c]) ]

            for path in paths:
	        inter_nodes = path[1:-1]
                novel_fracs[n][c].append(float(sum([ 1 for node_id in inter_nodes if id_status[node_id] == "UNCLEAVED" ]))/len(inter_nodes))
    
    base_n_file = os.path.basename(os.path.splitext(novel_seqs_file)[0])
    base_c_file = os.path.basename(os.path.splitext(canonical_file)[0])

    with open("{0}_frac_paths_{1}_{2}.txt".format(output_prefix, base_n_file, base_c_file), 'w') as o:
        for n, c_dict in novel_fracs.items():
	    for c, fracs_list in c_dict.items():
                o.write("{0},{1},".format(n,c))
	        o.write(",".join(map(str,fracs_list)))
		o.write("\n")
	
    print "Output paths: {0}".format(datetime.datetime.now())
예제 #10
0
def main(sequence_list, canonical_seq_list, known_cleaved):

    sequences = seq_IO.read_sequences(sequence_list, additional_params=True)

    canonical_seqs = seq_IO.read_sequences(canonical_seq_list)

    known_cleaved_list = seq_IO.read_sequences(known_cleaved)

    base = os.path.splitext(sequence_list)[0]

    cleaved_seqs = [
        (s[0], s[1], s[2], min([conv.hamdist(s[0], c)
                                for c in canonical_seqs])) for s in sequences
        if s[1] == 'CLEAVED' and s[2] > 2.0 and s[0] not in known_cleaved_list
    ]
    uncleaved_seqs = [(s[0], s[1], s[2],
                       min([conv.hamdist(s[0], c) for c in canonical_seqs]))
                      for s in sequences if s[1] == 'UNCLEAVED' and s[2] < -2.0
                      and s[0] not in known_cleaved_list]

    cl_s_dist = [s[2] for s in cleaved_seqs]
    uncl_s_dist = [s[2] for s in uncleaved_seqs]

    print max(cl_s_dist)
    print min(uncl_s_dist)

    cleaved_seqs_low_ham = sorted(cleaved_seqs, key=lambda x:
                                  (x[3], -x[2]))[0:4]
    cleaved_seqs_hi_ham = sorted(cleaved_seqs, key=lambda x:
                                 (-x[3], -x[2]))[0:4]
    uncleaved_seqs_low_ham = sorted(uncleaved_seqs, key=lambda x:
                                    (x[3], x[2]))[0:4]
    uncleaved_seqs_hi_ham = sorted(uncleaved_seqs, key=lambda x:
                                   (-x[3], x[2]))[0:4]

    outfile = '%s_selected.csv' % (base)

    out = open(outfile, "w")
    out.write("Cleaved_seqs_low_hamming_distance\n")
    out.write("\n".join([",".join(map(str, s)) for s in cleaved_seqs_low_ham]))
    out.write("\nCleaved_seqs_high_hamming_distance\n")
    out.write("\n".join([",".join(map(str, s)) for s in cleaved_seqs_hi_ham]))
    out.write("\nUncleaved_seqs_low_hamming_distance\n")
    out.write("\n".join(
        [",".join(map(str, s)) for s in uncleaved_seqs_low_ham]))
    out.write("\nUncleaved_seqs_high_hamming_distance\n")
    out.write("\n".join([",".join(map(str, s))
                         for s in uncleaved_seqs_hi_ham]))
예제 #11
0
def main(sequence_ratio_file, width, height, pattern, legend):

    sequences = seq_IO.read_sequences(sequence_ratio_file, additional_params=True)

    shell_data = []

    for shell in xrange(1,len(sequences[0])):
        shell_data.append([ seq[shell] for seq in sequences ])

    avg = []
    std = []
    label = xrange(1,4)

    for sd in shell_data:
        avg.append( np.median(sd))
        std.append( np.std(sd))

    #check if std has to be fixed
    #if sum([ 1 for a, s in zip(avg_ratio, std) if a - s < 0 ]):
    #    min_err = [ a - s if a - s >= 0.0 else 0 for a,s in zip(avg_ratio, std) ]
    #    max_err = [ a + s for a,s in zip(avg_ratio, std) ]
    #    err = [min_err, max_err]
    #else:
    #    err = std

    err = std

    fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True)

    bar.draw_actual_plot(axarr[0,0], avg, ['lightsteelblue','lightblue','darkgray'], "", "Shell", "Fraction Cleaved", tick_label=label, yerr = err)
    #axarr[0,0].set_ylim([0,1.3])
    pconv.save_fig(fig, sequence_ratio_file, "plot", width, height, tight=True, size=10)
def main(list_sequence_names, output_prefix):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename) 
        list_sequences.append(sequences)
        extended_list_sequences.extend(sequences[:])
        labels.append(label)

    cleaved_ind = labels.index("CLEAVED")
    middle_ind = labels.index("MIDDLE")
    uncleaved_ind = labels.index("UNCLEAVED")
    frac_uncleaved = {}
    frac_cleaved = {}
    frac_middle = {}
    for seq in list_sequences[cleaved_ind]:
        cleaved_seqs = sum([1 for s in list_sequences[cleaved_ind] if conv.hamdist(seq,s) == 1])
        uncleaved_seqs = sum([1 for s in list_sequences[uncleaved_ind] if conv.hamdist(seq,s) == 1])
        middle_seqs = sum([1 for s in list_sequences[middle_ind] if conv.hamdist(seq,s) == 1])
	if cleaved_seqs > 0 or uncleaved_seqs > 0:
	    total = uncleaved_seqs+middle_seqs+cleaved_seqs
            frac_uncleaved[seq] = float(uncleaved_seqs)/total
	    frac_cleaved[seq] = float(cleaved_seqs)/total
            frac_middle[seq] = float(middle_seqs)/total
    fig, ax = pconv.create_ax(3, 1)

    hist.draw_actual_plot(ax[0,0], frac_cleaved.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Cleaved", log=False, normed=False, nbins=20)
    hist.draw_actual_plot(ax[0,1], frac_middle.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Middle", log=False, normed=False, nbins=20)
    hist.draw_actual_plot(ax[0,2], frac_uncleaved.values(), "Landscape Near Cleaved Sequences", "Fraction of Neighbors Uncleaved", log=False, normed=False, nbins=20)

    pconv.save_fig(fig, output_prefix, "fraction_neighbors", 15, 5, size=10)
예제 #13
0
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences

    #canonical_seqs = seq_IO.read_sequences(canonical_file)
    canonical_seqs = ["DEMEE"] #left other code here in case want to try it from all cleaved sequences

    dict_sequences = {}

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float})
        new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] 
        list_sequences.append(new_seqs)

        extended_list_sequences.extend(new_seqs[:])
	dict_sequences.update({ n[0] : n for n in new_seqs })
        labels.append(label)

    edges = []
    edges_set = set()
    print "Read in Data: {0}".format(datetime.datetime.now()) 

    for seq, fitness, canonical_like in extended_list_sequences:
        neighbors = conv.gen_hamdist_one(seq)
        edges_set.update([ (seq, n) for n in neighbors if n in dict_sequences ])
	edges += [((seq, fitness, canonical_like), dict_sequences[n] ) for n in neighbors if n in dict_sequences and (n,seq) not in edges_set ]
    
    print "Generated Edges: {0}".format(datetime.datetime.now())
    print edges[0:10]
    seq_id = { seq[0] : ind for ind, seq in enumerate(extended_list_sequences) } 
  
    nodes = []
    for seqs, label in zip(list_sequences, labels):
        nodes.extend([ { "id" : seq_id[seq[0]], "sequence" : seq[0], "status" : label, "fitness" : seq[1], "canonical_like" : seq[2] } for seq in seqs ])  
   
    print "Generated List of Nodes: {0}".format(datetime.datetime.now()) 
    links = []

    for canonical_seq in canonical_seqs: 
	print canonical_seq
        for ((seq1,fit1,can1),(seq2,fit2,can2)) in edges:
            dist_seq1 = conv.hamdist(canonical_seq, seq1)
            dist_seq2 = conv.hamdist(canonical_seq, seq2)
            fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2
            fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1
            fit_upper = fit_upper if fit_upper > 0 else 0.001
            seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2
            seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1
	    links.append({ "source" : seq_id[seq_lower], "target" : seq_id[seq_upper], "weight" : fit_lower/float(fit_upper) } )        

    print "Generated List of Edges: {0}".format(datetime.datetime.now())

    output = { "nodes" : nodes, "links" : links }

    with open('{0}nodes_edges.json'.format(output_prefix), 'w') as fp:
        json.dump(output, fp)

    print "Dumped Nodes and Edges Lists: {0}".format(datetime.datetime.now())    
예제 #14
0
def main(list_sequence_names, output_prefix):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    labels = [] #labels for list_sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename) 
        list_sequences.append(sequences)
        labels.append(label)

    cleaved_ind = labels.index("CLEAVED")
    middle_ind = labels.index("MIDDLE")
    uncleaved_ind = labels.index("UNCLEAVED")

    fracs_cleaved = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[cleaved_ind])
    fracs_uncleaved = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[uncleaved_ind])
    fracs_middle = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[middle_ind])

    with open("{0}_cleaved.csv".format(output_prefix),'w') as f:
        f.write("Sequence,Frac_Cleaved,Frac_Middle,Frac_Uncleaved\n")
        f.write("".join([ "{0},{1},{2},{3}\n".format(k,str(v[0]),str(v[1]),str(v[2])) for k, v in fracs_cleaved.items() ]))

    with open("{0}_middle.csv".format(output_prefix),'w') as f:
        f.write("Sequence,Frac_Cleaved,Frac_Middle,Frac_Uncleaved\n")
        f.write("".join([ "{0},{1},{2},{3}\n".format(k,str(v[0]),str(v[1]),str(v[2])) for k, v in fracs_middle.items() ]))

    with open("{0}_uncleaved.csv".format(output_prefix),'w') as f:
        f.write("Sequence,Frac_Cleaved,Frac_Middle,Frac_Uncleaved\n")
        f.write("".join([ "{0},{1},{2},{3}\n".format(k,str(v[0]),str(v[1]),str(v[2])) for k, v in fracs_uncleaved.items() ]))        
예제 #15
0
def main(list_sequence_names, conversion_type="alpha"):    

    if list_sequence_names == "random":
        sequences = conv.generate_random_seqs(5) #used 5 as current length because that's my current use for it, can be customized later
    else:
        sequences = seq_IO.read_sequences(list_sequence_names)
    sequence_features = {}

    if conversion_type == "alpha":
        sequence_features = conv_alpha_seq(sequences)
    elif conversion_type == "binary":
        sequence_features = conv_binary_seq(sequences)
    else:
	raise Exception("Conversion type must be binary or alpha")

    base = os.path.splitext(list_sequence_names)[0]

    outfile = '%s_sequence_features_%s.csv' % (base,conversion_type)

    out = open(outfile,"w")
    #out.write(','.join(["Sequence"] + [ str(i) for i in xrange(1,len(sequences[0])+1)] ))
    #out.write("\n")
    for seq, features in sorted(sequence_features.items()):
	out.write(",".join( [seq] + ( features ) ))
        #out.write(",".join( features ) )
        out.write("\n")
예제 #16
0
def main(list_sequence_names, conversion_type="alpha"):

    if list_sequence_names == "random":
        sequences = conv.generate_random_seqs(
            5
        )  #used 5 as current length because that's my current use for it, can be customized later
    else:
        sequences = seq_IO.read_sequences(list_sequence_names)
    sequence_features = {}

    if conversion_type == "alpha":
        sequence_features = conv_alpha_seq(sequences)
    elif conversion_type == "binary":
        sequence_features = conv_binary_seq(sequences)
    else:
        raise Exception("Conversion type must be binary or alpha")

    base = os.path.splitext(list_sequence_names)[0]

    outfile = '%s_sequence_features_%s.csv' % (base, conversion_type)

    out = open(outfile, "w")
    #out.write(','.join(["Sequence"] + [ str(i) for i in xrange(1,len(sequences[0])+1)] ))
    #out.write("\n")
    for seq, features in sorted(sequence_features.items()):
        out.write(",".join([seq] + (features)))
        #out.write(",".join( features ) )
        out.write("\n")
예제 #17
0
def main(sequences_ratio_file):

    sequences_ratio = seq_IO.read_sequences(sequences_ratio_file,
                                            additional_params=True)

    seq_ratio_dict = [[l[1], l[2], l[3]] for l in (sequences_ratio)]
    seq_cleaved_dict = [l[4] for l in sequences_ratio]
    seqs = [l[0] for l in sequences_ratio]

    avg_ratio = [sum(v) / 3.0 for v in seq_ratio_dict]
    min_ratio = [sum(v) / 3.0 - min(v) for v in seq_ratio_dict]
    max_ratio = [max(v) - sum(v) / 3.0 for v in seq_ratio_dict]

    fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True)
    bar.draw_actual_plot(axarr[0, 0],
                         avg_ratio,
                         'c',
                         "",
                         "Sequence",
                         "FLAG/HA Ratio",
                         tick_label=seqs,
                         yerr=[min_ratio, max_ratio])
    pconv.save_fig(fig,
                   sequences_ratio_file,
                   "plot",
                   4,
                   4,
                   tight=True,
                   size=12)
예제 #18
0
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences

    #canonical_seqs = seq_IO.read_sequences(canonical_file)
    canonical_seqs = ["DEMEE"] #left other code here in case want to try it from all cleaved sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float})
        new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] 
        list_sequences.append(new_seqs)

        extended_list_sequences.extend(new_seqs[:])
	dict_sequences = { n[0] : n for n in new_seqs }
        labels.append(label)

    edges = []
    edges_set = set()
    print "Read in Data: {0}".format(datetime.datetime.now()) 

    for seq, fitness, canonical_like in extended_list_sequences:
        neighbors = conv.gen_hamdist_one(seq)
        edges_set.update([ (seq, n) for n in neighbors if n in dict_sequences ])
	edges += [((seq, fitness, canonical_like), dict_sequences[n] ) for n in neighbors if n in dict_sequences and (n,seq) not in edges_set ]
    
    print "Generated Edges: {0}".format(datetime.datetime.now())
    print edges[0:10]
    seq_id = { seq[0] : ind for ind, seq in enumerate(extended_list_sequences) } 
  
    nodes = []
    for seqs, label in zip(list_sequences, labels):
        nodes.extend([ { "id" : seq_id[seq[0]], "sequence" : seq[0], "status" : label, "fitness" : seq[1], "canonical_like" : seq[2] } for seq in seqs ])  
   
    print "Generated List of Nodes: {0}".format(datetime.datetime.now()) 
    links = []

    for canonical_seq in canonical_seqs: 
	print canonical_seq
        for ((seq1,fit1,can1),(seq2,fit2,can2)) in edges:
            dist_seq1 = conv.hamdist(canonical_seq, seq1)
            dist_seq2 = conv.hamdist(canonical_seq, seq2)
            fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2
            fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1
            fit_upper = fit_upper if fit_upper > 0 else 0.001
            seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2
            seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1
	    links.append({ "source" : seq_id[seq_lower], "target" : seq_id[seq_upper], "weight" : fit_lower/float(fit_upper) } )        

    print "Generated List of Edges: {0}".format(datetime.datetime.now())

    output = { "nodes" : nodes, "links" : links }

    with open('{0}nodes_edges.json'.format(output_prefix), 'w') as fp:
        json.dump(output, fp)

    print "Dumped Nodes and Edges Lists: {0}".format(datetime.datetime.now())    
예제 #19
0
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences

    canonical_seqs = seq_IO.read_sequences(canonical_file)

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float})
        new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] 
        list_sequences.append(new_seqs)
        extended_list_sequences.extend(new_seqs[:])
        labels.append(label)

    outfile_nodes = '%s_nodes.csv' % (output_prefix)

    edges = [(seq2,seq) for seq,seq2 in itertools.combinations(extended_list_sequences,2) if conv.hamdist(seq2[0],seq[0]) == hamming_dist ]

    for canonical_seq in canonical_seqs: 
        outfile_edges = '%s_%s_edges.csv' % (output_prefix, canonical_seq)
        edges_out = open(outfile_edges,"w")
        edges_out.write("Source,Target,Weight\n")
	print canonical_seq
        for ([seq1,fit1,can1],[seq2,fit2,can2]) in edges:
            dist_seq1 = conv.hamdist(canonical_seq, seq1)
            dist_seq2 = conv.hamdist(canonical_seq, seq2)
            fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2
            fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1
            fit_upper = fit_upper if fit_upper > 0 else 0.001
            seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2
            seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1        
	    out_str = "{0},{1},{2}\n".format(seq_lower,seq_upper,fit_lower/float(fit_upper))
	    edges_out.write(out_str) #does this have the correct directionality?
	edges_out.close()

    already_written_nodes = []
   
    nodes_out = open(outfile_nodes,"w")
    nodes_out.write("Id,Label,Type,Fitness,Canonical\n")
    for seqs,label in zip(list_sequences,labels):
        nodes_out.write("\n".join("{0},{0},{1},{2},{3}".format(x, label, fitness,can) for (x,fitness,can) in seqs if x not in already_written_nodes))
        already_written_nodes.extend([ s[0] for s in seqs])
        nodes_out.write("\n")
예제 #20
0
def main(list_nodes, output_prefix, metric, create_keys=False):
    
    if not create_keys:
        sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True)
    else:
        sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True, create_keys=True)

    cleaved_seq = { key : val for key, val in sequences.items() if val["type"] == "CLEAVED" }
    middle_seq = { key : val for key, val in sequences.items() if val["type"] == "MIDDLE" }
    uncleaved_seq = { key : val for key, val in sequences.items() if val["type"] == "UNCLEAVED" }

    print len(cleaved_seq) 
    if metric == "metrics":
        labels_non_plot = ["label", "fitness", "type", "canonical", "timeset"]
        #labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ] + ["Fraction_Cleaved"])
        labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ])
    else:
	labels_to_plot = [metric]

    n_to_plot = len(labels_to_plot)
    fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False)

    nbins = 10    

    for ind, key in enumerate(labels_to_plot):
	if key == "pageranks":
            log = True 
	else:
	    log = False
	if key == "Fraction_Cleaved":
           # data = [ conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), cleaved_seq.keys()).values(),
	   #           conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), middle_seq.keys()).values(),
           #          conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), uncleaved_seq.keys()).values()]
	    normed = True
	else:
            data = [ get_data_from_dict(cleaved_seq, key), get_data_from_dict(middle_seq, key), get_data_from_dict(uncleaved_seq, key) ]
	    normed = True 
	print key
        hist.draw_actual_plot(axarr[0,ind], data, "", key.capitalize(), log=log, normed=normed, label=["Cleaved", "Middle", "Uncleaved"], nbins=nbins)    
        axarr[0,ind].ticklabel_format(axis='x', style='sci', scilimits=(-2,2))

        #pconv.add_legend(axarr[0,ind], location="middle right")
    pconv.save_fig(fig, output_prefix, metric, n_to_plot*2.5, 2.5, tight=True, size=9) 
예제 #21
0
def main(sequence_list, trained_cleaved, trained_uncleaved):    

    sequences = seq_IO.read_sequences(sequence_list, additional_params=True)

    trained_cleaved_list = seq_IO.read_sequences(trained_cleaved)

    trained_uncleaved_list = seq_IO.read_sequences(trained_uncleaved)

    base = os.path.splitext(sequence_list)[0]

    cleaved_seqs = [ (s[0],s[1],min([conv.hamdist(s[0],c) for c in trained_cleaved_list])) for s in sequences if s[1] == 'CLEAVED' ]
    uncleaved_seqs = [ (s[0],s[1],min([conv.hamdist(s[0],c) for c in trained_uncleaved_list])) for s in sequences if s[1] == 'UNCLEAVED' ]

    outfile = '%s_selected_hamm.csv' % (base)

    out = open(outfile,"w")
    out.write("Cleaved_seqs\n")
    out.write("\n".join( [ ",".join(map(str,s)) for s in cleaved_seqs ] )) 
    out.write("\nUncleaved_seqs\n")
    out.write("\n".join( [ ",".join(map(str,s)) for s in uncleaved_seqs] ))
예제 #22
0
def read_sequence_lists( list_sequence_names ):
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float, 2:float})
        list_sequences.append(sequences)
        extended_list_sequences.extend(sequences[:])
        labels.append(label)

    return list_sequences, extended_list_sequences, labels
예제 #23
0
def main(data_file, title, output_prefix):

    sequences = seq_IO.read_sequences(data_file, additional_params=True, header=True)

    data = [ seq_dict["Degree"] for seq, seq_dict in sequences.items() ]

    fig, axarr = pconv.create_ax(1, 1, shx=False, shy=False)

    hist.draw_actual_plot(axarr[0,0], data, "", title.capitalize(), normed=True, nbins=30, edgecolor=None, log=False)    
    #axarr[0,0].ticklabel_format(axis='x', style='sci', scilimits=(-2,2))

    pconv.save_fig(fig, output_prefix, title, 5, 5, tight=True, size=10) 
예제 #24
0
def main(list_sequence_names, canonical_list, output_prefix ):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_list)

    cleaved_seqs = seq_IO.read_sequences( [ s for s,l in list_sequence_names if l == "CLEAVED" ][0] )

    uncleaved_seqs = seq_IO.read_sequences( [ s for s,l in list_sequence_names if l == "UNCLEAVED" ][0] )

    min_dist = []
    avg_dist = []
    max_dist = []

    for seq in cleaved_seqs:

        distances = [ conv.hamdist(seq, unc) for unc in uncleaved_seqs ]
        min_dist.append(min(distances))
        avg_dist.append(numpy.mean(distances))
        max_dist.append(max(distances))
	if seq in canonical_list_seq:
            print seq
	    print min_dist[-1]
            print avg_dist[-1]
            print max_dist[-1]	
    

    fig, ax = pconv.create_ax(1, 3)


    hist.draw_actual_plot(ax[0,0], min_dist, "Min. Distance from Boundary", "Minimum Distances", log=False, normed=True, label=None, nbins=15, stacked=False)
    hist.draw_actual_plot(ax[1,0], avg_dist, "Avg. Distance from Boundary", "Average Distances", log=False, normed=True, label=None, nbins=15, stacked=False)
    hist.draw_actual_plot(ax[2,0], max_dist, "Max. Distance from Boundary", "Maximum Distances", log=False, normed=True, label=None, nbins=15, stacked=False)


    #ax[0,0].set_xlim(xmin=1,xmax=5)
    #ax[0,0].set_xticks(xrange(1,6))
    pconv.save_fig(fig, output_prefix, "dist_from_bounds", 18, 6, size=15)
예제 #25
0
def main(list_sequence_names, canonical_list, output_prefix, func_labels, unfunc_labels):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_list)

    for canonical in canonical_list_seq:
	
        dict_sequences = {}

        for [filename, label] in list_sequence_names:
            sequences = seq_IO.read_sequences(filename) 
            distances = [ conv.hamdist(seq, canonical) for seq in sequences ]
        
            dict_sequences[label] =  { i : sum([d for d in distances if d == i]) for i in xrange(1,6) } 

        x = []
        y = []
        for i in xrange(1,6):
            func=0.0
            unfunc=0.0
            for label, dict_sums in dict_sequences.items():
                if label in func_labels:
                    func = func + dict_sums[i]
                elif label in unfunc_labels:
                    unfunc = unfunc + dict_sums[i]
            if unfunc != 0:
		x.append(i)
                y.append( func/(func+unfunc) )
        print x
	print y
        series.append([x, y, canonical])
    fig, ax = pconv.create_ax(1, 1)

    scatterplot.plot_series( ax[0,0], series, title="", x_axis="# of Mutations", y_axis="Fraction of Variants that are Functional", alpha=1.0, connect_dots=True, size=30, edgecolors='k')
    ax[0,0].set_xlim(xmin=1,xmax=5)
    ax[0,0].set_xticks(xrange(1,6))
    pconv.save_fig(fig, output_prefix, canonical + "_fraction_func_mutant", 6, 6, size=15)
예제 #26
0
def main(list_sequence_names, output_prefix):

    sequence_list = []
    labels = []

    for [filename, label] in list_sequence_names:
        sequence_list.append(set(seq_IO.read_sequences(filename)))
        labels.append(label) 

    fig, ax = pconv.create_ax(1, 1)

    venn3(sequence_list, set_labels = labels, ax=ax[0,0]) 
    
    pconv.save_fig(fig, output_prefix, '_'.join(labels)+"_venn", 10, 10, size=12)
예제 #27
0
def main(list_nodes, output_prefix):
    
    sequences = seq_IO.read_sequences(list_nodes, additional_params=True, header=True)

    mod = get_data_from_dict( sequences, "modularity_class" )

    count_mods = Counter(mod)
    total = float(sum(count_mods.values()))
    freq_mods = [ key for key, val in count_mods.items() if val/total > 0.01 ]

    for mod_class in freq_mods:
	nodes = [ key for key, val in sequences.items() if val["modularity_class"] == mod_class ]
        with open(output_prefix + "_{0}.txt".format(mod_class), 'w') as f:
	    f.write('\n'.join(nodes))    
def main(list_sequence_names, output_prefix, index):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    labels = [] #labels for list_sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename) 
        list_sequences.append(sequences)
        labels.append(label)

    print "Read in Sequences at: {0}".format(datetime.datetime.now())

    cleaved_ind = labels.index("CLEAVED")
    #middle_ind = labels.index("MIDDLE")
    uncleaved_ind = labels.index("UNCLEAVED")

    adj_list_cleaved = conv.adj_list(set(list_sequences[cleaved_ind]), set(list_sequences[uncleaved_ind]), set(), set(list_sequences[cleaved_ind]), ignore_middle=False)
    adj_list_uncleaved = conv.adj_list(set(list_sequences[cleaved_ind]), set(list_sequences[uncleaved_ind]), set(), set(list_sequences[uncleaved_ind]), ignore_middle=False)

    fracs_cleaved = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], [], list_sequences[cleaved_ind], ignore_middle=True)
    fracs_uncleaved = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], [], list_sequences[uncleaved_ind], ignore_middle=True)
    #fracs_middle = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[middle_ind])

    print "Created Adj List and Fracs at: {0}".format(datetime.datetime.now())

    adj_list_cleaved.update(adj_list_uncleaved)
    fracs_cleaved.update(fracs_uncleaved)

    fracs_per_seq = {}

    start_ind = (index-1)*10000
    end_ind = (index)*10000
    if start_ind > len(list_sequences[uncleaved_ind]):
        print "This index is not valid"
        exit
    if end_ind > len(list_sequences[uncleaved_ind]):
        end_ind = len(list_sequences[uncleaved_ind])

    for seq in list_sequences[uncleaved_ind][start_ind:end_ind]:
	new_neighbors = [seq]
	fracs_per_seq[seq] = []
        for x in xrange(0,3):
            frac, new_neighbors = find_fraction_for_shell(new_neighbors, adj_list_cleaved, fracs_cleaved)
	    fracs_per_seq[seq].append(frac)

    print "Found Fracs for Uncleaved Sequences at: {0}".format(datetime.datetime.now())    

    with open("{0}_uncleaved_{1}.csv".format(output_prefix, index),'w') as f:
        f.write("Sequence,1,2,3\n")
        f.write("".join([ "{0},{1},{2},{3}\n".format(k,str(v[0]),str(v[1]),str(v[2])) for k, v in fracs_per_seq.items() ]))
예제 #29
0
def main(list_sequence_names, output_prefix, source):

    list_sequences = [
    ]  #list of list of sequences, where each item represents a label
    labels = []  #labels for list_sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename)
        list_sequences.append(sequences)
        labels.append(label)

    print "Read in Sequences at: {0}".format(datetime.datetime.now())

    cleaved_ind = labels.index("CLEAVED")
    uncleaved_ind = labels.index("UNCLEAVED")

    cleaved_dna = set([
        dna_seq for aa_seq in list_sequences[cleaved_ind]
        for dna_seq in dna_conv.rev_translate(aa_seq)
    ])

    print "Converted to dna at: {0} for # sequences: {1}".format(
        datetime.datetime.now(), len(cleaved_dna))

    total = float(len(cleaved_dna))

    fracs = {}

    for s in source:

        source_dna = dna_conv.rev_translate(s)

        neighbors_set = set.union(
            *[set(dna_conv.gen_hamdist_one(seq)) for seq in source_dna])
        neighbors_nostop = set(
            [n for n in neighbors_set if '_' not in dna_conv.translate(n)])
        cl_neighbors = neighbors_set.intersection(cleaved_dna)

        fracs[s] = (float(len(cl_neighbors)) / len(neighbors_set),
                    float(len(cl_neighbors)) / len(neighbors_nostop))

    print "Found Fracs for Cleaved Sequences at: {0}".format(
        datetime.datetime.now())

    with open("{0}_frac_neighbors_dna.csv".format(output_prefix), 'w') as f:
        f.write("\n".join([
            "{0},{1},{2}".format(s, str(frac1), str(frac2))
            for s, (frac1, frac2) in fracs.items()
        ]))
예제 #30
0
def main(input_file, canonical_file, output_prefix, hamm_dist, less_than, more_than):

    set_sequences = set()
    canonical_sequences = [] 
    canonical_sequences = seq_IO.read_sequences(canonical_file)
    sequences = seq_IO.read_sequences(input_file) 
    for ind, can in enumerate(canonical_sequences):
        if less_than and more_than:
            raise ValueError('Cannot set both --less_than and --more_than')
	elif less_than:
	    set_sequences = find_seqs_less_than(can, sequences, set_sequences, hamm_dist)
        elif more_than and ind == 0:
	    set_sequences = find_seqs_more_than_first(can, sequences, set_sequences, hamm_dist)
	elif more_than:
	    set_sequences = find_seqs_more_than(can, sequences, set_sequences, hamm_dist)
        else:
	    raise ValueError('Cannot have both --less_than and --more_than as false')

    less_v_more = "less" if less_than else "more"

    outfile_canon = '%scanonical_sim_cleaved_%s_%d.csv' % (output_prefix, less_v_more, hamm_dist)

    with open(outfile_canon, "w") as canon_out:
        canon_out.write('\n'.join(set_sequences))
예제 #31
0
def main(input_dir, canonical_file, output_prefix, hamm_dist):

    list_seq_files = glob.glob(os.path.join(input_dir, "*_cleaved.txt"))
    
    dict_sequences = {}
    canonical_sequences = [] 
    canonical_sequences = seq_IO.read_sequences(canonical_file)

    for filename in list_seq_files:
        sequences = seq_IO.read_sequences(filename) 
        for can in canonical_sequences:
            if hamm_dist == -1:
                seq_sim = [ seq for seq in sequences if chem_sim(seq, can) ]
	    else:
       	        seq_sim = [ seq for seq in sequences if conv.hamdist(seq,can) <= hamm_dist ]
	    if seq_sim:
                 dict_sequences[(filename, can)] = seq_sim

    outfile_canon = '%scanonical_sim_cleaved%d.csv' % (output_prefix, hamm_dist)

    canon_out = open(outfile_canon,"w")

    for (filename, can), seqs in dict_sequences.items():
	canon_out.write(filename + "," + can + "," + ','.join(seqs) + "\n")
예제 #32
0
def main(sequences_ratio_file):

    sequences_ratio = seq_IO.read_sequences(sequences_ratio_file, additional_params=True)

    seq_ratio_dict = [ [l[1],l[2],l[3]] for l in (sequences_ratio) ]
    seq_cleaved_dict = [ l[4] for l in sequences_ratio ]
    seqs = [ l[0] for l in sequences_ratio ]
    
    avg_ratio = [ sum(v)/3.0 for v in seq_ratio_dict]
    min_ratio = [ sum(v)/3.0 - min(v) for v in seq_ratio_dict]
    max_ratio = [ max(v)-sum(v)/3.0 for v in seq_ratio_dict]

    fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True)
    bar.draw_actual_plot(axarr[0,0], avg_ratio, 'c', "", "Sequence", "FLAG/HA Ratio", tick_label=seqs, yerr = [min_ratio, max_ratio] ) 
    pconv.save_fig(fig, sequences_ratio_file, "plot", 4, 4, tight=True, size=12)
예제 #33
0
def main(list_sequence_names, output_prefix):

    list_sequences = [
    ]  #list of list of sequences, where each item represents a label
    labels = []  #labels for list_sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename)
        list_sequences.append(sequences)
        labels.append(label)

    cleaved_ind = labels.index("CLEAVED")
    middle_ind = labels.index("MIDDLE")
    uncleaved_ind = labels.index("UNCLEAVED")

    fracs_cleaved = conv.fraction_neighbors_all(list_sequences[cleaved_ind],
                                                list_sequences[uncleaved_ind],
                                                list_sequences[middle_ind],
                                                list_sequences[cleaved_ind])
    fracs_uncleaved = conv.fraction_neighbors_all(
        list_sequences[cleaved_ind], list_sequences[uncleaved_ind],
        list_sequences[middle_ind], list_sequences[uncleaved_ind])
    fracs_middle = conv.fraction_neighbors_all(list_sequences[cleaved_ind],
                                               list_sequences[uncleaved_ind],
                                               list_sequences[middle_ind],
                                               list_sequences[middle_ind])

    with open("{0}_cleaved.csv".format(output_prefix), 'w') as f:
        f.write("Sequence,Frac_Cleaved,Frac_Middle,Frac_Uncleaved\n")
        f.write("".join([
            "{0},{1},{2},{3}\n".format(k, str(v[0]), str(v[1]), str(v[2]))
            for k, v in fracs_cleaved.items()
        ]))

    with open("{0}_middle.csv".format(output_prefix), 'w') as f:
        f.write("Sequence,Frac_Cleaved,Frac_Middle,Frac_Uncleaved\n")
        f.write("".join([
            "{0},{1},{2},{3}\n".format(k, str(v[0]), str(v[1]), str(v[2]))
            for k, v in fracs_middle.items()
        ]))

    with open("{0}_uncleaved.csv".format(output_prefix), 'w') as f:
        f.write("Sequence,Frac_Cleaved,Frac_Middle,Frac_Uncleaved\n")
        f.write("".join([
            "{0},{1},{2},{3}\n".format(k, str(v[0]), str(v[1]), str(v[2]))
            for k, v in fracs_uncleaved.items()
        ]))
예제 #34
0
def main(list_sequence_names, output_prefix):

    lines = []

    temp_dict = { "CLEAVED" : {}, "UNCLEAVED" : {}, "MIDDLE" : {} }

    for [filename, label, sample] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename)
        temp_dict[label][sample] = len(sequences)

    lines.append(([ val for k, val in sorted(temp_dict["CLEAVED"].items()) ], "CLEAVED") )
    lines.append(([ val for k, val in sorted(temp_dict["MIDDLE"].items()) ], "MIDDLE") )
    lines.append(([ val for k, val in sorted(temp_dict["UNCLEAVED"].items()) ], "UNCLEAVED") )

    fig, ax = pconv.create_ax(1, 1)

    bar.plot_series( ax[0,0], lines, title="", x_axis="Variant Name", y_axis="Number of Substrate Sequences Sampled", tick_label=sorted(temp_dict["CLEAVED"].keys())) 
    pconv.save_fig(fig, output_prefix, "cleaved_uncleaved_middle", 6, 6, tight=True, size=10)
예제 #35
0
def main(json_file, output_prefix, nbunch_file):
    
    with open(json_file) as data_file:    
        data = json.load(data_file)

    G = json_graph.node_link_graph(data)

    sequences = seq_IO.read_sequences(nbunch_file) 

    id_seq = networkx.get_node_attributes(G, "sequence")

    seq_id = { seq : node_id for node_id, seq in id_seq.items()}

    nbunch = [ seq_id[seq] for seq in sequences ]

    degrees = networkx.degree(G, nbunch)

    with open("{0}_degree.txt".format(output_prefix), 'w') as o:
        o.write("\n".join([ "{0},{1}".format(id_seq[k], str(d)) for k,d in degrees.items() ]))
예제 #36
0
def main(json_file, output_prefix, nbunch_file):

    with open(json_file) as data_file:
        data = json.load(data_file)

    G = json_graph.node_link_graph(data)

    sequences = seq_IO.read_sequences(nbunch_file)

    id_seq = networkx.get_node_attributes(G, "sequence")

    seq_id = {seq: node_id for node_id, seq in id_seq.items()}

    nbunch = [seq_id[seq] for seq in sequences]

    degrees = networkx.degree(G, nbunch)

    with open("{0}_degree.txt".format(output_prefix), 'w') as o:
        o.write("\n".join(
            ["{0},{1}".format(id_seq[k], str(d)) for k, d in degrees.items()]))
예제 #37
0
def main(list_sequence_names, output_prefix, source):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    labels = [] #labels for list_sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename) 
        list_sequences.append(sequences)
        labels.append(label)

    print "Read in Sequences at: {0}".format(datetime.datetime.now())

    cleaved_ind = labels.index("CLEAVED")
    uncleaved_ind = labels.index("UNCLEAVED")

    cleaved_dna = set([ dna_seq for aa_seq in list_sequences[cleaved_ind] for dna_seq in dna_conv.rev_translate(aa_seq) ])

    print "Converted to dna at: {0} for # sequences: {1}".format(datetime.datetime.now(), len(cleaved_dna))

    adj_list_cleaved = dna_conv.adj_list_cleaved(cleaved_dna, cleaved_dna)

    print "Created Adj List and Fracs at: {0}".format(datetime.datetime.now())

    total = float(len(cleaved_dna))

    list_x = [0]
    list_y = [1/total]

    source_dna = dna_conv.rev_translate(source)

    new_neighbors = source_dna

    for x in xrange(1,3):
        frac, new_neighbors = find_fraction_for_shell(new_neighbors, adj_list_cleaved, total)
        list_x.append(x)
        list_y.append(frac)

    print "Found Fracs for Cleaved Sequences at: {0}".format(datetime.datetime.now())    

    with open("{0}_{1}.csv".format(output_prefix,source),'w') as f:
        f.write("\n".join([ "{0},{1}".format(str(x),str(y)) for x, y in zip(list_x,list_y) ]))
예제 #38
0
def main(sequence_ratio_file, width, height, pattern, legend):

    sequence_ratio = seq_IO.read_sequences(sequence_ratio_file,
                                           additional_params=True)

    seqs = [s[0] for s in sequence_ratio]
    avg_ratio = [s[1] for s in sequence_ratio]
    std = [s[2] for s in sequence_ratio]
    label = [s[3] for s in sequence_ratio]

    if len(sequence_ratio[0]) > 4:
        color = [s[4] for s in sequence_ratio]
    else:
        color = [convert_label_color(l) for l in label]

    #check if std has to be fixed
    #if sum([ 1 for a, s in zip(avg_ratio, std) if a - s < 0 ]):
    #    min_err = [ a - s if a - s >= 0.0 else 0 for a,s in zip(avg_ratio, std) ]
    #    max_err = [ a + s for a,s in zip(avg_ratio, std) ]
    #    err = [min_err, max_err]
    #else:
    #    err = std

    err = std

    fig, axarr = pconv.create_ax(1, 1, shx=True, shy=True)

    if legend:
        label_legend = [
            l if l not in ["CLEAVED", "MIDDLE", "UNCLEAVED"] else None
            for l in label
        ]
        patches, labels = bar.draw_actual_plot(axarr[0, 0],
                                               avg_ratio,
                                               color,
                                               "",
                                               "",
                                               "FLAG/HA Ratio",
                                               tick_label=seqs,
                                               yerr=err,
                                               pattern=pattern,
                                               label=label_legend)
        lgd = axarr[0, 0].legend(patches,
                                 labels,
                                 loc="upper center",
                                 bbox_to_anchor=(0.5, 1.05),
                                 borderaxespad=0.,
                                 prop={'size': 9},
                                 ncol=2,
                                 fancybox=True)
        print patches
        print labels
    else:
        bar.draw_actual_plot(axarr[0, 0],
                             avg_ratio,
                             color,
                             "",
                             "",
                             "FLAG/HA Ratio",
                             tick_label=seqs,
                             yerr=err,
                             pattern=pattern)
        lgd = None
    axarr[0, 0].set_ylim([0, 1.3])
    pconv.save_fig(fig,
                   sequence_ratio_file,
                   "plot",
                   width,
                   height,
                   tight=True,
                   size=10,
                   extra_artists=lgd)
예제 #39
0
def main(seq_file, canonical_file, output_prefix):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_file)

    print "Beginning Script: {0}".format(datetime.datetime.now())

    for canonical in canonical_list_seq:

        with open(seq_file) as strings:
            seq_list = strings.read().splitlines()
            seq_ind_list = [(seq, ind) for ind, seq in enumerate(seq_list)]
        orig_len = len(seq_ind_list)
        if canonical not in seq_list:
            one_away = gsconv.gen_hamdist_one(canonical)
            one_away = [o for o in one_away if o != canonical] + [canonical]
            seq_ind_list = seq_ind_list[:] + [
                (o, ind) for (ind, o) in enumerate(one_away, len(seq_ind_list))
            ]

        edges = [(seq2, seq)
                 for seq, seq2 in itertools.combinations(seq_ind_list, 2)
                 if gsconv.hamdist(seq2[0], seq[0]) < 2]
        print len(seq_ind_list)
        print "Generated Edges: {0}".format(datetime.datetime.now())

        numpy.set_printoptions(threshold='nan')

        canon_ind = [i for (s, i) in seq_ind_list if s == canonical][0]

        T_mat = trans_matrix(seq_ind_list, edges)
        #print raise_matrix(T_mat,1)
        #print raise_matrix(T_mat,3)
        #T = raise_matrix(T_mat,10)
        #T = raise_matrix(T_mat,20)
        x = [0]
        y = [0]

        print "Transformed Matrix: {0}".format(datetime.datetime.now())

        x.append(1)
        y.append(find_frac(T_mat, canon_ind, orig_len))

        T_mat_new = T_mat

        for i in range(2, 23):
            x.append(i)
            T_mat_new, frac = square_matrix(T_mat_new, T_mat, canon_ind,
                                            orig_len)
            y.append(frac)

            print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now())

        series.append([x, y, canonical])

    fig, ax = conv.create_ax(1, 1)

    color = ['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue']

    scatterplot.plot_series(ax[0, 0],
                            series,
                            title="",
                            x_axis="Number of Steps",
                            colors=color,
                            y_axis="Fraction Cleaved Variants Reached",
                            alpha=0.85,
                            connect_dots=True,
                            size=15,
                            edgecolors='k',
                            linewidth=0)
    ax[0, 0].set_xlim(xmin=1)
    ax[0, 0].set_ylim(ymin=0.0, ymax=1.0)
    ax[0, 0].set_xticks(xrange(1, 23, 3))
    lgd = conv.add_legend(ax[0, 0],
                          location='upper center',
                          bbox_to_anchor=(0.5, 1.05),
                          ncol=2,
                          size=8)
    conv.save_fig(fig,
                  output_prefix,
                  "fraction_func",
                  2.5,
                  3,
                  size=9.5,
                  extra_artists=lgd)

    print "Outputted Figure: {0}".format(datetime.datetime.now())
예제 #40
0
def main(list_sequence_names, hamming_dist, output_prefix, canonical_file):
    
    list_sequences = [] #list of list of sequences, where each item represents a label 
    extended_list_sequences = [] #flat list of sequences
    labels = [] #labels for list_sequences

    #canonical_seqs = seq_IO.read_sequences(canonical_file)
    canonical_seqs = ['DEMEE']

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename, additional_params=True, ind_type={1:float})
        new_seqs = [ (seq,fitness,min([ conv.hamdist(seq,can) for can in canonical_seqs ]) <= 2) for seq,fitness in sequences ] 
        list_sequences.append(new_seqs)
        extended_list_sequences.extend(new_seqs[:])
        labels.append(label)

    outfile_nodes = '%s_nodes.csv' % (output_prefix)

    edges = [(seq2,seq) for seq,seq2 in itertools.combinations(extended_list_sequences,2) if conv.hamdist(seq2[0],seq[0]) == hamming_dist ]

    tallies = { 2 : {2:0,1.5:0,1:0}, 1.5 : {2:0,1.5:0,1:0}, 1 : {2:0,1.5:0,1:0} }
    

    for edge in edges:
        tallies[edge[0][1]][edge[1][1]] += 1

    frequencies = { 2 : {}, 1.5 : {}, 1 : {} }

    for source, tallies_dict in tallies.items():
        n_tallies = float(sum(tallies_dict.values()))
        frequencies[source] = { k : v/n_tallies for k, v in tallies_dict.items() }

    new_edges = []

    for edge in edges:
	fitness_source = edge[0][1]
        fitness_target = np.random.choice([2,1.5,1],p=[frequencies[fitness_source][2],frequencies[fitness_source][1.5],frequencies[fitness_source][1]])   
 	seqs = list_sequences[labels.index(conv_fitness_label(fitness_target))]
        new_edges.append((edge[0],seqs[np.random.randint(0,len(seqs)-1)]))

    edges = new_edges
      
    for canonical_seq in canonical_seqs: 
        outfile_edges = '%s_%s_edges.csv' % (output_prefix, canonical_seq)
        edges_out = open(outfile_edges,"w")
        edges_out.write("Source,Target,Weight\n")
	print canonical_seq
        for ([seq1,fit1,can1],[seq2,fit2,can2]) in edges:
            dist_seq1 = conv.hamdist(canonical_seq, seq1)
            dist_seq2 = conv.hamdist(canonical_seq, seq2)
            fit_lower = fit1 if dist_seq1 < dist_seq2 else fit2
            fit_upper = fit2 if dist_seq1 < dist_seq2 else fit1
            fit_upper = fit_upper if fit_upper > 0 else 0.001
            seq_lower = seq1 if dist_seq1 < dist_seq2 else seq2
            seq_upper = seq2 if dist_seq1 < dist_seq2 else seq1        
	    out_str = "{0},{1},{2}\n".format(seq_lower,seq_upper,fit_lower/float(fit_upper))
	    edges_out.write(out_str) #does this have the correct directionality?
	edges_out.close()

    already_written_nodes = []
   
    nodes_out = open(outfile_nodes,"w")
    nodes_out.write("Id,Label,Type,Fitness,Canonical\n")
    for seqs,label in zip(list_sequences,labels):
        nodes_out.write("\n".join("{0},{0},{1},{2},{3}".format(x, label, fitness,can) for (x,fitness,can) in seqs if x not in already_written_nodes))
        already_written_nodes.extend([ s[0] for s in seqs])
        nodes_out.write("\n")
예제 #41
0
def main(list_sequence_names, canonical_list, output_prefix):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_list)

    cleaved_seqs = seq_IO.read_sequences(
        [s for s, l in list_sequence_names if l == "CLEAVED"][0])

    uncleaved_seqs = seq_IO.read_sequences(
        [s for s, l in list_sequence_names if l == "UNCLEAVED"][0])

    min_dist = []
    avg_dist = []
    max_dist = []

    for seq in cleaved_seqs:

        distances = [conv.hamdist(seq, unc) for unc in uncleaved_seqs]
        min_dist.append(min(distances))
        avg_dist.append(numpy.mean(distances))
        max_dist.append(max(distances))
        if seq in canonical_list_seq:
            print seq
            print min_dist[-1]
            print avg_dist[-1]
            print max_dist[-1]

    fig, ax = pconv.create_ax(1, 3)

    hist.draw_actual_plot(ax[0, 0],
                          min_dist,
                          "Min. Distance from Boundary",
                          "Minimum Distances",
                          log=False,
                          normed=True,
                          label=None,
                          nbins=15,
                          stacked=False)
    hist.draw_actual_plot(ax[1, 0],
                          avg_dist,
                          "Avg. Distance from Boundary",
                          "Average Distances",
                          log=False,
                          normed=True,
                          label=None,
                          nbins=15,
                          stacked=False)
    hist.draw_actual_plot(ax[2, 0],
                          max_dist,
                          "Max. Distance from Boundary",
                          "Maximum Distances",
                          log=False,
                          normed=True,
                          label=None,
                          nbins=15,
                          stacked=False)

    #ax[0,0].set_xlim(xmin=1,xmax=5)
    #ax[0,0].set_xticks(xrange(1,6))
    pconv.save_fig(fig, output_prefix, "dist_from_bounds", 18, 6, size=15)
예제 #42
0
def main(list_nodes, output_prefix, metric):

    cleaved_seq = {}
    uncleaved_seq = {}
    middle_seq = {}    

    for nodes, label in list_nodes:
        sequences = seq_IO.read_sequences(nodes, additional_params=True, header=True)

        cleaved_seq[label] = { key : val for key, val in sequences.items() if val["type"] == "CLEAVED" }
        middle_seq[label] = { key : val for key, val in sequences.items() if val["type"] == "MIDDLE" }
        uncleaved_seq[label] = { key : val for key, val in sequences.items() if val["type"] == "UNCLEAVED" }

    if metric == "metrics":
        labels_non_plot = ["label", "fitness", "type", "canonical"]
	orig_labels_to_plot = sorted([ key for key in sequences["DEMEE"].keys() if key not in labels_non_plot ])
        labels_to_plot = sorted(orig_labels_to_plot) 
    else:
	orig_labels_to_plot = [metric]
	labels_to_plot = [metric]

    n_to_plot = len(labels_to_plot)
    fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False)

    nbins = 10    

    list_seqs = [ k for d in cleaved_seq.values() for k in d.keys() ]

    count_seqs = Counter(list_seqs)

    #seqs_5_l = [ s for s in list_seqs if count_seqs[s] == 5 ]
    seqs_4_l = [ s for s in list_seqs if count_seqs[s] == 4 ]
    seqs_3_l = [ s for s in list_seqs if count_seqs[s] == 3 ]
    seqs_2_l = [ s for s in list_seqs if count_seqs[s] == 2 ]
    seqs_1_l = [ s for s in list_seqs if count_seqs[s] == 1 ]


    if metric != "Fraction_Cleaved":
        #seqs_5 = list_metrics( cleaved_seq, seqs_5_l, orig_labels_to_plot)
        seqs_4 = list_metrics( cleaved_seq, seqs_4_l, orig_labels_to_plot)
        seqs_3 = list_metrics( cleaved_seq, seqs_3_l, orig_labels_to_plot)
        seqs_2 = list_metrics( cleaved_seq, seqs_2_l, orig_labels_to_plot)
        seqs_1 = list_metrics( cleaved_seq, seqs_1_l, orig_labels_to_plot)

    for ind, key in enumerate(labels_to_plot):
	if key == "pageranks":
            log = True
	else:
	    log = False
	if key == "Fraction_Cleaved":
            data = [ #average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_5_l),
                     average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_4_l),
                     average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_3_l),
                     average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_2_l),
                     average_fraction_neighbors_cleaved(cleaved_seq, uncleaved_seq, middle_seq, seqs_1_l)]
	    normed=True
        else:
            data = [ #get_data_from_dict(seqs_5, key), 
		get_data_from_dict(seqs_1, key), get_data_from_dict(seqs_2, key), get_data_from_dict(seqs_3, key), get_data_from_dict(seqs_4, key) ]
	    normed=True 
        hist.draw_actual_plot(axarr[0,ind], data, "", key.capitalize(), colors = [ tuple(c) for c in plt.cm.Blues(np.linspace(0.2, 1, 4)).tolist()], log=log, normed=normed, label=["Cl. by 5", "Cl. by 4", "Cl. by 3", "Cl. by 2", "Cl. by 1"], nbins=nbins)    
        axarr[0,ind].ticklabel_format(axis='x', style='sci', scilimits=(-2,2))

        #pconv.add_legend(axarr[0,ind], location="upper right")
    pconv.save_fig(fig, output_prefix, metric, n_to_plot*3, 3, tight=True, size=9) 

    fig_bar, axarr_bar = pconv.create_ax(1, 1, shx=False, shy=False)

    gradient = np.linspace(1, 0.2, 256)
    #gradient = np.hstack((gradient, gradient))
    gradient = np.array(zip(gradient,gradient))
    axarr_bar[0,0].imshow(gradient, aspect='auto', cmap=plt.get_cmap('Blues'))
    #axarr_bar[0,0].set_axis_off()
    plt.tick_params(
    axis='both',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off', # labels along the bottom edge are off
    left='off',      # ticks along the bottom edge are off
    right='off',         # ticks along the top edge are off
    labelright='off') # labels along the bottom edge are off

    pconv.save_fig(fig_bar, output_prefix, "colorbar", 0.3, 3, tight=True)
예제 #43
0
def main(list_sequence_names, output_prefix):

    list_sequences = [
    ]  #list of list of sequences, where each item represents a label
    extended_list_sequences = []  #flat list of sequences
    labels = []  #labels for list_sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename,
                                          additional_params=True,
                                          ind_type={
                                              1: float,
                                              2: float
                                          })
        print sequences[0:10]
        list_sequences.append(sequences)
        extended_list_sequences.extend(sequences[:])
        labels.append(label)

    print len(extended_list_sequences)
    dict_seq_fit = {
        seq: fitness
        for (seq, fitness, ratio) in extended_list_sequences
    }
    dict_seq_ratio = {
        seq: ratio
        for (seq, fitness, ratio) in extended_list_sequences
    }
    print len(dict_seq_fit)

    epi = {}
    outfile_epi = '%s_epi_double.csv' % (output_prefix)
    epi_double_out = open(outfile_epi, "w")
    outfile_epi = '%s_epi.csv' % (output_prefix)
    epi_out = open(outfile_epi, "w")

    mut_func = {
        "Both_Functional": [],
        "Both_Nonfunctional": [],
        "One_Functional": []
    }
    mut_nonfunc = {
        "Both_Functional": [],
        "Both_Nonfunctional": [],
        "One_Functional": []
    }

    prod = itertools.product(list_sequences[labels.index("CLEAVED")],
                             extended_list_sequences)
    pairs = set()
    counter = 0
    counter_prod = 0
    for x, y in prod:
        counter_prod += 1
        if x[0] != y[0]:
            counter += 1
            pairs.add(frozenset((x, y)))
    print counter_prod
    print len(pairs)
    print counter
    print "done making set"
    for can, seq_fit in pairs:
        canonical_seq = can[0]
        seq = seq_fit[0]
        fit = seq_fit[1]
        mut_dict = mut_func if fit == 1 else mut_nonfunc

        dist = conv.hamdist(canonical_seq, seq)
        if dist <= 1:
            continue
        list_inter, list_fit = get_inter_fitness(canonical_seq, seq,
                                                 dict_seq_fit)
        if None not in list_fit:
            if dist == 2:
                sum_fit = sum(list_fit)
                if sum_fit > 1.95:
                    mut_dict["Both_Functional"].append(
                        (canonical_seq, seq, list_inter, list_fit))
                elif sum_fit < 0.05:
                    mut_dict["Both_Nonfunctional"].append(
                        (canonical_seq, seq, list_inter, list_fit))
                else:  #either one uncleaved or one middle
                    mut_dict["One_Functional"].append(
                        (canonical_seq, seq, list_inter, list_fit))
            epi[(canonical_seq, seq)] = (calc_epi(list_fit, fit), fit,
                                         list_fit, list_inter)
    print "done calc epi"
    '''epi_double_out.write("Starting,Starting_Ratio,Ending,Ending_Ratio,Status_Ending,Status_Intermediates,Inter1_Seq,Inter1_Fit,Inter1_Ratio,Inter2_Seq,Inter2_Fit,Inter2_Ratio\n")
    for label, list_muts in mut_func.items():
        for (can, seq, list_inter, list_fit) in list_muts:
            epi_double_out.write("{start},{start_ratio},{end},{end_ratio},End_Cleaved,{label},{data}\n".format(label=label,start=can,end=seq,
					start_ratio=dict_seq_ratio[can],end_ratio=dict_seq_ratio[seq],
					data = ",".join([ "{0},{1},{2}".format(seq,fitness_to_str(fit),dict_seq_ratio[seq]) for seq,fit in zip(list_inter,list_fit)])) )
    for label, list_muts in mut_nonfunc.items():
        for (can, seq, list_inter, list_fit) in list_muts:
            epi_double_out.write("{start},{start_ratio},{end},{end_ratio},End_Uncleaved,{label},{data}\n".format(label=label,start=can,end=seq,
                                        start_ratio=dict_seq_ratio[can],end_ratio=dict_seq_ratio[seq],
                                        data = ",".join([ "{0},{1},{2}".format(seq,fit,dict_seq_ratio[seq]) for seq,fit in zip(list_inter,list_fit)])) ) 
    '''
    epi_out.write(
        "Starting,Starting_Ratio,Ending,Ending_Ratio,Ending_Fitness,Epistasis,List_Seqs_Fitnesses_Ratios_Intermediates\n"
    )
    epi_out.write("\n".join([
        "{0},{1},{2},{3},{4},{5},{6}".format(
            can, dict_seq_ratio[can], seq, dict_seq_ratio[seq],
            fitness_to_str(fit), e, ",".join([
                "{0},{1},{2}".format(s, fitness_to_str(f), dict_seq_ratio[s])
                for f, s in zip(list_fit, list_inter)
            ])) for (can, seq), (e, fit, list_fit, list_inter) in epi.items()
    ]))
    epi_out.close()
    epi_double_out.close()
    print "done writing epi"
예제 #44
0
def main(list_sequence_names, output_prefix, index):

    list_sequences = [
    ]  #list of list of sequences, where each item represents a label
    labels = []  #labels for list_sequences

    for [filename, label] in list_sequence_names:
        sequences = seq_IO.read_sequences(filename)
        list_sequences.append(sequences)
        labels.append(label)

    print "Read in Sequences at: {0}".format(datetime.datetime.now())

    cleaved_ind = labels.index("CLEAVED")
    #middle_ind = labels.index("MIDDLE")
    uncleaved_ind = labels.index("UNCLEAVED")

    adj_list_cleaved = conv.adj_list(set(list_sequences[cleaved_ind]),
                                     set(list_sequences[uncleaved_ind]),
                                     set(),
                                     set(list_sequences[cleaved_ind]),
                                     ignore_middle=False)
    adj_list_uncleaved = conv.adj_list(set(list_sequences[cleaved_ind]),
                                       set(list_sequences[uncleaved_ind]),
                                       set(),
                                       set(list_sequences[uncleaved_ind]),
                                       ignore_middle=False)

    fracs_cleaved = conv.fraction_neighbors_all(list_sequences[cleaved_ind],
                                                list_sequences[uncleaved_ind],
                                                [],
                                                list_sequences[cleaved_ind],
                                                ignore_middle=True)
    fracs_uncleaved = conv.fraction_neighbors_all(
        list_sequences[cleaved_ind],
        list_sequences[uncleaved_ind], [],
        list_sequences[uncleaved_ind],
        ignore_middle=True)
    #fracs_middle = conv.fraction_neighbors_all(list_sequences[cleaved_ind], list_sequences[uncleaved_ind], list_sequences[middle_ind], list_sequences[middle_ind])

    print "Created Adj List and Fracs at: {0}".format(datetime.datetime.now())

    adj_list_cleaved.update(adj_list_uncleaved)
    fracs_cleaved.update(fracs_uncleaved)

    fracs_per_seq = {}

    start_ind = (index - 1) * 10000
    end_ind = (index) * 10000
    if start_ind > len(list_sequences[cleaved_ind]):
        print "This index is not valid"
        exit
    if end_ind > len(list_sequences[cleaved_ind]):
        end_ind = len(list_sequences[cleaved_ind])

    for seq in list_sequences[cleaved_ind][start_ind:end_ind]:
        new_neighbors = [seq]
        fracs_per_seq[seq] = []
        for x in xrange(0, 3):
            frac, new_neighbors = find_fraction_for_shell(
                new_neighbors, adj_list_cleaved, fracs_cleaved)
            fracs_per_seq[seq].append(frac)

    print "Found Fracs for Cleaved Sequences at: {0}".format(
        datetime.datetime.now())

    with open("{0}_cleaved_{1}.csv".format(output_prefix, index), 'w') as f:
        f.write("Sequence,1,2,3\n")
        f.write("".join([
            "{0},{1},{2},{3}\n".format(k, str(v[0]), str(v[1]), str(v[2]))
            for k, v in fracs_per_seq.items()
        ]))
예제 #45
0
def main(seq_file, canonical_file, output_prefix):

    series = []

    canonical_list_seq = seq_IO.read_sequences(canonical_file)

    print "Beginning Script: {0}".format(datetime.datetime.now())

    for canonical in canonical_list_seq:

        with open(seq_file) as strings:
            seq_list = strings.read().splitlines()
	    seq_ind_list = [ (seq, ind) for ind, seq in enumerate(seq_list) ]
	orig_len = len(seq_ind_list)
        if canonical not in seq_list:
	    one_away = gsconv.gen_hamdist_one(canonical)
            one_away = [ o for o in one_away if o != canonical ] + [canonical]
	    seq_ind_list = seq_ind_list[:] + [ (o, ind) for (ind, o) in enumerate(one_away, len(seq_ind_list)) ]

        edges = [(seq2,seq) for seq,seq2 in itertools.combinations(seq_ind_list,2) if gsconv.hamdist(seq2[0],seq[0]) < 2 ]
	print len(seq_ind_list)
        print "Generated Edges: {0}".format(datetime.datetime.now())    

        numpy.set_printoptions(threshold='nan')

        canon_ind=[ i for (s, i) in seq_ind_list if s == canonical ][0]

        T_mat = trans_matrix(seq_ind_list,edges)
        #print raise_matrix(T_mat,1)
        #print raise_matrix(T_mat,3)
        #T = raise_matrix(T_mat,10)
        #T = raise_matrix(T_mat,20)
        x = [0]
        y = [0]

        print "Transformed Matrix: {0}".format(datetime.datetime.now())

        x.append(1)
        y.append(find_frac(T_mat, canon_ind, orig_len))

        T_mat_new = T_mat

        for i in range(2,23):
            x.append(i)
            T_mat_new, frac = square_matrix(T_mat_new,T_mat,canon_ind, orig_len)
	    y.append(frac)

	    print "Raised Matrix {0}: {1}".format(i, datetime.datetime.now())

	series.append([x,y,canonical])

    fig, ax = conv.create_ax(1, 1)

    color=['orange', 'palevioletred', 'mediumaquamarine', 'deepskyblue']

    scatterplot.plot_series( ax[0,0], series, title="", x_axis="Number of Steps", colors=color, y_axis="Fraction Cleaved Variants Reached", alpha=0.85, connect_dots=True, size=15, edgecolors='k', linewidth=0)
    ax[0,0].set_xlim(xmin=1)
    ax[0,0].set_ylim(ymin=0.0, ymax=1.0)
    ax[0,0].set_xticks(xrange(1,23,3))
    lgd = conv.add_legend(ax[0,0], location='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, size=8)
    conv.save_fig(fig, output_prefix, "fraction_func", 2.5, 3, size=9.5, extra_artists=lgd)

    print "Outputted Figure: {0}".format(datetime.datetime.now())    
예제 #46
0
def main(data_file, output_prefix, degree_file, width, height):

    sequences = seq_IO.read_sequences(data_file,
                                      additional_params=True,
                                      header=True,
                                      list_vals=True)
    seq_degree = seq_IO.read_sequences(degree_file,
                                       additional_params=True,
                                       header=True)

    degree_frac = defaultdict(list)

    for seq, seq_dict in sequences.items():
        degree_frac[seq_degree[seq]['Degree']].append(np.mean(
            seq_dict["Frac"]))

    data = [np.mean(seq_dict["Frac"]) for seq, seq_dict in sequences.items()]

    degree_frac_avg = [
        np.mean(list_fracs) for degree, list_fracs in degree_frac.items()
    ]
    degree_frac_std = [
        np.std(list_fracs) for degree, list_fracs in degree_frac.items()
    ]

    fig, axarr = pconv.create_ax(1, 1, shx=False, shy=False)

    hist.draw_actual_plot(axarr[0, 0],
                          data,
                          "",
                          "",
                          normed=False,
                          nbins=30,
                          edgecolor=None,
                          log=False)
    #axarr[0,0].ticklabel_format(axis='x', style='sci', scilimits=(-2,2))

    pconv.save_fig(fig,
                   output_prefix,
                   "hist",
                   width,
                   height,
                   tight=True,
                   size=10)

    fig2, axarr2 = pconv.create_ax(1, 1, shx=True, shy=True)

    bar.draw_actual_plot(axarr2[0, 0],
                         degree_frac_avg,
                         'g',
                         "",
                         "Degree",
                         "Fraction Shortest Path Uncleaved",
                         tick_label=degree_frac.keys(),
                         yerr=degree_frac_std)
    #axarr[0,0].set_ylim([0,1.3])
    pconv.save_fig(fig2,
                   output_prefix,
                   "bar",
                   width,
                   height,
                   tight=True,
                   size=10)
예제 #47
0
def main(json_file, output_prefix, novel_seqs_file, canonical_file):

    print "Started Script: {0}".format(datetime.datetime.now())

    with open(json_file) as data_file:
        data = json.load(data_file)

    G = json_graph.node_link_graph(data, directed=False)

    print "Finished Reading in Graph: {0}".format(datetime.datetime.now())

    id_seq = networkx.get_node_attributes(G, "sequence")
    id_status = networkx.get_node_attributes(G, "status")
    seq_id = {seq: node_id for node_id, seq in id_seq.items()}

    print "Created inverse lookup table: {0}".format(datetime.datetime.now())

    novel_seqs = seq_IO.read_sequences(novel_seqs_file)
    canonical_seqs = seq_IO.read_sequences(canonical_file)

    novel_fracs = {}

    print "Ready to enter loop: {0}".format(datetime.datetime.now())

    for n in novel_seqs:
        novel_fracs[n] = {}
        hamm_dist = sorted([(conv.hamdist(n, c), c) for c in canonical_seqs])
        min_hamm_dist = hamm_dist[0][0]
        print "Found hamming distances: {0}".format(datetime.datetime.now())

        for hamm, c in hamm_dist:
            #only analyze min_dist canonical sequences
            if hamm != min_hamm_dist:
                continue
            novel_fracs[n][c] = []
            #generate list of 5 paths
            #paths = itertools.islice(networkx.all_shortest_paths(G, seq_id[n], seq_id[c]), 5)
            paths = [networkx.shortest_path(G, seq_id[n], seq_id[c])]

            for path in paths:
                inter_nodes = path[1:-1]
                novel_fracs[n][c].append(
                    float(
                        sum([
                            1 for node_id in inter_nodes
                            if id_status[node_id] == "UNCLEAVED"
                        ])) / len(inter_nodes))

    base_n_file = os.path.basename(os.path.splitext(novel_seqs_file)[0])
    base_c_file = os.path.basename(os.path.splitext(canonical_file)[0])

    with open(
            "{0}_frac_paths_{1}_{2}.txt".format(output_prefix, base_n_file,
                                                base_c_file), 'w') as o:
        for n, c_dict in novel_fracs.items():
            for c, fracs_list in c_dict.items():
                o.write("{0},{1},".format(n, c))
                o.write(",".join(map(str, fracs_list)))
                o.write("\n")

    print "Output paths: {0}".format(datetime.datetime.now())
예제 #48
0
def main(list_nodes, output_prefix, metric, create_keys=False):

    if not create_keys:
        sequences = seq_IO.read_sequences(list_nodes,
                                          additional_params=True,
                                          header=True)
    else:
        sequences = seq_IO.read_sequences(list_nodes,
                                          additional_params=True,
                                          header=True,
                                          create_keys=True)

    cleaved_seq = {
        key: val
        for key, val in sequences.items() if val["type"] == "CLEAVED"
    }
    middle_seq = {
        key: val
        for key, val in sequences.items() if val["type"] == "MIDDLE"
    }
    uncleaved_seq = {
        key: val
        for key, val in sequences.items() if val["type"] == "UNCLEAVED"
    }

    print len(cleaved_seq)
    if metric == "metrics":
        labels_non_plot = ["label", "fitness", "type", "canonical", "timeset"]
        #labels_to_plot = sorted([ key for key in sequences["YNYIN"].keys() if key not in labels_non_plot ] + ["Fraction_Cleaved"])
        labels_to_plot = sorted([
            key for key in sequences["YNYIN"].keys()
            if key not in labels_non_plot
        ])
    else:
        labels_to_plot = [metric]

    n_to_plot = len(labels_to_plot)
    fig, axarr = pconv.create_ax(n_to_plot, 1, shx=False, shy=False)

    nbins = 10

    for ind, key in enumerate(labels_to_plot):
        if key == "pageranks":
            log = True
        else:
            log = False
        if key == "Fraction_Cleaved":
            # data = [ conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), cleaved_seq.keys()).values(),
            #           conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), middle_seq.keys()).values(),
            #          conv.fraction_neighbors_cleaved(cleaved_seq.keys(), uncleaved_seq.keys(), middle_seq.keys(), uncleaved_seq.keys()).values()]
            normed = True
        else:
            data = [
                get_data_from_dict(cleaved_seq, key),
                get_data_from_dict(middle_seq, key),
                get_data_from_dict(uncleaved_seq, key)
            ]
            normed = True
        print key
        hist.draw_actual_plot(axarr[0, ind],
                              data,
                              "",
                              key.capitalize(),
                              log=log,
                              normed=normed,
                              label=["Cleaved", "Middle", "Uncleaved"],
                              nbins=nbins)
        axarr[0, ind].ticklabel_format(axis='x',
                                       style='sci',
                                       scilimits=(-2, 2))

    #pconv.add_legend(axarr[0,ind], location="middle right")
    pconv.save_fig(fig,
                   output_prefix,
                   metric,
                   n_to_plot * 2.5,
                   2.5,
                   tight=True,
                   size=9)