def get_flank_distributions(kplets_2d_list, neighborhood_path, target_profiles): org2weights = t.map_genome2weight() flanking_genes_count = [] cog2gids = [] gid2weight = dict() for kplets_list in kplets_2d_list: cur_flanking_genes_count = dict() cur_cog2gids = dict() for kplet in kplets_list: neighborhoods = [Neighborhood(os.path.join(neighborhood_path, f)) for f in kplet.files] for neighborhood in neighborhoods: for gene in neighborhood.genes: gid2weight[int(gene.gid)] = org2weights[gene.organism] for cogid in gene.cogid.split(): # if cogid in target_profiles: # continue t.update_dictionary(cur_flanking_genes_count,cogid,org2weights[gene.organism]) t.update_dictionary_set(cur_cog2gids, cogid, set([int(gene.gid)])) flanking_genes_count.append(cur_flanking_genes_count) cog2gids.append(cur_cog2gids) return flanking_genes_count, cog2gids, gid2weight
def arcog_profiles_pool_into_classes_pool(profile_community): _arcog2class = t.map_arcog2class() class_community = list() for profiles in profile_community: classes = dict() for k in profiles: if k in _arcog2class: t.update_dictionary(classes, _arcog2class[k], profiles[k]) class_community.append(classes) return class_community
def arcog_profiles_pool_into_classes_pool(profile_community): _arcog2class = t.map_arcog2class() class_community = list() for profiles in profile_community: classes = dict() for k in profiles: if k in _arcog2class: t.update_dictionary(classes, _arcog2class[k], profiles[k]) class_community.append(classes) return class_community
def merge_into_file_summaries(kplets, neighborhood_files_path, file2src_src2org_map, data_type='bacteria'): _org2weight = t.map_genome2weight() _file2kplets = dict() for kplet in kplets: for f in kplet.files: if f in _file2kplets: _file2kplets[f].append(kplet) else: _file2kplets[f] = [kplet] kplet_files = _file2kplets.keys() _file2src, _src2org = file2src_src2org_map(kplet_files) file_summaries = list() for f in kplet_files: _neighborhood = Neighborhood(os.path.join(neighborhood_files_path, f)) _src = _file2src[f] _org = _src2org[_src] _weight = _org2weight[_org] kplets = _file2kplets[f] _neighborhood.extend_flanks(10, os.path.join(gv.pty_data_path, _org, "%s.pty" % _src), _gid2arcog_cdd) file_summaries.append(NeighborhoodFileSummary(f, kplets, _neighborhood, _org, _src, _weight)) # file_summaries = trim_file_summary_list(file_summaries, data_type) # file_summaries = [fs for fs in file_summaries if fs] # Updating the map _file2src after trimming. # new_file_list = [ fs.file_name for fs in file_summaries] # for _file_name in _file2src.keys(): # if _file_name not in new_file_list: # del _file2src[_file_name] # if len(file_summaries) < 2: # return None, None, None, None, None, None file_summaries.sort(key= lambda x: x.weight, reverse=True) community_count_with_flanks = {} community_count = {} _org2weight = t.map_genome2weight() total_weight = 0 for i in range(len(file_summaries)): cur_file_summary = file_summaries[i] _weight = _org2weight[cur_file_summary.org] total_weight += _weight for gene in cur_file_summary.neighborhood.genes: if gene.tag == 'flank': for k in gene.cogid.split(): t.update_dictionary(community_count_with_flanks, k, _weight) else: for k in gene.cogid.split(): t.update_dictionary(community_count_with_flanks, k, _weight) t.update_dictionary(community_count, k, _weight) community = [] return _src2org, file_summaries, community, community_count, community_count_with_flanks, total_weight
def cdd_profile_count_into_class_count(community_count): _cdd2class = t.map_cdd2class() class_count = dict() class2profiles = dict() for k in community_count: if k in _cdd2class: _classes = _cdd2class[k] for _class in _classes: t.update_dictionary(class_count, _class, community_count[k]) t.update_dictionary(class2profiles, _class, [k]) else: _class = 'Unclassified' t.update_dictionary(class_count, _class, community_count[k]) t.update_dictionary(class2profiles, _class, [k]) return class_count, class2profiles
def cdd_profile_count_into_class_count(community_count): _cdd2class = t.map_cdd2class() class_count = dict() class2profiles = dict() for k in community_count: if k in _cdd2class: _classes = _cdd2class[k] for _class in _classes: t.update_dictionary(class_count, _class, community_count[k]) t.update_dictionary(class2profiles, _class, [k]) else: _class = 'Unclassified' t.update_dictionary(class_count, _class, community_count[k]) t.update_dictionary(class2profiles, _class, [k]) return class_count, class2profiles
def arcog_profile_count_into_class_count(community_count): _arcog2class = t.map_arcog2class() class2count = dict() class2profiles = dict() for k in community_count: if k in _arcog2class: _classes = _arcog2class[k] for _class in _classes: t.update_dictionary(class2count, _class, community_count[k]) # t.update_dictionary_list_value(class2profiles, _class, k) t.update_dictionary(class2profiles, _class, [k]) else: _class = 'Unclassified' t.update_dictionary(class2count, _class , community_count[k]) # t.update_dictionary_list_value(class2profiles, _class, k) t.update_dictionary(class2profiles, _class, [k]) return class2count, class2profiles
def arcog_profile_count_into_class_count(community_count): _arcog2class = t.map_arcog2class() class2count = dict() class2profiles = dict() for k in community_count: if k in _arcog2class: _classes = _arcog2class[k] for _class in _classes: t.update_dictionary(class2count, _class, community_count[k]) # t.update_dictionary_list_value(class2profiles, _class, k) t.update_dictionary(class2profiles, _class, [k]) else: _class = 'Unclassified' t.update_dictionary(class2count, _class, community_count[k]) # t.update_dictionary_list_value(class2profiles, _class, k) t.update_dictionary(class2profiles, _class, [k]) return class2count, class2profiles
from lib.utils import tools as t cdd_file = '/Users/hudaiber/data/CDD/all_Prok1402.ccp.csv' gnm2weight = t.map_genome2weight() profile2count = {} profile2weight = {} missing = [] for l in open(cdd_file): terms = l.split(',') org = terms[1] profile = terms[6] if org in gnm2weight: t.update_dictionary(profile2count, profile, 1) t.update_dictionary(profile2weight, profile, gnm2weight[org]) else: missing.append(org) print "Missing from weights:" for gnm in set(missing): print gnm print "Finished scanning" with open('/Users/hudaiber/data/CDD/profile2weight.tab', 'w') as outf: outf.write("#Profile\tweight\tcount\n") for k, v in profile2weight.items(): outf.write("%s\t%f\t%s\n" % (k, v, profile2count[k])) print "Done"
def generate_community_reports(nodes_pool, reports_dir, locus2weight, file2locus, profile2def, feature_profiles_file=None): # if not feature_labels: # local_features = True # else: # local_features = False # thr_occ, thr_crisp, cluster_threshold = thresholds_pack summary_file = os.path.join(reports_dir, 'summary.xlsx' ) workbook = x.Workbook(summary_file) worksheet = workbook.add_worksheet() header_format = workbook.add_format() header_format.set_font_size(12) header_format.set_bold() header_format.set_align('center') # worksheet.set_column(4,5,50) worksheet.write_row(0, 0, ["File name", "Size", "Effective size", "Genes"], header_format) print "Generating report files" ind = 1 for nodes in nodes_pool: loci_size = len([node for node in nodes if node.type == 2]) loci_esize = sum(node.weight for node in nodes if node.type == 2) # if loci_esize < 5: # continue loci = [file2locus[node.file_name] for node in nodes if node.type == 2] xls_file_name = os.path.join(reports_dir, '%d.xlsx' % ind) loci_file_name = os.path.join(reports_dir, '%d.tab' % ind) with open(loci_file_name, 'w') as outf: loci_files = ",".join(os.path.basename(locus.file_name) for locus in loci) outf.write(loci_files + "\n") gene2cnt = {} profile2cnt = {} for locus in loci: weight = locus2weight[os.path.basename(locus.file_name)] for gene_name in locus.gene_names: t.update_dictionary(gene2cnt, gene_name, weight) for cl in locus.clusters: t.update_dictionary(profile2cnt, cl, weight) sorted_gene2count = sorted(gene2cnt.items(), key=lambda x: x[1], reverse=True) gene_counts = ";".join(["%s:%.2f" % (gene_name, count) for (gene_name, count) in sorted_gene2count[:10]]) worksheet.write_row(ind + 1, 0, ['%d.xlsx' % ind, loci_size, loci_esize, gene_counts]) args = {} args['xls_file_name'] = xls_file_name args['loci'] = loci args['profile_code2def'] = profile2def if not feature_profiles_file: args['feature_labels'] = [ k for k,v in profile2cnt.items() if v >= loci_esize/2 ] else: args['feature_labels'] = [l.strip() for l in open(feature_profiles_file)] try: r.write_to_xls_loci_plain(args) except: sys.exit() ind += 1
def dull_gene_name(): cas_gene_names = [ l.strip() for l in open( os.path.join(gv.project_data_path, 'cas1402/all_gene_names.txt')) ] gene_name2gids = {gene: set() for gene in cas_gene_names} cnt = 0 with open(os.path.join(gv.project_data_path, 'cas1402/cas1402.arrisl.lst')) as inf: for in_line in inf: if in_line.startswith("==="): continue parts = in_line.strip().split('\t') if len(parts) < 9: continue _gene = parts[8] if _gene in cas_gene_names: gene_name2gids[_gene].update([parts[0]]) cdd_gid2profiles = t.map_gid2cdd() cas_gene2profile = {gene: {} for gene in cas_gene_names} for _cas_gene in cas_gene_names: for _gid in gene_name2gids[_cas_gene]: if not _gid in cdd_gid2profiles: # t.update_dictionary(cas_gene2profile[_cas_gene], "NA", 1) continue for _profile in cdd_gid2profiles[_gid].split(): t.update_dictionary(cas_gene2profile[_cas_gene], _profile, 1) work_dir = os.path.join(gv.project_data_path, 'cas1402/crispricity/') with open(os.path.join(work_dir, 'gene_name2profiles.txt'), 'w') as outf: for _gene_name in cas_gene_names: for _profile in cas_gene2profile[_gene_name]: outf.write("%s\t%s\t%d\n" % (_gene_name, _profile, cas_gene2profile[_gene_name][_profile])) cas_related_profiles = set([ _profile for _gene in cas_gene_names for _profile in cas_gene2profile[_gene].keys() ]) cr_occurrence = [] cr_crispricity = [] ncr_occurrence = [] ncr_crispricity = [] for l in open(os.path.join(work_dir, 'crispricity.tab')).readlines()[1:]: if not l: continue parts = l.split('\t') if parts[0] in cas_related_profiles: cr_occurrence.append(parts[1]) cr_crispricity.append(parts[2]) else: ncr_occurrence.append(parts[1]) ncr_crispricity.append(parts[2]) cr_occurrence = np.asarray(cr_occurrence, dtype=np.float) cr_occurrence = np.log(cr_occurrence) cr_crispricity = np.asarray(cr_crispricity) ncr_occurrence = np.asarray(ncr_occurrence, dtype=np.float) ncr_occurrence = np.log(ncr_occurrence) ncr_crispricity = np.asarray(ncr_crispricity) plt.ioff() fig, ax = plt.subplots() ax.scatter(cr_occurrence, cr_crispricity, color='r', s=1, label="Cas related") ax.scatter(ncr_occurrence, ncr_crispricity, color='b', s=1, label="Not Cas related") ax.axvline(1.6, color='g', linewidth=0.5) ax.axhline(0.5, color='g', linewidth=0.5) plt.xlabel("Effective orcurrence in CRISPR loci (log)") plt.ylabel("Crispricity") plt.legend(loc="upper left", fontsize=7) plt.savefig(os.path.join(work_dir, 'crispricity_log.png'))
def sub_classify_by_scores_cas4(M, threshold, loci, inf_default=50): if not (M == np.transpose(M)).all(): M += np.transpose(M) M = np.negative(np.log(M)) np.fill_diagonal(M, 0) inf_idx = np.isinf(M) M[inf_idx] = inf_default M_array = ssd.squareform(M) Z = linkage(M_array, method='average') root = to_tree(Z) leaf_names = [os.path.basename(l.file_name) for l in loci] newick = tree_to_newick(root, "", root.dist, leaf_names) fname = gv.project_data_path+'/cas4/tmp.nw' outf = open(fname, 'w') outf.write(newick) outf.close() proc = sp.Popen(['tree_listnodes', '-o=4', fname], stdout=sp.PIPE, stderr=open(os.devnull, 'wb')) locus2weight = {} for line in proc.stdout: terms = line.strip().split() if len(terms) == 2: locus2weight[terms[0]] = float(terms[1]) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z) cnt = 0 i = 0 total_count = 1 pool = [] while True: cur_node = id2node[leaf_ids[i]] parent_dist = cur_node.parent.dist while parent_dist < threshold: cur_node = cur_node.parent parent_dist = cur_node.parent.dist cur_leaf_ids = get_leaves(cur_node) pool.append([id for id in cur_leaf_ids]) total_count += cur_node.count i += len(cur_leaf_ids) if i >= len(leaf_ids): break cnt += 1 to_collapse = [l for l in pool if len(l) > 1] singles = [l[0] for l in pool if len(l) == 1] to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True) gene2cnt = {} for locus in loci: try: weight = locus2weight[os.path.basename(locus.file_name)] except: print "Skipping:" , os.path.basename(locus.file_name) continue for gene_name in locus.gene_names: t.update_dictionary(gene2cnt, gene_name, weight) # t.update_dictionary(gene2cnt, gene_name, 1) return singles, to_collapse, gene2cnt
def classify_by_scores_cas1402(M, threshold, loci): M_array = ssd.squareform(M) Z = linkage(M_array, method='average') root = to_tree(Z) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z) cnt = 0 i = 0 total_count = 1 pool = [] while True: cur_node = id2node[leaf_ids[i]] parent_dist = cur_node.parent.dist while parent_dist < threshold: cur_node = cur_node.parent parent_dist = cur_node.parent.dist cur_leaf_ids = get_leaves(cur_node) pool.append([id for id in cur_leaf_ids]) total_count += cur_node.count i += len(cur_leaf_ids) if i >= len(leaf_ids)-1: break cnt += 1 to_collapse = [l for l in pool if len(l) > 1] singles = [l[0] for l in pool if len(l) == 1] to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True) sum_errors = [] entropies = [] weights = [] to_collapse_retval = [] cluster_ind = 0 for cluster in to_collapse: cluster_ind += 1 type2cnt = {} type2wgt = {} cluster_files = [loci[id].file_name.split('/')[-1] for id in cluster] cluster_weight = 0 for _f in cluster_files: file_weight = gnm2weight[file2org[_f]] cluster_weight += file_weight if _f not in file2crispr_type: t.update_dictionary(type2cnt, "NA", 1) t.update_dictionary(type2wgt, "NA", file_weight) continue for _type in file2crispr_type[_f]: t.update_dictionary(type2cnt, _type, 1) t.update_dictionary(type2wgt, _type, file_weight) _weights = np.array(type2wgt.values(), dtype=np.float) sum_errors.append(np.sum(np.square(_weights - np.mean(_weights)))) _weights /= np.sum(_weights) entropy = -1 * np.sum(_weights * np.log(_weights)) entropies.append(entropy) weights.append(cluster_weight) to_collapse_retval.append((cluster, type2cnt, type2wgt, entropy)) sum_errors = np.average(sum_errors) entropies = np.array(entropies) weights = np.array(weights) average_entropy = np.sum(entropies * weights) / np.sum(weights) sum_errors = np.sum(sum_errors * weights) / np.sum(weights) return singles, to_collapse_retval, sum_errors, average_entropy
def sub_classify_by_scores_cas4(M, threshold, loci, inf_default=50): if not (M == np.transpose(M)).all(): M += np.transpose(M) M = np.negative(np.log(M)) np.fill_diagonal(M, 0) inf_idx = np.isinf(M) M[inf_idx] = inf_default M_array = ssd.squareform(M) Z = linkage(M_array, method='average') root = to_tree(Z) leaf_names = [os.path.basename(l.file_name) for l in loci] newick = tree_to_newick(root, "", root.dist, leaf_names) fname = gv.project_data_path + '/cas4/tmp.nw' outf = open(fname, 'w') outf.write(newick) outf.close() proc = sp.Popen(['tree_listnodes', '-o=4', fname], stdout=sp.PIPE, stderr=open(os.devnull, 'wb')) locus2weight = {} for line in proc.stdout: terms = line.strip().split() if len(terms) == 2: locus2weight[terms[0]] = float(terms[1]) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z) cnt = 0 i = 0 total_count = 1 pool = [] while True: cur_node = id2node[leaf_ids[i]] parent_dist = cur_node.parent.dist while parent_dist < threshold: cur_node = cur_node.parent parent_dist = cur_node.parent.dist cur_leaf_ids = get_leaves(cur_node) pool.append([id for id in cur_leaf_ids]) total_count += cur_node.count i += len(cur_leaf_ids) if i >= len(leaf_ids): break cnt += 1 to_collapse = [l for l in pool if len(l) > 1] singles = [l[0] for l in pool if len(l) == 1] to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True) gene2cnt = {} for locus in loci: try: weight = locus2weight[os.path.basename(locus.file_name)] except: print "Skipping:", os.path.basename(locus.file_name) continue for gene_name in locus.gene_names: t.update_dictionary(gene2cnt, gene_name, weight) # t.update_dictionary(gene2cnt, gene_name, 1) return singles, to_collapse, gene2cnt
def classify_by_scores_cas4(M, threshold, loci, inf_default=50, locus2weight=None): if not (M == np.transpose(M)).all(): M += np.transpose(M) M = np.negative(np.log(M)) np.fill_diagonal(M, 0) inf_idx = np.isinf(M) M[inf_idx] = inf_default M_array = ssd.squareform(M) Z = linkage(M_array, method='average') root = to_tree(Z) leaf_names = [os.path.basename(l.file_name) for l in loci] newick = tree_to_newick(root, "", root.dist, leaf_names) fname = gv.project_data_path + '/cas4/tmp.nw' outf = open(fname, 'w') outf.write(newick) outf.close() proc = sp.Popen(['tree_listnodes', '-o=4', fname], stdout=sp.PIPE, stderr=open(os.devnull, 'wb')) locus2weight = {} for line in proc.stdout: terms = line.strip().split() if len(terms) == 2: locus2weight[terms[0]] = float(terms[1]) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z) cnt = 0 i = 0 total_count = 1 pool = [] while True: cur_node = id2node[leaf_ids[i]] parent_dist = cur_node.parent.dist while parent_dist < threshold: cur_node = cur_node.parent parent_dist = cur_node.parent.dist cur_leaf_ids = get_leaves(cur_node) pool.append([id for id in cur_leaf_ids]) total_count += cur_node.count i += len(cur_leaf_ids) # if i >= len(leaf_ids)-1: if i >= len(leaf_ids): break cnt += 1 to_collapse = [l for l in pool if len(l) > 1] singles = [l[0] for l in pool if len(l) == 1] to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True) entropies = [] to_collapse_retval = [] cluster_ind = 0 for cluster in to_collapse: cluster_ind += 1 type2cnt = {} gene2cnt = {} for pos in cluster: t.update_dictionary(type2cnt, loci[pos].crispr_type, 1.0) _fname = os.path.basename(loci[pos].file_name) _weight = locus2weight[_fname] if _fname in locus2weight else 1 for _gene_name in loci[pos].gene_names: t.update_dictionary(gene2cnt, _gene_name, _weight) _values = type2cnt.values() _values /= np.sum(_values) entropy = -1 * np.sum(_values * np.log(_values)) entropies.append(entropy) to_collapse_retval.append((cluster, type2cnt, entropy, gene2cnt)) entropies = np.array(entropies) average_entropy = np.average(entropies) return singles, to_collapse_retval, average_entropy
def classify_by_scores(M, threshold, loci): M_array = ssd.squareform(M) # main linkage structure for upgma # print "Building linkage" Z = linkage(M_array, method='average') # Z = np.load(linkage_file).items()[0][1] # print "plotting dendogram" # plot_dendrogram(Z, report_path) root = to_tree(Z) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z) cnt = 0 i = 0 total_count = 1 pool = [] # print "Starting merging" while True: cur_node = id2node[leaf_ids[i]] parent_dist = cur_node.parent.dist while parent_dist < threshold: cur_node = cur_node.parent parent_dist = cur_node.parent.dist cur_leaf_ids = get_leaves(cur_node) pool.append([id for id in cur_leaf_ids]) total_count += cur_node.count i += len(cur_leaf_ids) if i >= len(leaf_ids)-1: break cnt += 1 to_collapse = [l for l in pool if len(l) > 1] singles = [l[0] for l in pool if len(l) == 1] to_collapse = sorted(to_collapse, key=lambda x: sum(gnm2weight[loci[i].organism] for i in x), reverse=True) sum_errors = [] entropies = [] weights = [] to_collapse_retval = [] cluster_ind = 0 for cluster in to_collapse: cluster_ind += 1 type2cnt = {} type2wgt = {} cluster_files = [loci[id].file_name.split('/')[-1] for id in cluster] cluster_weight = 0 for _f in cluster_files: file_weight = gnm2weight[file2org[_f]] cluster_weight += file_weight if _f not in file2crispr_type: t.update_dictionary(type2cnt, "NA", 1) t.update_dictionary(type2wgt, "NA", file_weight) continue for _type in file2crispr_type[_f]: t.update_dictionary(type2cnt, _type, 1) t.update_dictionary(type2wgt, _type, file_weight) _weights = np.array(type2wgt.values(), dtype=np.float) sum_errors.append(np.sum(np.square(_weights - np.mean(_weights)))) _weights /= np.sum(_weights) entropy = -1 * np.sum(_weights * np.log(_weights)) entropies.append(entropy) weights.append(cluster_weight) to_collapse_retval.append((cluster, type2cnt, type2wgt, entropy)) sum_errors = np.average(sum_errors) entropies = np.array(entropies) weights = np.array(weights) average_entropy = np.sum(entropies * weights) / np.sum(weights) sum_errors = np.sum(sum_errors * weights) / np.sum(weights) return singles, to_collapse_retval, sum_errors, average_entropy
def generate_cluster_reports(cluster_packs, loci, reports_dir, feature_labels, method, thresholds_pack): if not feature_labels: local_features = True else: local_features = False thr_occ, thr_crisp, cluster_threshold = thresholds_pack summary_file = os.path.join(reports_dir, 'summary_%s_%d_%.2f_%.2f.xls' % (method, thr_occ, thr_crisp, cluster_threshold)) workbook = x.Workbook(summary_file) worksheet = workbook.add_worksheet() header_format = workbook.add_format() header_format.set_font_size(12) header_format.set_bold() header_format.set_align('center') worksheet.set_column(4,5,50) worksheet.write_row(0, 0, ["File name", "Weight", "Loci", "Entropy", "systems weight", "systems count"], header_format) print "Generating report files" ind = 0 weights = np.zeros(len(cluster_packs)) entropies = np.zeros(len(cluster_packs)) for outer_i in range(len(cluster_packs)): (cluster, type2count, type2weight, entropy) = cluster_packs[outer_i] ind += 1 cl_files = [os.path.basename(loci[i].file_name) for i in cluster] weight = sum([gnm2weight[file2org[file]] for file in cl_files]) weights[outer_i] = weight entropies[outer_i] = entropy crispr_cas_types_count = " ; ".join([k+":"+str(v) for (k,v) in sorted(type2count.items(), key=itemgetter(1), reverse=True)]) crispr_cas_types_weight = " ; ".join([k+":"+str(v) for (k,v) in sorted(type2weight.items(), key=itemgetter(1), reverse=True)]) xls_file_name = os.path.join(reports_dir, '%d.xls' % ind) worksheet.write_row(ind+1, 0, ['%d.xls'%ind, weight, len(cl_files), entropy, crispr_cas_types_weight, crispr_cas_types_count, " "]) cl_loci = sorted([loci[_i] for _i in cluster], key = lambda x: gnm2weight[x.organism], reverse=True) local_profile2weight = {} for locus in cl_loci: for gene in locus.genes: for profile in gene.cogid.split(','): t.update_dictionary(local_profile2weight, profile, gnm2weight[locus.organism]) global_profile2weight = t.map_global_cdd_profile_count() if local_features: feature_labels = [ k for k,v in local_profile2weight.items() if v/weight >= 0.5 ] params = {} params['xls_file_name'] = xls_file_name params['loci'] = cl_loci params['weight'] = weight params['profile_code2def'] = profile_code2def params['gnm2weight'] = gnm2weight params['feature_labels'] = feature_labels params['file2crispr_type'] = file2crispr_type params['local_profile2weight'] = local_profile2weight params['global_profile2weight'] = global_profile2weight r.write_to_xls_generic_loci(params) worksheet.write_row(ind+3, 0, ['Average entropy'], header_format) worksheet.write_row(ind+3, 1, [np.sum(weights*entropies)/np.sum(weights)]) worksheet.write_row(ind + 4, 0, ['Exp(Average entropy)'], header_format) worksheet.write_row(ind + 4, 1, [np.exp(np.sum(weights * entropies) / np.sum(weights))])
def classify_loci_hierarchically(loci, threshold=5, inf_default=50, dendrogram_file=None): M = scores.jackard_weighted_scores(loci) if not (M == np.transpose(M)).all(): M += np.transpose(M) M = np.negative(np.log(M)) np.fill_diagonal(M, 0) inf_idx = np.isinf(M) M[inf_idx] = inf_default M_array = ssd.squareform(M) Z = linkage(M_array, method='average') if dendrogram_file: plot_dendrogram(Z, dendrogram_file) return root = to_tree(Z) locus2weight = tree_to_weights(root, loci) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z) cnt = 0 i = 0 total_count = 1 pool = [] while True: cur_node = id2node[leaf_ids[i]] parent_dist = cur_node.parent.dist while parent_dist < threshold: cur_node = cur_node.parent parent_dist = cur_node.parent.dist cur_leaf_ids = get_leaves(cur_node) pool.append([id for id in cur_leaf_ids]) total_count += cur_node.count i += len(cur_leaf_ids) if i >= len(leaf_ids): break cnt += 1 to_collapse = [l for l in pool if len(l) > 1] to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True) singles = [l[0] for l in pool if len(l) == 1] entropies = [] to_collapse_retval = [] cluster_ind = 0 for cluster in to_collapse: cluster_ind += 1 type2cnt = {} gene2cnt = {} for pos in cluster: t.update_dictionary(type2cnt, loci[pos].crispr_type, 1.0) _fname = os.path.basename(loci[pos].file_name) _weight = locus2weight[_fname] if _fname in locus2weight else 1 for _gene_name in loci[pos].gene_names: t.update_dictionary(gene2cnt, _gene_name, _weight) _values = type2cnt.values() _values /= np.sum(_values) entropy = -1 * np.sum(_values * np.log(_values)) entropies.append(entropy) to_collapse_retval.append((cluster, type2cnt, entropy, gene2cnt)) entropies = np.array(entropies) average_entropy = np.average(entropies) return singles, to_collapse_retval, average_entropy
def classify_by_scores_cas4(M, threshold, loci, inf_default=50, locus2weight=None): if not (M == np.transpose(M)).all(): M += np.transpose(M) M = np.negative(np.log(M)) np.fill_diagonal(M, 0) inf_idx = np.isinf(M) M[inf_idx] = inf_default M_array = ssd.squareform(M) Z = linkage(M_array, method='average') root = to_tree(Z) leaf_names = [os.path.basename(l.file_name) for l in loci] newick = tree_to_newick(root, "", root.dist, leaf_names) fname = gv.project_data_path + '/cas4/tmp.nw' outf = open(fname, 'w') outf.write(newick) outf.close() proc = sp.Popen(['tree_listnodes', '-o=4', fname], stdout=sp.PIPE, stderr=open(os.devnull, 'wb')) locus2weight = {} for line in proc.stdout: terms = line.strip().split() if len(terms) == 2: locus2weight[terms[0]] = float(terms[1]) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z) cnt = 0 i = 0 total_count = 1 pool = [] while True: cur_node = id2node[leaf_ids[i]] parent_dist = cur_node.parent.dist while parent_dist < threshold: cur_node = cur_node.parent parent_dist = cur_node.parent.dist cur_leaf_ids = get_leaves(cur_node) pool.append([id for id in cur_leaf_ids]) total_count += cur_node.count i += len(cur_leaf_ids) # if i >= len(leaf_ids)-1: if i >= len(leaf_ids): break cnt += 1 to_collapse = [l for l in pool if len(l) > 1] singles = [l[0] for l in pool if len(l) == 1] to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True) entropies = [] to_collapse_retval = [] cluster_ind = 0 for cluster in to_collapse: cluster_ind += 1 type2cnt = {} gene2cnt = {} for pos in cluster: t.update_dictionary(type2cnt, loci[pos].crispr_type, 1.0) _fname = os.path.basename(loci[pos].file_name) _weight = locus2weight[_fname] if _fname in locus2weight else 1 for _gene_name in loci[pos].gene_names: t.update_dictionary(gene2cnt, _gene_name, _weight) _values = type2cnt.values() _values /= np.sum(_values) entropy = -1 * np.sum(_values * np.log(_values)) entropies.append(entropy) to_collapse_retval.append((cluster, type2cnt, entropy, gene2cnt)) entropies = np.array(entropies) average_entropy = np.average(entropies) return singles, to_collapse_retval, average_entropy
def kplet_list_to_file_summaries(kplets, neighborhood_files_path, filter_weak_hits=True): file_summaries = list() organisms = set() _file2kplets = dict() _kplet2count_af = dict() # kplet2count after filtration _kplet2count_bf = dict() # kplet2count before filtration _profile2count_bf = dict() _profile2count_af = dict() _cas_type2count = dict() # filter_size = 5 for kplet in kplets: for f in kplet.files: t.update_dictionary(_file2kplets, f, [kplet]) initial_length = len(_file2kplets) for f in _file2kplets.keys(): [ t.update_dictionary(_kplet2count_bf, kplet.id, 1) for kplet in _file2kplets[f] ] del f # if filter_weak_hits: # _file2kplets = {k: v for (k,v) in _file2kplets.items() if len(v) > filter_size} if len(_file2kplets) < 2: return None _file2genes = { f: dt.get_pty_file_generic(os.path.join(neighborhood_files_path, f)) for f in _file2kplets.keys() } _files = set(_file2kplets.keys()) for _f in _files: _genes = _file2genes[_f] _src = _genes[0].src _org = _genes[0].organism organisms.update([_org]) _nfs = NeighborhoodFileSummary(_f, _file2kplets[_f], _genes, _org, _src, _org2weight[_org]) for _gene in _genes: if _gene.gid in _gi2castype: _nfs.cas_type = ";".join(_gi2castype[_gene.gid]) for _cas_type in _gi2castype[_gene.gid]: t.update_dictionary(_cas_type2count, _cas_type, 1) break [ t.update_dictionary(_kplet2count_af, kplet.id, 1) for kplet in _file2kplets[_f] ] for _gene in _genes: for _c in _gene.cogid.split(','): t.update_dictionary(_profile2count_af, _c, 1) file_summaries.append(_nfs) # file_summaries = [fs for fs in file_summaries if len(fs.kplets)>1] # _files = [fs.file_name for fs in file_summaries] # for _f in _files: # [t.update_dictionary(_kplet2count_af, kplet.id, 1) for kplet in _file2kplets[_f]] # # _gene_list = _file2genes[_f] # for _gene in _gene_list: # for _c in _gene.cogid.split(','): # t.update_dictionary(_profile2count_af, _c, 1) file_summaries.sort(key=lambda x: x.org) retval = GenericMergingKplets2FsOutput() retval.file_summaries = file_summaries retval.organisms = organisms retval.profile2count_bf = _profile2count_bf retval.profile2count_af = _profile2count_af retval.kplet2count_af = _kplet2count_af retval.kplet2count_bf = _kplet2count_bf retval.weight = sum(fs.weight for fs in file_summaries) retval.cas_type2count = _cas_type2count return retval
def classify_loci_hierarchically(loci, threshold=5, inf_default=50, dendrogram_file=None): M = scores.jackard_weighted_scores(loci) if not (M == np.transpose(M)).all(): M += np.transpose(M) M = np.negative(np.log(M)) np.fill_diagonal(M, 0) inf_idx = np.isinf(M) M[inf_idx] = inf_default M_array = ssd.squareform(M) Z = linkage(M_array, method='average') if dendrogram_file: plot_dendrogram(Z, dendrogram_file) return root = to_tree(Z) locus2weight = tree_to_weights(root, loci) root = clone_graph(root) nodes = get_nodes(root) id2node = {node.id: node for node in nodes} leaf_ids = leaves_list(Z) cnt = 0 i = 0 total_count = 1 pool = [] while True: cur_node = id2node[leaf_ids[i]] parent_dist = cur_node.parent.dist while parent_dist < threshold: cur_node = cur_node.parent parent_dist = cur_node.parent.dist cur_leaf_ids = get_leaves(cur_node) pool.append([id for id in cur_leaf_ids]) total_count += cur_node.count i += len(cur_leaf_ids) if i >= len(leaf_ids): break cnt += 1 to_collapse = [l for l in pool if len(l) > 1] to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True) singles = [l[0] for l in pool if len(l) == 1] entropies = [] to_collapse_retval = [] cluster_ind = 0 for cluster in to_collapse: cluster_ind += 1 type2cnt = {} gene2cnt = {} for pos in cluster: t.update_dictionary(type2cnt, loci[pos].crispr_type, 1.0) _fname = os.path.basename(loci[pos].file_name) _weight = locus2weight[_fname] if _fname in locus2weight else 1 for _gene_name in loci[pos].gene_names: t.update_dictionary(gene2cnt, _gene_name, _weight) _values = type2cnt.values() _values /= np.sum(_values) entropy = -1 * np.sum(_values * np.log(_values)) entropies.append(entropy) to_collapse_retval.append((cluster, type2cnt, entropy, gene2cnt)) entropies = np.array(entropies) average_entropy = np.average(entropies) return singles, to_collapse_retval, average_entropy
from lib.utils import tools as t cdd_file = '/Users/hudaiber/data/CDD/all_Prok1402.ccp.csv' gnm2weight = t.map_genome2weight() profile2count = {} profile2weight = {} missing = [] for l in open(cdd_file): terms = l.split(',') org = terms[1] profile = terms[6] if org in gnm2weight: t.update_dictionary(profile2count, profile, 1) t.update_dictionary(profile2weight, profile, gnm2weight[org]) else: missing.append(org) print "Missing from weights:" for gnm in set(missing): print gnm print "Finished scanning" with open('/Users/hudaiber/data/CDD/profile2weight.tab','w') as outf: outf.write("#Profile\tweight\tcount\n") for k,v in profile2weight.items(): outf.write("%s\t%f\t%s\n"%(k,v, profile2count[k])) print "Done"
def generate_jw_cluster_reports(cluster_packs, loci, reports_dir, threshold): # if not feature_labels: # local_features = True # else: # local_features = False # thr_occ, thr_crisp, cluster_threshold = thresholds_pack summary_file = os.path.join(reports_dir, 'summary_jw_%.2f.xls' % threshold) workbook = x.Workbook(summary_file) worksheet = workbook.add_worksheet() header_format = workbook.add_format() header_format.set_font_size(12) header_format.set_bold() header_format.set_align('center') worksheet.set_column(4,5,50) worksheet.write_row(0, 0, ["File name", "Weight", "Loci", "Entropy", "systems weight", "systems count"], header_format) print "Generating report files" ind = 0 weights = np.zeros(len(cluster_packs)) entropies = np.zeros(len(cluster_packs)) for outer_i in range(len(cluster_packs)): (cluster, type2count, type2weight, entropy) = cluster_packs[outer_i] ind += 1 cl_files = [os.path.basename(loci[i].file_name) for i in cluster] weight = sum([gnm2weight[file2org[file]] for file in cl_files]) weights[outer_i] = weight entropies[outer_i] = entropy crispr_cas_types_count = " ; ".join([k+":"+str(v) for (k,v) in sorted(type2count.items(), key=itemgetter(1), reverse=True)]) crispr_cas_types_weight = " ; ".join([k+":"+str(v) for (k,v) in sorted(type2weight.items(), key=itemgetter(1), reverse=True)]) xls_file_name = os.path.join(reports_dir, '%d.xls' % ind) worksheet.write_row(ind+1, 0, ['%d.xls'%ind, weight, len(cl_files), entropy, crispr_cas_types_weight, crispr_cas_types_count, " "]) cl_loci = sorted([loci[_i] for _i in cluster], key = lambda x: gnm2weight[x.organism], reverse=True) local_profile2weight = {} for locus in cl_loci: for gene in locus.genes: for profile in gene.cogid.split(','): t.update_dictionary(local_profile2weight, profile, gnm2weight[locus.organism]) global_profile2weight = t.map_global_cdd_profile_count() # if local_features: # feature_labels = [ k for k,v in local_profile2weight.items() if v/weight >= 0.5 ] params = {} params['xls_file_name'] = xls_file_name params['loci'] = cl_loci params['weight'] = weight params['profile_code2def'] = profile_code2def params['gnm2weight'] = gnm2weight # params['feature_labels'] = feature_labels params['feature_labels'] = [] params['file2crispr_type'] = file2crispr_type params['local_profile2weight'] = local_profile2weight params['global_profile2weight'] = global_profile2weight r.write_to_xls_generic_loci(params) worksheet.write_row(ind+3, 0, ['Average entropy'], header_format) worksheet.write_row(ind+3, 1, [np.sum(weights*entropies)/np.sum(weights)]) worksheet.write_row(ind + 4, 0, ['Exp(Average entropy)'], header_format) worksheet.write_row(ind + 4, 1, [np.exp(np.sum(weights * entropies) / np.sum(weights))])
def kplet_list_to_file_summaries(kplets, neighborhood_files_path, filter_weak_hits=True, dataset=None): file_summaries = list() organisms = set() _crispr_type2files = dict() _file2kplets = dict() _kplet2count_af = dict() # kplet2count after filtration _kplet2count_bf = dict() # kplet2count before filtration _profile2count_bf = dict() _profile2count_af = dict() filter_size = 5 singletons = get_singleton_loci(dataset) clusters = get_clustered_loci(dataset) for kplet in kplets: for f in kplet.files: t.update_dictionary(_file2kplets, f, [kplet]) initial_length = len(_file2kplets) for f in _file2kplets.keys(): [t.update_dictionary(_kplet2count_bf, kplet.id, 1) for kplet in _file2kplets[f]] del f kplet_ids = [k.id for k in kplets] if filter_weak_hits: _file2kplets = {k: v for (k,v) in _file2kplets.items() if len(v) > filter_size} if len(_file2kplets) < 2: return None _file2genes = {f: dt.get_wgs_file(os.path.join(neighborhood_files_path, f)) for f in _file2kplets.keys()} _files = set(_file2kplets.keys()) for _gene_list in _file2genes.values(): for _gene in _gene_list: for _c in _gene.cogid.split(','): t.update_dictionary(_profile2count_bf, _c, 1) del _gene_list, _gene, _c while _files: _f = _files.pop() if _f in singletons: _genes = _file2genes[_f] _src = _genes[0].src _org = _genes[0].organism _crispr_type = _genes[0].crispr_type t.update_dictionary_set(_crispr_type2files, _crispr_type, _f) file_summaries.append(WGSNeighborhoodFileSummary(_f, _file2kplets[_f], _genes, _org, _src, 'singleton')) organisms.update(set([_org])) else: _cluster = None for cl in clusters: if _f in cl.files: _cluster = cl break if not _cluster: continue del cl _cl_files = _cluster.files.intersection(_files) _representative = _f del _f for _cl_file in _cl_files: if len(_file2genes[_cl_file]) > len(_file2genes[_representative]): _representative = _cl_file _genes = _file2genes[_representative] _src = _genes[0].src _org = _genes[0].organism _crispr_type = _genes[0].crispr_type t.update_dictionary_set(_crispr_type2files, _crispr_type, _representative) _file_summary = WGSNeighborhoodFileSummary(_representative, _file2kplets[_representative], _genes, _org, _src, _cluster) _file_summary.cluster_local_count = len(_cl_files)+1 file_summaries.append(_file_summary) organisms.update(set([_org])) _files = _files.difference(_cl_files) file_summaries = [fs for fs in file_summaries if len(fs.kplets)>1] _files = [fs.file_name for fs in file_summaries] for _f in _files: [t.update_dictionary(_kplet2count_af, kplet.id, 1) for kplet in _file2kplets[_f]] _gene_list = _file2genes[_f] for _gene in _gene_list: for _c in _gene.cogid.split(','): t.update_dictionary(_profile2count_af, _c, 1) file_summaries.sort(key=lambda x: x.org) retval = CrisprMergingKplets2FsOutput() retval.file_summaries = file_summaries retval.organisms = organisms retval.crispr_type2files = _crispr_type2files retval.kplet2count_af = _kplet2count_af retval.kplet2count_bf = _kplet2count_bf retval.initial_length = initial_length retval.kplets = kplets retval.profile2count_bf = _profile2count_bf retval.profile2count_af = _profile2count_af return retval
def merge_into_file_summaries(kplets, neighborhood_files_path, file2src_src2org_map, data_type='bacteria'): _org2weight = t.map_genome2weight() _file2kplets = dict() for kplet in kplets: for f in kplet.files: if f in _file2kplets: _file2kplets[f].append(kplet) else: _file2kplets[f] = [kplet] kplet_files = _file2kplets.keys() _file2src, _src2org = file2src_src2org_map(kplet_files) file_summaries = list() for f in kplet_files: _neighborhood = Neighborhood(os.path.join(neighborhood_files_path, f)) _src = _file2src[f] _org = _src2org[_src] _weight = _org2weight[_org] kplets = _file2kplets[f] _neighborhood.extend_flanks( 10, os.path.join(gv.pty_data_path, _org, "%s.pty" % _src), _gid2arcog_cdd) file_summaries.append( NeighborhoodFileSummary(f, kplets, _neighborhood, _org, _src, _weight)) # file_summaries = trim_file_summary_list(file_summaries, data_type) # file_summaries = [fs for fs in file_summaries if fs] # Updating the map _file2src after trimming. # new_file_list = [ fs.file_name for fs in file_summaries] # for _file_name in _file2src.keys(): # if _file_name not in new_file_list: # del _file2src[_file_name] # if len(file_summaries) < 2: # return None, None, None, None, None, None file_summaries.sort(key=lambda x: x.weight, reverse=True) community_count_with_flanks = {} community_count = {} _org2weight = t.map_genome2weight() total_weight = 0 for i in range(len(file_summaries)): cur_file_summary = file_summaries[i] _weight = _org2weight[cur_file_summary.org] total_weight += _weight for gene in cur_file_summary.neighborhood.genes: if gene.tag == 'flank': for k in gene.cogid.split(): t.update_dictionary(community_count_with_flanks, k, _weight) else: for k in gene.cogid.split(): t.update_dictionary(community_count_with_flanks, k, _weight) t.update_dictionary(community_count, k, _weight) community = [] return _src2org, file_summaries, community, community_count, community_count_with_flanks, total_weight
def tree_leaves(): work_dir = os.path.join(gv.project_data_path, 'UvrD/prok1603') tree_dir = os.path.join(work_dir, 'clust_tree/') files_dir = os.path.join(work_dir, 'merged_files') profile2gene = t.map_cdd_profile2gene_name() gi2org = {l.split()[0]: l.rstrip().split()[1] for l in open(work_dir + '/gi_org.txt')} gi2weight = {l.split()[0].split('.')[0]: float(l.split()[1]) for l in open(work_dir + '/prok1603_weights.txt')} cl2size, cl2gis, cl2weight = {}, {}, {} for l in open(tree_dir + 'uvrd.cls'): terms = l.rstrip().split() cl2size[terms[1]] = terms[0] cl2gis[terms[1]] = terms[2:] cl2weight[terms[1]] = sum([ gi2weight[gi] if gi in gi2weight else 0 for gi in terms[2:]]) tree_string = open(tree_dir + 'uvrd.up.tre').readline() leave_file_names = [os.path.basename(l) for l in glob.glob(tree_dir + '*.sr')] for leave_file_name in leave_file_names: leave_file_gis = [ l.split()[0] for l in open(os.path.join(tree_dir, leave_file_name))] system_gene_pool = [] sgp_count = {} for gi in leave_file_gis: system_genes = get_system_genes(gi, files_dir, profile2gene) if not system_genes: continue system_gene_pool.append(system_genes) t.update_dictionary(sgp_count, system_genes, gi2weight[gi]) sorted_sgp_count = sorted(sgp_count.items(), key=lambda x: x[1], reverse=True) leaf_name = os.path.splitext(leave_file_name)[0] gene_names = sorted_sgp_count[0][0] if sorted_sgp_count else "" representative = gi2org[random.choice(leave_file_gis)] total_weight = sum([v for k,v in sgp_count.items()]) leaf_prefix = "%s|" % int(total_weight) if total_weight else "-" has_genes = False for _gene_name in ["Cas4", "UvrA", "UvrB", "UvrC", "SbcS", "SbcD"]: # for _gene_name in ["Cas4"]: _weight = sum([v for k, v in sgp_count.items() if _gene_name.lower() in k.lower()]) if _weight: leaf_prefix += "%s=%d|" % (_gene_name, _weight) has_genes = True if has_genes: new_leaf_name = leaf_prefix + representative + "|" + leaf_name.split('.')[1] else: new_leaf_name = leaf_prefix + leaf_name.split('.')[1] # new_leaf_name = "cas4=%s/%s|%s|%s" % (int(cas4_weight) if total_weight else "-", # int(total_weight) if total_weight else "-", # representative, # leaf_name.split('.')[1]) print leaf_name, new_leaf_name tree_string = tree_string.replace(leaf_name + ":", new_leaf_name + ":") # new_file_name = os.path.join(tree_dir, os.path.splitext(leave_file_name)[0] + '.def') # with open(new_file_name, 'w') as new_file: # # for k, v in sorted_sgp_count: # new_file.write("#%s\t%f\n" % (k, v)) # # new_file.write("\n") # # [new_file.write("%s\t%s\n" % (gi, gi2org[gi])) for gi in leave_file_gis] with open(tree_dir + 'uvrd.up_all_genes.tree', 'w') as outf: outf.write(tree_string)
def generate_community_reports(nodes_pool, reports_dir, locus2weight, file2locus, profile2def, feature_profiles_file=None): # if not feature_labels: # local_features = True # else: # local_features = False # thr_occ, thr_crisp, cluster_threshold = thresholds_pack summary_file = os.path.join(reports_dir, 'summary.xlsx') workbook = x.Workbook(summary_file) worksheet = workbook.add_worksheet() header_format = workbook.add_format() header_format.set_font_size(12) header_format.set_bold() header_format.set_align('center') # worksheet.set_column(4,5,50) worksheet.write_row(0, 0, ["File name", "Size", "Effective size", "Genes"], header_format) print "Generating report files" ind = 1 for nodes in nodes_pool: loci_size = len([node for node in nodes if node.type == 2]) loci_esize = sum(node.weight for node in nodes if node.type == 2) # if loci_esize < 5: # continue loci = [file2locus[node.file_name] for node in nodes if node.type == 2] xls_file_name = os.path.join(reports_dir, '%d.xlsx' % ind) loci_file_name = os.path.join(reports_dir, '%d.tab' % ind) with open(loci_file_name, 'w') as outf: loci_files = ",".join( os.path.basename(locus.file_name) for locus in loci) outf.write(loci_files + "\n") gene2cnt = {} profile2cnt = {} for locus in loci: weight = locus2weight[os.path.basename(locus.file_name)] for gene_name in locus.gene_names: t.update_dictionary(gene2cnt, gene_name, weight) for cl in locus.clusters: t.update_dictionary(profile2cnt, cl, weight) sorted_gene2count = sorted(gene2cnt.items(), key=lambda x: x[1], reverse=True) gene_counts = ";".join([ "%s:%.2f" % (gene_name, count) for (gene_name, count) in sorted_gene2count[:10] ]) worksheet.write_row( ind + 1, 0, ['%d.xlsx' % ind, loci_size, loci_esize, gene_counts]) args = {} args['xls_file_name'] = xls_file_name args['loci'] = loci args['profile_code2def'] = profile2def if not feature_profiles_file: args['feature_labels'] = [ k for k, v in profile2cnt.items() if v >= loci_esize / 2 ] else: args['feature_labels'] = [ l.strip() for l in open(feature_profiles_file) ] try: r.write_to_xls_loci_plain(args) except: sys.exit() ind += 1
def dull_gene_name(): cas_gene_names = [l.strip() for l in open(os.path.join(gv.project_data_path,'cas1402/all_gene_names.txt'))] gene_name2gids = { gene:set() for gene in cas_gene_names } cnt = 0 with open(os.path.join(gv.project_data_path,'cas1402/cas1402.arrisl.lst')) as inf: for in_line in inf: if in_line.startswith("==="): continue parts = in_line.strip().split('\t') if len(parts) < 9: continue _gene = parts[8] if _gene in cas_gene_names: gene_name2gids[_gene].update([parts[0]]) cdd_gid2profiles = t.map_gid2cdd() cas_gene2profile = { gene:{} for gene in cas_gene_names } for _cas_gene in cas_gene_names: for _gid in gene_name2gids[_cas_gene]: if not _gid in cdd_gid2profiles: # t.update_dictionary(cas_gene2profile[_cas_gene], "NA", 1) continue for _profile in cdd_gid2profiles[_gid].split(): t.update_dictionary(cas_gene2profile[_cas_gene], _profile, 1) work_dir = os.path.join(gv.project_data_path,'cas1402/crispricity/') with open(os.path.join(work_dir, 'gene_name2profiles.txt'), 'w') as outf: for _gene_name in cas_gene_names: for _profile in cas_gene2profile[_gene_name]: outf.write("%s\t%s\t%d\n" % (_gene_name, _profile, cas_gene2profile[_gene_name][_profile])) cas_related_profiles = set([_profile for _gene in cas_gene_names for _profile in cas_gene2profile[_gene].keys()]) cr_occurrence = [] cr_crispricity = [] ncr_occurrence = [] ncr_crispricity = [] for l in open(os.path.join(work_dir, 'crispricity.tab')).readlines()[1:]: if not l: continue parts = l.split('\t') if parts[0] in cas_related_profiles: cr_occurrence.append(parts[1]) cr_crispricity.append(parts[2]) else: ncr_occurrence.append(parts[1]) ncr_crispricity.append(parts[2]) cr_occurrence = np.asarray(cr_occurrence, dtype=np.float) cr_occurrence = np.log(cr_occurrence) cr_crispricity = np.asarray(cr_crispricity) ncr_occurrence = np.asarray(ncr_occurrence, dtype=np.float) ncr_occurrence = np.log(ncr_occurrence) ncr_crispricity = np.asarray(ncr_crispricity) plt.ioff() fig, ax = plt.subplots() ax.scatter(cr_occurrence, cr_crispricity, color='r', s=1, label="Cas related") ax.scatter(ncr_occurrence, ncr_crispricity, color='b', s=1, label="Not Cas related") ax.axvline(1.6 ,color='g', linewidth=0.5) ax.axhline(0.5 ,color='g', linewidth=0.5) plt.xlabel("Effective orcurrence in CRISPR loci (log)") plt.ylabel("Crispricity") plt.legend(loc="upper left", fontsize=7) plt.savefig(os.path.join(work_dir, 'crispricity_log.png'))
def extract_all_duplets_from_prok1402(): """ Extraction adjacent duplets is done by means of recording them in the dictionary pair2weight The overall abundance of profiles is also needed. It's recorded in profile2weight """ pty_path = "/panfs/pan1/patternquest/data/Pty/genomes/" work_dir = os.path.join(data_path, 'prok1402/graph/graph_files/') print("Loading dictionaries") gi2profiles = t.map_gi2profiles() genome2weight = t.map_genome2weight() pair2weight = defaultdict(float) pair2count = defaultdict(int) profile2weight = defaultdict(float) print("Reading Prok1402") for root, dirs, files in os.walk(pty_path): for f in files: if not f.endswith(".pty"): continue file_name = os.path.join(root, f) genome = os.path.basename(root) genes = t.parse_pty_file(file_name) for gene in genes: gene.profiles = gi2profiles[gene.gid] for profile in gene.profiles: t.update_dictionary(profile2weight, profile, genome2weight[genome]) previous_profiles = genes[0].profiles if len(previous_profiles) > 1: domain_duplets = list(combinations(previous_profiles, 2)) for duplet in domain_duplets: [kplet_1, kplet_2] = sorted(duplet) key = "%s-%s" % (kplet_1, kplet_2) t.update_dictionary(pair2weight, key, genome2weight[genome]) t.update_dictionary(pair2count, key, 1) for gene in genes[1:]: cur_profiles = gene.profiles if not previous_profiles: previous_profiles = cur_profiles continue if len(cur_profiles) > 1: domain_duplets = list(combinations(previous_profiles, 2)) for duplet in domain_duplets: [kplet_1, kplet_2] = sorted(duplet) key = "%s-%s" % (kplet_1, kplet_2) t.update_dictionary(pair2weight, key, genome2weight[genome]) t.update_dictionary(pair2count, key, 1) adjacent_duplets = list( product(previous_profiles, cur_profiles)) for duplet in adjacent_duplets: [kplet_1, kplet_2] = sorted(duplet) key = "%s-%s" % (kplet_1, kplet_2) t.update_dictionary(pair2weight, key, genome2weight[genome]) t.update_dictionary(pair2count, key, 1) previous_profiles = cur_profiles print("Writing to files") with open(os.path.join(work_dir, "prok1402_adj_duplets_weights.txt"), "w") as outf: for (key, weight) in sorted(pair2weight.items(), key=lambda x: x[1], reverse=True): [kplet_1, kplet_2] = key.split("-") outf.write("%s\t%s\t%f\n" % (kplet_1, kplet_2, weight)) with open(os.path.join(work_dir, "prok1402_profile_abundance.txt"), "w") as outf: for (profile, weight) in sorted(profile2weight.items(), key=lambda x: x[1], reverse=True): outf.write("%s\t%f\n" % (profile, weight))
def extract_all_duplets_from_prok1402(): """ Extraction adjacent duplets is done by means of recording them in the dictionary pair2weight The overall abundance of profiles is also needed. It's recorded in profile2weight """ pty_path = "/panfs/pan1/patternquest/data/Pty/genomes/" work_dir = os.path.join(data_path, 'prok1402/graph/graph_files/') print("Loading dictionaries") gi2profiles = t.map_gi2profiles() genome2weight = t.map_genome2weight() pair2weight = defaultdict(float) pair2count = defaultdict(int) profile2weight=defaultdict(float) print("Reading Prok1402") for root, dirs, files in os.walk(pty_path): for f in files: if not f.endswith(".pty"): continue file_name = os.path.join(root, f) genome = os.path.basename(root) genes = t.parse_pty_file(file_name) for gene in genes: gene.profiles = gi2profiles[gene.gid] for profile in gene.profiles: t.update_dictionary(profile2weight, profile, genome2weight[genome]) previous_profiles = genes[0].profiles if len(previous_profiles) > 1: domain_duplets = list(combinations(previous_profiles,2)) for duplet in domain_duplets: [kplet_1, kplet_2] = sorted(duplet) key = "%s-%s" % (kplet_1, kplet_2) t.update_dictionary(pair2weight, key, genome2weight[genome]) t.update_dictionary(pair2count, key, 1) for gene in genes[1:]: cur_profiles = gene.profiles if not previous_profiles: previous_profiles = cur_profiles continue if len(cur_profiles) > 1: domain_duplets = list(combinations(previous_profiles, 2)) for duplet in domain_duplets: [kplet_1, kplet_2] = sorted(duplet) key = "%s-%s" % (kplet_1, kplet_2) t.update_dictionary(pair2weight, key, genome2weight[genome]) t.update_dictionary(pair2count, key, 1) adjacent_duplets = list(product(previous_profiles, cur_profiles)) for duplet in adjacent_duplets: [kplet_1, kplet_2] = sorted(duplet) key = "%s-%s" % (kplet_1, kplet_2) t.update_dictionary(pair2weight, key, genome2weight[genome]) t.update_dictionary(pair2count, key, 1) previous_profiles = cur_profiles print("Writing to files") with open(os.path.join(work_dir, "prok1402_adj_duplets_weights.txt"), "w") as outf: for (key,weight) in sorted(pair2weight.items(), key=lambda x: x[1], reverse=True): [kplet_1, kplet_2] = key.split("-") outf.write("%s\t%s\t%f\n" % (kplet_1, kplet_2, weight)) with open(os.path.join(work_dir, "prok1402_profile_abundance.txt"), "w") as outf: for (profile,weight) in sorted(profile2weight.items(), key=lambda x: x[1], reverse=True): outf.write("%s\t%f\n" % (profile, weight))