def extract_and_analyze_domains(targets,baselines,input_state): domain_set = set() # domains from targets and baselines target_domains = {} baseline_domains = {} args = input_state.get_args() node_types = input_state.node_types subsmat = input_state.subsmat num_runs = args['runs'] output_file = args['o'] # output_base = args['o'] # if output_base is not None: # consensus_file = output_base + ".consensuses.txt" # domain_file = output_base + ".domains.txt" ded = DomainExtractionDriver(targets,baselines,node_types,subsmat,num_runs,input_state) ded.set_debug(1) ded.start() domain_set = ded.domain_set target_domains = ded.target_domains baseline_domains = ded.baseline_domains target_consensuses = ded.target_consensuses baseline_consensuses = ded.baseline_consensuses print("Testing ded.target_consensuses: "+str(type(target_consensuses[0]))) # Cluster domains if args['cluster']: combined_occurrences = merge_counts(target_domains,baseline_domains) domain_clusters = cluster_domains(combined_occurrences,node_types,subsmat,args) # Compile counts for each cluster from the individual domains target_counts = {length:{seed:sum(target_domains[domain] for domain in domain_clusters[length][seed] if domain in target_domains) for seed in domain_clusters[length]} for length in domain_clusters} baseline_counts = {length:{seed:sum(baseline_domains[domain] for domain in domain_clusters[length][seed] if domain in baseline_domains) for seed in domain_clusters[length]} for length in domain_clusters} ''' old method target_clust_counts = {} baseline_clust_counts = {} #for clust_num in range(len(domain_clusters)): for seed in domain_clusters: target_count = 0 baseline_count = 0 for domain in domain_clusters[seed]: if domain in target_domains.keys(): target_count += target_domains[domain] if domain in baseline_domains.keys(): baseline_count += baseline_domains[domain] target_clust_counts[seed] = target_count baseline_clust_counts[seed] = baseline_count target_counts = target_clust_counts baseline_counts = baseline_clust_counts ''' else: domain_clusters = {} target_counts = {} baseline_counts = {} # Move all domains into their own clusters, keyed by length #print("Domain set: "+str(domain_set)) min_length = min(len(domain) for domain in domain_set) max_length = max(len(domain) for domain in domain_set) target_counts = {} baseline_counts = {} for length in range(min_length,min_domain_size): domain_clusters[length] = list([[domain] for domain in domain_set if len(domain) == length]) target_counts[length] = {domain:target_domains[domain] for domain in domain_set if len(domain) == length} baseline_counts[length] = {domain:baseline_domains[domain] for domain in domain_set if len(domain) == length} # Calculate probability of target being different from baseline for each domain (cluster) domain_matrices = [] for length in domain_clusters: dab = DomainAbundanceBuilder(target_counts[length],baseline_counts[length]) domain_matrices = domain_matrices + dab.build(); # Print out domain cluster seeds, their domains, and their p-value domain_tuples = [] for domain_mat in domain_matrices: cluster_name = domain_mat.name #i = int(cluster_name.replace("DC",""))-1 #cluster_seed = domain_seeds[i] cluster_seed = cluster_name target_count = target_counts[len(cluster_name)][cluster_name] baseline_count = baseline_counts[len(cluster_name)][cluster_name] pval = domain_mat.get_hypergeometric_pval() # domain_tuples.append((i, cluster_name, cluster_seed, pval, target_counts, baseline_counts)) domain_tuples.append((cluster_name, pval, target_count, baseline_count)) sorted_tuples = sorted(domain_tuples, key=lambda domain: domain[1]) if output_file is not None: handle = open(output_file,'w') handle.write("Cluster Seed,pVal,Target Counts,Baseline Counts,Domains\n") for tup in sorted_tuples: handle.write(tup[0]+','+str(round(tup[1], 6))+','+str(tup[2])+','+str(tup[3])+', "'+str(domain_clusters[len(tup[0])][tup[0]])+'"\n') handle.flush() handle.close() handle = open(output_file+".consensuses.txt",'w') handle.write('TARGETS:\n') for consensus in target_consensuses: handle.write(consensus.replace('-','')+'\n') handle.write('\nBASELINES:\n') for consensus in baseline_consensuses: handle.write(consensus.replace('-','')+'\n') handle.close() else: for tup in sorted_tuples: tuple_str = tup[0]+','+str(round(tup[1], 6))+','+str(tup[2])+','+str(tup[3]) print(tuple_str) print(domain_clusters[len(tup[0])][tup[0]]) print() print('TARGET CONSENSUSES:\n') for consensus in target_consensuses: print(consensus.replace('-','')) print('\nBASELINES:') for consensus in baseline_consensuses: print(consensus.replace('-',''))
print('penne - v.' + str(version) + '\n=============') cons_query = XMLBuildReader(args['query']).parse().consensuses[0] cons_baseline = XMLBuildReader(args['baseline']).parse().consensuses[0] # next, yield domains for both query and baseline datasets. dsb_query = DomainSetBuilder(cons_query, args['win'], args['max_g'], args['strip'], is_enum=args['enumerate']) dsb_baseline = DomainSetBuilder(cons_baseline, args['win'], args['max_g'], args['strip'], is_enum=args['enumerate']) # dsb_baseline = DomainSetBuilder(win=args['win'], max_gap=args['max_g'], # is_enum=args['enumerate'], consensus=cons_baseline, # is_strip=args['strip']) domains_query = dsb_query.build() # build abundance counts domains_baseline = dsb_baseline.build() status_message('Identifying domains', 'OK') db = DomainAbundanceBuilder(query=domains_query, baseline=domains_baseline) domains = db.build() # build contingency matrices dpp = DomainPrettyPrinter(domains = domains, pval = args['p'], out=args['o']) dpp.display() # pretty-print domains status_message('Domain over-representation computation complete ', 'OK') else: args.update({'f':args['query'],'f2':args['baseline'],'a':None}) input_state = InputWrapperState(args) #input_state.assign_matrix() # parse in-built or custom matrix targets = input_state.parse_fasta(input_state.fname) baselines = input_state.parse_fasta(input_state.fname2) if not args['overlap']: target_names = list([target.name for target in targets]) baselines = list([baseline for baseline in baselines if baseline.name not in target_names])