args['strip'], is_enum=args['enumerate']) dsb_baseline = DomainSetBuilder(cons_baseline, args['win'], args['max_g'], args['strip'], is_enum=args['enumerate']) # dsb_baseline = DomainSetBuilder(win=args['win'], max_gap=args['max_g'], # is_enum=args['enumerate'], consensus=cons_baseline, # is_strip=args['strip']) domains_query = dsb_query.build() # build abundance counts domains_baseline = dsb_baseline.build() status_message('Identifying domains', 'OK') db = DomainAbundanceBuilder(query=domains_query, baseline=domains_baseline) domains = db.build() # build contingency matrices dpp = DomainPrettyPrinter(domains = domains, pval = args['p'], out=args['o']) dpp.display() # pretty-print domains status_message('Domain over-representation computation complete ', 'OK') else: args.update({'f':args['query'],'f2':args['baseline'],'a':None}) input_state = InputWrapperState(args) #input_state.assign_matrix() # parse in-built or custom matrix targets = input_state.parse_fasta(input_state.fname) baselines = input_state.parse_fasta(input_state.fname2) if not args['overlap']: target_names = list([target.name for target in targets]) baselines = list([baseline for baseline in baselines if baseline.name not in target_names]) extract_and_analyze_domains(targets,baselines,input_state) status_message('Domain analysis via multiple runs complete ', 'OK') except (IOError, KeyboardInterrupt, IndexError) as e: print(str(e))
def cluster_domains(domain_occurrences,node_types,subsmat,args): ''' A class that finds pairwise edit distances between domains and clusters them, does so for each domain length. Returns a domain-seed keyed dictionary of domain lists (clusters). ''' # Set the minimum size at which domains should be clustered min_domain_size = 7 # maybe should be 8 print("Clustering domains") # Set up pairwise alignment args domain_args = {'f':None,'f2':None,'a':None,'subsmat':subsmat,'gap':-1,'gapopen':0,'matrix':None,'custom':None,'o':None,'n':args['n'],'node_types':None} input_state = InputWrapperState(domain_args) input_state.subsmat = subsmat domains = list([domain for domain in domain_occurrences.keys()]) # Cluster results via average distance criterion #linkages = cluster.hierarchy.average(distance_mat) ### Split domains up by size first, then cluster those that are large enough # Determine max length print("Domains: "+str(domains)) min_length = min(len(domain) for domain in domains) max_length = max(len(domain) for domain in domains) clusters_by_length = {} # Move short domains into their own clusters for length in range(min_length,min_domain_size): domains_sub = list([domain for domain in domains if len(domain) == length]) clusters = {} for domain in domains_sub: clusters[domain] = [domain] clusters_by_length[length] = clusters # Cluster domains that are long enough for each size for length in range(min_domain_size,max_length+1): domains_sub = list([domain for domain in domains if len(domain) == length]) #domains_ns = list([NeuriteSequence("D"+str(i),domains[i]) for i in range(len(domains))]) domains_ns = list([NeuriteSequence(domains_sub[i],domains_sub[i]) for i in range(len(domains_sub))]) domain_id_map = {domains_sub[i]:i for i in range(len(domains_sub))} driver = PairwiseDriver(domains_ns, domains_ns, input_state, store_pairwise=True, score_type='num_gaps') driver.start() distance_mat = driver.get_score_matrix() ''' Determine clusters from hierarchy ??? Use abundance sort and then UCLUST - First and subsequent sequences are seeds unless they are close enough to an existing seed, in which case they are merged with the seed's cluster. - Domains shorter than min_domain_size go in their own separate clusters. - Domains that are exact sub- or super-sequences of a domain already in a cluster are excluded ''' # First sort domains by occurrence frequency, requires creating tuples from domain and occurrence count domain_tuples = [] for domain in domains_sub: domain_tuples.append((domain,domain_occurrences[domain])) sorted_tuples = sorted(domain_tuples, key=lambda domain_tup: (domain_tup[1],len(domain_tup[0])), reverse=True) sorted_domains = list([tup[0] for tup in sorted_tuples]) if len(sorted_domains) > 0: # UCLUST implementation separates = {} clusters = {sorted_domains[0]:list([sorted_domains[0]])} for domain in sorted_domains[1:]: domain_pos = domain_id_map[domain] closest_cluster = -1,1000 if len(domain) < min_domain_size: separates[domain] = list([domain]) else: for seed in clusters: seed_pos = domain_id_map[seed] dist = max(distance_mat[domain_pos][seed_pos],distance_mat[seed_pos][domain_pos]) if dist <= max_cluster_dist and dist < closest_cluster[1]: closest_cluster = seed,dist if closest_cluster[0] == -1: # If not close enough to any existing seed, add new cluster clusters[domain] = list([domain]) else: # Otherwise add it to the cluster it's closest to clusters[closest_cluster[0]].append(domain) clusters_by_length[length] = clusters return clusters_by_length
consensus_object = msa_driver.build_consensus(args['thresh'],args['type']) # Write MSA and consensus to file consensus_fact = ConsensusFilterFactory(msa_driver,consensus_object) consensus_fact.write(fname=args['build']) #consensus_fact = ConsensusFilterFactory(driver.alns, driver.composite, args['thresh'], args['type']) #consensus_fact.build_consensus() #consensus_fact.write(fname=args['build']) # write analysis if __name__ == '__main__': try: args = AlignmentCommandParser().parse_args() AlignmentArgumentValidator(args) # test all arguments are correct print('spaghetti - v.' + str(version) + '\n=================') input_state = InputWrapperState(args) input_state.assign_matrix() # parse in-built or custom matrix targets = input_state.parse_fasta(input_state.fname) # next, parse fasta file if input_state.fname2 is None: queries = targets else: queries = input_state.parse_fasta(input_state.fname2) if args['mode'] == 'local': run_local(targets, queries, input_state) elif args['mode'] == 'msa': # start multiple-sequence alignment (MSA) run_msa(queries, input_state) except (IOError, KeyboardInterrupt, IndexError) as e: print(str(e)+'\n')