def start(self): status_message('Multiple alignment running','please wait') args = self.args executor = concurrent.futures.ProcessPoolExecutor(args['n']) try: for i in range(self.num_runs): if self.debug >= 1: print("DomainExtractionDriver: Run "+str(i+1)+" of "+str(self.num_runs)) targets_sub = generate_sequence_set(self.targets,args['subsample'],args['random_subset'],args['random_order'],args['subsample_start']) f = executor.submit(_extract_domains, targets_sub, is_baseline=False, input_state=self.input_state, threshold=args['thresh'], thresh_type=args['type'], max_domain_size=args['win'], min_domain_size=args['minwin']) f.add_done_callback(self._callback) disallowed = [] if args['disjoint_subset']: disallowed = targets_sub baselines_sub = generate_sequence_set(self.baselines,args['subsample'],args['random_subset'],args['random_order'],args['subsample_start'],disallowed=disallowed) f = executor.submit(_extract_domains, baselines_sub, is_baseline=True, input_state=self.input_state, threshold=args['thresh'], thresh_type=args['type'], max_domain_size=args['win'], min_domain_size=args['minwin']) f.add_done_callback(self._callback) executor.shutdown() #self.close_output_buffers() status_message('Analysis complete', 'OK') except KeyboardInterrupt: executor.shutdown()
def generate_consensus_newick(msa, consensus, filename): newick_string = "" cons_str = consensus.consensus conservation, variability = calculate_conservation(msa, consensus) a_stack = [(1, 0)] print(cons_str) for position in range(len(cons_str)): # print(str(position)+' '+cons_str[position]) # print(str(a_stack)) if cons_str[position] == "A": a_stack.append((0, 0)) node_string = ")" elif cons_str[position] == "C": node_string = ",:1)" a_val = a_stack.pop() a_stack.append((a_val[0], a_val[1] + 1)) elif cons_str[position] == "T": node_string = "(:1,:1)" not_done = 1 while not_done and len(a_stack) > 0: a_val = a_stack.pop() if a_val[0] == 0: node_string = "," + a_val[1] * "(" + node_string a_stack.append((1, 1)) not_done = 0 else: node_string = a_val[1] * "(" + node_string newick_string = ( node_string + cons_str[position] + "-" + str(round(conservation[position], 3)) + ":" + str(round(variability[position], 3)) + newick_string ) newick_string += ";" status_message("Generating newick string", "OK") handle = open(args["newick"], "w") handle.write(newick_string) handle.close() return newick_string
def extract_and_run_stats(args): msa = XMLBuildReader(args['build']).parse() consensus = msa.consensuses[0] ungapped = consensus.ungapped_consensus alignments = msa.alignments contributions = [] for alignment in alignments: contributions.append(1/len(alignment.replace('-',''))) current_var_sum = 0 variability_sums = [] conservation = [] # Go through each position in the alignment (based on the composite or gapped consensus) for position in range(len(msa.composite)): # If the current position is a gap, add contributions to the current variability sum if consensus.seq[position] == '-': # Go through each alignment to check whether it has a character at the current position for alignment_num in range(len(alignments)): # If it does have a character (not a gap '-'), add its variability contribution if not alignments[alignment_num][position] == '-': current_var_sum += contributions[alignment_num] # The current position is a consensus character, so store the variability sum preceeding it else: # if the position within the ungapped consensus is needed, just use the current length of variability_sums variability_sums.append(current_var_sum) current_var_sum = 0 # determine the current position's conservation level conservation_sum = 0 for alignment_num in range(len(alignments)): # If it does have a character (not a gap '-'), add its conservation contribution if not alignments[alignment_num][position] == '-': conservation_sum += 1 conservation.append(conservation_sum/len(alignments)) if 'newick' in args.keys(): status_message('Generating newick string','OK') newick_string = generate_consensus_newick(consensus,conservation,variability_sums) handle = open(args['newick'],'w') handle.write(newick_string) handle.close()
msa.add_consensus(threshold, msa.build_consensus(float(threshold))) print("Consensus at threshold " + threshold + ": " + msa.get_consensus(threshold).consensus) if args["newick"]: if args["threshold"]: generate_consensus_newick(msa, msa.get_consensus(float(args["threshold"])), args["newick"]) else: generate_consensus_newick(msa, list(msa.consensuses.values())[0], args["newick"]) if args["scores"]: generate_score_and_conserved_chars_file(msa, args["scores"]) if args["k"]: threshold, k = msa.find_conservation_boundary() print( "At threshold %.2f, average conservation and proportion of conserved characters are both %.2f%%" % (threshold, k * 100) ) if args["newbuild"]: # consensus_object = msa.build_consensus(args['threshold'],args['type']) # Write MSA and consensus to file consensus_fact = ConsensusFilterFactory(msa, msa.get_consensus(threshold)) consensus_fact.write(fname=args["newbuild"]) status_message("Consensus statistics computation complete ", "OK") except (IOError, KeyboardInterrupt, IndexError) as e: print(str(e))
elif cons_str[position] == 'T': node_string = '(:1,:1)' not_done = 1 while not_done and len(a_stack) > 0: a_val = a_stack.pop() if a_val[0] == 0: node_string = ','+a_val[1]*'('+node_string a_stack.append((1,1)) not_done = 0 else: node_string = a_val[1]*'('+node_string newick_string = node_string + cons_str[position] + '-' + str(round(conservation[position],3)) + ':' + str(round(variability_sums[position],3)) + newick_string newick_string += ';' return newick_string if __name__ == '__main__': try: args = ConsensusStatsCommandParser().parse_args() # ConsensusStatsArgumentValidator(args) # test all arguments are correct print('capellini - v.' + str(version) + '\n=============') extract_and_run_stats(args) status_message('Consensus statistics computation complete ', 'OK') except (IOError, KeyboardInterrupt, IndexError) as e: print(str(e))
if args['mode'] == 'single': print('penne - v.' + str(version) + '\n=============') cons_query = XMLBuildReader(args['query']).parse().consensuses[0] cons_baseline = XMLBuildReader(args['baseline']).parse().consensuses[0] # next, yield domains for both query and baseline datasets. dsb_query = DomainSetBuilder(cons_query, args['win'], args['max_g'], args['strip'], is_enum=args['enumerate']) dsb_baseline = DomainSetBuilder(cons_baseline, args['win'], args['max_g'], args['strip'], is_enum=args['enumerate']) # dsb_baseline = DomainSetBuilder(win=args['win'], max_gap=args['max_g'], # is_enum=args['enumerate'], consensus=cons_baseline, # is_strip=args['strip']) domains_query = dsb_query.build() # build abundance counts domains_baseline = dsb_baseline.build() status_message('Identifying domains', 'OK') db = DomainAbundanceBuilder(query=domains_query, baseline=domains_baseline) domains = db.build() # build contingency matrices dpp = DomainPrettyPrinter(domains = domains, pval = args['p'], out=args['o']) dpp.display() # pretty-print domains status_message('Domain over-representation computation complete ', 'OK') else: args.update({'f':args['query'],'f2':args['baseline'],'a':None}) input_state = InputWrapperState(args) #input_state.assign_matrix() # parse in-built or custom matrix targets = input_state.parse_fasta(input_state.fname) baselines = input_state.parse_fasta(input_state.fname2) if not args['overlap']: target_names = list([target.name for target in targets]) baselines = list([baseline for baseline in baselines if baseline.name not in target_names])
cons_query = XMLBuildReader(args["query"]).parse().consensuses[0] cons_baseline = XMLBuildReader(args["baseline"]).parse().consensuses[0] # next, yield domains for both query and baseline datasets. dsb_query = DomainSetBuilder( cons_query, args["win"], args["max_g"], args["strip"], is_enum=args["enumerate"] ) dsb_baseline = DomainSetBuilder( cons_baseline, args["win"], args["max_g"], args["strip"], is_enum=args["enumerate"] ) # dsb_baseline = DomainSetBuilder(win=args['win'], max_gap=args['max_g'], # is_enum=args['enumerate'], consensus=cons_baseline, # is_strip=args['strip']) domains_query = dsb_query.build() # build abundance counts domains_baseline = dsb_baseline.build() status_message("Identifying domains", "OK") db = DomainAbundanceBuilder(query=domains_query, baseline=domains_baseline) domains = db.build() # build contingency matrices dpp = DomainPrettyPrinter(domains=domains, pval=args["p"], out=args["o"]) dpp.display() # pretty-print domains status_message("Domain over-representation computation complete ", "OK") else: args.update({"f": args["query"], "f2": args["baseline"], "a": None}) input_state = InputWrapperState(args) # input_state.assign_matrix() # parse in-built or custom matrix targets = input_state.parse_fasta(input_state.fname) baselines = input_state.parse_fasta(input_state.fname2) if not args["overlap"]: target_names = list([target.name for target in targets]) baselines = list([baseline for baseline in baselines if baseline.name not in target_names])