def write_in_fastafile(filename, res, min_length=50): corpus = [] labels = [] for seq, score, pval, _, _ in res: if len(seq) > min_length and pval < 0.05: corpus.append(seq) labels.append(' '.join( ['+' if score > 0 else '-', 'p-val:' + str(pval)])) FileUtility.create_fasta_file(filename, corpus, labels)
def align_markers(self, p_value_threshold): final_results = [] for idx, (seq, description) in tqdm.tqdm(self.seq_IDS.items()): pval = float(description.split(':')[1]) if pval <= p_value_threshold: FileUtility.create_fasta_file('temp.fasta', [seq], ['temp']) blastx_cline = NcbiblastnCommandline( query='temp.fasta', db= "/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/EZ/raw/eztaxon_qiime_full.fasta", evalue=0.001, outfmt=5, out="temp.xml") blastx_cline() f = open("temp.xml", 'r') blast_records = NCBIXML.parse(f) flag = False score = -1 alignment_length = -1 results = [] for blast_record in blast_records: for alignment in blast_record.alignments: for hsp in alignment.hsps: if not flag and score == -1: score = hsp.score alignment_length = hsp.align_length flag = True if hsp.score >= score and hsp.align_length >= alignment_length and 'Eukarya' not in self.ez_taxa_dict[ alignment.hit_id]: results.append( (self.ez_taxa_dict[alignment.hit_id], hsp.expect)) if len(results) > 0: res = self.lowest_certain_level(results) if res: final_results.append( (seq, self.refine_ez_taxonomy(res) + idx[-1], pval)) else: final_results.append((seq, 'ZZZNOVEL' + idx[-1], pval)) else: final_results.append((seq, 'ZZZNOVEL' + idx[-1], pval)) # sorted markers by the taxonomy information of the last certain level self.aligned_markers = sorted(final_results, key=operator.itemgetter(1), reverse=False) self.min_p_value = p_value_threshold self.update_matrix_by_markers()
def _perform_alignment(self, idx__seq_discrpt): idx, (seq, description) = idx__seq_discrpt pval = float(description.split(':')[1]) final_results = [] if pval <= self.p_value_threshold: FileUtility.create_fasta_file('../tmp/temp' + str(idx) + '.fasta', [seq], ['temp']) blastx_cline = NcbiblastnCommandline( query='../tmp/temp' + str(idx) + '.fasta', db= "/mounts/data/proj/asgari/dissertation/git_repos/16S_datasets/EZ/raw/eztaxon_qiime_full.fasta", evalue=0.001, outfmt=5, out='../tmp/temp' + str(idx) + '.xml') blastx_cline() f = open('../tmp/temp' + str(idx) + '.xml', 'r') blast_records = NCBIXML.parse(f) flag = False score = -1 alignment_length = -1 results = [] for blast_record in blast_records: for alignment in blast_record.alignments: for hsp in alignment.hsps: if not flag and score == -1: score = hsp.score alignment_length = hsp.align_length flag = True if hsp.score >= score and hsp.align_length >= alignment_length and 'Eukarya' not in self.ez_taxa_dict[ alignment.hit_id]: results.append( (self.ez_taxa_dict[alignment.hit_id], hsp.expect)) if len(results) > 0: res = self.lowest_certain_level(results) if res: final_results = (seq, self.refine_ez_taxonomy(res) + idx[-1], pval) else: final_results = (seq, 'ZZZNOVEL' + idx[-1], pval) else: final_results = (seq, 'ZZZNOVEL' + idx[-1], pval) return final_results