def predict(input_sequence): sys.path.append(settings["SERENDIP_DIR"]) from sequence.entropy.lib.seq_lib import FastaParser from Bio.Blast import NCBIStandalone sequence_hash = get_sequence_hash(input_sequence) results_path = os.path.join(settings["RESULTS_DIR"], sequence_hash) lock_path = results_path + ".lock" with FileLock(lock_path): if os.path.isfile(results_path): return parse_serendip_results(open(results_path, 'r').read()) input_id = 'input' out_dir = tempfile.mkdtemp() try: out_file = os.path.join(out_dir, 'output.myrsa') input_fasta_path = os.path.join(out_dir, input_id + '.fa') # Netsurf open(input_fasta_path, 'w').write(">%s\n%s" % (input_id, input_sequence)) cmd = [settings["NETSURF_EXE"], "-i", input_fasta_path, "-d", settings["NR70_DB"], "-a", "-k", "-T", out_dir, "-o", out_file] _log.info(cmd) subprocess.call(cmd) blast_parser = NCBIStandalone.PSIBlastParser() blast_record = blast_parser.parse(open(os.path.join(out_dir, input_id + '.blastout'), 'r')) if blast_record.rounds <= 0: raise Exception("no netsurf hits") hit_titles = [alignment.title[1:] for alignment in blast_record.rounds[-1].alignments] id_path = os.path.join(out_dir, input_id + '.blastout_id') with open(id_path, 'w') as f: for hit_title in hit_titles: f.write(hit_title + '\n') blast_hits_path = os.path.join(out_dir, 'output_seqs.fa') cmd = [settings["FASTACMD_EXE"], "-d", settings["NR70_DB"], '-i', id_path, '-o', blast_hits_path] _log.info(cmd) result = subprocess.call(cmd) if result == 0: # We have blast hits # Netsurf on hits netsurf_append_path = os.path.join(out_dir, 'output_other.myrsa') subtasks = [netsurf_hit.delay(str(seq)) for seq in FastaParser(open(blast_hits_path, 'r'))] with open(netsurf_append_path, 'w') as f: for subtask in subtasks: f.write(subtask.get()) # Append input sequence to blast hits for alignment as input for entropy and DynaMine with open(blast_hits_path, 'a') as f: f.write('>input\n' + input_sequence + '\n') # Make alignment using muscle alignment_path = os.path.join(out_dir, "output.ali") cmd = [settings["MUSCLE_EXE"], "-in", blast_hits_path, "-out", alignment_path] _log.info(cmd) subprocess.call(cmd) else: raise Exception("No blast hits for input sequence") # Alignment position entropies entropy_path = os.path.join(out_dir, "output.entropy") hit_sequences = FastaParser(open(alignment_path, 'r')) hit_sequences.frequencies().normalize() hit_entropies = hit_sequences.frequencies().entropies() with open(entropy_path, 'w') as f: n = 0 for entropy in hit_entropies: n += 1 f.write(str(n) + ' ' + str(entropy) + '\n') # Run dynamine on each sequence dynamine_fasta_path = os.path.join(out_dir, "output_seq.fasta") for seq in FastaParser(open(blast_hits_path, 'r')): # We use this file name, to avoid confusing the rest of the script: with open(dynamine_fasta_path, 'w') as f: f.write(str(seq)) cmd = [settings["DYNAMINE_EXE"], "-a", dynamine_fasta_path] _log.info(cmd) subprocess.call(cmd, env=dict(os.environ, **{"PYTHONPATH":"/usr/local/lib/python2.7/site-packages/"})) # Run prediction script result_testing_path = os.path.join(settings["SERENDIP_DIR"], "sequence", "Result_Testing") combined_path = os.path.join(settings["SERENDIP_DIR"], "sequence", "five_models_combined") dynamine_path = os.path.splitext(dynamine_fasta_path)[0] cmd = [settings["RSCRIPT_EXE"], settings["RF_SCRIPT"], input_id, alignment_path, entropy_path, netsurf_append_path, dynamine_path, out_file, result_testing_path, combined_path] _log.info(cmd) os.chdir(out_dir) subprocess.call(cmd) output_result_path = os.path.join(out_dir, input_id + '.out') if not os.path.isfile(output_result_path): raise Exception("No ouput generated") shutil.copyfile(output_result_path, results_path) data = parse_serendip_results(open(results_path, 'r').read()) # Start making the scene: yasara_scene.delay(data) return data finally: if os.path.isdir(out_dir): shutil.rmtree(out_dir)
for test in all_tests: print "*" * 50, "TESTING %s" % test datafile = os.path.join("Blast", test) scanner.feed(open(datafile), ParserSupport.AbstractConsumer()) for test in detailed_tests: print "*" * 50, "TESTING %s" % test datafile = os.path.join("Blast", test) scanner.feed(open(datafile), ParserSupport.TaggingConsumer()) ### BlastParser print "Running tests on BlastParser" parser = NCBIStandalone.BlastParser() pb_parser = NCBIStandalone.PSIBlastParser() for test in all_tests: print "*" * 50, "TESTING %s" % test datafile = os.path.join("Blast", test) try: # First, try parsing it with the normal parser. rec = parser.parse(open(datafile)) except ValueError, x: # If it complains that the input is psiblast data, then # parse it with the psiblast parser. if string.find(str(x), 'PSI-BLAST data') >= 0: rec = pb_parser.parse(open(datafile)) else: raise ### Blast Record
def ReadBlast(self, file, OUT, iszipped=0, is_psiblast=None): output = open(OUT, "w") self.selfhits = [] if is_psiblast: print >> sys.stderr, 'Parsing PSI-Blast' self.parser = NCBIStandalone.PSIBlastParser() else: self.parser = NCBIStandalone.BlastParser() if file[-3:] == '.gz' or iszipped: handle = gzip.open(file) else: handle = open(file) self.iter = NCBIStandalone.Iterator(handle=handle, parser=self.parser) self.blastDict = {} while 1: try: rec = self.iter.next() if not rec: break except: sys.stderr.write( 'Can\'t iterate on blast records anymore. Abort.\n') import traceback traceback.print_exc() return 'Error parsing %s' % file self.query = rec.query.split(" ")[ 0] ## blast_record.query.split(" ")[0] self.length = rec.query_letters if self.length < self.min_size: self.printer("Does not meet the minimum length " + str(self.min_size)) break if is_psiblast: rec = rec.rounds[-1] # each alignment is one potential hit for n, alignment in enumerate(rec.alignments): hsp = alignment.hsps[0] #no multiple hsps alnlength = hsp.align_length hit = alignment.title #targetlength = alignment.length #m = re.search("sp\|([A-Z0-9]+)\|([A-Z0-9_]+) ?(.+)?", alignment.title) m = re.search("sp\|(.+?)\|(.+?) (.+)?", alignment.title) if m: # pyphynr blast result hit_sp_ac = m.group(1) hit_sp_id = m.group(2) hit_sp_note = m.group(3) elif alignment.title[ 0] == '>': # result from qadditional blast databases hit_sp_ac = None hit_sp_id = alignment.title[1:].split()[0] hit_sp_note = None else: hit_sp_ac = None hit_sp_id = None hit_sp_note = None self.printer(hit_sp_id) similarity = hsp.positives[0] / float(hsp.positives[1]) * 100 if float(hsp.expect) <= float(self.HSP_max_evalue): if float(similarity) >= int(self.HSP_minimal_positives): coverage = hsp.positives[1] / float(self.length) * 100 if float(coverage) >= int(self.HSP_minimal_coverage): #targetcoverage = hsp.positives[1]/float(targetlength)*100 #if float(targetcoverage) > int(self.HSP_minimal_targetcov): #self.compatibles.append((hit_sp_ac, hit)) #hitlist = [hit_sp_id, n+1 , hsp.positives[0]/float(hsp.positives[1])*100, hsp.positives[1]/float(self.length)*100, hsp.positives[1]/float(targetlength)*100, hsp.score, hsp.expect] hitlist = [ hit_sp_id, hsp.positives[0] / float(hsp.positives[1]) * 100, hsp.positives[1] / float(self.length) * 100, hsp.score, hsp.expect ] if self.cB: self.createblastDict(query, hitlist) output.write("%s\t" % (self.query)), for element in hitlist: output.write("%s\t" % element), output.write("\n") output.close() handle.close() return None