def run_hmmer(program, assertEqual): test_dir = path.join(settings.BASE_DIR, 'test_hmmer') if not path.exists(test_dir): mkdir(test_dir) chmod(test_dir, Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO) if program == 'phmmer': input_file_dir = path.join(settings.BASE_DIR, 'example', 'blastdb') query_filename = path.join(test_dir, 'Cimex_sample_pep_query.faa') copyfile(path.join(input_file_dir, 'Cimex_sample_pep_query.faa'), query_filename) else: # program == 'hmmersearch' input_file_dir = path.join(settings.BASE_DIR, 'example', 'hmmer') query_filename = path.join(test_dir, 'example.MSA') copyfile(path.join(input_file_dir, 'example.MSA'), query_filename) db_file = path.join(test_dir, 'AGLA_new_ids.faa') copyfile(path.join(settings.BASE_DIR, 'example', 'blastdb', 'AGLA_new_ids.faa'), db_file) chmod(query_filename, Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO) chmod(db_file, Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO) bin_name = get_bin_name() program_path = path.join(settings.BASE_DIR, 'hmmer', bin_name, 'bin') option_params = ['--incE', u'0.01', '--incdomE', u'0.03', '-E', u'0.01', '--domE', u'0.03'] db_list = [db_file] args = generate_hmmer_args(program, program_path, query_filename, option_params, db_list) chdir(test_dir) try: run_commands(args, assertEqual) finally: rmtree(test_dir)
def makeblastdb(self): if not os.path.isfile(self.fasta_file.path_full): return 1, 'FASTA file not found', '' bin_name = get_bin_name() makeblastdb_path = os.path.join(settings.BASE_DIR, 'blast', bin_name, 'makeblastdb') args = [ makeblastdb_path, '-in', self.fasta_file.path_full, '-dbtype', self.type.molecule_type, '-hash_index' ] # , '-parse_seqids' TODO: make option if self.title: args += ['-title', self.title] if self.organism.tax_id: args += ['-taxid', str(self.organism.tax_id)] p = Popen(args, stdout=PIPE, stderr=PIPE) output, error = p.communicate() return p.returncode, error, output
def test_clustalw(self): test_dir = path.join(settings.BASE_DIR, 'test_clustal') if not path.exists(test_dir): mkdir(test_dir) bin_name = get_bin_name() if bin_name == 'bin_win' or bin_name == 'bin_mac': return program_path = path.join(settings.BASE_DIR, 'clustal', bin_name, 'clustalw2') example_file_path = path.join(settings.BASE_DIR, 'example', 'blastdb', 'Cimex_sample_pep_query.faa') out_file_path = path.join(test_dir, 'test.out') args = [program_path, '-infile=' + example_file_path, '-OUTFILE=' + out_file_path, '-type=protein'] try: p = Popen(args, stdin=PIPE, stdout=PIPE) p.wait() self.assertEqual(p.returncode, 0) finally: rmtree(test_dir)
def test_clustalo(self): test_dir = path.join(settings.BASE_DIR, 'test_clustal') if not path.exists(test_dir): mkdir(test_dir) bin_name = get_bin_name() if bin_name == 'win32': return program_path = path.join(settings.BASE_DIR, 'clustal', bin_name, 'clustalo') example_file_path = path.join(settings.BASE_DIR, 'example', 'blastdb', 'Cimex_sample_pep_query.faa') out_file_path = path.join(test_dir, 'test.out') ph_file_path = path.join(test_dir, 'test.ph') args = [program_path, '--infile=' + example_file_path, '--outfile=' + out_file_path, '--guidetree-out=' + ph_file_path, '--full', '--full-iter', '--iterations=0', '--outfmt=clu', '--output-order=tree-order'] try: p = Popen(args, stdin=PIPE, stdout=PIPE) p.wait() self.assertEqual(p.returncode, 0) finally: rmtree(test_dir)
clustalo_path) clustalw_tar_path = join( clustal_bin_path, 'clustalw-2.1-linux-x86_64-libcppstatic.tar.gz') clustalw_path = join(clustal_bin_path, 'clustalw2') urllib.request.urlretrieve( 'http://www.clustal.org/download/current/clustalw-2.1-linux-x86_64-libcppstatic.tar.gz', clustalw_tar_path) print('Installing clustalw ...') tar = tarfile.open(clustalw_tar_path, 'r:gz') for member in tar.getmembers(): if member.isreg(): member.name = basename(member.name) tar.extract(member, clustal_bin_path) tar.close() remove(clustalw_tar_path) chmod(clustalo_path, Perm.S_IXUSR | Perm.S_IXGRP | Perm.S_IXOTH) chmod(clustalw_path, Perm.S_IXUSR | Perm.S_IXGRP | Perm.S_IXOTH) if __name__ == '__main__': bin_name = get_bin_name() print('Installing blast ...') install_blast(bin_name) print('Installing hmmer ...') install_hmmer(bin_name) print('Installing clustal ...') install_clustal(bin_name)
def create(request, iframe=False): if request.method == 'GET': blastdb_list = sorted([[db.type.dataset_type, db.type.get_molecule_type_display(), db.title, db.organism.display_name, db.description] for db in BlastDb.objects.select_related('organism').select_related('type').filter(is_shown=True) if db.db_ready()], key=lambda x: (x[3], x[1], x[0], x[2])) blastdb_type_counts = dict([(k.lower().replace(' ', '_'), len(list(g))) for k, g in groupby(sorted(blastdb_list, key=lambda x: x[0]), key=lambda x: x[0])]) return render(request, 'blast/main.html', { 'title': 'BLAST Query', 'blastdb_list': json.dumps(blastdb_list), 'blastdb_type_counts': blastdb_type_counts, 'iframe': iframe }) elif request.method == 'OPTIONS': return HttpResponse("OPTIONS METHOD NOT SUPPORTED", status=202) elif request.method == 'POST': # setup file paths task_id = uuid4().hex # TODO: Create from hash of input to check for duplicate inputs file_prefix = path.join(settings.MEDIA_ROOT, 'blast', 'task', task_id, task_id) query_filename = file_prefix + '.in' asn_filename = file_prefix + '.asn' if not path.exists(path.dirname(query_filename)): makedirs(path.dirname(query_filename)) chmod(path.dirname(query_filename), Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO) # ensure the standalone dequeuing process can open files in the directory bin_name = get_bin_name() # write query to file if 'query-file' in request.FILES: with open(query_filename, 'wb') as query_f: for chunk in request.FILES['query-file'].chunks(): query_f.write(chunk) elif 'query-sequence' in request.POST: with open(query_filename, 'wb') as query_f: query_text = [x.encode('ascii','ignore').strip() for x in request.POST['query-sequence'].split('\n')] query_f.write('\n'.join(query_text)) else: return render(request, 'blast/invalid_query.html', {'title': 'Invalid Query'}) if (path.getsize(query_filename) > int(settings.BLAST_QUERY_SIZE_MAX) * 1024): return render(request, 'blast/invalid_query.html', {'title': 'Your query size is ' + str(path.getsize(query_filename)) + ' bytes, but exceeds our query size limit of ' + str(settings.BLAST_QUERY_SIZE_MAX) + ' kbytes, Please try again with a smaller query size.',}) chmod(query_filename, Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO) # ensure the standalone dequeuing process can access the file # build blast command db_list = ' '.join([db.fasta_file.path_full for db in BlastDb.objects.filter(title__in=set(request.POST.getlist('db-name'))) if db.db_ready()]) if not db_list: return render(request, 'blast/invalid_query.html', {'title': 'Invalid Query',}) # check if program is in list for security if request.POST['program'] in ['blastn', 'tblastn', 'tblastx', 'blastp', 'blastx']: with open(query_filename, 'r') as f: qstr = f.read() if(qstr.count('>') > int(settings.BLAST_QUERY_MAX)): query_cnt = str(qstr.count('>')) remove(query_filename) return render(request, 'blast/invalid_query.html', {'title': 'Your search includes ' + query_cnt + ' sequences, but blast allows a maximum of ' + str(settings.BLAST_QUERY_MAX) + ' sequences per submission.', }) # generate customized_options input_opt = [] max_target_seqs = request.POST.get('max_target_seqs', 50) for blast_option in blast_customized_options[request.POST['program']]: if blast_option == 'low_complexity': if request.POST['program'] == 'blastn': input_opt.extend(['-dust', request.POST['low_complexity']]) else: input_opt.extend(['-seg', request.POST['low_complexity']]) else: input_opt.extend(['-'+blast_option, request.POST[blast_option]]) program_path = path.join(settings.BASE_DIR, 'blast', bin_name, request.POST['program']) num_threads = '4' if cpu_count() >= 4 else str(cpu_count()) args_list = [[program_path, '-query', query_filename, '-db', db_list, '-outfmt', '11', '-out', asn_filename, '-num_threads', num_threads]] args_list[0].extend(input_opt) # convert to multiple formats blast_formatter_path = path.join(settings.BASE_DIR, 'blast', bin_name, 'blast_formatter') for ext, outfmt in blast_info['ext'].items(): args = [blast_formatter_path, '-archive', asn_filename, '-outfmt', outfmt, '-out', file_prefix + ext] if ext == '.html': args.append('-html') if int(outfmt.split()[0]) > 4: args.extend(['-max_target_seqs', max_target_seqs]) else: args.extend(['-num_descriptions', max_target_seqs, '-num_alignments', max_target_seqs]) args_list.append(args) record = BlastQueryRecord() record.task_id = task_id if request.user.is_authenticated(): record.user = request.user record.save() # generate status.json for frontend status checking with open(query_filename, 'r') as f: # count number of query sequence by counting '>' qstr = f.read() seq_count = qstr.count('>') if (seq_count == 0): seq_count = 1 with open(path.join(path.dirname(file_prefix), 'status.json'), 'wb') as f: json.dump({'status': 'pending', 'seq_count': seq_count}, f) run_blast_task.delay(task_id, args_list, file_prefix, blast_info) # debug # run_blast_task.delay(task_id, args_list, file_prefix, blast_info).get() return redirect('blast:retrieve', task_id) else: raise Http404
def create(request): ''' Main page of Clustal * Max number of query sequences: 600 sequences ''' if request.method == 'GET': return render(request, 'clustal/main.html', { 'title': 'Clustal Query', }) elif request.method == 'POST': # setup file paths task_id = uuid4().hex task_dir = path.join(settings.MEDIA_ROOT, 'clustal', 'task', task_id) # file_prefix only for task... file_prefix = path.join(settings.MEDIA_ROOT, 'clustal', 'task', task_id, task_id) if not path.exists(task_dir): makedirs(task_dir) chmod(task_dir, Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO) # ensure the standalone dequeuing process can open files in the directory # change directory to task directory query_filename = '' if 'query-file' in request.FILES: query_filename = path.join(settings.MEDIA_ROOT, 'clustal', 'task', task_id, request.FILES['query-file'].name) with open(query_filename, 'wb') as query_f: for chunk in request.FILES['query-file'].chunks(): query_f.write(chunk) elif 'query-sequence' in request.POST and request.POST[ 'query-sequence']: query_filename = path.join(settings.MEDIA_ROOT, 'clustal', 'task', task_id, task_id + '.in') with open(query_filename, 'wb') as query_f: query_text = [ x.encode('ascii', 'ignore').strip() for x in request.POST['query-sequence'].split('\n') ] query_f.write('\n'.join(query_text)) else: return render(request, 'clustal/invalid_query.html', { 'title': '', }) chmod(query_filename, Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO) # ensure the standalone dequeuing process can access the file bin_name = get_bin_name( ) # note that we didn't support Clustal on windows yet program_path = path.join(settings.BASE_DIR, 'clustal', bin_name) # count number of query sequence by counting '>' with open(query_filename, 'r') as f: qstr = f.read() seq_count = qstr.count('>') if (seq_count > 600): return render( request, 'clustal/invalid_query.html', { 'title': 'Clustal: Max number of query sequences: 600 sequences.', }) is_color = False # check if program is in list for security if request.POST['program'] in ['clustalw', 'clustalo']: option_params = [] args_list = [] if request.POST['program'] == 'clustalw': # clustalw option_params.append("-type=" + request.POST['sequenceType']) # parameters setting for full option or fast option if request.POST['pairwise'] == "full": if request.POST['sequenceType'] == "dna": if request.POST['PWDNAMATRIX'] != "": option_params.append('-PWDNAMATRIX=' + request.POST['PWDNAMATRIX']) if request.POST['dna-PWGAPOPEN'] != "": option_params.append('-PWGAPOPEN=' + request.POST['dna-PWGAPOPEN']) if request.POST['dna-PWGAPEXT'] != "": option_params.append('-PWGAPEXT=' + request.POST['dna-PWGAPEXT']) elif request.POST['sequenceType'] == "protein": if request.POST['PWMATRIX'] != "": option_params.append('-PWMATRIX=' + request.POST['PWMATRIX']) if request.POST['protein-PWGAPOPEN'] != "": option_params.append( '-PWGAPOPEN=' + request.POST['protein-PWGAPOPEN']) if request.POST['protein-PWGAPEXT'] != "": option_params.append( '-PWGAPEXT=' + request.POST['protein-PWGAPEXT']) elif request.POST['pairwise'] == "fast": option_params.append('-QUICKTREE') if request.POST['KTUPLE'] != "": option_params.append('-KTUPLE=' + request.POST['KTUPLE']) if request.POST['WINDOW'] != "": option_params.append('-WINDOW=' + request.POST['WINDOW']) if request.POST['PAIRGAP'] != "": option_params.append('-PAIRGAP=' + request.POST['PAIRGAP']) if request.POST['TOPDIAGS'] != "": option_params.append('-TOPDIAGS=' + request.POST['TOPDIAGS']) if request.POST['SCORE'] != "": option_params.append('-SCORE=' + request.POST['SCORE']) # prarmeters setting for mutliple alignment if request.POST['sequenceType'] == "dna": if request.POST['DNAMATRIX'] != "": option_params.append('-DNAMATRIX=' + request.POST['DNAMATRIX']) if request.POST['dna-GAPOPEN'] != "": option_params.append('-GAPOPEN=' + request.POST['dna-GAPOPEN']) if request.POST['dna-GAPEXT'] != "": option_params.append('-GAPEXT=' + request.POST['dna-GAPEXT']) if request.POST['dna-GAPDIST'] != "": option_params.append('-GAPDIST=' + request.POST['dna-GAPDIST']) if request.POST['dna-ITERATION'] != "": option_params.append('-ITERATION=' + request.POST['dna-ITERATION']) if request.POST['dna-NUMITER'] != "": option_params.append('-NUMITER=' + request.POST['dna-NUMITER']) if request.POST['dna-CLUSTERING'] != "": option_params.append('-CLUSTERING=' + request.POST['dna-CLUSTERING']) elif request.POST['sequenceType'] == "protein": if request.POST['MATRIX'] != "": option_params.append('-MATRIX=' + request.POST['MATRIX']) if request.POST['protein-GAPOPEN'] != "": option_params.append('-GAPOPEN=' + request.POST['protein-GAPOPEN']) if request.POST['protein-GAPEXT'] != "": option_params.append('-GAPEXT=' + request.POST['protein-GAPEXT']) if request.POST['protein-GAPDIST'] != "": option_params.append('-GAPDIST=' + request.POST['protein-GAPDIST']) if request.POST['protein-ITERATION'] != "": option_params.append('-ITERATION=' + request.POST['protein-ITERATION']) if request.POST['protein-NUMITER'] != "": option_params.append('-NUMITER=' + request.POST['protein-NUMITER']) if request.POST['protein-CLUSTERING'] != "": option_params.append( '-CLUSTERING=' + request.POST['protein-CLUSTERING']) # parameters setting of output is_color = True if request.POST[ 'OUTPUT'] == 'clustal' else False option_params.append('-OUTPUT=' + request.POST['OUTPUT']) option_params.append('-OUTORDER=' + request.POST['OUTORDER']) args_list.append([ path.join(program_path, 'clustalw2'), '-infile=' + query_filename, '-OUTFILE=' + path.join(settings.MEDIA_ROOT, 'clustal', 'task', task_id, task_id + '.aln'), '-type=protein' ] + option_params) args_list_log = [] args_list_log.append([ 'clustalw2', '-infile=' + path.basename(query_filename), '-OUTFILE=' + task_id + '.aln', '-type=protein' ] + option_params) else: # clustalo if request.POST['dealing_input'] == "yes": option_params.append("--dealign") if request.POST['clustering_guide_tree'] != "no": option_params.append("--full") if request.POST['clustering_guide_iter'] != "no": option_params.append("--full-iter") if request.POST['combined_iter'] != "": option_params.append("--iterations=" + request.POST['combined_iter']) if request.POST['max_gt_iter'] != "": option_params.append("--max-guidetree-iterations=" + request.POST['max_gt_iter']) if request.POST['max_hmm_iter'] != "": option_params.append("--max-hmm-iterations=" + request.POST['max_hmm_iter']) if request.POST['omega_output'] != "": option_params.append("--outfmt=" + request.POST['omega_output']) is_color = True if request.POST[ 'omega_output'] == 'clu' else False if request.POST['omega_order'] != "": option_params.append("--output-order=" + request.POST['omega_order']) args_list.append([ path.join(program_path, 'clustalo'), '--infile=' + query_filename, '--guidetree-out=' + path.join(settings.MEDIA_ROOT, 'clustal', 'task', task_id, task_id) + '.ph', '--outfile=' + path.join(settings.MEDIA_ROOT, 'clustal', 'task', task_id, task_id) + '.aln' ] + option_params) args_list_log = [] args_list_log.append([ 'clustalo', '--infile=' + path.basename(query_filename), '--guidetree-out=' + task_id + '.ph', '--outfile=' + task_id + '.aln' ] + option_params) record = ClustalQueryRecord() record.task_id = task_id if request.user.is_authenticated(): record.user = request.user record.save() # generate status.json for frontend status checking with open(query_filename, 'r' ) as f: # count number of query sequence by counting '>' qstr = f.read() seq_count = qstr.count('>') if (seq_count == 0): seq_count = 1 with open( path.join(settings.MEDIA_ROOT, 'clustal', 'task', task_id, 'status.json'), 'wb') as f: json.dump( { 'status': 'pending', 'seq_count': seq_count, 'program': request.POST['program'], 'cmd': " ".join(args_list_log[0]), 'is_color': is_color, 'query_filename': path.basename(query_filename) }, f) run_clustal_task.delay(task_id, args_list, file_prefix) return redirect('clustal:retrieve', task_id) else: raise Http404
def create(request): ''' Main page of Hmmer Use hmmsearch fast mode for format validation Input limitation: (1). Phmmer, Max number of query sequences: 10 sequences ''' if request.method == 'GET': hmmerdb_list = sorted([['Protein', "Protein", db.title, db.organism.display_name, db.description] for db in HmmerDB.objects.select_related('organism').filter(is_shown=True)], key=lambda x: (x[3], x[1], x[0], x[2])) hmmerdb_type_counts = dict([(k.lower().replace(' ', '_'), len(list(g))) for k, g in groupby(sorted(hmmerdb_list, key=lambda x: x[0]), key=lambda x: x[0])]) ''' Redirect from clustal result ''' clustal_content = [] if ("clustal_task_id" in request.GET): clustal_aln = path.join(settings.MEDIA_ROOT, 'clustal', 'task', request.GET['clustal_task_id'], request.GET['clustal_task_id'] + ".aln") with open(clustal_aln, 'r') as content_file: for line in content_file: clustal_content.append(line) return render(request, 'hmmer/main.html', { 'title': 'HMMER Query', 'hmmerdb_list': json.dumps(hmmerdb_list), 'hmmerdb_type_counts': hmmerdb_type_counts, 'clustal_content': "".join(clustal_content), }) elif request.method == 'POST': # setup file paths task_id = uuid4().hex task_dir = path.join(settings.MEDIA_ROOT, 'hmmer', 'task', task_id) # file_prefix only for task... file_prefix = path.join(settings.MEDIA_ROOT, 'hmmer', 'task', task_id, task_id) if not path.exists(task_dir): makedirs(task_dir) chmod(task_dir, Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO) # ensure the standalone dequeuing process can open files in the directory # change directory to task directory if 'query-file' in request.FILES: query_filename = path.join(settings.MEDIA_ROOT, 'hmmer', 'task', task_id, request.FILES['query-file'].name) with open(query_filename, 'wb') as query_f: for chunk in request.FILES['query-file'].chunks(): query_f.write(chunk) elif 'query-sequence' in request.POST and request.POST['query-sequence']: query_filename = path.join(settings.MEDIA_ROOT, 'hmmer', 'task', task_id, task_id + '.in') with open(query_filename, 'wb') as query_f: query_text = [x.encode('ascii', 'ignore').strip() for x in request.POST['query-sequence'].split('\n')] query_f.write('\n'.join(query_text)) else: return render(request, 'hmmer/invalid_query.html', {'title': '', }) chmod(query_filename, Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO) # ensure the standalone dequeuing process can access the file bin_name = get_bin_name() # Note that currently we didn't support HMMER on windows program_path = path.join(settings.BASE_DIR, 'hmmer', bin_name, 'bin') if request.POST['program'] == 'phmmer': with open(query_filename, 'r') as f: qstr = f.read() if qstr.count('>') > int(HMMER_QUERY_MAX): query_cnt = str(qstr.count('>')) remove(query_filename) return render(request, 'hmmer/invalid_query.html', {'title': 'Your search includes ' + query_cnt + ' sequences, but HMMER allows a maximum of ' + str(HMMER_QUERY_MAX) + ' sequences per submission.', }) elif request.POST['program'] == 'hmmsearch': ''' Format validation by hmmsearch fast mode If the machine can't perform it in short time, it could be marked. But you need find a good to check format in front-end ''' p = Popen([path.join(program_path, "hmmbuild"), "--fast", '--amino', path.join(settings.MEDIA_ROOT, 'hmmer', 'task', 'hmmbuild.test'), query_filename], stdout=PIPE, stderr=PIPE) p.wait() result = p.communicate()[1] if(result != ''): return render(request, 'hmmer/invalid_query.html', {'title': 'Invalid MSA format', 'info' :'<a href="http://toolkit.tuebingen.mpg.de/reformat/help_params#format" target="_blank"> \ Valid MSA format descriptions </a>' }) else: # check if program is in list for security raise Http404 # build hmmer command db_list = [db.fasta_file.path_full for db in HmmerDB.objects.filter(title__in=set(request.POST.getlist('db-name')))] for db in db_list: symlink(db, path.join(settings.MEDIA_ROOT, 'hmmer', 'task', task_id, db[db.rindex('/') + 1:])) if not db_list: return render(request, 'hmmer/invalid_query.html', {'title': '', }) if request.POST['cutoff'] == 'evalue': option_params = ['--incE', request.POST['s_sequence'], '--incdomE', request.POST['s_hit'], '-E', request.POST['r_sequence'], '--domE', request.POST['r_hit']] elif request.POST['cutoff'] == 'bitscore': option_params = ['--incT', request.POST['s_sequence'], '--incdomT', request.POST['s_hit'], '-T', request.POST['r_sequence'], '--domT', request.POST['r_hit']] else: raise Http404 record = HmmerQueryRecord() record.task_id = task_id if request.user.is_authenticated(): record.user = request.user record.save() # generate status.json for frontend statu checking with open(query_filename, 'r') as f: qstr = f.read() seq_count = qstr.count('>') if (seq_count == 0): seq_count = 1 with open(path.join(settings.MEDIA_ROOT, 'hmmer', 'task', task_id, 'status.json'), 'wb') as f: json.dump({'status': 'pending', 'seq_count': seq_count, 'db_list': [db[db.rindex('/') + 1:] for db in db_list], 'program': request.POST['program'], 'params': option_params, 'input': path.basename(query_filename)}, f) args = generate_hmmer_args(request.POST['program'], program_path, query_filename, option_params, db_list) run_hmmer_task.delay(task_id, args, file_prefix) return redirect('hmmer:retrieve', task_id)
def generate_blast_args(program): input_file_dir = path.join(settings.BASE_DIR, 'example', 'blastdb/') output_file_dir = path.join(settings.BASE_DIR, 'test_' + program + '/') asn_filename = path.join(output_file_dir, 'test_' + program + '.asn') if program == 'blastp': query_filename = path.join(input_file_dir, 'Cimex_sample_pep_query.faa') db_list = path.join(input_file_dir, 'clec_peptide_example_BLASTdb.fa') options = { 'max_target_seqs': '100', 'evalue': '10.0', 'word_size': '6', 'matrix': 'BLOSUM62', 'threshold': '11', 'gapopen': '11', 'gapextend': '1', 'low_complexity': 'no', 'soft_masking': 'false', } elif program == 'blastn': query_filename = path.join(input_file_dir, 'LFUL_sample_query.fna') db_list = path.join(input_file_dir, 'Ladonda_sample_CDS_BLASTdb.fna') options = { 'max_target_seqs': '100', 'evalue': '10.0', 'word_size': '11', 'reward': '2', 'penalty': '-3', 'gapopen': '5', 'gapextend': '2', 'strand': 'both', 'low_complexity': 'yes', 'soft_masking': 'true', } elif program == 'tblastn': query_filename = path.join(input_file_dir, 'Cimex_sample_pep_query.faa') db_list = path.join(input_file_dir, 'Ladonda_sample_CDS_BLASTdb.fna') options = { 'max_target_seqs': '100', 'evalue': '10.0', 'word_size': '6', 'matrix': 'BLOSUM62', 'threshold': '13', 'gapopen': '11', 'gapextend': '1', 'low_complexity': 'yes', 'soft_masking': 'false', } elif program == 'tblastx': query_filename = path.join(input_file_dir, 'LFUL_sample_query.fna') db_list = path.join(input_file_dir, 'Ladonda_sample_CDS_BLASTdb.fna') options = { 'max_target_seqs': '100', 'evalue': '10.0', 'word_size': '3', 'matrix': 'BLOSUM62', 'threshold': '13', 'strand': 'both', 'low_complexity': 'yes', 'soft_masking': 'false', } elif program == 'blastx': query_filename = path.join(input_file_dir, 'LFUL_sample_query.fna') db_list = path.join(input_file_dir, 'clec_peptide_example_BLASTdb.fa') options = { 'max_target_seqs': '100', 'evalue': '10.0', 'word_size': '6', 'matrix': 'BLOSUM62', 'threshold': '12', 'strand': 'both', 'gapopen': '11', 'gapextend': '1', 'low_complexity': 'no', 'soft_masking': 'false', } if path.exists(output_file_dir): rmtree(output_file_dir) makedirs(output_file_dir) bin_name = get_bin_name() program_path = path.join(settings.BASE_DIR, 'blast', bin_name, program) blast_customized_options = { 'blastn': [ 'max_target_seqs', 'evalue', 'word_size', 'reward', 'penalty', 'gapopen', 'gapextend', 'strand', 'low_complexity', 'soft_masking' ], 'tblastn': [ 'max_target_seqs', 'evalue', 'word_size', 'matrix', 'threshold', 'gapopen', 'gapextend', 'low_complexity', 'soft_masking' ], 'tblastx': [ 'max_target_seqs', 'evalue', 'word_size', 'matrix', 'threshold', 'strand', 'low_complexity', 'soft_masking' ], 'blastp': [ 'max_target_seqs', 'evalue', 'word_size', 'matrix', 'threshold', 'gapopen', 'gapextend', 'low_complexity', 'soft_masking' ], 'blastx': [ 'max_target_seqs', 'evalue', 'word_size', 'matrix', 'threshold', 'strand', 'gapopen', 'gapextend', 'low_complexity', 'soft_masking' ] } input_opt = [] max_target_seqs = options.get('max_target_seqs', '50') for blast_option in blast_customized_options[program]: if blast_option == 'low_complexity': if program == 'blastn': input_opt.extend(['-dust', options['low_complexity']]) else: input_opt.extend(['-seg', options['low_complexity']]) else: input_opt.extend(['-' + blast_option, options[blast_option]]) args_list = [[ program_path, '-query', query_filename, '-db', db_list, '-outfmt', '11', '-out', asn_filename, '-num_threads', '2' ]] args_list[0].extend(input_opt) blast_formatter_path = path.join(settings.BASE_DIR, 'blast', bin_name, 'blast_formatter') blast_col_name = 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qcovs qlen slen qframe sframe' blast_info = { 'col_types': [ 'str', 'str', 'float', 'int', 'int', 'int', 'int', 'int', 'int', 'int', 'float', 'float', 'int', 'int', 'int', 'int', 'int', 'int' ], 'col_names': blast_col_name.split(), 'ext': { '.0': '0', '.html': '0', '.1': '1', '.3': '3', '.xml': '5', '.tsv': '6 ' + blast_col_name, '.csv': '10 ' + blast_col_name, }, } for ext, outfmt in blast_info['ext'].items(): args = [ blast_formatter_path, '-archive', asn_filename, '-outfmt', outfmt, '-out', output_file_dir + 'test_' + program + ext ] if ext == '.html': args.append('-html') if int(outfmt.split()[0]) > 4: args.extend(['-max_target_seqs', max_target_seqs]) else: args.extend([ '-num_descriptions', max_target_seqs, '-num_alignments', max_target_seqs ]) args_list.append(args) return args_list