Пример #1
0
def run_hmmer(program, assertEqual):
    test_dir = path.join(settings.BASE_DIR, 'test_hmmer')
    if not path.exists(test_dir):
        mkdir(test_dir)
    chmod(test_dir, Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO)
    if program == 'phmmer':
        input_file_dir = path.join(settings.BASE_DIR, 'example', 'blastdb')
        query_filename = path.join(test_dir, 'Cimex_sample_pep_query.faa')
        copyfile(path.join(input_file_dir, 'Cimex_sample_pep_query.faa'), query_filename)
    else:  # program == 'hmmersearch'
        input_file_dir = path.join(settings.BASE_DIR, 'example', 'hmmer')
        query_filename = path.join(test_dir, 'example.MSA')
        copyfile(path.join(input_file_dir, 'example.MSA'), query_filename)
    db_file = path.join(test_dir, 'AGLA_new_ids.faa')
    copyfile(path.join(settings.BASE_DIR, 'example', 'blastdb', 'AGLA_new_ids.faa'), db_file)
    chmod(query_filename,
          Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO)
    chmod(db_file,
          Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO)
    bin_name = get_bin_name()
    program_path = path.join(settings.BASE_DIR, 'hmmer', bin_name, 'bin')
    option_params = ['--incE', u'0.01', '--incdomE', u'0.03', '-E', u'0.01', '--domE', u'0.03']
    db_list = [db_file]
    args = generate_hmmer_args(program, program_path, query_filename, option_params, db_list)
    chdir(test_dir)
    try:
        run_commands(args, assertEqual)
    finally:
        rmtree(test_dir)
Пример #2
0
 def makeblastdb(self):
     if not os.path.isfile(self.fasta_file.path_full):
         return 1, 'FASTA file not found', ''
     bin_name = get_bin_name()
     makeblastdb_path = os.path.join(settings.BASE_DIR, 'blast', bin_name,
                                     'makeblastdb')
     args = [
         makeblastdb_path, '-in', self.fasta_file.path_full, '-dbtype',
         self.type.molecule_type, '-hash_index'
     ]  # , '-parse_seqids' TODO: make option
     if self.title:
         args += ['-title', self.title]
     if self.organism.tax_id:
         args += ['-taxid', str(self.organism.tax_id)]
     p = Popen(args, stdout=PIPE, stderr=PIPE)
     output, error = p.communicate()
     return p.returncode, error, output
Пример #3
0
 def test_clustalw(self):
     test_dir = path.join(settings.BASE_DIR, 'test_clustal')
     if not path.exists(test_dir):
         mkdir(test_dir)
     bin_name = get_bin_name()
     if bin_name == 'bin_win' or bin_name == 'bin_mac':
         return
     program_path = path.join(settings.BASE_DIR, 'clustal', bin_name, 'clustalw2')
     example_file_path = path.join(settings.BASE_DIR, 'example', 'blastdb', 'Cimex_sample_pep_query.faa')
     out_file_path = path.join(test_dir, 'test.out')
     args = [program_path, '-infile=' + example_file_path,
             '-OUTFILE=' + out_file_path,
             '-type=protein']
     try:
         p = Popen(args, stdin=PIPE, stdout=PIPE)
         p.wait()
         self.assertEqual(p.returncode, 0)
     finally:
         rmtree(test_dir)
Пример #4
0
 def test_clustalo(self):
     test_dir = path.join(settings.BASE_DIR, 'test_clustal')
     if not path.exists(test_dir):
         mkdir(test_dir)
     bin_name = get_bin_name()
     if bin_name == 'win32':
         return
     program_path = path.join(settings.BASE_DIR, 'clustal', bin_name, 'clustalo')
     example_file_path = path.join(settings.BASE_DIR, 'example', 'blastdb', 'Cimex_sample_pep_query.faa')
     out_file_path = path.join(test_dir, 'test.out')
     ph_file_path = path.join(test_dir, 'test.ph')
     args = [program_path, '--infile=' + example_file_path,
             '--outfile=' + out_file_path,
             '--guidetree-out=' + ph_file_path,
             '--full', '--full-iter', '--iterations=0',
             '--outfmt=clu', '--output-order=tree-order']
     try:
         p = Popen(args, stdin=PIPE, stdout=PIPE)
         p.wait()
         self.assertEqual(p.returncode, 0)
     finally:
         rmtree(test_dir)
Пример #5
0
            clustalo_path)

        clustalw_tar_path = join(
            clustal_bin_path, 'clustalw-2.1-linux-x86_64-libcppstatic.tar.gz')
        clustalw_path = join(clustal_bin_path, 'clustalw2')
        urllib.request.urlretrieve(
            'http://www.clustal.org/download/current/clustalw-2.1-linux-x86_64-libcppstatic.tar.gz',
            clustalw_tar_path)

        print('Installing clustalw ...')
        tar = tarfile.open(clustalw_tar_path, 'r:gz')
        for member in tar.getmembers():
            if member.isreg():
                member.name = basename(member.name)
                tar.extract(member, clustal_bin_path)
        tar.close()
        remove(clustalw_tar_path)

    chmod(clustalo_path, Perm.S_IXUSR | Perm.S_IXGRP | Perm.S_IXOTH)
    chmod(clustalw_path, Perm.S_IXUSR | Perm.S_IXGRP | Perm.S_IXOTH)


if __name__ == '__main__':
    bin_name = get_bin_name()
    print('Installing blast ...')
    install_blast(bin_name)
    print('Installing hmmer ...')
    install_hmmer(bin_name)
    print('Installing clustal ...')
    install_clustal(bin_name)
Пример #6
0
def create(request, iframe=False):
    if request.method == 'GET':
        blastdb_list = sorted([[db.type.dataset_type, db.type.get_molecule_type_display(), db.title, db.organism.display_name, db.description] for db in BlastDb.objects.select_related('organism').select_related('type').filter(is_shown=True) if db.db_ready()], key=lambda x: (x[3], x[1], x[0], x[2]))
        blastdb_type_counts = dict([(k.lower().replace(' ', '_'), len(list(g))) for k, g in groupby(sorted(blastdb_list, key=lambda x: x[0]), key=lambda x: x[0])])
        return render(request, 'blast/main.html', {
            'title': 'BLAST Query',
            'blastdb_list': json.dumps(blastdb_list),
            'blastdb_type_counts': blastdb_type_counts,
            'iframe': iframe
        })
    elif request.method == 'OPTIONS':
        return HttpResponse("OPTIONS METHOD NOT SUPPORTED", status=202)
    elif request.method == 'POST':
        # setup file paths
        task_id = uuid4().hex  # TODO: Create from hash of input to check for duplicate inputs
        file_prefix = path.join(settings.MEDIA_ROOT, 'blast', 'task', task_id, task_id)
        query_filename = file_prefix + '.in'
        asn_filename = file_prefix + '.asn'
        if not path.exists(path.dirname(query_filename)):
            makedirs(path.dirname(query_filename))
        chmod(path.dirname(query_filename), Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO) # ensure the standalone dequeuing process can open files in the directory
        bin_name = get_bin_name()
        # write query to file
        if 'query-file' in request.FILES:
            with open(query_filename, 'wb') as query_f:
                for chunk in request.FILES['query-file'].chunks():
                    query_f.write(chunk)
        elif 'query-sequence' in request.POST:
            with open(query_filename, 'wb') as query_f:
                query_text = [x.encode('ascii','ignore').strip() for x in request.POST['query-sequence'].split('\n')]
                query_f.write('\n'.join(query_text))
        else:
            return render(request, 'blast/invalid_query.html', {'title': 'Invalid Query'})

        if (path.getsize(query_filename) > int(settings.BLAST_QUERY_SIZE_MAX) * 1024):
            return render(request, 'blast/invalid_query.html', {'title': 'Your query size is ' + str(path.getsize(query_filename)) + ' bytes, but exceeds our query size limit of ' + str(settings.BLAST_QUERY_SIZE_MAX) + ' kbytes,  Please try again with a smaller query size.',})

        chmod(query_filename, Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO) # ensure the standalone dequeuing process can access the file

        # build blast command
        db_list = ' '.join([db.fasta_file.path_full for db in BlastDb.objects.filter(title__in=set(request.POST.getlist('db-name'))) if db.db_ready()])
        if not db_list:
            return render(request, 'blast/invalid_query.html', {'title': 'Invalid Query',})

        # check if program is in list for security
        if request.POST['program'] in ['blastn', 'tblastn', 'tblastx', 'blastp', 'blastx']:

            with open(query_filename, 'r') as f:
                qstr = f.read()
                if(qstr.count('>') > int(settings.BLAST_QUERY_MAX)):
                    query_cnt = str(qstr.count('>'))
                    remove(query_filename)
                    return render(request, 'blast/invalid_query.html',
                            {'title': 'Your search includes ' + query_cnt + ' sequences, but blast allows a maximum of ' + str(settings.BLAST_QUERY_MAX) + ' sequences per submission.', })

            # generate customized_options
            input_opt = []
            max_target_seqs = request.POST.get('max_target_seqs', 50)
            for blast_option in blast_customized_options[request.POST['program']]:
                if blast_option == 'low_complexity':
                    if request.POST['program'] == 'blastn':
                        input_opt.extend(['-dust', request.POST['low_complexity']])
                    else:
                        input_opt.extend(['-seg', request.POST['low_complexity']])
                else:
                    input_opt.extend(['-'+blast_option, request.POST[blast_option]])

            program_path = path.join(settings.BASE_DIR, 'blast', bin_name, request.POST['program'])
            num_threads = '4' if cpu_count() >= 4 else str(cpu_count())
            args_list = [[program_path, '-query', query_filename, '-db', db_list, '-outfmt', '11', '-out', asn_filename, '-num_threads', num_threads]]
            args_list[0].extend(input_opt)

            # convert to multiple formats
            blast_formatter_path = path.join(settings.BASE_DIR, 'blast', bin_name, 'blast_formatter')
            for ext, outfmt in blast_info['ext'].items():
                args = [blast_formatter_path, '-archive', asn_filename, '-outfmt', outfmt, '-out', file_prefix + ext]
                if ext == '.html':
                    args.append('-html')
                if int(outfmt.split()[0]) > 4:
                    args.extend(['-max_target_seqs', max_target_seqs])
                else:
                    args.extend(['-num_descriptions', max_target_seqs, '-num_alignments', max_target_seqs])
                args_list.append(args)
            record = BlastQueryRecord()
            record.task_id = task_id
            if request.user.is_authenticated():
                record.user = request.user
            record.save()

            # generate status.json for frontend status checking
            with open(query_filename, 'r') as f:
                # count number of query sequence by counting '>'
                qstr = f.read()
                seq_count = qstr.count('>')
                if (seq_count == 0):
                    seq_count = 1
                with open(path.join(path.dirname(file_prefix), 'status.json'), 'wb') as f:
                    json.dump({'status': 'pending', 'seq_count': seq_count}, f)

            run_blast_task.delay(task_id, args_list, file_prefix, blast_info)

            # debug
            # run_blast_task.delay(task_id, args_list, file_prefix, blast_info).get()
            return redirect('blast:retrieve', task_id)
        else:
            raise Http404
Пример #7
0
def create(request):
    '''
    Main page of Clustal
    * Max number of query sequences: 600 sequences
    '''
    if request.method == 'GET':
        return render(request, 'clustal/main.html', {
            'title': 'Clustal Query',
        })
    elif request.method == 'POST':
        # setup file paths
        task_id = uuid4().hex
        task_dir = path.join(settings.MEDIA_ROOT, 'clustal', 'task', task_id)
        # file_prefix only for task...
        file_prefix = path.join(settings.MEDIA_ROOT, 'clustal', 'task',
                                task_id, task_id)
        if not path.exists(task_dir):
            makedirs(task_dir)
        chmod(task_dir, Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO)
        # ensure the standalone dequeuing process can open files in the directory
        # change directory to task directory

        query_filename = ''
        if 'query-file' in request.FILES:
            query_filename = path.join(settings.MEDIA_ROOT, 'clustal', 'task',
                                       task_id,
                                       request.FILES['query-file'].name)
            with open(query_filename, 'wb') as query_f:
                for chunk in request.FILES['query-file'].chunks():
                    query_f.write(chunk)
        elif 'query-sequence' in request.POST and request.POST[
                'query-sequence']:
            query_filename = path.join(settings.MEDIA_ROOT, 'clustal', 'task',
                                       task_id, task_id + '.in')
            with open(query_filename, 'wb') as query_f:
                query_text = [
                    x.encode('ascii', 'ignore').strip()
                    for x in request.POST['query-sequence'].split('\n')
                ]
                query_f.write('\n'.join(query_text))
        else:
            return render(request, 'clustal/invalid_query.html', {
                'title': '',
            })

        chmod(query_filename, Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO)
        # ensure the standalone dequeuing process can access the file
        bin_name = get_bin_name(
        )  # note that we didn't support Clustal on windows yet
        program_path = path.join(settings.BASE_DIR, 'clustal', bin_name)

        # count number of query sequence by counting '>'
        with open(query_filename, 'r') as f:
            qstr = f.read()
            seq_count = qstr.count('>')
            if (seq_count > 600):
                return render(
                    request, 'clustal/invalid_query.html', {
                        'title':
                        'Clustal: Max number of query sequences: 600 sequences.',
                    })

        is_color = False
        # check if program is in list for security
        if request.POST['program'] in ['clustalw', 'clustalo']:

            option_params = []
            args_list = []

            if request.POST['program'] == 'clustalw':
                # clustalw
                option_params.append("-type=" + request.POST['sequenceType'])

                # parameters setting for full option or fast option
                if request.POST['pairwise'] == "full":
                    if request.POST['sequenceType'] == "dna":
                        if request.POST['PWDNAMATRIX'] != "":
                            option_params.append('-PWDNAMATRIX=' +
                                                 request.POST['PWDNAMATRIX'])
                        if request.POST['dna-PWGAPOPEN'] != "":
                            option_params.append('-PWGAPOPEN=' +
                                                 request.POST['dna-PWGAPOPEN'])
                        if request.POST['dna-PWGAPEXT'] != "":
                            option_params.append('-PWGAPEXT=' +
                                                 request.POST['dna-PWGAPEXT'])
                    elif request.POST['sequenceType'] == "protein":
                        if request.POST['PWMATRIX'] != "":
                            option_params.append('-PWMATRIX=' +
                                                 request.POST['PWMATRIX'])
                        if request.POST['protein-PWGAPOPEN'] != "":
                            option_params.append(
                                '-PWGAPOPEN=' +
                                request.POST['protein-PWGAPOPEN'])
                        if request.POST['protein-PWGAPEXT'] != "":
                            option_params.append(
                                '-PWGAPEXT=' +
                                request.POST['protein-PWGAPEXT'])
                elif request.POST['pairwise'] == "fast":
                    option_params.append('-QUICKTREE')
                    if request.POST['KTUPLE'] != "":
                        option_params.append('-KTUPLE=' +
                                             request.POST['KTUPLE'])
                    if request.POST['WINDOW'] != "":
                        option_params.append('-WINDOW=' +
                                             request.POST['WINDOW'])
                    if request.POST['PAIRGAP'] != "":
                        option_params.append('-PAIRGAP=' +
                                             request.POST['PAIRGAP'])
                    if request.POST['TOPDIAGS'] != "":
                        option_params.append('-TOPDIAGS=' +
                                             request.POST['TOPDIAGS'])
                    if request.POST['SCORE'] != "":
                        option_params.append('-SCORE=' + request.POST['SCORE'])

                # prarmeters setting for mutliple alignment
                if request.POST['sequenceType'] == "dna":
                    if request.POST['DNAMATRIX'] != "":
                        option_params.append('-DNAMATRIX=' +
                                             request.POST['DNAMATRIX'])
                    if request.POST['dna-GAPOPEN'] != "":
                        option_params.append('-GAPOPEN=' +
                                             request.POST['dna-GAPOPEN'])
                    if request.POST['dna-GAPEXT'] != "":
                        option_params.append('-GAPEXT=' +
                                             request.POST['dna-GAPEXT'])
                    if request.POST['dna-GAPDIST'] != "":
                        option_params.append('-GAPDIST=' +
                                             request.POST['dna-GAPDIST'])
                    if request.POST['dna-ITERATION'] != "":
                        option_params.append('-ITERATION=' +
                                             request.POST['dna-ITERATION'])
                    if request.POST['dna-NUMITER'] != "":
                        option_params.append('-NUMITER=' +
                                             request.POST['dna-NUMITER'])
                    if request.POST['dna-CLUSTERING'] != "":
                        option_params.append('-CLUSTERING=' +
                                             request.POST['dna-CLUSTERING'])
                elif request.POST['sequenceType'] == "protein":
                    if request.POST['MATRIX'] != "":
                        option_params.append('-MATRIX=' +
                                             request.POST['MATRIX'])
                    if request.POST['protein-GAPOPEN'] != "":
                        option_params.append('-GAPOPEN=' +
                                             request.POST['protein-GAPOPEN'])
                    if request.POST['protein-GAPEXT'] != "":
                        option_params.append('-GAPEXT=' +
                                             request.POST['protein-GAPEXT'])
                    if request.POST['protein-GAPDIST'] != "":
                        option_params.append('-GAPDIST=' +
                                             request.POST['protein-GAPDIST'])
                    if request.POST['protein-ITERATION'] != "":
                        option_params.append('-ITERATION=' +
                                             request.POST['protein-ITERATION'])
                    if request.POST['protein-NUMITER'] != "":
                        option_params.append('-NUMITER=' +
                                             request.POST['protein-NUMITER'])
                    if request.POST['protein-CLUSTERING'] != "":
                        option_params.append(
                            '-CLUSTERING=' +
                            request.POST['protein-CLUSTERING'])

                # parameters setting of output
                is_color = True if request.POST[
                    'OUTPUT'] == 'clustal' else False
                option_params.append('-OUTPUT=' + request.POST['OUTPUT'])
                option_params.append('-OUTORDER=' + request.POST['OUTORDER'])

                args_list.append([
                    path.join(program_path, 'clustalw2'), '-infile=' +
                    query_filename, '-OUTFILE=' +
                    path.join(settings.MEDIA_ROOT, 'clustal', 'task', task_id,
                              task_id + '.aln'), '-type=protein'
                ] + option_params)

                args_list_log = []
                args_list_log.append([
                    'clustalw2', '-infile=' +
                    path.basename(query_filename), '-OUTFILE=' + task_id +
                    '.aln', '-type=protein'
                ] + option_params)

            else:  # clustalo
                if request.POST['dealing_input'] == "yes":
                    option_params.append("--dealign")
                if request.POST['clustering_guide_tree'] != "no":
                    option_params.append("--full")
                if request.POST['clustering_guide_iter'] != "no":
                    option_params.append("--full-iter")

                if request.POST['combined_iter'] != "":
                    option_params.append("--iterations=" +
                                         request.POST['combined_iter'])
                if request.POST['max_gt_iter'] != "":
                    option_params.append("--max-guidetree-iterations=" +
                                         request.POST['max_gt_iter'])
                if request.POST['max_hmm_iter'] != "":
                    option_params.append("--max-hmm-iterations=" +
                                         request.POST['max_hmm_iter'])
                if request.POST['omega_output'] != "":
                    option_params.append("--outfmt=" +
                                         request.POST['omega_output'])
                    is_color = True if request.POST[
                        'omega_output'] == 'clu' else False
                if request.POST['omega_order'] != "":
                    option_params.append("--output-order=" +
                                         request.POST['omega_order'])

                args_list.append([
                    path.join(program_path, 'clustalo'), '--infile=' +
                    query_filename, '--guidetree-out=' +
                    path.join(settings.MEDIA_ROOT, 'clustal', 'task', task_id,
                              task_id) + '.ph', '--outfile=' +
                    path.join(settings.MEDIA_ROOT, 'clustal', 'task', task_id,
                              task_id) + '.aln'
                ] + option_params)

                args_list_log = []
                args_list_log.append([
                    'clustalo', '--infile=' +
                    path.basename(query_filename), '--guidetree-out=' +
                    task_id + '.ph', '--outfile=' + task_id + '.aln'
                ] + option_params)

            record = ClustalQueryRecord()
            record.task_id = task_id
            if request.user.is_authenticated():
                record.user = request.user
            record.save()

            # generate status.json for frontend status checking
            with open(query_filename, 'r'
                      ) as f:  # count number of query sequence by counting '>'
                qstr = f.read()
                seq_count = qstr.count('>')
                if (seq_count == 0):
                    seq_count = 1
                with open(
                        path.join(settings.MEDIA_ROOT, 'clustal', 'task',
                                  task_id, 'status.json'), 'wb') as f:
                    json.dump(
                        {
                            'status': 'pending',
                            'seq_count': seq_count,
                            'program': request.POST['program'],
                            'cmd': " ".join(args_list_log[0]),
                            'is_color': is_color,
                            'query_filename': path.basename(query_filename)
                        }, f)
            run_clustal_task.delay(task_id, args_list, file_prefix)

            return redirect('clustal:retrieve', task_id)
        else:
            raise Http404
Пример #8
0
def create(request):
    '''
    Main page of Hmmer
    Use hmmsearch fast mode for format validation

    Input limitation:
    (1). Phmmer, Max number of query sequences: 10 sequences
    '''
    if request.method == 'GET':
        hmmerdb_list = sorted([['Protein', "Protein", db.title, db.organism.display_name, db.description] for db in
                               HmmerDB.objects.select_related('organism').filter(is_shown=True)],
                              key=lambda x: (x[3], x[1], x[0], x[2]))
        hmmerdb_type_counts = dict([(k.lower().replace(' ', '_'), len(list(g))) for k, g in
                                    groupby(sorted(hmmerdb_list, key=lambda x: x[0]), key=lambda x: x[0])])
        '''
        Redirect from clustal result
        '''
        clustal_content = []
        if ("clustal_task_id" in request.GET):
            clustal_aln = path.join(settings.MEDIA_ROOT, 'clustal',
                                    'task', request.GET['clustal_task_id'],
                                    request.GET['clustal_task_id'] + ".aln")

            with open(clustal_aln, 'r') as content_file:
                for line in content_file:
                    clustal_content.append(line)

        return render(request, 'hmmer/main.html', {
            'title': 'HMMER Query',
            'hmmerdb_list': json.dumps(hmmerdb_list),
            'hmmerdb_type_counts': hmmerdb_type_counts,
            'clustal_content': "".join(clustal_content),
        })

    elif request.method == 'POST':
        # setup file paths
        task_id = uuid4().hex
        task_dir = path.join(settings.MEDIA_ROOT, 'hmmer', 'task', task_id)
        # file_prefix only for task...
        file_prefix = path.join(settings.MEDIA_ROOT, 'hmmer', 'task', task_id, task_id)
        if not path.exists(task_dir):
            makedirs(task_dir)

        chmod(task_dir,
              Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO)
        # ensure the standalone dequeuing process can open files in the directory
        # change directory to task directory

        if 'query-file' in request.FILES:
            query_filename = path.join(settings.MEDIA_ROOT, 'hmmer', 'task', task_id, request.FILES['query-file'].name)
            with open(query_filename, 'wb') as query_f:
                for chunk in request.FILES['query-file'].chunks():
                    query_f.write(chunk)
        elif 'query-sequence' in request.POST and request.POST['query-sequence']:
            query_filename = path.join(settings.MEDIA_ROOT, 'hmmer', 'task', task_id, task_id + '.in')
            with open(query_filename, 'wb') as query_f:
                query_text = [x.encode('ascii', 'ignore').strip() for x in request.POST['query-sequence'].split('\n')]
                query_f.write('\n'.join(query_text))
        else:
            return render(request, 'hmmer/invalid_query.html', {'title': '', })

        chmod(query_filename,
              Perm.S_IRWXU | Perm.S_IRWXG | Perm.S_IRWXO)
        # ensure the standalone dequeuing process can access the file

        bin_name = get_bin_name()  # Note that currently we didn't support HMMER on windows
        program_path = path.join(settings.BASE_DIR, 'hmmer', bin_name, 'bin')

        if request.POST['program'] == 'phmmer':
            with open(query_filename, 'r') as f:
                qstr = f.read()
                if qstr.count('>') > int(HMMER_QUERY_MAX):
                    query_cnt = str(qstr.count('>'))
                    remove(query_filename)
                    return render(request, 'hmmer/invalid_query.html',
                                  {'title': 'Your search includes ' + query_cnt + ' sequences, but HMMER allows a maximum of ' + str(HMMER_QUERY_MAX) + ' sequences per submission.', })
        elif request.POST['program'] == 'hmmsearch':
            '''
            Format validation by hmmsearch fast mode
            If the machine can't perform it in short time, it could be marked.
            But you need find a good to check format in front-end
            '''
            p = Popen([path.join(program_path, "hmmbuild"), "--fast", '--amino',
                      path.join(settings.MEDIA_ROOT, 'hmmer', 'task', 'hmmbuild.test'), query_filename],
                      stdout=PIPE, stderr=PIPE)
            p.wait()
            result = p.communicate()[1]
            if(result != ''):
                return render(request, 'hmmer/invalid_query.html',
                             {'title': 'Invalid MSA format',
                              'info' :'<a href="http://toolkit.tuebingen.mpg.de/reformat/help_params#format" target="_blank"> \
                                      Valid MSA format descriptions </a>' })
        else:  # check if program is in list for security
            raise Http404

        # build hmmer command
        db_list = [db.fasta_file.path_full for db in HmmerDB.objects.filter(title__in=set(request.POST.getlist('db-name')))]
        for db in db_list:
            symlink(db, path.join(settings.MEDIA_ROOT, 'hmmer', 'task', task_id, db[db.rindex('/') + 1:]))

        if not db_list:
            return render(request, 'hmmer/invalid_query.html', {'title': '', })

        if request.POST['cutoff'] == 'evalue':
            option_params = ['--incE', request.POST['s_sequence'], '--incdomE', request.POST['s_hit'],
                             '-E', request.POST['r_sequence'], '--domE', request.POST['r_hit']]
        elif request.POST['cutoff'] == 'bitscore':
            option_params = ['--incT', request.POST['s_sequence'], '--incdomT', request.POST['s_hit'],
                             '-T', request.POST['r_sequence'], '--domT', request.POST['r_hit']]
        else:
            raise Http404

        record = HmmerQueryRecord()
        record.task_id = task_id
        if request.user.is_authenticated():
            record.user = request.user
        record.save()
        # generate status.json for frontend statu checking
        with open(query_filename, 'r') as f:
            qstr = f.read()
            seq_count = qstr.count('>')
            if (seq_count == 0):
                seq_count = 1
            with open(path.join(settings.MEDIA_ROOT, 'hmmer', 'task', task_id, 'status.json'), 'wb') as f:
                json.dump({'status': 'pending', 'seq_count': seq_count,
                           'db_list': [db[db.rindex('/') + 1:] for db in db_list],
                           'program': request.POST['program'],
                           'params': option_params,
                           'input': path.basename(query_filename)}, f)
        args = generate_hmmer_args(request.POST['program'], program_path, query_filename, option_params, db_list)
        run_hmmer_task.delay(task_id, args, file_prefix)
        return redirect('hmmer:retrieve', task_id)
Пример #9
0
def generate_blast_args(program):
    input_file_dir = path.join(settings.BASE_DIR, 'example', 'blastdb/')
    output_file_dir = path.join(settings.BASE_DIR, 'test_' + program + '/')
    asn_filename = path.join(output_file_dir, 'test_' + program + '.asn')
    if program == 'blastp':
        query_filename = path.join(input_file_dir,
                                   'Cimex_sample_pep_query.faa')
        db_list = path.join(input_file_dir, 'clec_peptide_example_BLASTdb.fa')
        options = {
            'max_target_seqs': '100',
            'evalue': '10.0',
            'word_size': '6',
            'matrix': 'BLOSUM62',
            'threshold': '11',
            'gapopen': '11',
            'gapextend': '1',
            'low_complexity': 'no',
            'soft_masking': 'false',
        }
    elif program == 'blastn':
        query_filename = path.join(input_file_dir, 'LFUL_sample_query.fna')
        db_list = path.join(input_file_dir, 'Ladonda_sample_CDS_BLASTdb.fna')
        options = {
            'max_target_seqs': '100',
            'evalue': '10.0',
            'word_size': '11',
            'reward': '2',
            'penalty': '-3',
            'gapopen': '5',
            'gapextend': '2',
            'strand': 'both',
            'low_complexity': 'yes',
            'soft_masking': 'true',
        }
    elif program == 'tblastn':
        query_filename = path.join(input_file_dir,
                                   'Cimex_sample_pep_query.faa')
        db_list = path.join(input_file_dir, 'Ladonda_sample_CDS_BLASTdb.fna')
        options = {
            'max_target_seqs': '100',
            'evalue': '10.0',
            'word_size': '6',
            'matrix': 'BLOSUM62',
            'threshold': '13',
            'gapopen': '11',
            'gapextend': '1',
            'low_complexity': 'yes',
            'soft_masking': 'false',
        }
    elif program == 'tblastx':
        query_filename = path.join(input_file_dir, 'LFUL_sample_query.fna')
        db_list = path.join(input_file_dir, 'Ladonda_sample_CDS_BLASTdb.fna')
        options = {
            'max_target_seqs': '100',
            'evalue': '10.0',
            'word_size': '3',
            'matrix': 'BLOSUM62',
            'threshold': '13',
            'strand': 'both',
            'low_complexity': 'yes',
            'soft_masking': 'false',
        }
    elif program == 'blastx':
        query_filename = path.join(input_file_dir, 'LFUL_sample_query.fna')
        db_list = path.join(input_file_dir, 'clec_peptide_example_BLASTdb.fa')
        options = {
            'max_target_seqs': '100',
            'evalue': '10.0',
            'word_size': '6',
            'matrix': 'BLOSUM62',
            'threshold': '12',
            'strand': 'both',
            'gapopen': '11',
            'gapextend': '1',
            'low_complexity': 'no',
            'soft_masking': 'false',
        }
    if path.exists(output_file_dir):
        rmtree(output_file_dir)
    makedirs(output_file_dir)
    bin_name = get_bin_name()
    program_path = path.join(settings.BASE_DIR, 'blast', bin_name, program)
    blast_customized_options = {
        'blastn': [
            'max_target_seqs', 'evalue', 'word_size', 'reward', 'penalty',
            'gapopen', 'gapextend', 'strand', 'low_complexity', 'soft_masking'
        ],
        'tblastn': [
            'max_target_seqs', 'evalue', 'word_size', 'matrix', 'threshold',
            'gapopen', 'gapextend', 'low_complexity', 'soft_masking'
        ],
        'tblastx': [
            'max_target_seqs', 'evalue', 'word_size', 'matrix', 'threshold',
            'strand', 'low_complexity', 'soft_masking'
        ],
        'blastp': [
            'max_target_seqs', 'evalue', 'word_size', 'matrix', 'threshold',
            'gapopen', 'gapextend', 'low_complexity', 'soft_masking'
        ],
        'blastx': [
            'max_target_seqs', 'evalue', 'word_size', 'matrix', 'threshold',
            'strand', 'gapopen', 'gapextend', 'low_complexity', 'soft_masking'
        ]
    }
    input_opt = []
    max_target_seqs = options.get('max_target_seqs', '50')
    for blast_option in blast_customized_options[program]:
        if blast_option == 'low_complexity':
            if program == 'blastn':
                input_opt.extend(['-dust', options['low_complexity']])
            else:
                input_opt.extend(['-seg', options['low_complexity']])
        else:
            input_opt.extend(['-' + blast_option, options[blast_option]])

    args_list = [[
        program_path, '-query', query_filename, '-db', db_list, '-outfmt',
        '11', '-out', asn_filename, '-num_threads', '2'
    ]]
    args_list[0].extend(input_opt)
    blast_formatter_path = path.join(settings.BASE_DIR, 'blast', bin_name,
                                     'blast_formatter')
    blast_col_name = 'qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore nident qcovs qlen slen qframe sframe'
    blast_info = {
        'col_types': [
            'str', 'str', 'float', 'int', 'int', 'int', 'int', 'int', 'int',
            'int', 'float', 'float', 'int', 'int', 'int', 'int', 'int', 'int'
        ],
        'col_names':
        blast_col_name.split(),
        'ext': {
            '.0': '0',
            '.html': '0',
            '.1': '1',
            '.3': '3',
            '.xml': '5',
            '.tsv': '6 ' + blast_col_name,
            '.csv': '10 ' + blast_col_name,
        },
    }
    for ext, outfmt in blast_info['ext'].items():
        args = [
            blast_formatter_path, '-archive', asn_filename, '-outfmt', outfmt,
            '-out', output_file_dir + 'test_' + program + ext
        ]
        if ext == '.html':
            args.append('-html')
        if int(outfmt.split()[0]) > 4:
            args.extend(['-max_target_seqs', max_target_seqs])
        else:
            args.extend([
                '-num_descriptions', max_target_seqs, '-num_alignments',
                max_target_seqs
            ])
        args_list.append(args)
    return args_list