Пример #1
0
    def _save_aln_results(self, save_dir=None):
        '''
		first parse the blast result file,
		and then save alignment files into the save_dir.
		
		if save_dir is not defined,
		it does not perform any operations.
		'''

        if self.save_dir == None and save_dir == None:
            return
        elif save_dir == None:
            save_dir = self.save_dir

        blast = self.parse()
        self.msas = {}

        for i, msa in enumerate(blast):
            iter = i + 1
            dir, basename, ext = parse_sequence_filename(self.input)
            profile_name = build_profile_filename(save_dir, basename, iter,
                                                  '.aln')
            profile_fp = open(profile_name, 'w')
            msa.build_psiblast_alignment_input(profile_fp)
            profile_fp.close()

            self.aln_files[iter] = profile_name
            self.msas[iter] = msa
Пример #2
0
    def __init__(self, input_file=None, output_file=None):
        if input_file == None:
            raise TypeError("No Input file is given.")

        if output_file == None:
            dir, base_filename, iteration, ext = parse_profile_filename(
                input_file)
            output_file = build_profile_filename(dir, base_filename, iteration,
                                                 ".pnp")

        self.input_file = input_file
        self.output_file = output_file
Пример #3
0
    def initial_iteration(self, echo=True):
        '''
		Run initial blast and prepare output
		'''
        #run blast
        temp_output = os.path.join(
            self.temp_dir, '%s.1%s' % (self.output_base, self.output_ext))
        runner = BLASTRunner(input=self.processed_input,
                             output=temp_output,
                             **self.kwargs)
        runner.run()

        #parse blast output
        blast = BLAST.BLAST(self.processed_input, runner.output)
        msa = blast[-1]
        #flagging for combining HSPs for B input alignment.
        msa.set_combine_hsps()

        #purge the result
        if self.use_overlapping_purging:
            if echo:
                print('purging overlapping regions...')

            #build pssm
            #for initial iteration, BLOSUM 62 matrix is used!
            pssm = ScoreMat()  # len(self.processed_inputfasta) )
            pssm.set_blosum_mat()
            #start to purge the matrix
            if self.number_of_processors > 1:
                msa.purge_overlapping_hsps_multithreading(
                    self.inserted_positions, pssm, self.number_of_processors)
            else:
                msa.purge_overlapping_hsps(self.inserted_positions, pssm)

        if self.use_backblast_purging:
            for neighbor_msa in self.neighboring_msas:
                #msa.psiblast_purge( neighbor_msa )
                backblastpurger = BackblastPurger(msa, neighbor_msa,
                                                  **self.kwargs)

        msa_output = build_profile_filename(self.temp_dir, self.output_base, 1,
                                            '.aln')
        msa_output_fp = open(msa_output, 'w')
        msa.build_psiblast_alignment_input(msa_output_fp)
        msa_output_fp.close()

        self.current_runner = runner
        self.current_msa_output = msa_output
        self.current_iteration = 1
        self.current_parser = blast

        self.msa_files.append(msa_output)
        return msa  #msa
Пример #4
0
    def __init__(self,
                 cmd=None,
                 input_file=None,
                 output_file=None,
                 calibration_db=None,
                 calibrate=True,
                 calibration_cmd=None):

        if cmd == None:
            cmd = Settings.get("hhmake")

        if input_file == None:
            raise TypeError("Input_file should be given.")

        if calibration_cmd == None:
            calibration_cmd = Settings.get("hhsearch_cmd")

        if calibration_db == None:
            calibration_db = Settings.get("hhm_cal_db")

        if output_file == None:
            dir, base_filename, iteration, ext = parse_profile_filename(
                input_file)
            output_file = build_profile_filename(dir, base_filename, iteration,
                                                 '.hhm')

        self.cmd = cmd
        self.calibration_cmd = calibration_cmd
        self.calibration_db = calibration_db

        self.calibrate = calibrate

        self.input_file = input_file
        self.output_file = output_file

        #the following part is added due to hhsearch bug
        #of cannot handle long input file name handling!
        self.tmpinput = tempfile.NamedTemporaryFile()
        self.tmpinputname = self.tmpinput.name
        try:
            shutil.copy(self.input_file, self.tmpinputname)
        except IOError:
            self.tmpinputname = self.input_file

        self.tmpoutput = tempfile.NamedTemporaryFile()
        self.tmpoutputname = self.tmpoutput.name
        #need to be copied after the execution!

        self.command_lines = self.get_command_lines()
Пример #5
0
    def next_iteration(self, echo=True):
        self.current_iteration += 1
        self.previous_runner = self.current_runner
        self.previous_msa_output = self.current_msa_output
        self.previous_parser = self.current_parser

        i = self.current_iteration
        temp_output = os.path.join(
            self.temp_dir,
            '%s.%s%s' % (self.output_base, str(i), self.output_ext))
        runner = BLASTRunner(input=self.processed_input,
                             output=temp_output,
                             input_alignment=self.previous_msa_output,
                             **self.kwargs)
        runner.run()

        blast = BLAST.BLAST(self.processed_input, runner.output)
        msa = blast[-1]
        msa.set_combine_hsps()

        if self.use_overlapping_purging:
            if echo:
                print('purging overlapping regions...')
            pssm = ScoreMat()
            pssm.build_pssm(self.previous_parser[-1], **self.kwargs)
            msa.purge_overlapping_hsps(self.inserted_positions, pssm)

        if self.use_backblast_purging:
            for neighbor_msa in self.neighboring_msas:
                #msa.psiblast_purge( neighbor_msa )
                backblastpurger = BackblastPurger(msa, neighbor_msa)

        #make msa output
        msa_output = build_profile_filename(self.temp_dir, self.output_base, i,
                                            '.aln')
        msa_output_fp = open(msa_output, 'w')
        msa.build_psiblast_alignment_input(msa_output_fp)
        msa_output_fp.close()

        #finally
        self.current_runner = runner
        self.current_msa_output = msa_output
        self.current_parser = blast

        self.msa_files.append(msa_output)

        return msa
Пример #6
0
        def set_result_files( self ) :
                '''
                returns list of a3m files in the save_dir.
                '''
                basename = parse_profile_filename( self.sequence )[1]
                if not basename :
                        return
		
		a3m_files = glob.glob( os.path.join( self.save_dir, basename + '*.a3m' ) )
		new_a3m_files = []
		for a3m in a3m_files :
			dir, basename2, iteration, ext = parse_profile_filename( a3m )
			newa3m = build_profile_filename( dir, basename, iteration, ext )
			shutil.move( a3m, newa3m )
			new_a3m_files.append( newa3m )

                self.a3m_files = new_a3m_files
		self.a3m_files.sort()
Пример #7
0
    def get_command_line(self):

        if self.input_type == None:
            self.input_type = 'a3m'

        if self.output_type == None:
            self.output_type = 'psi'

        if self.output_file == None:
            dir, base_filename, iteration, ext = parse_profile_filename(
                self.input_file)
            self.output_file = build_profile_filename(dir, base_filename,
                                                      iteration,
                                                      '.' + self.output_type)
        command_line = [
            self.cmd, self.input_type, self.output_type, self.input_file,
            self.output_file
        ]
        return command_line
Пример #8
0
    def get_command_line(self):
        if self.output_file == None:
            dir, base_filename, iteration, ext = parse_profile_filename(
                self.input_file)

            if verbose:
                print("COMPASS Builder file analysis")
                print('input_file', self.input_file)
                print("dir:", dir)
                print("base_filename:", base_filename)
                print("ieration:", iteration)
                print("ext:", ext)

            self.output_file = build_profile_filename(dir, base_filename,
                                                      iteration, '.cnp')

        self.temporary_output_fp = tempfile.NamedTemporaryFile()
        self.temporary_output_file = self.temporary_output_fp.name
        list_fn = self.prepare_list_file()
        command_line = [
            self.cmd, '-i', list_fn, '-o', self.temporary_output_file
        ]
        return command_line
Пример #9
0
    def build_file(self, dominfo):
        from evdblib.DBTools import Settings
        filename = os.path.join(self.job_dir,
                                'prfs%s.job' % dominfo['uniqueid'])

        if not self.cmd:
            self.cmd = Settings.get('profile_searcher')

        command = self.cmd
        domain_path = dominfo['domain_path']
        profile_search_method = self.alignment_method
        profile_search_queryid = dominfo['uniqueid']
        iteration = self.iteration

        query_iteration = min(self.iteration, check_profile_integrity(dominfo))

        profile_search_query = build_profile_filename(
            domain_path, dominfo['uniqueid'] + '.prof', query_iteration,
            self.profile_extention)
        profile_search_db = self.search_db
        profile_search_db_size = self.search_db_size

        profile_search_output = build_sequence_filename(
            domain_path, dominfo['uniqueid'], self.alignment_extention)

        command += ' -q %(profile_search_queryid)s -j %(iteration)s -m %(profile_search_method)s -u -s %(profile_search_db_size)s -d %(profile_search_db)s %(profile_search_query)s %(profile_search_output)s' % locals(
        )

        if os.path.exists(filename):
            raise JobScriptWriteError(
                "Profile search job script %s already exists!" % filename)

        fp = open(filename, 'w')
        print(command, file=fp)
        fp.close()

        return filename
Пример #10
0
def _prepare_compass_search_db(db_filename, domain_informations, iteration,
                               use_between, selected_iterations):
    '''
	prepare compass DB.
	'''

    db_fp = open(db_filename, 'w')
    ext = Settings.get('compass_suffix')

    db_size_fp = open(db_filename + ".len", 'w')  #need to be built
    compass_db_size = 0

    previous_iteration = selected_iterations[max(
        selected_iterations.index(iteration) - 1, 0)]

    number_of_records = 0
    for dominfo in domain_informations:

        #read domain path
        domain_path = dominfo['domain_path']
        if not domain_path:
            if verbose:
                print("WARNING: Dominfo does not have domain_path...")
                print(dominfo)
            continue

        domid = dominfo['uniqueid']
        compass_file = build_profile_filename(domain_path, domid + '.prof',
                                              iteration, ext)

        if not os.path.exists(compass_file) and use_between:
            #in case the hhsearch file of the iteration
            #does not exists
            #and the use_between flag is On...
            #find the last iteration
            last_available_iteration = check_profile_integrity(dominfo)
            if not last_available_iteration:
                print("WARNING: Profile is bad!", domain_path, domid)
                continue

            if last_available_iteration > previous_iteration:
                compass_file = build_profile_filename(
                    domain_path, domid + '.prof', last_available_iteration,
                    ext)
            else:
                if verbose:
                    print("No between iteration available!", iteration,
                          last_available_iteration)
                continue

        elif not os.path.exists(compass_file) and not use_between:
            #when hhsearch file is not availble and use  between flag is off.
            last_available_iteration = check_profile_integrity(dominfo)
            compass_file = build_profile_filename(domain_path, domid + '.prof',
                                                  last_available_iteration,
                                                  ext)

        #final check!
        if not os.path.exists(compass_file):
            #error!
            print("WARNING: COMPASS file should be available but not found!",
                  compass_file,
                  file=sys.stderr)
            raise SearchDatabasePreparationError(
                "COMPASS numerical profile file is not availble!",
                compass_file)
            continue

        fp = open(compass_file)
        content = fp.read()
        fp.close()

        db_fp.write(content)
        number_of_records += 1

        compass_size_file = compass_file + ".len"
        try:
            fp = open(compass_size_file)
            compass_db_size += int(fp.read().strip())
            fp.close()
        except:
            print("WARNING: Cannot read compass profile size file.",
                  compass_size_file)

    db_fp.close()
    print(compass_db_size, file=db_size_fp)
    db_size_fp.close()

    return number_of_records
Пример #11
0
def _prepare_hhsearch_search_db(db_filename, domain_informations, iteration,
                                use_between, selected_iterations):
    '''
	prepare HHsearch DB.
	and returns the number of records saved in the database file.
	'''
    db_fp = open(db_filename, 'w')
    ext = Settings.get('hhm_suffix')

    #getting previous iteration
    #for selecting iteration bigger than before.
    previous_iteration = selected_iterations[max(
        selected_iterations.index(iteration) - 1, 0)]

    number_of_records = 0
    for dominfo in domain_informations:

        #read domain path
        if 'domain_path' in dominfo:
            domain_path = dominfo['domain_path']
        else:
            raise ValueError('domain_path does not exists', dominfo)

        if not domain_path:
            if verbose:
                print("WARNING: Dominfo does not have domain_path...")
                print(dominfo)
            continue

        domid = dominfo['uniqueid']
        hhsearch_file = build_profile_filename(domain_path, domid + '.prof',
                                               iteration, ext)

        if not os.path.exists(hhsearch_file) and use_between:
            #in case the hhsearch file of the iteration
            #does not exists
            #and the use_between flag is On...
            #means the profile is generation is good!
            #and the value is max iteratoin!!
            last_available_iteration = check_profile_integrity(dominfo)
            if not last_available_iteration:
                print("WARNING: Profile is bad!", domain_path, domid)
                continue

            if last_available_iteration > previous_iteration:
                hhsearch_file = build_profile_filename(
                    domain_path, domid + '.prof', last_available_iteration,
                    ext)
            else:
                if verbose:
                    print("WARNING: No between iteration available!",
                          iteration, last_available_iteration)
                continue

        elif not os.path.exists(hhsearch_file) and not use_between:
            #when hhsearch file is not availble and use  between flag is off.
            last_available_iteration = check_profile_integrity(dominfo)
            hhsearch_file = build_profile_filename(domain_path,
                                                   domid + '.prof',
                                                   last_available_iteration,
                                                   ext)

        #final check!
        if not os.path.exists(hhsearch_file):
            #error!
            print(
                "Error: HHsearch HMM file should be available but not found!",
                hhsearch_file,
                file=sys.stderr)
            raise SearchDatabasePreparationError(
                "HHsearch HHM file is not availble!", hhsearch_file)

        fp = open(hhsearch_file)
        content = fp.read()
        fp.close()

        db_fp.write(content)
        number_of_records += 1

    db_fp.close()
    return number_of_records