Пример #1
0
    def __init__(self,
                 cmd=None,
                 dominfo=None,
                 save_dir=None,
                 sequence=None,
                 range=None,
                 max_iterations=None,
                 selected_iterations=None,
                 max_rerun=3,
                 stdout=PIPE,
                 stderr=PIPE,
                 timeout=0):
        if cmd == None:
            cmd = Settings.get("buildali")

        self.cmd = cmd

        #getting information out of dominfo
        if dominfo:
            self.save_dir = dominfo['domain_path']
            self.sequence = dominfo['profile_sequence_file']
            self.range = dominfo['profile_sequence_range']
        elif save_dir and sequence:
            self.save_dir = save_dir
            if not os.path.exists(self.save_dir):
                os.makedirs(self.save_dir)
            self.sequence = sequence
        else:
            raise ProfileBuildingError("DomainInformation should be given.")

        if not os.path.exists(self.sequence):
            raise ProfileBuildingError("Sequence file %s cannot be found!" %
                                       self.sequence)

        #getting profile iteration information from setitng
        if max_iterations == None:
            max_iterations = int(Settings.get('max_iterations'))
        self.max_iterations = max_iterations

        if selected_iterations == None:
            selected_iterations = [
                int(i) for i in Settings.get('selected_iterations').split()
            ]
        self.selected_iterations = selected_iterations

        if not self.save_dir:
            raise ProfileBuildingError("Saving directory should be set.")

        self.range = range
        if self.range:
            self.prepare_fasta(range=self.range)

        self.max_rerun = max_rerun

        #getting command line arguments and setting timedrunner
        self.command_line = self.get_command_line()
        #Runner.TimedRunner.__init__( self, self.command_line, **kwargs )
        self.timeout = timeout
        self.stdout = stdout
        self.stderr = stderr
Пример #2
0
    def __init__(self,
                 alignment_method='',
                 search_db='',
                 search_db_size=None,
                 **kwargs):
        from evdblib.DBTools import Settings
        self.alignment_method = alignment_method
        self.search_db = search_db
        self.search_db_size = search_db_size
        if self.search_db_size == None:
            raise ValueError("The search DB size is required!")

        if alignment_method == 'DaliLite':
            self.data_file_extention = Settings.get("dali_data_suffix")
        elif alignment_method == 'FAST' or alignment_method == 'TMalign':
            self.data_file_extention = Settings.get(
                "processed_ca_structure_suffix")
        else:
            if alignment_method in Settings.get(
                    "structure_comparison_methods").split():
                raise ValueError("The method is not implemented!",
                                 alignment_method)
            else:
                raise ValueError("Unknown structure comparison method!",
                                 alignment_method)

        self.alignment_extention = Settings.get("alignment_suffix")

        JobScriptBuilder.__init__(self, **kwargs)
Пример #3
0
    def __init__(self,
                 alignment_method='',
                 search_db='',
                 search_db_size=None,
                 iteration='',
                 **kwargs):
        from evdblib.DBTools import Settings
        self.alignment_method = alignment_method
        self.search_db = search_db
        self.search_db_size = search_db_size
        self.iteration = int(iteration)

        if self.search_db_size == None:
            raise ValueError("The search DB size is required!")

        if alignment_method == 'COMPASS':
            self.profile_extention = Settings.get("compass_suffix")
        elif alignment_method == 'HHsearch':
            self.profile_extention = Settings.get("hhm_suffix")
        else:
            if alignment_method in Settings.get("profile_comparison_methods"):
                raise ValueError("The method is not implemented!")
            else:
                raise ValueError("Unknown Profile comparison method!")

        self.alignment_extention = Settings.get("alignment_suffix")

        JobScriptBuilder.__init__(self, **kwargs)
Пример #4
0
    def build_file(self, dominfo):
        filename = os.path.join(self.job_dir,
                                'prfb%s.job' % dominfo['uniqueid'])

        if not self.cmd:
            self.cmd = Settings.get('profile_builder')

        command = self.cmd
        if dominfo.get('profile_sequence_range'):
            command += ' -r ' + str(dominfo['profile_sequence_range'])

        if Settings.get('profile_type'):
            command += ' -m %s ' % Settings.get('profile_type')

        if Settings.get('blast_db'):
            command += ' -d %s ' % Settings.get('blast_db')

        command += ' %(profile_sequence_file)s %(domain_path)s' % dominfo

        if os.path.exists(filename):
            raise JobScriptWriteError("Profile job script %s already exists!" %
                                      filename)

        fp = open(filename, 'w')
        print(command, file=fp)
        fp.close()

        dominfo['progress'] = 2

        return filename
Пример #5
0
    def __init__(self,
                 cmd=None,
                 input_file=None,
                 output_file=None,
                 calibration_db=None,
                 calibrate=True,
                 calibration_cmd=None):

        if cmd == None:
            cmd = Settings.get("hhmake")

        if input_file == None:
            raise TypeError("Input_file should be given.")

        if calibration_cmd == None:
            calibration_cmd = Settings.get("hhsearch_cmd")

        if calibration_db == None:
            calibration_db = Settings.get("hhm_cal_db")

        if output_file == None:
            dir, base_filename, iteration, ext = parse_profile_filename(
                input_file)
            output_file = build_profile_filename(dir, base_filename, iteration,
                                                 '.hhm')

        self.cmd = cmd
        self.calibration_cmd = calibration_cmd
        self.calibration_db = calibration_db

        self.calibrate = calibrate

        self.input_file = input_file
        self.output_file = output_file

        #the following part is added due to hhsearch bug
        #of cannot handle long input file name handling!
        self.tmpinput = tempfile.NamedTemporaryFile()
        self.tmpinputname = self.tmpinput.name
        try:
            shutil.copy(self.input_file, self.tmpinputname)
        except IOError:
            self.tmpinputname = self.input_file

        self.tmpoutput = tempfile.NamedTemporaryFile()
        self.tmpoutputname = self.tmpoutput.name
        #need to be copied after the execution!

        self.command_lines = self.get_command_lines()
Пример #6
0
    def __init__(self,
                 cmd=None,
                 inputfile=None,
                 outputfile=None,
                 dbfile=None,
                 cpu=1,
                 db_size=None):
        if cmd == None:
            cmd = Settings.get('hhsearch_cmd')

        if inputfile == None:
            raise AlingerError("No input file is given.")

        if outputfile == None:
            self.outputfile_fp = tempfile.NamedTemporaryFile()
            outputfile = self.outputfile_fp.name

        if dbfile == None:
            raise AlignerError("No DB file is given.")

        self.db_size = db_size

        DBAligner.__init__(self,
                           cmd=cmd,
                           inputfile=inputfile,
                           outputfile=outputfile,
                           dbfile=dbfile)
        #setting number of CPU's can be used.
        self.cpu = cpu
Пример #7
0
    def build_file(self, dominfo):
        from evdblib.DBTools import Settings
        filename = os.path.join(self.job_dir,
                                'strs%s.job' % dominfo['uniqueid'])

        if not self.cmd:
            self.cmd = Settings.get('structure_searcher')

        command = self.cmd
        domain_path = dominfo['domain_path']
        search_method = self.alignment_method
        search_queryid = dominfo['uniqueid']

        search_query = build_sequence_filename(domain_path,
                                               dominfo['uniqueid'],
                                               self.data_file_extention)
        search_db = self.search_db
        search_db_size = self.search_db_size

        search_output = build_sequence_filename(domain_path,
                                                dominfo['uniqueid'],
                                                self.alignment_extention)

        command += ' -q %(search_queryid)s -m %(search_method)s -u -s %(search_db_size)s -d %(search_db)s %(search_query)s %(search_output)s' % locals(
        )

        if os.path.exists(filename):
            raise JobScriptWriteError(
                "Structure search job script %s already exists!" % filename)

        fp = open(filename, 'w')
        print(command, file=fp)
        fp.close()

        return filename
Пример #8
0
    def get_command_line(self):
        #need to check self.cmd here to avoid writing
        #another version of __init__
        #probably not a good practice.

        if self.cmd == None:
            self.cmd = Settings.get("tmalign")

        return [self.cmd, self.inputfile2, self.inputfile1]
Пример #9
0
        def __init__( self, cmd=None, 
                        save_dir=None, sequence=None, range=None, 
			max_iterations=1, save_all_iteration_results=False, msa_input_fn=None,
                        max_rerun=3, stdout=PIPE, stderr=PIPE, timeout=0, number_of_processors=1 ) :
		'''
		sequence: fasta file for the query sequence
		range: 
		'''

		self.temp_dir = None #initialized at the first
		if save_dir == None :
			self.save_dir = tempfile.mkdtemp()
			self.temp_dir = self.save_dir
		else :
			self.save_dir = save_dir
			
                if cmd == None :
                        cmd = Settings.get( "buildali" )

                self.cmd = cmd
		self.number_of_processors = number_of_processors

		self.sequence = sequence
		self.msa_input_fn = msa_input_fn
		if msa_input_fn :
			self.sequence = msa_input_fn
                self.range = range

		print(self.sequence, self.msa_input_fn)

		if self.sequence or self.msa_input_fn :
			pass
		else :
                        raise BuildAliRunnerError( "Query sequence or MSA is necessary to build profile." )
	
                if (self.sequence and os.path.exists( self.sequence )) or (self.msa_input_fn and os.path.exists(self.msa_input_fn) ):
			pass
		else :
                        raise ProfileBuildingError( "Query sequence file %s cannot be found!"%self.sequence )
			
                self.max_rerun = max_rerun
		self.save_all_iteration_results = save_all_iteration_results

		#buildali options
                #getting profile iteration information from setitng
                self.max_iterations = max_iterations
		self.msa_input_fn = msa_input_fn

                #getting command line arguments and setting timedrunner
                self.command_line = self.get_command_line()
                #Runner.TimedRunner.__init__( self, self.command_line, **kwargs )
                self.timeout = timeout
                self.stdout = stdout
                self.stderr = stderr
Пример #10
0
def _check_profile_alignment(dominfo, filtered_ids):

    if dominfo.get('profile_alignment_integrity'):
        return dominfo['profile_alignment_integrity']

    domid = dominfo['uniqueid']
    ext = Settings.get("alignment_suffix")
    aln_fn = build_sequence_filename(dominfo['domain_path'], domid, ext)

    if os.path.exists(aln_fn):
        alignments = PairwiseAlignmentRecords()
        alignments.parse(aln_fn)

        for alignment_method in Settings.get(
                "profile_comparison_methods").split():
            method_name = alignment_method.lower() + "_1"
            if not alignments.count(domid, filtered_ids,
                                    method_name) == len(filtered_ids):
                return 0
        else:
            return 1
    return 0
Пример #11
0
    def __init__(self, cmd=None, input_file=None, output_file=None):

        if cmd == None:
            cmd = Settings.get("mk_compass_db")

        if input_file == None:
            raise TypeError("Input_file should be given.")

        self.input_file = input_file
        self.output_file = output_file
        #self.input_type = input_type
        #self.output_type = output_type
        self.cmd = cmd

        self.command_line = self.get_command_line()
Пример #12
0
    def submit(self):
        '''
		Submit a job script to the queue and get the SGE Queue Job ID.
		'''
        command_line = [self.submit_cmd]
        if self.name:
            command_line.append('-N')
            command_line.append(self.name)

        if Settings.get('sge_job_queue'):
            command_line.append('-q')
            command_line.append(Settings.get('sge_job_queue'))

        command_line.append("-cwd")  #run in the current directory
        command_line.append(self.script_file)

        #sorting out submit directory issue
        #if the submission directory is not the same directory
        #of the script.
        if self.use_script_dir:
            script_dir = os.path.dirname(self.script_file)
        else:
            script_dir = None

        output = Popen(command_line, stdout=PIPE,
                       cwd=script_dir).communicate()[0]

        #saving previous job ids.
        if self.job_id:
            self.old_job_ids.append(self.job_id)

        try:
            self.job_id = output.split()[2]
        except IndexError:
            raise SunGridEngineError("Submission %s failed!" %
                                     self.script_file)
Пример #13
0
    def __init__(self, cmd=None, inputfile=None, outputfile=None, dbfile=None):
        if cmd == None:
            cmd = Settings.get('compass_cmd')

        if inputfile == None:
            raise AlingerError("No input file is given.")

        if outputfile == None:
            self.outputfile_fp = tempfile.NamedTemporaryFile()
            outputfile = self.outputfile_fp.name

        if dbfile == None:
            raise AlignerError("No DB file is given.")

        DBAligner.__init__(self,
                           cmd=cmd,
                           inputfile=inputfile,
                           outputfile=outputfile,
                           dbfile=dbfile)
Пример #14
0
    def __init__(self,
                 cmd=None,
                 input_file=None,
                 output_file=None,
                 input_type=None,
                 output_type=None,
                 remove_query_gap=True):

        if cmd == None:
            cmd = Settings.get("reformat")

        self.input_file = input_file
        self.output_file = output_file
        self.input_type = input_type
        self.output_type = output_type

        self.cmd = cmd

        self.command_line = self.get_command_line()
        self.remove_query_gap = remove_query_gap
Пример #15
0
    def build_file(self, dominfo):
        from evdblib.DBTools import Settings
        filename = os.path.join(self.job_dir,
                                'prfs%s.job' % dominfo['uniqueid'])

        if not self.cmd:
            self.cmd = Settings.get('profile_searcher')

        command = self.cmd
        domain_path = dominfo['domain_path']
        profile_search_method = self.alignment_method
        profile_search_queryid = dominfo['uniqueid']
        iteration = self.iteration

        query_iteration = min(self.iteration, check_profile_integrity(dominfo))

        profile_search_query = build_profile_filename(
            domain_path, dominfo['uniqueid'] + '.prof', query_iteration,
            self.profile_extention)
        profile_search_db = self.search_db
        profile_search_db_size = self.search_db_size

        profile_search_output = build_sequence_filename(
            domain_path, dominfo['uniqueid'], self.alignment_extention)

        command += ' -q %(profile_search_queryid)s -j %(iteration)s -m %(profile_search_method)s -u -s %(profile_search_db_size)s -d %(profile_search_db)s %(profile_search_query)s %(profile_search_output)s' % locals(
        )

        if os.path.exists(filename):
            raise JobScriptWriteError(
                "Profile search job script %s already exists!" % filename)

        fp = open(filename, 'w')
        print(command, file=fp)
        fp.close()

        return filename
Пример #16
0
def _prepare_compass_search_db(db_filename, domain_informations, iteration,
                               use_between, selected_iterations):
    '''
	prepare compass DB.
	'''

    db_fp = open(db_filename, 'w')
    ext = Settings.get('compass_suffix')

    db_size_fp = open(db_filename + ".len", 'w')  #need to be built
    compass_db_size = 0

    previous_iteration = selected_iterations[max(
        selected_iterations.index(iteration) - 1, 0)]

    number_of_records = 0
    for dominfo in domain_informations:

        #read domain path
        domain_path = dominfo['domain_path']
        if not domain_path:
            if verbose:
                print("WARNING: Dominfo does not have domain_path...")
                print(dominfo)
            continue

        domid = dominfo['uniqueid']
        compass_file = build_profile_filename(domain_path, domid + '.prof',
                                              iteration, ext)

        if not os.path.exists(compass_file) and use_between:
            #in case the hhsearch file of the iteration
            #does not exists
            #and the use_between flag is On...
            #find the last iteration
            last_available_iteration = check_profile_integrity(dominfo)
            if not last_available_iteration:
                print("WARNING: Profile is bad!", domain_path, domid)
                continue

            if last_available_iteration > previous_iteration:
                compass_file = build_profile_filename(
                    domain_path, domid + '.prof', last_available_iteration,
                    ext)
            else:
                if verbose:
                    print("No between iteration available!", iteration,
                          last_available_iteration)
                continue

        elif not os.path.exists(compass_file) and not use_between:
            #when hhsearch file is not availble and use  between flag is off.
            last_available_iteration = check_profile_integrity(dominfo)
            compass_file = build_profile_filename(domain_path, domid + '.prof',
                                                  last_available_iteration,
                                                  ext)

        #final check!
        if not os.path.exists(compass_file):
            #error!
            print("WARNING: COMPASS file should be available but not found!",
                  compass_file,
                  file=sys.stderr)
            raise SearchDatabasePreparationError(
                "COMPASS numerical profile file is not availble!",
                compass_file)
            continue

        fp = open(compass_file)
        content = fp.read()
        fp.close()

        db_fp.write(content)
        number_of_records += 1

        compass_size_file = compass_file + ".len"
        try:
            fp = open(compass_size_file)
            compass_db_size += int(fp.read().strip())
            fp.close()
        except:
            print("WARNING: Cannot read compass profile size file.",
                  compass_size_file)

    db_fp.close()
    print(compass_db_size, file=db_size_fp)
    db_size_fp.close()

    return number_of_records
Пример #17
0
def _prepare_hhsearch_search_db(db_filename, domain_informations, iteration,
                                use_between, selected_iterations):
    '''
	prepare HHsearch DB.
	and returns the number of records saved in the database file.
	'''
    db_fp = open(db_filename, 'w')
    ext = Settings.get('hhm_suffix')

    #getting previous iteration
    #for selecting iteration bigger than before.
    previous_iteration = selected_iterations[max(
        selected_iterations.index(iteration) - 1, 0)]

    number_of_records = 0
    for dominfo in domain_informations:

        #read domain path
        if 'domain_path' in dominfo:
            domain_path = dominfo['domain_path']
        else:
            raise ValueError('domain_path does not exists', dominfo)

        if not domain_path:
            if verbose:
                print("WARNING: Dominfo does not have domain_path...")
                print(dominfo)
            continue

        domid = dominfo['uniqueid']
        hhsearch_file = build_profile_filename(domain_path, domid + '.prof',
                                               iteration, ext)

        if not os.path.exists(hhsearch_file) and use_between:
            #in case the hhsearch file of the iteration
            #does not exists
            #and the use_between flag is On...
            #means the profile is generation is good!
            #and the value is max iteratoin!!
            last_available_iteration = check_profile_integrity(dominfo)
            if not last_available_iteration:
                print("WARNING: Profile is bad!", domain_path, domid)
                continue

            if last_available_iteration > previous_iteration:
                hhsearch_file = build_profile_filename(
                    domain_path, domid + '.prof', last_available_iteration,
                    ext)
            else:
                if verbose:
                    print("WARNING: No between iteration available!",
                          iteration, last_available_iteration)
                continue

        elif not os.path.exists(hhsearch_file) and not use_between:
            #when hhsearch file is not availble and use  between flag is off.
            last_available_iteration = check_profile_integrity(dominfo)
            hhsearch_file = build_profile_filename(domain_path,
                                                   domid + '.prof',
                                                   last_available_iteration,
                                                   ext)

        #final check!
        if not os.path.exists(hhsearch_file):
            #error!
            print(
                "Error: HHsearch HMM file should be available but not found!",
                hhsearch_file,
                file=sys.stderr)
            raise SearchDatabasePreparationError(
                "HHsearch HHM file is not availble!", hhsearch_file)

        fp = open(hhsearch_file)
        content = fp.read()
        fp.close()

        db_fp.write(content)
        number_of_records += 1

    db_fp.close()
    return number_of_records
Пример #18
0
def prepare_structure_search_database(method,
                                      domain_informations=None,
                                      prefix='',
                                      dir=None,
                                      strict=False,
                                      compute_node_dir=None):
    '''
	Prepares local condensed search database directory 
	for convienent and fast search.

	Methods should be one of the search method defined
	in database configuration.

	Currently "DaliLite", "FAST", "TMalign"
	can be used as a method keyword.

	Optional domain_informations list can be given for
	building local db for subset of the current content in the database
	'''

    ##############################
    #preparation for options
    ##############################
    if domain_informations == None:
        domain_informations = DomainInformation.get_all_records()
    else:
        pass

    if strict and domain_informations == None:
        raise TypeError("Domain information fetch failed.")

    #specific directory to have all data files into one directory.
    if dir == None:
        dir = Settings.get("local_db_space")

    local_db_root = os.path.join(dir, prefix)

    if not os.path.exists(local_db_root):
        os.makedirs(local_db_root)

    #unlike to profile search case,
    #the main db path is directory name
    #not the filename.
    #since all of the methods are essentially pairwise
    db_dir = os.path.join(local_db_root, '.'.join([prefix, method]))
    if os.path.exists(db_dir):
        shutil.rmtree(db_dir)

    if not os.path.exists(db_dir):
        os.makedirs(db_dir)

    ##############################
    #actual db building code
    ##############################
    if method == 'DaliLite':
        number_of_records = _prepare_dalilite_search_db(
            db_dir, domain_informations)
    elif method == 'FAST':
        number_of_records = _prepare_fast_search_db(db_dir,
                                                    domain_informations)
    elif method == 'TMalign':
        number_of_records = _prepare_tmalign_search_db(db_dir,
                                                       domain_informations)
    else:
        raise TypeError(
            "Structure search database method should be DaliLite, FAST or TMalign.",
            method)

    ################################
    #copying LOCAL_SCRATCH if it is set in the db.config.
    if compute_node_dir == None:
        compute_node_db_dir = Settings.get("compute_node_db_space")

    if compute_node_db_dir:
        compute_node_db_root = os.path.join(compute_node_db_dir, prefix)

    db_dir = _prepare_compute_node_db(db_dir, compute_node_db_root)

    return db_dir, number_of_records
Пример #19
0
def prepare_input_sequence_for_profile_building(dominfo):
    '''
	build a sequence files for profile and save the information into 
	dominfo.
	
	New items "profile_sequence_file" and "profile_sequence_range" will
	be added into dominfo dictionary.

	Note that the profile sequence file and range will be same as input file if the
	db type is sequence. For structure, profile sequence file will be biologically
	relavent region defined by DBREF in PDB.
	'''

    data_type = Settings.get("data_type")
    reference_sequence_suffix = Settings.get("reference_sequence_suffix")
    profile_sequence_suffix = Settings.get("profile_sequence_suffix")

    profile_sequence_file = os.path.join(
        dominfo['domain_path'], dominfo['uniqueid']) + profile_sequence_suffix
    #reference_sequence_file = os.path.join( dominfo['domain_path'], dominfo['uniqueid'] ) + profile_sequence_suffix

    if data_type == 'sequence':
        if dominfo['original_input_path']:
            try:
                shutil.copyfile(dominfo['original_input_path'],
                                profile_sequence_file)
                dominfo['profile_sequence_file'] = profile_sequence_file
                dominfo['profile_sequence_range'] = dominfo['range']

            except IOError:
                dominfo['profile_sequence_file'] = ''
                dominfo['profile_sequence_range'] = ''
                print("WARNING: Profile sequence file cannot be written.",
                      profile_sequence_file,
                      file=sys.stderr)
        else:
            dominfo[
                'profile_sequence_file'] = ''  #dominfo[ 'original_input_path' ]
            dominfo['profile_sequence_range'] = ''  #dominfo[ 'range' ]

    elif data_type == 'structure' and dominfo[
            'original_input_path'] and dominfo['domain_path']:
        pdb = PDB.parse(dominfo['original_input_path'])
        pdbrange = PDBRange()
        pdbrange.parse(dominfo['range'])

        chainrange = PDBRange()
        chainrange.parse(','.join(
            [cid + ':' for cid in pdbrange.get_unique_chain_ids()]))

        #Full SEQRES sequence
        profile_sequence = pdb.extract_sequence(chainrange,
                                                biological=False,
                                                standard_residue_name=False,
                                                atomrecord=True,
                                                backbone=False)[0]
        #convert the pdbrange into the sequence range matching the given
        #set of the residue indications.
        profile_sequence_range = pdb.pdbrange2sequencerange(
            pdbrange,
            biological=False,
            standard_residue_name=True,
            atomrecord=True,
            backbone=False)
        ###########################

        #print "*"*10, "dominfo"
        #print dominfo
        #print profile_sequence_range
        #for i,contig in enumerate(profile_sequence_range) :
        #print 'contig:%d'%i, contig.get_start(), contig.get_end()

        header = '>%s' % (dominfo['uniqueid'])

        try:
            if os.path.exists(profile_sequence_file):
                raise IOError

            fp = open(profile_sequence_file, 'w')
            print(header, file=fp)
            print(profile_sequence, file=fp)
            fp.close()

        except IOError:
            dominfo['profile_sequence_file'] = ''
            dominfo['profile_sequence_range'] = ''
            print("WARNING: Profile sequence file cannot be written.",
                  profile_sequence_file,
                  file=sys.stderr)

        else:
            dominfo['profile_sequence_file'] = profile_sequence_file
            dominfo['profile_sequence_range'] = str(profile_sequence_range)
    else:
        print("Error!", file=sys.stderr)
        print(dominfo, file=sys.stderr)
        raise Exception("Cannot build profile sequence file.")
Пример #20
0
def prepare_profile_search_database(method,
                                    domain_informations=None,
                                    iteration=None,
                                    prefix='',
                                    dir=None,
                                    use_between=None,
                                    selected_iterations=None,
                                    strict=False):
    '''
	Prepares local condensed search database file 
	for convienent and fast search.

	Methods should be one of the search method defined
	in database configuration.

	Currently "COMPASS", "HHsearch"
	can be used as a method keyword.

	Optional domain_informations list can be given for
	building local db for subset of the current content in the database

	This function returns a filename of a composite local db filename
	and the number of records in the database.
	The number of records in the db is helpful especially for running 
	HHsearch to print out all the available alignments.

	Note that the use_between option True makes
	the local db building procedure use in between selected iterations
	if the specified "iteraion" does not exist.
	e.g. for iteration=3, selected_iterations = [1,3,5,8]
	the domain converged at iteration 2.
	Then the iteration 2 will be used if the "use_between" option is true. 
	
	By Default, the following options will use values specified in config file;
		use_between
		selected_iterations
		domain_informations
	.
	'''
    ##############################
    #preparation for options
    ##############################
    if domain_informations == None:
        domain_informations = DomainInformation.get_all_records()

    if strict and domain_informations == None:
        raise TypeError("Domain information fetch failed.")

    if iteration == None:
        raise TypeError("Integer value of iteration should be given.")

    try:
        iteration = int(iteration)
    except ValueError:
        raise ValueError(
            "Iteration should be integer or should be convetable to an integer value."
        )

    if dir == None:
        dir = Settings.get("local_db_space")

    local_db_root = os.path.join(dir, prefix)

    if not os.path.exists(local_db_root):
        os.makedirs(local_db_root)

    if use_between == None:
        use_between_string = Settings.get("use_between_selected_iterations")

        #print >>sys.stderr, 'use_between_string', use_between_string

        if use_between_string in ['true', 'True']:
            use_between = True
        elif use_between_string in ['false', 'False']:
            use_between = False
        else:
            raise ValueError(
                "database configuration of use_between_selected_iterations is wrong: %s"
                % use_between_string)

    if selected_iterations == None:
        selected_iterations = [
            int(i) for i in Settings.get("selected_iterations").split()
        ]

    db_filename = os.path.join(local_db_root,
                               '.'.join([prefix, method,
                                         str(iteration)]))
    ##############################
    #actual db building code
    ##############################
    if method == 'HHsearch':
        number_of_records = _prepare_hhsearch_search_db(
            db_filename, domain_informations, iteration, use_between,
            selected_iterations)
    elif method == 'COMPASS':
        number_of_records = _prepare_compass_search_db(db_filename,
                                                       domain_informations,
                                                       iteration, use_between,
                                                       selected_iterations)
    else:
        raise TypeError(
            "Profile search database method should be HHsearch or COMPASS.")

    return db_filename, number_of_records
Пример #21
0
    def __init__(self,
                 cmd=None,
                 inputfile1=None,
                 inputfile2=None,
                 identifier1=None,
                 identifier2=None,
                 fakeid1='1domA',
                 fakeid2='2domA',
                 outputfile=None,
                 parser=None,
                 verbose=False):
        '''
		Alinger class for DaliLite pairwise program.
		
		Currently Parser does not work!
		If verbose is True, output from DaliLite will be
		printed out!
		Use symlink True when the DAT files are on
		fast access directories.
		For remote files, turn off symlink.
		This will make the local copy of .dat files for DaliLite.
		'''
        if cmd == None:
            cmd = Settings.get('dalilite')

        self.temp_dir = tempfile.mkdtemp()

        if inputfile1 == None:
            raise AlignerError("No input file 1 is given.")

        if inputfile2 == None:
            raise AlignerError("No input file 2 is given.")

        if outputfile == None:
            self.outputfile = os.path.join(
                self.temp_dir,
                os.path.basename(inputfile1).replace('.dat', '.dccp'))

        if identifier1 == None:
            identifier1 = parse_sequence_filename(inputfile1)[1]
        if identifier2 == None:
            identifier2 = parse_sequence_filename(inputfile2)[1]

        if os.path.exists(self.outputfile):
            raise AlignerError(outputfile, "Outputfile already exists!")

        self.identifier1 = identifier1
        self.identifier2 = identifier2

        self.fakeid1 = fakeid1
        self.fakeid2 = fakeid2

        #preparing
        self.dat_dir = os.path.join(self.temp_dir, 'DAT')
        os.mkdir(self.dat_dir)
        self.dat_file1 = os.path.join(self.dat_dir, self.fakeid1 + '.dat')
        self.dat_file2 = os.path.join(self.dat_dir, self.fakeid2 + '.dat')

        dat1 = DaliLiteDAT(inputfile1)
        dat1.convert_identifier(output=self.dat_file1,
                                output_identifier=self.fakeid1)
        dat2 = DaliLiteDAT(inputfile2)
        dat2.convert_identifier(output=self.dat_file2,
                                output_identifier=self.fakeid2)

        self.temp_output = os.path.join(self.temp_dir, self.fakeid1 + ".dccp")

        PairAligner.__init__(self,
                             cmd=cmd,
                             cwd=self.temp_dir,
                             inputfile1=inputfile1,
                             inputfile2=inputfile2,
                             identifier1=identifier1,
                             identifier2=identifier2,
                             outputfile=self.temp_output,
                             parser=None,
                             verbose=verbose)
Пример #22
0
    def __init__(self,
                 pdbfn,
                 cmd=None,
                 save_dir=None,
                 output_fn=None,
                 identifier=None,
                 echo=True):
        '''
		Run DaliLite command and generate DAT file
		for large scale DaliLite structure comparisons.

		identifier is quite similar to PDBID (with chainID followed) 
		except that DaliLite differentiate lower vs upper characters and 
		the first character does not have to be an numeric character.

		Note that the settings in this DAT generator is somewhat different
		from the original settings in DaliLite program.
		In DaliLite the filename of DAT file should be (I guess)
		matching with the identifier and chainID.
		
		echo is a flag that controls output from the program
		for logging purposes and the actual command run to generate
		the DAT file.

		WARNING: This class depends on a slightly modified version
		of DaliLite that saves DAT files into the DAT directory in 
		the current working directory.
		'''

        self.cmd = cmd
        self.pdbfn = pdbfn
        self.save_dir = save_dir
        self.output_fn = output_fn
        self.identifier = identifier
        self.echo = echo

        self.temp_dir = tempfile.mkdtemp()

        if self.cmd == None:
            self.cmd = Settings.get("dalilite")

        if self.save_dir:
            if not os.path.exists(self.save_dir):
                os.makedirs(self.save_dir)
        else:
            self.save_dir = os.getcwd()

        #convert the data into absolute_dir.
        if self.pdbfn:
            shutil.copy(self.pdbfn, self.temp_dir)
            self.pdbfn = os.path.join(self.temp_dir,
                                      os.path.basename(self.pdbfn))

        if not self.output_fn:
            dir, basename, ext = parse_sequence_filename(
                os.path.basename(self.pdbfn))
            self.output_fn = basename + '.dat'

        if os.path.exists(os.path.join(self.save_dir, self.output_fn)):
            raise IOError("File already exists.", self.output_fn)

        #read settings file to get the default idenfier
        #this is important for internal consistency
        if self.identifier == None:
            self.identifier = Settings.get("default_dali_id")

        #original DAT file that will be produced by the DaliLite program!
        self.dat_fn = os.path.join(self.temp_dir, "DAT",
                                   self.identifier + ".dat")
        self.dssp_fn = os.path.join(self.temp_dir,
                                    self.identifier[:4] + ".dssp")
Пример #23
0
    def __init__(self,
                 inputpdb=None,
                 read_cmd=None,
                 buildbackbone_cmd=None,
                 dglp_list=None,
                 save_dir=None,
                 output_fn=None,
                 echo=True):
        '''
		Run MaxSprout command buildbackbone to generate a new PDB file
		filling backbone atoms for CA only residues.
		This class does not full model building but only builds backbones.
		Full sidechain optimization has some problems.

		echo is a flag that controls output from the program
		for logging purposes and the actual command run to generate
		the full backbone PDB file.

		Note that inputpdb is assumed to be a single chain PDB file.
		If the input PDB sequence and output PDB sequence does not match,
		MaxSproutRunnerSequenceChangeError will be raised!

		Note that the finally backbone built model is actually not
		a complete PDB file format.
		It has missing records like occupancies and B-factors.
		'''
        self.read_cmd = read_cmd
        self.buildbackbone_cmd = buildbackbone_cmd
        self.dglp_list = dglp_list  #Necessary input param for buildbackbone cmd.

        self.pdbfn = inputpdb
        self.save_dir = save_dir
        self.output_fn = output_fn
        self.echo = echo

        self.temp_dir = tempfile.mkdtemp()

        if self.read_cmd == None:
            self.read_cmd = Settings.get("maxsprout_readbrk")
        if self.buildbackbone_cmd == None:
            self.buildbackbone_cmd = Settings.get("maxsprout_buildbackbone")
        if self.dglp_list == None:
            self.dglp_list = Settings.get("maxsprout_dglp_list")

        if not (self.read_cmd and self.buildbackbone_cmd and self.dglp_list):

            for k, v in Settings.settings.items():
                print(k, ":", v)

            raise MaxSproutRunnerError(
                "Commands or dglp.list info was not retrieved!", self.read_cmd,
                self.buildbackbone_cmd, self.dglp_list)

        if self.save_dir:
            if not os.path.exists(self.save_dir):
                os.makedirs(self.save_dir)
        else:
            self.save_dir = os.getcwd()

        #convert the data into absolute_dir.
        if self.pdbfn:
            shutil.copy(self.pdbfn, self.temp_dir)
            self.pdbfn = os.path.join(self.temp_dir,
                                      os.path.basename(self.pdbfn))

        if not self.output_fn:
            dir, basename, ext = parse_sequence_filename(
                os.path.basename(self.pdbfn))
            self.output_fn = basename + '.maxsprout'

        if os.path.exists(os.path.join(self.save_dir, self.output_fn)):
            raise IOError("Maxsprouted file already exists.", self.output_fn)
Пример #24
0
    def __init__(
        self,
        msa=None,
        query=None,  #main input
        number_of_processors=None,
        database=None,  #BLAST Setting if MSA should be built
        echo=True,  #echoing ouptut option
        #commands
        formatdb=None,
        makemat=None,
        psipred=None,
        psipass2=None,
        #data directory containing weight file for psipred
        psipred_data_dir=None):
        '''
		Predict secondary structure using PSIPRED.

		if multiple sequence alignment or msa (MSA object) is given, 
		the msa is used for prediction.

		if query is given, blastpgp will be used
		for building multiple sequence alignment and
		predict the secondary structure.

		Mode 1 of this class is basically copied from
		runpsipred script.
		'''

        if formatdb == None:
            formatdb = Settings.get('formatdb')
        if makemat == None:
            makemat = Settings.get('makemat')
        if psipred == None:
            psipred = Settings.get('psipred')
        if psipass2 == None:
            psipass2 = Settings.get('psipass2')
        if psipred_data_dir == None:
            psipred_data_dir = Settings.get('psipred_data_dir')

        print("PSIPRED is running...")

        self.temp_dir = None  #tempfile.tempname()
        self.number_of_processors = number_of_processors

        #mode 2 stuff.
        self.database = database
        self.query = query

        #Mode 1. use the given MSA to predict 2nd Structures
        if msa != None:

            input_string = str(msa.query)
            self.temp_dir = tempfile.mkdtemp()

            alignment_input = os.path.join(self.temp_dir, "query.aln")
            fp = open(alignment_input, 'w')
            msa.build_psiblast_alignment_input(fp)
            fp.close()

            dummy_db = os.path.join(self.temp_dir, 'query.seq')
            msa.query.save(dummy_db)
            os.system(formatdb + ' -i ' + dummy_db)

            checkpoint = os.path.join(self.temp_dir, 'query.chk')
            output = '/dev/null'

            if verbose:
                print('temp_dir:', self.temp_dir)
                print('input_string:', input_string)
                print('dummy_db', dummy_db)
                print('checkpoint', checkpoint)
                print('alignment_input', alignment_input)
                print(open(alignment_input).read())

            runner = PSIBLASTRunner(input_string=input_string,
                                    max_iterations=1,
                                    output=output,
                                    input_alignment=alignment_input,
                                    database=dummy_db,
                                    number_of_processors=number_of_processors,
                                    checkpoint=checkpoint)
            runner.run()

            basename = os.path.join(self.temp_dir, 'query')

            fp = open(basename + ".pn", 'w')
            print("query.chk", file=fp)
            fp.close()

            fp = open(basename + ".sn", 'w')
            print("query.seq", file=fp)
            fp.close()

            if verbose:
                print("basename:", basename)
                print(basename + '.pn')
                print(open(basename + '.pn').read())
                print(basename + '.sn')
                print(open(basename + '.sn').read())

            os.system(makemat + " -P " + basename)
            weight_file = os.path.join(psipred_data_dir, "weights.dat")
            os.system(
                psipred +
                " %(basename)s.mtx %(weight_file)s %(weight_file)s2 %(weight_file)s3 %(weight_file)s4 > %(basename)s.ss"
                % locals())

            os.system(
                "%(psipass2)s %(psipred_data_dir)s/weights_p2.dat 1 0.98 1.09 %(basename)s.ss2 %(basename)s.ss > %(basename)s.horiz"
                % locals())

            self.output = basename + '.horiz'  #important output !!!

        #Mode 2. build MSA and then predict 2nd structures
        elif query != None:
            raise NotYetImplementedError(
                'Building Query Mode has not yet been implemented!')

        else:
            raise PSIPREDRunnerError('MSA or query FASTA should be given!')
Пример #25
0
'''
This subpackage contains modules to manage information about proteins 
or domains in the database.
'''
#debug = 1
verbose = 1

import os, shelve
from evdblib.DBTools import Settings

domain_info_db = Settings.get('domain_info_db')
classification_info_db = Settings.get('classification_info_db')
classification_levels = int(Settings.get('classification_levels'))

#########################################
#Using local database
#########################################
if Settings.get('use_local_db'):
    cwd = os.getcwd()
    domain_info_db = os.path.join(cwd, os.path.basename(domain_info_db))
    classification_info_db = os.path.join(
        cwd, os.path.basename(classification_info_db))

    ##################################k
    #temporary blocking!!!
    ##################################k
    #class DomInfoDB :
    #def __init__( self, domain_info_db ) :
    #	self.dominf = shelve.open( domain_info_db )
    '''
	def __del__( self ) :
Пример #26
0
def generate_intermediate_dbs(domain_informations):
    #Settings.get( 'intermediate_result_dir' )
    intermediate_domain_info_db = Settings.get('intermediate_domain_info_db')
    intermediate_classification_info_db = Settings.get(
        'intermediate_classification_info_db')
Пример #27
0
This module contains wrappers for the blastpgp program.

BLASTRunner class will run protein blast or psiblast.
'''
import os, sys, time, tempfile, shutil, copy
from subprocess import Popen, PIPE
from io import StringIO
from evdblib.Utils.Parsers import FASTA, BLAST
from evdblib.Utils import parse_sequence_filename, find_command_in_path, build_profile_filename

from . import Runner

formatdb = 'formatdb'
if find_command_in_path(formatdb):
    from evdblib.DBTools import Settings
    formatdb = Settings.get('formatdb')

verbose = 0
default_max_iterations_for_neighbors = 1


class BLASTRunner:
    '''
	BLASTRunner class will run blastpgp program.
	'''
    def __init__(self,
                 input=None,
                 output=None,
                 database=None,
                 evalue_cutoff=None,
                 input_string=None,