示例#1
0
def run_agadir_on_1000_fastas():
    path_output_mutants = os.path.join(
        Paths.OUTPUT, Paths.DIR_MUTANTS_FASTAS.value,
        str(startnum) + Str.DOTS3.value + str(endnum))
    path_to_fastas = path_output_mutants + '/**/*' + Str.FSTAEXT.value
    path_fastafile_list = natsort.natsorted(
        glob.glob(path_to_fastas, recursive=True))
    agadir = Agadir(Cond.INCELL_MAML.value)
    for path_fastafile in path_fastafile_list:
        time.sleep(1)
        if GUM.using_cluster():
            jobname = Paths.PREFIX_WRITE.value + path_fastafile.split('/')[-1]
            path_to_script = os.path.join(
                Paths.SRC,
                'write_1fastafile_per_fasta_from_multifastafile_zeus.py')
            Cluster.write_job_q_bash(jobname,
                                     path_job_q_dir=Paths.CONFIG_JOBQ,
                                     python_script_with_paths=path_to_script +
                                     Str.SPCE.value + path_fastafile)
            Cluster.run_job_q(path_job_q_dir=Paths.CONFIG_JOBQ)
        else:
            GUM.write_1_fastafile_per_fasta_from_multifastafile(
                path_dst=Paths.INPUT, path_fastafile=path_fastafile)
示例#2
0
 def test_write_mutants_2(self):
     # arrange
     path_tests_output_fastas_wtsci = os.path.join(
         TPLS.MC_TESTS_OUTPUT_FASTAS.value, 'WT_SCI')
     GUM.linux_remove_all_files_in_dir(path_tests_output_fastas_wtsci)
     title_titleSeq_w_mutants = {
         'WT_SCI': {
             'WT_SCI': 'SCI',
             'S1A': 'ACI',
             'C2A': 'SAI',
             'I3A': 'SCA'
         }
     }
     expected_mutant_fastafile_list = [
         'WT_SCI.fasta', 'S1A.fasta', 'C2A.fasta', 'I3A.fasta'
     ]
     # act
     self.mutateFasta._write_mutants(
         title_titleSeq_w_mutants,
         write_1_fasta_only=False,
         write_fasta_per_mut=True,
         path_output_3dots=TPLS.MC_TESTS_OUTPUT.value,
         write_csv=False,
         write_txt=False)
     path_tests_output_fastas_wtsci_mutants = os.path.join(
         TPLS.MC_TESTS_OUTPUT.value, TPLS.DIR_FASTAS.value, 'WT_SCI',
         TPLS.DIR_MUTANTS.value)
     # mutant_fastafile_list = glob.glob(path_tests_output_fastas_wtsci + '/**/*.fasta', recursive=True)
     # mutant_fastafile_list = [mutant_fastafile.split('/')[-1] for mutant_fastafile in mutant_fastafile_list]
     mutant_fastafile_list = GUM.make_filelist_in_current_dir_and_subdirs_recursively(
         path_tests_output_fastas_wtsci, '.fasta')
     # assert
     self.assertTrue(path_tests_output_fastas_wtsci_mutants,
                     msg=path_tests_output_fastas_wtsci_mutants +
                     ' folder not found.')
     for exp_mutant_fastafile in expected_mutant_fastafile_list:
         self.assertTrue(exp_mutant_fastafile in mutant_fastafile_list)
示例#3
0
 def test_copy_input_files_from_repo_to_input_2(
         self, mock_get_pdb_or_fastafile_list_from_subdirs):
     # arrange
     path_repo = TPLS.REPO_PDB_FASTA.value
     path_dst = TPLS.MC_TESTS_INPUT.value
     wanted_file_list = ['1_A.fasta', '3_A.fasta']
     mock_get_pdb_or_fastafile_list_from_subdirs.return_value = [
         '1_A.fasta', '1_B.fasta', '2_A.fasta', '3_A.fasta', '3_B.fasta'
     ]
     copy_all_files_in_dir = False
     # act
     copied_wanted_file_list = GUM.copy_files_from_repo_to_input_dirs(
         path_repo, path_dst, wanted_file_list)
     path_copied_file_list = [
         path_dst + '/' + x for x in copied_wanted_file_list
     ]
     # assert
     self.assertEqual(wanted_file_list, copied_wanted_file_list)
     for path_copied_file in path_copied_file_list:
         self.assertTrue(path_copied_file)
__author__ = "Shahin Zibaee"
__copyright__ = "Copyright 2018, The Switch lab, KU Leuven"
__license__ = "GPL"
__version__ = "1.0.0"
__maintainer__ = "Shahin Zibaee"
__email__ = "*****@*****.**"
__status__ = "Development"
"""
1. Set up paths. ("use_cluster" is set to False by default.)  
"""
Paths.set_up_paths(use_cluster=(
    len(sys.argv) > 1 and sys.argv[1].strip(' ') == 'use_cluster=True'))
"""
2. Get csv files you want to read and write to single file.
"""
using_cluster = GUM.using_cluster()
path_output_ac_or_bm_dir = ''
path_output_csvfiles = glob.glob(Paths.OUTPUT_BM + '/**/*' + Str.CSVEXT.value,
                                 recursive=True)
"""
(some private functions defined here)
"""


def _extract_pdbname(filename: str):
    regex = re.compile(r"RepairPDB_[0-9]+_")
    res = regex.search(filename)
    pdbnme = filename[res.start():res.end() - 1]
    return pdbnme

示例#5
0
Delete files no longer needed from the output folders in order to reduce disk space usage (particularly important on the cluster).

:sys.argv[1] path_output_ac_pdb_fxmutant_dir: Absolute path to each pdb/mutant folder.
"""

ac_or_bm = path_output_ac_or_bm_pdb_fxmutant_dir.split('/')[-3]
pdbname = path_output_ac_or_bm_pdb_fxmutant_dir.split('/')[-2]
fxmutantname = path_output_ac_or_bm_pdb_fxmutant_dir.split('/')[-1]
fx = FoldX()
if ac_or_bm == Paths.DIR_AC.value:
    print(ac_or_bm + 'it is in the ac scope')
    ac = fx.AnalyseComplex(Cond.INCELL_MAML_FX.value)
    fx.rm_pdbfiles(os.path.join(Paths.OUTPUT_BM, pdbname, fxmutantname))
    fx.rm_pdbfiles(os.path.join(path_output_ac_or_bm_pdb_fxmutant_dir))
    ac.rm_all_sumry_except_1_0(path_output_ac_or_bm_pdb_fxmutant_dir)
elif ac_or_bm == Paths.DIR_BM.value:
    print(ac_or_bm + 'it is in the bm scope')
    bm = fx.BuildModel(Cond.INCELL_MAML_FX.value)
    path_pdbfile = os.path.join(path_output_ac_or_bm_pdb_fxmutant_dir,
                                pdbname + Str.PDBEXT.value)
    if GUM.get_num_of_chains(path_pdbfile) == 1:
        # BuildModel has no use for these mutant pdb files and because this pdb only has 1 chain, AnalyseComplex has
        # no use for them either. Hence, they can already be removed.
        fx.rm_pdbfiles(path_output_ac_or_bm_pdb_fxmutant_dir)
else:
    print(ac_or_bm + ' something is not working !')
fx.rm_config_files(path_output_ac_or_bm_pdb_fxmutant_dir)
fx.rm_cluster_logfiles(path_output_ac_or_bm_pdb_fxmutant_dir,
                       rm_non_empty_err_files=False)
fx.rm_unnecessary_fxoutfiles(path_output_ac_or_bm_pdb_fxmutant_dir)
示例#6
0
path_output = sys.argv[2]
path_config_job = sys.argv[3]
path_output_blastp = sys.argv[4]
write_idmaps_for_mysldb = sys.argv[5] == 'True'
write_csv = sys.argv[6] == 'True'
write_xml = sys.argv[7] == 'True'
write_json = sys.argv[8] == 'True'


for path_fastafile in path_input_fastafile_list:
    with open(path_fastafile) as fastafile_opened:
        fastafile_name = path_fastafile.split('/')[-1].split('.')[0]
        jobname = 'BLSTP_' + fastafile_name
        Cluster.write_job_q_bash(jobname=jobname, path_job_q_dir=path_config_job, queue='all.q', memory_limit_GB='3',
                                 cluster_node='hodor1.vib')
        path_output_blastp_fastaname = GUM.os_makedirs(path_output_blastp, fastafile_name)
        os.chdir(path_output_blastp_fastaname)
        Cluster.run_job_q(path_job_q_dir=path_config_job)
        Cluster.wait_for_grid_engine_job_to_complete(grid_engine_job_prefix_or_full_name=jobname)
        path_raw_blstp_xml = IdProt._write_raw_blast_xml(path_output, fastafile_name,
                                                         blastp_result=NCBIWWW.qblast(
                                                             program=Biopy.BlastParam.BLST_P.value,
                                                             database=Biopy.BlastParam.SWSPRT.value,
                                                             sequence=fastafile_opened.read(),
                                                             entrez_query=Biopy.BlastParam.HOMSAP_ORG.value,
                                                             alignments=Biopy.BlastParam.MAX_ALIGN_20.value,
                                                             hitlist_size=Biopy.BlastParam.MAX_HIT_20.value))
        blastp_dict = Biopy.parse_filter_blastp_xml_to_dict(path_raw_blstp_xml, fastafile_name, path_fastafile)
        # blastp_dict_list.append(blastp_dict)
        if write_idmaps_for_mysldb:
            IdProt._write_idmaps_for_mysqldb(path_output, blastp_dict, write_csv=write_csv, write_xml=write_xml,
示例#7
0
    def test__write_idmaps_for_mysqldb(self):
        # arrange
        GUM.linux_remove_all_files_in_dir(
            os.path.join(TPLS.MC_TESTS_OUTPUT_BLASTP.value,
                         self.DIR_1_A + self.UNDR_SCR_ID_MAPS))
        fastafile_name = self.DIR_1_A
        query_length = 538
        database_used = 'swissprot'
        database_seqs_num = 20379
        accession_num = 'Q99985'
        length = 751
        sbjct_end = 565
        sbjct_start = 28
        identical_aligns_list = [{
            'accession_num':
            accession_num,
            'length':
            length,
            'hit_def':
            'RecName: Full=Semaphorin-3C; AltName: Full=Semaphorin-E; Short=Sema E; '
            'Flags: Precursor',
            'hsp_dict': {
                'align_length': 538,
                'gaps': 0,
                'identities': 538,
                'query_end': 538,
                'query_start': 1,
                'sbjct_end': sbjct_end,
                'sbjct_start': sbjct_start
            }
        }]
        blastp_dict = {
            'query_seq_id': fastafile_name,
            'query_length': query_length,
            'database': database_used,
            'database_seqs_num': database_seqs_num,
            'identical_aligns_list': identical_aligns_list
        }
        full_name = 'Semaphorin-3C'
        altname = 'Semaphorin-E'
        flags = 'Precursor'
        expected_csv = 'sequence_id,accession_num,length,full_name,altname,flags,start_pos,end_pos\n' + \
                       fastafile_name + ',' + accession_num + ',' + str(length) + ',' + full_name + ',' + \
                       altname + ',' + flags + ',' + str(sbjct_start) + ',' + str(sbjct_end) + '\n'
        expected_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<sequence_id id="1_A">\n\t<accession_num>' + \
                       accession_num + '</accession_num>\n\t<length>' + str(length) + '</length>\n\t<full_name>' + \
                       full_name + '</full_name>\n\t<altname>' + altname + '</altname>\n\t<flags>' + flags + \
                       '</flags>\n\t<start_pos>' + str(sbjct_start) + '</start_pos>\n\t<end_pos>' + str(sbjct_end) + \
                       '</end_pos>\n</sequence_id>\n'
        expected_json = {
            "sequence_id": fastafile_name,
            "accession_num": accession_num,
            "length": 751,
            "full_name": full_name,
            "altname": altname,
            "flags": flags,
            "start_pos": 28,
            "end_pos": 565
        }

        expected_path_output_blastp_xmlfile = os.path.join(
            TPLS.MC_TESTS_OUTPUT_BLASTP.value, self.DIR_1_A + '_idmaps',
            'idmap_swsprt.xml')
        expected_path_output_blastp_csvfile = os.path.join(
            TPLS.MC_TESTS_OUTPUT_BLASTP.value, self.DIR_1_A + '_idmaps',
            'idmap_swsprt.csv')
        expected_path_output_blastp_jsonfile = os.path.join(
            TPLS.MC_TESTS_OUTPUT_BLASTP.value, self.DIR_1_A + '_idmaps',
            'idmap_swsprt.json')
        # act
        IdProt._write_idmaps_for_mysqldb(
            path_output=TPLS.MC_TESTS_OUTPUT.value,
            blastp_dict=blastp_dict,
            write_xml=True,
            write_csv=True,
            write_json=True)
        # assert
        with open(expected_path_output_blastp_csvfile) as csv_opened:
            csv = csv_opened.read()
            self.assertEqual(expected_csv, csv)
        with open(expected_path_output_blastp_xmlfile) as xml_opened:
            xml = xml_opened.read()
            self.assertEqual(expected_xml, xml)
        path_ref_files_blastp_1_A_jsonfile = os.path.join(
            TPLS.MC_TESTS_REFFILES.value, TPLS.DIR_BLASTP.value,
            fastafile_name + '_idmaps', 'idmap_swsprt.json')
        with open(expected_path_output_blastp_jsonfile) as json_opened, \
                open(path_ref_files_blastp_1_A_jsonfile) as ref_json_opened:
            json = json_opened.read()
            expected_json = ref_json_opened.read()
            self.assertEqual(expected_json, json)
示例#8
0
    def start(operations: dict, use_multithread: bool, path_input: str,
              path_output: str, path_pdbfiles: list, path_fastafiles: list,
              specific_fxmutants: list, amino_acids: list,
              write_1_fasta_only: bool, write_fasta_per_mut: bool,
              write_to_csv_dumpfile_after_each_mutant: bool):
        """
        Iterate through a list of fasta files or pdb files and perform Agadir, or Foldx computations as specified by 'operations'.
        :param operations: Each operation paired with True/False flag to indicate whether or not to perform the operation.
        :param use_multithread: True to employ parallel processing.
        :param path_input: Absolute path to input_data root dir.
        :param path_output: Absolute path to output_data root dir.
        :param path_pdbfiles: Absolute path to pdb input files.
        :param path_fastafiles: Absolute path to fasta input files.
        :param specific_fxmutants: Given when specific mutants only should be calculated.
        :param amino_acids: Amino acids that mutation operations should use to mutate to.
        :param write_1_fasta_only: True to write any fasta output data to 1 fasta file, each separated by \n.
        :param write_fasta_per_mut: True to write any fasta output data as 1 fasta file per mutant. (Uses a lot of disk space).
        :param write_to_csv_dumpfile_after_each_mutant: True to write ddG values from fxout files to one csv file (for database
        dump).
        """
        if path_fastafiles:
            if operations[Scheduler.Strs.OPER_RUN_MUT_FSTA.value]:
                path_output_fastas_3dots = GUM.make_path_fastas_3dots_dirs(
                    path_output, path_fastafiles[0])
                mutate_fasta = MutateFasta(amino_acids)
                for path_fastafile in path_fastafiles:
                    sleep_secs = 0 if len(
                        path_fastafiles) < 200 else len(path_fastafiles) / 5000
                    time.sleep(sleep_secs)
                    if use_multithread:
                        # Scheduler._launch_thread(target=mutate_fasta.mutate_every_residue,
                        #                          args=[path_fastafile, write_1_fasta_only, write_fasta_per_mut,
                        #                                path_output_3dots])
                        Scheduler._launch_process(
                            target=mutate_fasta.mutate_every_residue,
                            args=[
                                path_fastafile, write_1_fasta_only,
                                write_fasta_per_mut, path_output_fastas_3dots
                            ])
                    elif not GUM.using_cluster():
                        mutate_fasta.mutate_every_residue(
                            path_fastafile, write_1_fasta_only,
                            write_fasta_per_mut, path_output_fastas_3dots)
                    if GUM.using_cluster():
                        jobname = Paths.PREFIX_MUTFSTA.value + path_fastafile.split(
                            '/')[-1]
                        write_1_fasta_only = True
                        write_fasta_per_mut = False
                        Cluster.write_job_q_bash(
                            jobname=jobname,
                            path_job_q_dir=Paths.SE_CONFIG_MUTFASTA_JOBQ.value,
                            python_script_with_paths=os.path.join(
                                Paths.SE_SRC.value, 'run_mutate_fasta_zeus.py')
                            + Str.SPCE.value + path_fastafile +
                            Str.SPCE.value + str(write_1_fasta_only) +
                            Str.SPCE.value + str(write_fasta_per_mut) +
                            Str.SPCE.value + path_output_fastas_3dots,
                            queue='',
                            n_slots='',
                            total_memory_GB='',
                            memory_limit_GB='3',
                            cluster_node='')
                        Cluster.run_job_q(
                            path_job_q_dir=Paths.SE_CONFIG_MUTFASTA_JOBQ.value)

            if operations[Scheduler.Strs.OPER_RUN_AGDR.value]:
                agadir = Agadir(Cond.INCELL_MAML_FX.value)
                for path_fastafile in path_fastafiles:
                    sleep_secs = 0 if len(
                        path_fastafiles) < 200 else len(path_fastafiles) / 1000
                    time.sleep(sleep_secs)
                    if GUM.using_cluster():
                        print(
                            'Calling scheduler.do_agadir using_cluster condition'
                        )
                        jobname = Paths.PREFIX_AGADIR.value + path_fastafile.split(
                            '/')[-1]
                        Cluster.write_job_q_bash(
                            jobname=jobname,
                            path_job_q_dir=Paths.SE_CONFIG_AGAD_JOBQ.value,
                            python_script_with_paths=os.path.join(
                                Paths.SE_SRC.value,
                                'run_agadir_on_multifastas_zeus.py' +
                                Str.SPCE.value + path_fastafile +
                                Str.SPCE.value + Paths.SE_OUTPUT.value))
                        Cluster.run_job_q(
                            path_job_q_dir=Paths.SE_CONFIG_AGAD_JOBQ.value)

                    path_dst = GUM.make_path_agadir_3dots_filename_mutants_dirs(
                        path_output, path_fastafile, add_filename_subdir=True)
                    if use_multithread:
                        # Scheduler._launch_thread(target=agadir.run_agadir_on_multifastas,
                        #                          args=[path_fastafile, path_output])
                        Scheduler._launch_process(
                            target=agadir.run_agadir_on_multifastas,
                            args=[path_fastafile, path_dst])
                    elif not GUM.using_cluster() and not use_multithread:
                        agadir.run_agadir_on_multifastas(
                            path_fastafile, path_dst)
        if path_pdbfiles:
            for path_pdbfile in path_pdbfiles:
                if operations[Scheduler.Strs.OPER_RUN_FX_BM.value]:
                    buildmodel = FoldX().BuildModel(Cond.INCELL_MAML_FX.value)
                    if use_multithread:
                        Scheduler._launch_thread(
                            target=buildmodel.mutate_protein_structure,
                            args=[
                                path_pdbfile, amino_acids, specific_fxmutants
                            ])
                    else:
                        buildmodel.mutate_protein_structure(
                            path_pdbfile,
                            amino_acids,
                            specific_fxmutants,
                            write_to_csv_dumpfile_after_each_mutant=
                            write_to_csv_dumpfile_after_each_mutant)
                if operations[Scheduler.Strs.OPER_RUN_FX_AC.value]:
                    analysecomplex = FoldX().AnalyseComplex(
                        Cond.INCELL_MAML_FX.value)
                    if use_multithread:
                        Scheduler._launch_thread(
                            target=analysecomplex.calculate_complex_energies,
                            args=path_pdbfile)
                    else:
                        analysecomplex.calculate_complex_energies(
                            path_pdbfile,
                            specific_fxmutants,
                            write_to_csv_dumpfile_after_each_mutant=
                            write_to_csv_dumpfile_after_each_mutant)
                if operations[Scheduler.Strs.OPER_RUN_FX_RPR.value]:
                    repair = FoldX().Repair(Cond.INCELL_MAML_FX.value)
                    if use_multithread:
                        Scheduler._launch_thread(target=repair.do_repair,
                                                 args=path_pdbfile)
                    else:
                        repair.do_repair(path_pdbfile)
示例#9
0
 def setUpClass(cls):
     GUM.linux_copy_all_files_in_dir(path_src_dir=TPLS.CONFIG_FOR_READ_ONLY.value, path_dst_dir=TPLS.MC_TESTS.value,
                                     recursively=True)
示例#10
0
4. Select specific mutants if you are only interested in these.
BE SURE TO SET THIS TO EMPTY LIST IF YOU DON'T WANT ANY OF THE SUBSEQUENT ACTIONS BELOW TO BE SPECIFIC TO THIS/THESE MUTANTS ONLY.
"""
specific_fxmutants = ['CA498L']
# specific_fxmutants = []
"""
5. Select which algorithm's unwanted files to remove: 
"""
delete_from_buildmodel_outputs = True
delete_from_analysecomplex_outputs = False
delete_from_agadir_outputs = False
"""
6. Remove unwanted FoldX-related input/output files:
"""

using_cluster = GUM.using_cluster()
if delete_from_buildmodel_outputs:
    fx = FoldX()
    path_output_bm_pdb_fxmutant_dirs = []
    for path_pdbfile in path_pdbfiles:
        pdbfile = os.path.basename(path_pdbfile)
        pdbname = pdbfile.split('.')[0]
        for specific_fxmutant in specific_fxmutants:
            if specific_fxmutant[0] == specific_fxmutant[len(specific_fxmutant)
                                                         - 1]:
                print(
                    'Mutations to wild-type residues are currently not being performed, therefore the results directory '
                    'should not exist for this: ' + str(pdbname) + '_' +
                    specific_fxmutant)
                continue
            path_output_bm_pdb_fxmutant_dirs.append(
示例#11
0
__status__ = "Development"

"""
1. Set up paths. ("use_cluster" is set to False by default.)  
"""
Paths.set_up_paths(use_cluster=(len(sys.argv) > 1 and sys.argv[1].strip(' ') == 'use_cluster=True'))

"""
2. Select directory of files for packing & compressing: Agadir-related
"""
# path_dir_txt_files_to_pack = os.path.join(Paths.OUTPUT_AGADIR, '02064053-3f32-32e6-9660-aaaffc30db87')
path_output_agad_3dots_dir = os.path.join(Paths.OUTPUT_AGADIR, '1...250')
path_files_to_pack_dirs = glob.glob(path_output_agad_3dots_dir + '/*')

"""
3. Select pdbname(s), outputs of which are to be packed & compressed. 
"""
pdbname = ['Repair_14']

"""
4. Select directory of files for packing & compressing: FoldX-related
"""
path_files_to_pack_dirs.append(os.path.join(Paths.OUTPUT_BM, pdbname))
path_files_to_pack_dirs.append(os.path.join(Paths.OUTPUT_AC, pdbname))

"""
5. Pack files in directory into tar.
"""
for path_files_to_pack_dir in path_files_to_pack_dirs:
    GUM.make_tarfile(path_files_to_pack_dir)
示例#12
0
    def _write_mutants(self,
                       title_titleSeq_w_mutants: dict,
                       write_1_fasta_only: bool,
                       write_fasta_per_mut: bool,
                       path_output_3dots: str,
                       write_csv=False,
                       write_txt=False):
        """
        Writes the mutants out to fastafiles and/or csv files and/or txt files. These can be written in one file
        containing all mutants or one file per mutant.
        The fastafiles are written to /output_data/<fastafilename>/mutants/.
        The csvfiles and txtfiles are written to /output_data/<fastafilename/sequences/.
        The reason for fastafile mutants being written to an input folder is that these sequences are generated as
        direct inputs for the mutation operation.
        :param title_titleSeq_w_mutants: Title of wild-type associated to every mutant title:sequence.
        :param write_1_fasta_only: True to write one fastafile containing all mutants, separated by \n.
        :param write_fasta_per_mut: True to write one fastafile per mutant.
        :param path_output_3dots: Absolute path of output_data dir (where fasta, txt, csv written), includes to
        subdirs: /fastas/xxxx...yyyy/ e.g. 1001...2000
        :param write_csv: True to write 1 csv file for wt & mutants. False by default.
        :param write_txt: True to write 1 txt file for wt & mutants. False by default.
        """
        path_1_fastafile = None
        path_1_fastafile_open = None
        path_fastafilepermut = None
        path_seqscsv = None
        path_seqscsv_open = None
        path_seqstxt = None
        path_seqstxt_open = None

        for wt_title, title_seq in title_titleSeq_w_mutants.items():
            for mut_title, mut_seq in title_seq.items():
                if write_1_fasta_only and path_1_fastafile is None:
                    path_1_fastafile = GUM.os_makedirs(path_output_3dots,
                                                       wt_title,
                                                       Paths.DIR_MUTANTS.value)
                    path_1_fastafile = os.path.join(
                        path_1_fastafile, wt_title + '_mutants.fasta')
                    path_1_fastafile_open = open(path_1_fastafile, 'w')
                    path_1_fastafile_open.write('>' + wt_title + '\n' +
                                                title_seq[wt_title] + '\n')
                if write_fasta_per_mut and path_fastafilepermut is None:
                    path_fastafilepermut = GUM.os_makedirs(
                        path_output_3dots, Paths.DIR_FASTAS.value, wt_title)
                    path_fastafilepermut = os.path.join(
                        path_fastafilepermut, wt_title + Str.FSTAEXT.value)
                    path_fastafilepermut_open = open(path_fastafilepermut, 'w')
                    path_fastafilepermut_open.write('>' + wt_title + '\n' +
                                                    title_seq[wt_title] + '\n')
                    path_fastafilepermut_open.close()
                if write_csv and path_seqscsv is None:
                    path_seqscsv = GUM.os_makedirs(
                        path_output_3dots, Paths.DIR_SEQS_TXT_CSV.value,
                        wt_title)
                    path_seqscsv = os.path.join(path_seqscsv,
                                                wt_title + '_mutants.csv')
                    path_seqscsv_open = open(path_seqscsv, 'w')
                    path_seqscsv_open.write(wt_title + ':' +
                                            title_seq[wt_title] + ',')
                if write_txt and path_seqstxt is None:
                    path_seqstxt = GUM.os_makedirs(
                        path_output_3dots, Paths.DIR_SEQS_TXT_CSV.value,
                        wt_title)
                    path_seqstxt = os.path.join(path_seqstxt,
                                                wt_title + '_mutants.txt')
                    path_seqstxt_open = open(path_seqstxt, 'w')
                    path_seqstxt_open.write(wt_title + ':' +
                                            title_seq[wt_title] + '\n')
                elif mut_title is not wt_title:
                    if write_1_fasta_only and path_1_fastafile_open is not None:
                        path_1_fastafile_open.write('>' + mut_title + '\n' +
                                                    mut_seq + '\n')
                    if write_fasta_per_mut and path_fastafilepermut is not None:
                        path_fastafilepermut = os.path.join(
                            path_output_3dots, Paths.DIR_FASTAS.value,
                            wt_title, Paths.DIR_MUTANTS.value,
                            mut_title + Str.FSTAEXT.value)
                        path_fastafilepermut_open = open(
                            path_fastafilepermut, 'w')
                        path_fastafilepermut_open.write('>' + mut_title +
                                                        '\n' + mut_seq + '\n')
                        path_fastafilepermut_open.close()
                    if write_csv and path_seqscsv_open is not None:
                        path_seqscsv_open.write(mut_title + ':' + mut_seq +
                                                ',')
                    if write_txt and path_seqstxt_open is not None:
                        path_seqstxt_open.write(mut_title + ':' + mut_seq +
                                                '\n')

        if path_1_fastafile_open is not None:
            path_1_fastafile_open.close()
        if path_seqstxt_open is not None:
            path_seqstxt_open.close()
        if path_seqscsv_open is not None:
            path_seqscsv_open.close()
示例#13
0
    def map_seq_to_swsprt_acc_id_and_write_files(
            path_input_fastafiles,
            path_output: str,
            write_idmaps_for_mysqldb: bool,
            write_csv=True,
            write_xml=True,
            write_json=False):
        """
        Maps the specified protein sequences in FASTA format to sequences in the SwissProt database, to find 100% identity
        hits. The results including the SwissProt Accession Id are written to a csv file.
        Expects a directory location of fastafiles (not a fastafile itself).
        (This method relies on the presence of fastafiles in the specified dir, in order for them to be run on Blastp.
        This transfer is currently done manually)
        :param path_input_fastafiles: Absolute path of root directory for input fastafiles (e.g. /input_data/fastas_10).
        :param path_output: Absolute path of root directory for blastp output files (..../output_data/).
        :param write_idmaps_for_mysqldb: True (by default) builds dictionary mapping RvdK's ids to swsprt accession nos & write files.
        :param write_csv: True to write csvfiles.
        :param write_xml: True to write xmlfiles.
        :param write_json: True to write jsonfiles.
        :return: List of dictionary data structure representations of each parsed & filtered Blastp run result.
        """
        if isinstance(path_input_fastafiles, str):
            path_input_fastafiles = [path_input_fastafiles]
        blastp_dict_list = []
        # There are problems with using Biopython.Blast on the cluster that I have not yet solved. I may use the
        # blast module that is loaded on the cluster (v 2.5.0+) instead of via Biopython.
        if GUM.using_cluster():
            # THE 18 OR SO LINES BELOW HERE ARE COMMENTED OUT BECAUSE BIOPYTHON BLAST DID NOT WORK ON THE CLUSTER AND I DON'T
            # YET KNOW WHY.
            # for path_fastafile in path_input_fastafile_list:
            #     with open(path_fastafile) as fastafile_opened:
            #         fastafilename = path_fastafile.split('/')[-1].split('.')[0]
            #         jobname = 'BLSTP_' + fastafilename
            #         Cluster.write_job_q_bash(job_name=jobname, path_job_q_dir=Paths.CONFIG_JOBQ)
            #         path_output_blastp_fastaname = GUM._os_makedirs(Paths.OUTPUT_BLASTP, fastafilename)
            #         os.chdir(path_output_blastp_fastaname)
            #         Cluster.run_job_q(path_job_q_dir=Paths.CONFIG_JOBQ)
            #         Cluster.wait_for_grid_engine_job_to_complete(grid_engine_jobname=jobname)
            #         path_blstp_xml = IdProt._write_raw_blast_xml(path_output, fastafilename,
            #                                                 blastp_result=Biopy.run_blastp(fastafile_opened.read()))
            #         blastp_dict = Biopy.parse_filter_blastp_xml_to_dict(path_blstp_xml, fastafilename,
            #         path_fastafile)
            #         # blastp_dict_list.append(blastp_dict)
            #         if write_idmaps_for_mysqldb:
            #             IdProt._write_idmaps_for_mysqldb(path_output, blastp_dict, write_csv=write_csv,
            #                                              write_xml=write_xml,
            #                                              write_json=write_json)

            python_script_w_paths = os.path.join(Paths.SRC, 'run_blastp_zeus.py') + ' ' + path_input_fastafiles + ' ' \
                                    + path_output + ' ' + Paths.CONFIG_BLST_JOBQ + ' ' + Paths.OUTPUT_BLASTP + ' ' + \
                                    str(write_idmaps_for_mysqldb) + ' ' + str(write_csv) + ' ' + str(write_xml) + \
                                    ' ' + str(write_json)
            Cluster.write_job_q_bash(
                jobname='IdProtJobs',
                path_job_q_dir=Paths.CONFIG_BLST_JOBQ,
                python_script_with_paths=python_script_w_paths)
            Cluster.run_job_q(path_job_q_dir=Paths.CONFIG_BLST_JOBQ)
        else:
            for path_fastafile in path_input_fastafiles:
                with open(path_fastafile) as f:
                    fasta_str = f.read()
                    fastafilename = path_fastafile.split('/')[-1].split('.')[0]
                if IdProt._has_all_A_sequence(path_fastafile):
                    print(
                        'This sequence has all As, BLAST would think it is a nucleotide sequence and fail. So it is '
                        'not being run: ' + path_fastafile)
                    continue
                path_output_blastp_fastafilename = IdProt._build_dir_tree_with_intermed_dir(
                    path_root=path_output,
                    intermed_dir=Paths.DIR_BLASTP.value,
                    fastadir=fastafilename)
                if os.path.exists(
                        os.path.join(path_output_blastp_fastafilename,
                                     fastafilename + Str.XMLEXT.value)):
                    continue
                blastp_result = Biopy.run_blastp(fasta_str)
                path_raw_blstp_xml = IdProt._write_raw_blast_xml(
                    path_output, fastafilename, blastp_result)
                blastp_dict = Biopy.parse_filter_blastp_xml_to_dict(
                    path_raw_blstp_xml, fastafilename, path_fastafile)
                blastp_dict_list.append(blastp_dict)
                if write_idmaps_for_mysqldb:
                    IdProt._write_idmaps_for_mysqldb(path_output,
                                                     blastp_dict,
                                                     write_csv=write_csv,
                                                     write_xml=write_xml,
                                                     write_json=write_json)
        return blastp_dict_list
示例#14
0
__maintainer__ = "Shahin Zibaee"
__email__ = "*****@*****.**"
__status__ = "Development"

path_fastafile = sys.argv[1]
path_dst = sys.argv[2]
"""
To run agadir on file that has multiple fasta sequences. Iterate through the list, write the sequence to an
individual text file passing this to agadir.compute(). This newly-written individual text file is then deleted,
crucial to prevent too much memory taken up by each file when running large numbers of sequences.
:sys.argv[1] path_fastafile: Abs path to fasta file.
:sys.argv[2] path_dst: Abs path to output root dir.
"""

print('run_agadir_on_multifastas_zeus.py ###################################')
path_dst = GUM.make_path_agadir_3dots_filename_mutants_dirs(
    path_dst, path_fastafile, add_filename_subdir=True)
with open(path_fastafile) as f:
    is_first_line = True
    fasta_str = ''
    mutantfastafilename = ''
    mutantfastafile = ''
    for line in f.readlines():
        if '>' in line:
            if not is_first_line:
                path_dst_mutant_filename = GUM.os_makedirs(
                    path_dst, mutantfastafilename)
                path_dst_mutant_file = os.path.join(path_dst_mutant_filename,
                                                    mutantfastafile)
                with open(path_dst_mutant_file, 'w') as g:
                    g.write(fasta_str)
                agadir = Agadir(Cond.INCELL_MAML.value)
示例#15
0
import sys
import os
from src.tools.GeneralUtilityMethods import GUM
from src.enums.Paths import Paths
from src.enums.Str import Str

__author__ = "Shahin Zibaee"
__copyright__ = "Copyright 2018, The Switch lab, KU Leuven"
__license__ = "GPL"
__version__ = "1.0.0"
__maintainer__ = "Shahin Zibaee"
__email__ = "*****@*****.**"
__status__ = "Development"

path_fastafile = sys.argv[1]
path_dst = GUM.make_path_agadir_3dots_filename_mutants_dirs(
    Paths.OUTPUT, path_fastafile)

with open(path_fastafile) as f:
    fasta_str = ''
    is_first_line = True
    mutantfastafile = ''
    for line in f.readlines():
        if '>' in line:
            if not is_first_line:
                path_dst_mutant_file = os.path.join(path_dst, mutantfastafile)
                with open(path_dst_mutant_file, 'w') as temp_fastafile:
                    temp_fastafile.write(fasta_str)
            fasta_str = line
            is_first_line = False
            mutantfastafile = line.split('>')[-1].split(
                '\n')[0] + Str.FSTAEXT.value
示例#16
0
if not path_pdbfiles:
    warnings.warn_explicit(
        message=
        "No pdb files to process. (No problem if that is what you expected).",
        category=RuntimeWarning,
        filename="KickOff",
        lineno=68)
"""
5. Select specific mutants if you are only interested in these.
MAKE SURE TO SET THIS LIST TO EMPTY if you don't want any of the subsequent actions below to be for these mutants only.
"""
# specific_fxmutants = ['CA498L']
specific_fxmutants = []
for path_pdbfile in path_pdbfiles:
    for specific_fxmutant in specific_fxmutants:
        if not GUM.is_valid_fxmutant_for_pdb(path_pdbfile, specific_fxmutant):
            raise ValueError('The specified mutant ' + specific_fxmutant +
                             ' is not valid for this pdb: ' +
                             os.path.basename(path_pdbfile))
"""
6. Get the fasta files you want to run mutate_fasta or agadir on.
"""
path_fastafiles = []
# path_input_fastas_dir = Paths.INPUT_MUTS_MULTIFASTAS_29611_1000 + '/1...250/'
# path_fastafiles = sorted(glob.glob(path_input_fastas_dir + '/**/*.fasta', recursive=True))
# path_fastafiles = sorted(glob.glob(path_input_fastas_dir + '/*.fasta'))
if not path_fastafiles:
    warnings.warn_explicit(
        message=
        "No fasta files to process. (No problem if that is what you expected).",
        category=RuntimeWarning,