Exemplo n.º 1
0
    def add_features(self,
                     include_indep_genes=False,
                     include_joint_genes=True,
                     custom_pgen_model=None):
        """Generates a list of feature_lsts for a length dependent L pos model.

        Parameters
        ----------
        include_genes : bool
            If true, features for gene selection are also generated. Currently
            joint V/J pairs used.

        custom_pgen_model: string
            path to folder of custom olga model.

        """

        features = []
        L_features = [['l' + str(L)]
                      for L in range(self.min_L, self.max_L + 1)]
        features += L_features
        for L in range(self.min_L, self.max_L + 1):
            for i in range(L):
                for aa in self.amino_acids:
                    features.append(['l' + str(L), 'a' + aa + str(i)])

        if include_indep_genes or include_joint_genes:
            import olga.load_model as olga_load_model
            if custom_pgen_model is None:
                main_folder = os.path.join(os.path.dirname(__file__),
                                           'default_models', self.chain_type)
            else:
                main_folder = custom_pgen_model
            params_file_name = os.path.join(main_folder, 'model_params.txt')
            V_anchor_pos_file = os.path.join(main_folder,
                                             'V_gene_CDR3_anchors.csv')
            J_anchor_pos_file = os.path.join(main_folder,
                                             'J_gene_CDR3_anchors.csv')

            if self.vj: genomic_data = olga_load_model.GenomicDataVJ()
            else: genomic_data = olga_load_model.GenomicDataVDJ()
            genomic_data.load_igor_genomic_data(params_file_name,
                                                V_anchor_pos_file,
                                                J_anchor_pos_file)

            if include_indep_genes:
                features += [[v] for v in set([
                    gene_to_num_str(genV[0], 'V') for genV in genomic_data.genV
                ])]
                features += [[j] for j in set([
                    gene_to_num_str(genJ[0], 'J') for genJ in genomic_data.genJ
                ])]
            if include_joint_genes:
                features += [[v, j] for v in set([
                    gene_to_num_str(genV[0], 'V') for genV in genomic_data.genV
                ]) for j in set([
                    gene_to_num_str(genJ[0], 'J') for genJ in genomic_data.genJ
                ])]

        self.update_model(add_features=features)
Exemplo n.º 2
0
    def __init__(self,
                 sonia_model=None,
                 include_genes=True,
                 processes=None,
                 custom_olga_model=None):

        if type(sonia_model) == str or sonia_model is None:
            print('ERROR: you need to pass a Sonia object')
            return

        self.sonia_model = sonia_model
        self.include_genes = include_genes

        # only count usable cpus
        # (mp.cpu_count() returns total number of cpus even if not all are available e.g. when running on cluster)
        if processes is None: self.processes = len(os.sched_getaffinity(0))
        else: self.processes = processes

        # define olga model
        if custom_olga_model is not None:
            self.pgen_model = custom_olga_model
        else:
            try:
                if self.sonia_model.custom_pgen_model is None:
                    main_folder = os.path.join(os.path.dirname(__file__),
                                               'default_models',
                                               self.sonia_model.chain_type)
                else:
                    main_folder = self.sonia_model.custom_pgen_model
            except:
                main_folder = os.path.join(os.path.dirname(__file__),
                                           'default_models',
                                           self.sonia_model.chain_type)

            params_file_name = os.path.join(main_folder, 'model_params.txt')
            marginals_file_name = os.path.join(main_folder,
                                               'model_marginals.txt')
            V_anchor_pos_file = os.path.join(main_folder,
                                             'V_gene_CDR3_anchors.csv')
            J_anchor_pos_file = os.path.join(main_folder,
                                             'J_gene_CDR3_anchors.csv')

            if self.sonia_model.vj:
                self.genomic_data = olga_load_model.GenomicDataVJ()
                self.genomic_data.load_igor_genomic_data(
                    params_file_name, V_anchor_pos_file, J_anchor_pos_file)
                self.generative_model = olga_load_model.GenerativeModelVJ()
                self.generative_model.load_and_process_igor_model(
                    marginals_file_name)
                self.pgen_model = pgen.GenerationProbabilityVJ(
                    self.generative_model, self.genomic_data)
            else:
                self.genomic_data = olga_load_model.GenomicDataVDJ()
                self.genomic_data.load_igor_genomic_data(
                    params_file_name, V_anchor_pos_file, J_anchor_pos_file)
                self.generative_model = olga_load_model.GenerativeModelVDJ()
                self.generative_model.load_and_process_igor_model(
                    marginals_file_name)
                self.pgen_model = pgen.GenerationProbabilityVDJ(
                    self.generative_model, self.genomic_data)
def generate_simulated_beta_seqs(
        params_file_name='tcrdist/default_models/human_T_beta/model_params.txt',
        marginals_file_name='tcrdist/default_models/human_T_beta/model_marginals.txt',
        V_anchor_pos_file='tcrdist/default_models/human_T_beta/V_gene_CDR3_anchors.csv',
        J_anchor_pos_file='tcrdist/default_models/human_T_beta/J_gene_CDR3_anchors.csv',
        output_cols=['cdr3_b_aa', "v_b_gene", 'j_b_gene'],
        n=100000):
    #Load data
    genomic_data = load_model.GenomicDataVDJ()
    genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file,
                                        J_anchor_pos_file)
    #Load model
    generative_model = load_model.GenerativeModelVDJ()
    generative_model.load_and_process_igor_model(marginals_file_name)
    seq_gen_model = seq_gen.SequenceGenerationVDJ(generative_model,
                                                  genomic_data)

    #Generate some random sequences

    vs = [x[0] for x in genomic_data.__dict__['genV']]
    js = [x[0] for x in genomic_data.__dict__['genJ']]
    vs = {i: k for i, k in enumerate(vs)}
    js = {i: k for i, k in enumerate(js)}

    sim_cdr3 = [seq_gen_model.gen_rnd_prod_CDR3()[1:4] for x in range(n)]
    sim_cdr3_long = [(i, vs[v], js[j]) for i, v, j in sim_cdr3]

    df = pd.DataFrame(sim_cdr3_long, columns=output_cols)
    return df
Exemplo n.º 4
0
def compute_pgen(index, seq):
    index_ = int(index)
    main_folder = os.path.join(local_directory, 'default_models',
                               options_of[index_])
    params_file_name = os.path.join(main_folder, 'model_params.txt')
    marginals_file_name = os.path.join(main_folder, 'model_marginals.txt')
    V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv')
    J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv')

    if options_of[index] in vj_chains:
        genomic_data = olga_load_model.GenomicDataVJ()
        genomic_data.load_igor_genomic_data(params_file_name,
                                            V_anchor_pos_file,
                                            J_anchor_pos_file)
        generative_model = olga_load_model.GenerativeModelVJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        pgen_model = pgen.GenerationProbabilityVJ(generative_model,
                                                  genomic_data)
    else:
        genomic_data = olga_load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name,
                                            V_anchor_pos_file,
                                            J_anchor_pos_file)
        generative_model = olga_load_model.GenerativeModelVDJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        pgen_model = pgen.GenerationProbabilityVDJ(generative_model,
                                                   genomic_data)

    return pgen_model.compute_aa_CDR3_pgen(seq[0], seq[1],
                                           seq[2]) / norms[index_][0]
Exemplo n.º 5
0
    def __init__(self,sonia_model=None,include_genes=True,processes=None,custom_olga_model=None):

        if type(sonia_model)==str or sonia_model is None:
            print('ERROR: you need to pass a Sonia object')
            return

        self.sonia_model=sonia_model
        self.include_genes=include_genes
        if processes is None: self.processes = mp.cpu_count()
        else: self.processes = processes

        # define olga model
        if custom_olga_model is not None:
            self.pgen_model = custom_olga_model
        else:
            main_folder=os.path.join(os.path.dirname(__file__), 'default_models', self.sonia_model.chain_type)

            params_file_name = os.path.join(main_folder,'model_params.txt')
            marginals_file_name = os.path.join(main_folder,'model_marginals.txt')
            V_anchor_pos_file = os.path.join(main_folder,'V_gene_CDR3_anchors.csv')
            J_anchor_pos_file = os.path.join(main_folder,'J_gene_CDR3_anchors.csv')

            if self.sonia_model.vj:
                genomic_data = olga_load_model.GenomicDataVJ()
                genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
                generative_model = olga_load_model.GenerativeModelVJ()
                generative_model.load_and_process_igor_model(marginals_file_name)
                self.pgen_model = pgen.GenerationProbabilityVJ(generative_model, genomic_data)
            else:
                genomic_data = olga_load_model.GenomicDataVDJ()
                genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
                generative_model = olga_load_model.GenerativeModelVDJ()
                generative_model.load_and_process_igor_model(marginals_file_name)
                self.pgen_model = pgen.GenerationProbabilityVDJ(generative_model, genomic_data)
Exemplo n.º 6
0
def sample_olga(num_gen_seqs=1, chain_index=0, ppost=False, seed=None):
    if seed is not None: np.random.seed(seed)
    else: np.random.seed()

    num_gen_seqs = np.min([num_gen_seqs, 1000])
    chain_type = options_of[chain_index]
    main_folder = os.path.join(local_directory, 'default_models', chain_type)
    params_file_name = os.path.join(main_folder, 'model_params.txt')
    marginals_file_name = os.path.join(main_folder, 'model_marginals.txt')
    V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv')
    J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv')

    if options_of[chain_index] in vj_chains:
        genomic_data = olga_load_model.GenomicDataVJ()
        genomic_data.load_igor_genomic_data(params_file_name,
                                            V_anchor_pos_file,
                                            J_anchor_pos_file)
        generative_model = olga_load_model.GenerativeModelVJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        sg_model = seq_gen.SequenceGenerationVJ(generative_model, genomic_data)
    else:
        genomic_data = olga_load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name,
                                            V_anchor_pos_file,
                                            J_anchor_pos_file)
        generative_model = olga_load_model.GenerativeModelVDJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        sg_model = seq_gen.SequenceGenerationVDJ(generative_model,
                                                 genomic_data)

    if not bool(ppost):
        return [
            [
                seq[0], seq[1], genomic_data.genV[seq[2]][0].split('*')[0],
                genomic_data.genJ[seq[3]][0].split('*')[0]
            ] for seq in
            [sg_model.gen_rnd_prod_CDR3() for _ in range(int(num_gen_seqs))]
        ]
    else:
        qm = MinimalSonia(qfiles[chain_index], norms[chain_index][1])
        seqs_post = [['a', 'b', 'c', 'd']]  # initialize
        while len(seqs_post) < num_gen_seqs:
            seqs = [[
                seq[0], seq[1], genomic_data.genV[seq[2]][0].split('*')[0],
                genomic_data.genJ[seq[3]][0].split('*')[0]
            ] for seq in [
                sg_model.gen_rnd_prod_CDR3()
                for _ in range(int(11 * num_gen_seqs))
            ]]
            Qs = qm.compute_sel_factor(list(np.array(seqs)[:, 1:]))
            random_samples = np.random.uniform(
                size=len(Qs))  # sample from uniform distribution
            #do rejection
            rejection_selection = random_samples < np.clip(Qs, 0, 10) / 10.
            print(
                np.sum(rejection_selection) / float(len(rejection_selection)))
            seqs_post = np.concatenate(
                [seqs_post, np.array(seqs)[rejection_selection]])
        return seqs_post[1:num_gen_seqs + 1]
Exemplo n.º 7
0
    def __init__(self,
                 sonia_model=None,
                 custom_olga_model=None,
                 custom_genomic_data=None):

        if type(sonia_model) == str or sonia_model is None:
            print('ERROR: you need to pass a Sonia object')
            return

        self.sonia_model = sonia_model  # sonia model passed as an argument

        # define olga sequence_generation model
        if custom_olga_model is not None:
            if custom_genomic_data is None:
                print('ERROR: you need to pass also the custom_genomic_data')
                return
            self.genomic_data = custom_genomic_data
            self.seq_gen_model = custom_olga_model
        else:

            main_folder = os.path.join(
                os.path.dirname(olga_load_model.__file__), 'default_models',
                self.sonia_model.chain_type)

            params_file_name = os.path.join(main_folder, 'model_params.txt')
            marginals_file_name = os.path.join(main_folder,
                                               'model_marginals.txt')
            V_anchor_pos_file = os.path.join(main_folder,
                                             'V_gene_CDR3_anchors.csv')
            J_anchor_pos_file = os.path.join(main_folder,
                                             'J_gene_CDR3_anchors.csv')

            if self.sonia_model.chain_type != 'human_T_alpha':
                self.genomic_data = olga_load_model.GenomicDataVDJ()
                self.genomic_data.load_igor_genomic_data(
                    params_file_name, V_anchor_pos_file, J_anchor_pos_file)
                generative_model = olga_load_model.GenerativeModelVDJ()
                generative_model.load_and_process_igor_model(
                    marginals_file_name)
                self.seq_gen_model = seq_gen.SequenceGenerationVDJ(
                    generative_model, self.genomic_data)

            else:
                self.genomic_data = olga_load_model.GenomicDataVJ()
                self.genomic_data.load_igor_genomic_data(
                    params_file_name, V_anchor_pos_file, J_anchor_pos_file)
                generative_model = olga_load_model.GenerativeModelVJ()
                generative_model.load_and_process_igor_model(
                    marginals_file_name)

                self.seq_gen_model = seq_gen.SequenceGenerationVJ(
                    generative_model, self.genomic_data)

        # you need Z for rejection selection and generate sequences ppost --> compute only once
        self.energies_gen = self.sonia_model.compute_energy(
            self.sonia_model.gen_seq_features)
        self.Z = np.sum(np.exp(-self.energies_gen)) / len(self.energies_gen)
Exemplo n.º 8
0
    def add_features(self, custom_pgen_model=None):
        """Generates a list of feature_lsts for L/R pos model.

        Parameters
        ----------
        include_genes : bool
            If true, features for gene selection are also generated. Currently
            joint V/J pairs used.

        custom_pgen_model: string
            path to folder of custom olga model.

        """
        features = []
        L_features = [['l' + str(L)] for L in range(1, self.max_L + 1)]

        import olga.load_model as olga_load_model
        if custom_pgen_model is None:
            main_folder = os.path.join(os.path.dirname(__file__),
                                       'default_models', self.chain_type)
        else:
            main_folder = custom_pgen_model
        params_file_name = os.path.join(main_folder, 'model_params.txt')
        V_anchor_pos_file = os.path.join(main_folder,
                                         'V_gene_CDR3_anchors.csv')
        J_anchor_pos_file = os.path.join(main_folder,
                                         'J_gene_CDR3_anchors.csv')

        if self.vj: genomic_data = olga_load_model.GenomicDataVJ()
        else: genomic_data = olga_load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name,
                                            V_anchor_pos_file,
                                            J_anchor_pos_file)

        if self.joint_vjl:
            features += [[v, j, 'l' + str(l)] for v in set([
                'v' + genV[0].split('*')[0].split('V')[-1]
                for genV in genomic_data.genV
            ]) for j in set([
                'j' + genJ[0].split('*')[0].split('J')[-1]
                for genJ in genomic_data.genJ
            ]) for l in range(1, self.max_L + 1)]
        elif self.include_indep_genes:
            features += L_features
            features += [[v] for v in set(
                [gene_to_num_str(genV[0], 'V') for genV in genomic_data.genV])]
            features += [[j] for j in set(
                [gene_to_num_str(genJ[0], 'J') for genJ in genomic_data.genJ])]
        elif self.include_joint_genes:
            features += L_features
            features += [[v, j] for v in set([
                gene_to_num_str(genV[0], 'V') for genV in genomic_data.genV
            ]) for j in set(
                [gene_to_num_str(genJ[0], 'J') for genJ in genomic_data.genJ])]

        self.update_model(add_features=features)
Exemplo n.º 9
0
	def define_olga_models(self,olga_model=None):
		"""Defines Olga pgen and seqgen models and keeps them as attributes.

		Parameters
		----------
		olga_model: string
			Path to a folder specifying a custom IGoR formatted model to be
			used as a generative model. Folder must contain 'model_params.txt',
			model_marginals.txt','V_gene_CDR3_anchors.csv' and 'J_gene_CDR3_anchors.csv'.


		Attributes set
		--------------
		genomic_data: object
			genomic data associate with the olga model.

		pgen_model: object
			olga model for evaluation of pgen.

		seq_gen_model: object
			olga model for generation of seqs.

		"""


		#Load generative model
		if olga_model is not None:
			try:
				# relative path
				pathdir= os.getcwd()
				main_folder = os.path.join(pathdir,olga_model)
				os.path.isfile(os.path.join(main_folder,'model_params.txt'))
			except:
				# absolute path
				main_folder=olga_model
		else:
			main_folder=os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type)

		params_file_name = os.path.join(main_folder,'model_params.txt')
		marginals_file_name = os.path.join(main_folder,'model_marginals.txt')
		V_anchor_pos_file = os.path.join(main_folder,'V_gene_CDR3_anchors.csv')
		J_anchor_pos_file = os.path.join(main_folder,'J_gene_CDR3_anchors.csv')

		genomic_data = olga_load_model.GenomicDataVDJ()
		genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
		self.genomic_data=genomic_data
		generative_model = olga_load_model.GenerativeModelVDJ()
		generative_model.load_and_process_igor_model(marginals_file_name)        

		self.pgen_model = pgen.GenerationProbabilityVDJ(generative_model, genomic_data)
		self.pgen_model.V_mask_mapping=self.complement_V_mask(self.pgen_model)

		self.seq_gen_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data)
    def add_features(self,
                     include_indep_genes=False,
                     include_joint_genes=True,
                     custom_pgen_model=None):
        """Generates a list of feature_lsts for a length dependent L pos model.
        
        
        Parameters
        ----------
        min_L : int
            Minimum length CDR3 sequence
        max_L : int
            Maximum length CDR3 sequence
        include_genes : bool
            If true, features for gene selection are also generated. Currently
            joint V/J pairs used.
                
        """

        import olga.load_model as olga_load_model
        features = []

        if custom_pgen_model is None:
            main_folder = os.path.join(
                os.path.dirname(olga_load_model.__file__), 'default_models',
                self.chain_type)
        else:
            main_folder = custom_pgen_model

        params_file_name = os.path.join(main_folder, 'model_params.txt')
        V_anchor_pos_file = os.path.join(main_folder,
                                         'V_gene_CDR3_anchors.csv')
        J_anchor_pos_file = os.path.join(main_folder,
                                         'J_gene_CDR3_anchors.csv')

        genomic_data = olga_load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name,
                                            V_anchor_pos_file,
                                            J_anchor_pos_file)

        features += [[v, j, 'l' + str(l)] for v in set([
            'v' + genV[0].split('*')[0].split('V')[-1]
            for genV in genomic_data.genV
        ]) for j in set([
            'j' + genJ[0].split('*')[0].split('J')[-1]
            for genJ in genomic_data.genJ
        ]) for l in range(self.min_L, self.max_L + 1)]

        self.update_model(add_features=features)
Exemplo n.º 11
0
    def define_olga_models(self, olga_model=None):
        """
		Defines Olga pgen and seqgen models and keeps them as attributes.

		"""
        import olga.load_model as load_model
        import olga.generation_probability as pgen
        import olga.sequence_generation as seq_gen

        #Load generative model
        if olga_model is not None:
            try:
                # relative path
                pathdir = os.getcwd()
                main_folder = os.path.join(pathdir, olga_model)
                os.path.isfile(os.path.join(main_folder, 'model_params.txt'))
            except:
                # absolute path
                main_folder = olga_model
        else:
            main_folder = os.path.join(os.path.dirname(load_model.__file__),
                                       'default_models', self.chain_type)

        params_file_name = os.path.join(main_folder, 'model_params.txt')
        marginals_file_name = os.path.join(main_folder, 'model_marginals.txt')
        V_anchor_pos_file = os.path.join(main_folder,
                                         'V_gene_CDR3_anchors.csv')
        J_anchor_pos_file = os.path.join(main_folder,
                                         'J_gene_CDR3_anchors.csv')

        genomic_data = load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name,
                                            V_anchor_pos_file,
                                            J_anchor_pos_file)
        self.genomic_data = genomic_data
        generative_model = load_model.GenerativeModelVDJ()
        generative_model.load_and_process_igor_model(marginals_file_name)

        self.pgen_model = pgen.GenerationProbabilityVDJ(
            generative_model, genomic_data)
        self.pgen_model.V_mask_mapping = self.complement_V_mask(
            self.pgen_model)

        self.seq_gen_model = seq_gen.SequenceGenerationVDJ(
            generative_model, genomic_data)
Exemplo n.º 12
0
    def _load_params(self, model_params):
        """Private function for loading in the genomic parameter data for the IGoR model.

        Parameters
        ----------
        model_params : str
            A file path location for the IGoR parameters model file.

        Returns
        -------
        GenomicDataVJ or GenomicDataVDJ OLGA object
            The genomic data object class for a VJ or VDJ model.

        Raises
        ------
        TypeError
            When the model input data cannot be loaded in as either a VJ or VDJ model.
        OSError
            When OLGA produces and system error with the input data.

        """
        # Try to load the genomic data model for VDJ.
        try:
            genomic_data = None
            if self.type == 'VDJ':
                genomic_data = olga_load_model.GenomicDataVDJ()
                genomic_data.genD = olga_load_model.read_igor_D_gene_parameters(
                    model_params)
            elif self.type == 'VJ':
                genomic_data = olga_load_model.GenomicDataVJ()
            else:
                raise TypeError(
                    "Model genomic data could not be loaded as 'VDJ' or 'VJ' type"
                )

            # Load the remainder of the data for the VJ model and return.
            genomic_data.genV = olga_load_model.read_igor_V_gene_parameters(
                model_params)
            genomic_data.genJ = olga_load_model.read_igor_J_gene_parameters(
                model_params)
            return genomic_data

        except Exception as err:
            raise OSError(err)
Exemplo n.º 13
0
def return_genes(index):
    main_folder = os.path.join(local_directory, 'default_models',
                               options_of[index])
    params_file_name = os.path.join(main_folder, 'model_params.txt')
    marginals_file_name = os.path.join(main_folder, 'model_marginals.txt')
    V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv')
    J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv')

    if options_of[index] in vj_chains:
        genomic_data = olga_load_model.GenomicDataVJ()
        genomic_data.load_igor_genomic_data(params_file_name,
                                            V_anchor_pos_file,
                                            J_anchor_pos_file)
    else:
        genomic_data = olga_load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name,
                                            V_anchor_pos_file,
                                            J_anchor_pos_file)

    #select out genes
    gene_v = np.unique([value[0].split('*')[0] for value in genomic_data.genV])
    gene_j = np.unique([value[0].split('*')[0] for value in genomic_data.genJ])
    gene_v = list(gene_v)
    gene_j = list(gene_j)
    #select out bad genes
    if options_of[index] == 'human_T_beta': return gene_v, gene_j
    elif options_of[index] == 'human_T_alpha':
        bad_vs = ['TRAV8-4', 'TRAV3', 'TRAV26-2']
        bad_js = ['TRAJ9', 'TRAJ58']
        for v in bad_vs:
            gene_v.remove(v)
        for j in bad_js:
            gene_j.remove(j)
        return gene_v, gene_j
    elif options_of[index] == 'human_B_heavy':
        bad_vs = ['IGHV1-8', 'IGHV3-9', 'IGHV4-31', 'IGHV4-30-4']
        for v in bad_vs:
            gene_v.remove(v)
        return gene_v, gene_j
    else:
        return gene_v, gene_j
Exemplo n.º 14
0
def compute_all_pgens(seqs, model=None, processes=None, include_genes=True):
    '''
	Compute Pgen of sequences using OLGA
	'''
    #Load OLGA for seq pgen estimation
    if model is None:
        import olga.load_model as load_model
        import olga.generation_probability as pgen

        main_folder = os.path.join(os.path.dirname(load_model.__file__),
                                   'default_models', chain_type)
        params_file_name = os.path.join(main_folder, 'model_params.txt')
        marginals_file_name = os.path.join(main_folder, 'model_marginals.txt')
        V_anchor_pos_file = os.path.join(main_folder,
                                         'V_gene_CDR3_anchors.csv')
        J_anchor_pos_file = os.path.join(main_folder,
                                         'J_gene_CDR3_anchors.csv')

        genomic_data = load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name,
                                            V_anchor_pos_file,
                                            J_anchor_pos_file)
        generative_model = load_model.GenerativeModelVDJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        model_pgen = pgen.GenerationProbabilityVDJ(generative_model,
                                                   genomic_data)

    # every process needs to access this vector, for sure there is a smarter way to implement this.
    final_models = [model for i in range(len(seqs))]

    pool = mp.Pool(processes=processes)
    if include_genes:
        f = pool.map(compute_pgen_expand, zip(seqs, final_models))
        pool.close()
        return f
    else:
        f = pool.map(compute_pgen_expand_novj, zip(seqs, final_models))
        pool.close()
        return f
Exemplo n.º 15
0
def main():
    """ Evaluate sequences."""
    parser = OptionParser(conflict_handler="resolve")
    
    #specify model
    parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)')
    parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)')
    parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)')
    parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)')
    parser.add_option('--humanIGK', '--human_B_kappa', action='store_true', dest='humanIGK', default=False, help='use default human IGK model (B cell light kappa chain)')
    parser.add_option('--humanIGL', '--human_B_lambda', action='store_true', dest='humanIGL', default=False, help='use default human IGL model (B cell light lambda chain)')
    parser.add_option('--mouseTRA', '--mouse_T_alpha', action='store_true', dest='mouseTRA', default=False, help='use default mouse TRA model (T cell alpha chain)')

    parser.add_option('--set_custom_model_VDJ', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model')
    parser.add_option('--set_custom_model_VJ', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model')
    parser.add_option('--sonia_model', type='string', default = 'leftright', dest='model_type' ,help=' specify model type: leftright or lengthpos, default is leftright')
    parser.add_option('--ppost', '--Ppost', action='store_true', dest='ppost', default=False, help='compute Ppost, also computes pgen and Q')
    parser.add_option('--pgen', '--Pgen', action='store_true', dest='pgen', default=False, help='compute pgen')
    parser.add_option('--Q', '--selection_factor', action='store_true', dest='Q', default=False, help='compute Q')
    parser.add_option('--recompute_productive_norm', '--compute_norm', action='store_true', dest='recompute_productive_norm', default=False, help='recompute productive normalization')
    parser.add_option('--skip_off','--skip_empty_off', action='store_true', dest = 'skip_empty', default=True, help='stop skipping empty or blank sequences/lines (if for example you want to keep line index fidelity between the infile and outfile).')

    parser.add_option('-s','--chunk_size', type='int',metavar='N', dest='chunck_size', default = mp.cpu_count()*int(5e2), help='Number of sequences to evaluate at each iteration')

    #vj genes
    parser.add_option('--v_in', '--v_mask_index', type='int', metavar='INDEX', dest='V_mask_index', default=None, help='specifies V_masks are found in column INDEX in the input file. Default is None (do not condition on J usage).')
    parser.add_option('--j_in', '--j_mask_index', type='int', metavar='INDEX', dest='J_mask_index', default=None, help='specifies J_masks are found in column INDEX in the input file. Default is None (do not condition on J usage).')
    parser.add_option('--v_mask', type='string', dest='V_mask', help='specify V usage to condition as arguments.')
    parser.add_option('--j_mask', type='string', dest='J_mask', help='specify J usage to condition as arguments.')

    # input output
    parser.add_option('-i', '--infile', dest = 'infile_name',metavar='PATH/TO/FILE', help='read in CDR3 sequences (and optionally V/J masks) from PATH/TO/FILE')
    parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences and pgens to PATH/TO/FILE')
    parser.add_option('--seq_in', '--seq_index', type='int', metavar='INDEX', dest='seq_in_index', default = 0, help='specifies sequences to be read in are in column INDEX. Default is index 0 (the first column).')
    parser.add_option('-m', '--max_number_of_seqs', type='int',metavar='N', dest='max_number_of_seqs', help='evaluate for at most N sequences.')
    parser.add_option('--lines_to_skip', type='int',metavar='N', dest='lines_to_skip', default = 0, help='skip the first N lines of the file. Default is 0.')

    
    #delimiters
    parser.add_option('-d', '--delimiter', type='choice', dest='delimiter',  choices=['tab', 'space', ',', ';', ':'], help="declare infile delimiter. Default is tab for .tsv input files, comma for .csv files, and any whitespace for all others. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('--raw_delimiter', type='str', dest='delimiter', help="declare infile delimiter as a raw string.")
    parser.add_option('--delimiter_out', type='choice', dest='delimiter_out',  choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('--raw_delimiter_out', type='str', dest='delimiter_out', help="declare for the delimiter outfile as a raw string.")
    parser.add_option('--gene_mask_delimiter', type='choice', dest='gene_mask_delimiter',  choices=['tab', 'space', ',', ';', ':'], help="declare gene mask delimiter. Default comma unless infile delimiter is comma, then default is a semicolon. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('--raw_gene_mask_delimiter', type='str', dest='gene_mask_delimiter', help="declare delimiter of gene masks as a raw string.")
    parser.add_option('--comment_delimiter', type='str', dest='comment_delimiter', help="character or string to indicate comment or header lines to skip.")

    (options, args) = parser.parse_args()

    #Check that the model is specified properly
    main_folder = os.path.dirname(__file__)

    default_models = {}
    default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'),  'VJ']
    default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ']
    default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ']
    default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ']
    default_models['humanIGK'] = [os.path.join(main_folder, 'default_models', 'human_B_kappa'), 'VJ']
    default_models['humanIGL'] = [os.path.join(main_folder, 'default_models', 'human_B_lambda'),  'VJ']
    default_models['mouseTRA'] = [os.path.join(main_folder, 'default_models', 'mouse_T_alpha'), 'VJ']

    num_models_specified = sum([1 for x in list(default_models.keys()) + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)])
    recompute_productive_norm=False
    if num_models_specified == 1: #exactly one model specified
        try:
            d_model = [x for x in default_models.keys() if getattr(options, x)][0]
            model_folder = default_models[d_model][0]
            recomb_type = default_models[d_model][1]
        except IndexError:
            if options.vdj_model_folder: #custom VDJ model specified
                recompute_productive_norm=True
                model_folder = options.vdj_model_folder
                recomb_type = 'VDJ'
            elif options.vj_model_folder: #custom VJ model specified
                recompute_productive_norm=True
                model_folder = options.vj_model_folder
                recomb_type = 'VJ'
    elif num_models_specified == 0:
        print('Need to indicate generative model.')
        print('Exiting...')
        return -1
    elif num_models_specified > 1:
        print('Only specify one model')
        print('Exiting...')
        return -1

    #Generative model specification -- note we'll probably change this syntax to
    #allow for arbitrary model file specification
    params_file_name = os.path.join(model_folder,'model_params.txt')
    marginals_file_name = os.path.join(model_folder,'model_marginals.txt')
    V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv')
    J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv')

    for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]:
            if not os.path.isfile(x):
                print('Cannot find: ' + x)
                print('Please check the files (and naming conventions) in the model folder ' + model_folder)
                print('Exiting...')
                return -1

    #Load up model based on recomb_type
    #VDJ recomb case --- used for TCRB and IGH
    if recomb_type == 'VDJ':
        genomic_data = olga_load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
        generative_model = olga_load_model.GenerativeModelVDJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        pgen_model = generation_probability.GenerationProbabilityVDJ(generative_model, genomic_data)
    #VJ recomb case --- used for TCRA and light chain
    elif recomb_type == 'VJ':
        genomic_data = olga_load_model.GenomicDataVJ()
        genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
        generative_model = olga_load_model.GenerativeModelVJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        pgen_model = generation_probability.GenerationProbabilityVJ(generative_model, genomic_data)

    if options.infile_name is not None:
        infile_name = options.infile_name

        if not os.path.isfile(infile_name):
            print('Cannot find input file: ' + infile_name)
            print('Exiting...')
            return -1

    if options.outfile_name is not None:
        outfile_name = options.outfile_name
#        if os.path.isfile(outfile_name):
#            if not input(outfile_name + ' already exists. Overwrite (y/n)? ').strip().lower() in ['y', 'yes']:
#                print('Exiting...')
#                return -1

    #Parse delimiter
    delimiter = options.delimiter
    if delimiter is None: #Default case
        if options.infile_name is None:
            delimiter = '\t'
        elif infile_name.endswith('.tsv'): #parse TAB separated value file
            delimiter = '\t'
        elif infile_name.endswith('.csv'): #parse COMMA separated value file
            delimiter = ','
    else:
        try:
            delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter]
        except KeyError:
            pass #Other string passed as the delimiter.

    #Parse delimiter_out
    delimiter_out = options.delimiter_out
    if delimiter_out is None: #Default case
        if delimiter is None:
            delimiter_out = '\t'
        else:
            delimiter_out = delimiter
        if options.outfile_name is None:
            pass
        elif outfile_name.endswith('.tsv'): #output TAB separated value file
            delimiter_out = '\t'
        elif outfile_name.endswith('.csv'): #output COMMA separated value file
            delimiter_out = ','
    else:
        try:
            delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out]
        except KeyError:
            pass #Other string passed as the delimiter.

    #Parse gene_delimiter
    gene_mask_delimiter = options.gene_mask_delimiter
    if gene_mask_delimiter is None: #Default case
        gene_mask_delimiter = ','
        if delimiter == ',':
            gene_mask_delimiter = ';'
    else:
        try:
            gene_mask_delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[gene_mask_delimiter]
        except KeyError:
            pass #Other string passed as the delimiter.

    #More options
    seq_in_index = options.seq_in_index #where in the line the sequence is after line.split(delimiter)
    lines_to_skip = options.lines_to_skip #one method of skipping header
    comment_delimiter = options.comment_delimiter #another method of skipping header
    max_number_of_seqs = options.max_number_of_seqs
    V_mask_index = options.V_mask_index #Default is not conditioning on V identity
    J_mask_index = options.J_mask_index #Default is not conditioning on J identity
    skip_empty = options.skip_empty

    #print(V_mask_index,J_mask_index,seq_in_index,gene_mask_delimiter,delimiter)
    
    # choose sonia model type
    sonia_model=SoniaLeftposRightpos(feature_file=os.path.join(model_folder,'features.tsv'),log_file=os.path.join(model_folder,'log.txt'),vj=recomb_type == 'VJ',custom_pgen_model=model_folder)
    if options.recompute_productive_norm: 
        print('Recompute productive normalization.')
        sonia_model.norm_productive=pgen_model.compute_regex_CDR3_template_pgen('CX{0,}')

    # load Evaluate model class
    ev=EvaluateModel(sonia_model, custom_olga_model=pgen_model,
                     include_genes=False if ((V_mask_index is None) and (J_mask_index is None)) else True)

    if options.infile_name is None: #No infile specified -- args should be the input seq
        print_warnings = True
        if len(args)>1 : 
            print('ERROR: can process only one sequence at the time. Submit thourgh file instead.')
            return -1
        seq=args[0]

        #Format V and J masks -- uniform for all argument input sequences
 
        try:
            V_mask = options.V_mask.split(',')
            unrecognized_v_genes = [v for v in V_mask if gene_to_num_str(v, 'V') not in pgen_model.V_mask_mapping.keys()]
            V_mask = [v for v in V_mask if gene_to_num_str(v, 'V') in pgen_model.V_mask_mapping.keys()]
            if len(unrecognized_v_genes) > 0:
                print('These V genes/alleles are not recognized: ' + ', '.join(unrecognized_v_genes))
            if len(V_mask) == 0:
                print('No recognized V genes/alleles in the provided V_mask. Continuing without conditioning on V usage.')
                V_mask = None
        except AttributeError:
            V_mask = options.V_mask #Default is None, i.e. not conditioning on V identity

        try:
            J_mask = options.J_mask.split(',')
            unrecognized_j_genes = [j for j in J_mask if gene_to_num_str(j, 'J') not in pgen_model.J_mask_mapping.keys()]
            J_mask = [j for j in J_mask if gene_to_num_str(j, 'J') in pgen_model.J_mask_mapping.keys()]
            if len(unrecognized_j_genes) > 0:
                print('These J genes/alleles are not recognized: ' + ', '.join(unrecognized_j_genes))
            if len(J_mask) == 0:
                print('No recognized J genes/alleles in the provided J_mask. Continuing without conditioning on J usage.')
                J_mask = None
        except AttributeError:
            J_mask = options.J_mask #Default is None, i.e. not conditioning on J identity

        print('')

        if options.ppost:
            if options.V_mask is None: V_mask=['']
            if options.J_mask is None: J_mask=['']

            v,j=V_mask[0],J_mask[0]
            Q,pgen,ppost=ev.evaluate_seqs([[seq,v,j]])
            print('Ppost of ' + seq + ' '+v+ ' '+j+ ': ' + str(ppost[0]))
            print('Pgen of ' + seq + ' '+v+ ' '+j+ ': ' + str(pgen[0]))
            print('Q of ' + seq + ' '+v+ ' '+j+ ': ' + str(Q[0]))
            print('')
        elif options.Q:
            if options.V_mask is None: V_mask=['']
            if options.J_mask is None: J_mask=['']
            v,j=V_mask[0],J_mask[0]
            Q=ev.evaluate_selection_factors([[seq,v,j]])
            print('Q of ' + seq + ' '+v+ ' '+j+ ': ' + str(Q[0]))
        elif options.pgen:
            pgen=pgen_model.compute_aa_CDR3_pgen(seq,V_mask,J_mask)
            if J_mask is None: J_mask= ''
            if V_mask is None: V_mask= ''
            print('Pgen of ' + seq + ' '+','.join(V_mask)+ ' '+','.join(J_mask)+ ': ' + str(pgen))

        else:
            print('Specify and option: --ppost, --pgen or --Q')


    else:
        print('Load file')

        seqs = []
        V_usage_masks = []
        J_usage_masks = []

        infile = open(infile_name, 'r')

        for i, line in enumerate(infile):
            if comment_delimiter is not None: #Default case -- no comments/header delimiter
                if line.startswith(comment_delimiter): #allow comments
                    continue
            if i < lines_to_skip:
                continue

            if delimiter is None: #Default delimiter is any whitespace
                split_line = line.split('\n')[0].split()
            else:
                split_line = line.split('\n')[0].split(delimiter)
            #Find the seq
            try:
                seq = split_line[seq_in_index].strip()
                if len(seq.strip()) == 0:
                    if skip_empty:
                        continue
                    else:
                        seqs.append(seq) #keep the blank seq as a placeholder
                        #seq_types.append('aaseq')
                else:
                    seqs.append(seq)
                    #seq_types.append(determine_seq_type(seq, aa_alphabet))
            except IndexError: #no index match for seq
                if skip_empty and len(line.strip()) == 0:
                    continue
                print('seq_in_index is out of range')
                print('Exiting...')
                infile.close()
                return -1

            #Find and format V_usage_mask
            if V_mask_index is None:
                V_usage_masks.append(['']) #default mask
            else:
                try:
                    V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter)
                    #check that all V gene/allele names are recognized
                    if all([gene_to_num_str(v, 'V') in pgen_model.V_mask_mapping for v in V_usage_mask]):
                        V_usage_masks.append(V_usage_mask)
                    else:
                        print(str(V_usage_mask) + " is not a usable V_usage_mask composed exclusively of recognized V gene/allele names")
                        print('Unrecognized V gene/allele names: ' + ', '.join([v for v in V_usage_mask if gene_to_num_str(v, 'V') not in pgen_model.V_mask_mapping.keys()]))
                        print('Exiting...')
                        infile.close()
                        return -1
                except IndexError: #no index match for V_mask_index
                    print('V_mask_index is out of range')
                    print('Exiting...')
                    infile.close()
                    return -1

            #Find and format J_usage_mask
            if J_mask_index is None:
                J_usage_masks.append(['']) #default mask
            else:
                try:
                    J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter)
                    #check that all V gene/allele names are recognized
                    if all([gene_to_num_str(j, 'J') in pgen_model.J_mask_mapping for j in J_usage_mask]):
                        J_usage_masks.append(J_usage_mask)
                    else:
                        print(str(J_usage_mask) + " is not a usable J_usage_mask composed exclusively of recognized J gene/allele names")
                        print('Unrecognized J gene/allele names: ' + ', '.join([j for j in J_usage_mask if gene_to_num_str(j, 'J') not in pgen_model.J_mask_mapping.keys()]))
                        print('Exiting...')
                        infile.close()
                        return -1
                except IndexError: #no index match for J_mask_index
                    print('J_mask_index is out of range')
                    print('Exiting...')
                    infile.close()
                    return -1

            if max_number_of_seqs is not None:
                if len(seqs) >= max_number_of_seqs:
                    break

        # combine sequences.
        zipped=[[seqs[i],V_usage_masks[i][0],J_usage_masks[i][0]] for i in range(len(seqs))]

        print('Evaluate')

        if options.outfile_name is not None: #OUTFILE SPECIFIED
            with open(options.outfile_name,'w') as file:
                if options.ppost:file.write('Q'+delimiter_out+'Pgen'+delimiter_out+'Ppost\n')
                elif options.Q:file.write('Q\n')
                elif options.pgen:file.write('Pgen\n')
                else:
                    print('Specify one option: --ppost, --pgen or --Q')
                    return -1
                for t in tqdm(chunks(zipped,options.chunck_size)):
                    if options.ppost:
                        Q,pgen,ppost=ev.evaluate_seqs(t)
                        for i in range(len(Q)):file.write(str(Q[i])+delimiter_out+str(pgen[i])+delimiter_out+str(ppost[i])+'\n')
                    elif options.Q:
                        Q=ev.evaluate_selection_factors(t)
                        for i in range(len(Q)):file.write(str(Q[i])+'\n')
                    elif options.pgen:
                        pgens=ev.compute_all_pgens(t)
                        for i in range(len(pgens)):file.write(str(pgens[i])+'\n')

        else: #print to stdout
            for t in chunks(zipped,options.chunck_size):
                if options.ppost:
                    Q,pgen,ppost=ev.evaluate_seqs(t)
                    print ('Q, Pgen, Ppost')
                    for i in range(len(Q)):print(Q[i],pgen[i],ppost[i])
                elif options.Q:
                    Q=ev.evaluate_selection_factors(t)
                    print ('Q')
                    print(Q)
                elif options.pgen:
                    pgens=ev.compute_all_pgens(t)
                    print ('Pgen')
                    print(pgens)
                else:
                    print('Specify one option: --ppost, --pgen or --Q')
Exemplo n.º 16
0
    def __init__(self,
                 sonia_model=None,
                 custom_olga_model=None,
                 custom_genomic_data=None):

        if type(sonia_model) == str or sonia_model is None:
            print('ERROR: you need to pass a Sonia object')
            return

        self.sonia_model = sonia_model  # sonia model passed as an argument

        # define olga sequence_generation model
        if custom_olga_model is not None:
            if type(custom_olga_model) == str:
                print(
                    'ERROR: you need to pass a olga object for the seq_gen model'
                )
                return

            if custom_genomic_data is None:
                print('ERROR: you need to pass also the custom_genomic_data')
                return
            if type(custom_genomic_data) == str:
                print(
                    'ERROR: you need to pass a olga object for the genomic_data'
                )
                return
            self.genomic_data = custom_genomic_data
            self.seq_gen_model = custom_olga_model
        else:
            try:
                if self.sonia_model.custom_pgen_model is None:
                    main_folder = os.path.join(os.path.dirname(__file__),
                                               'default_models',
                                               self.sonia_model.chain_type)
                else:
                    main_folder = self.sonia_model.custom_pgen_model
            except:
                main_folder = os.path.join(os.path.dirname(__file__),
                                           'default_models',
                                           self.sonia_model.chain_type)

            params_file_name = os.path.join(main_folder, 'model_params.txt')
            marginals_file_name = os.path.join(main_folder,
                                               'model_marginals.txt')
            V_anchor_pos_file = os.path.join(main_folder,
                                             'V_gene_CDR3_anchors.csv')
            J_anchor_pos_file = os.path.join(main_folder,
                                             'J_gene_CDR3_anchors.csv')

            if not self.sonia_model.vj:
                self.genomic_data = olga_load_model.GenomicDataVDJ()
                self.genomic_data.load_igor_genomic_data(
                    params_file_name, V_anchor_pos_file, J_anchor_pos_file)
                self.generative_model = olga_load_model.GenerativeModelVDJ()
                self.generative_model.load_and_process_igor_model(
                    marginals_file_name)
                self.seq_gen_model = seq_gen.SequenceGenerationVDJ(
                    self.generative_model, self.genomic_data)
            else:
                self.genomic_data = olga_load_model.GenomicDataVJ()
                self.genomic_data.load_igor_genomic_data(
                    params_file_name, V_anchor_pos_file, J_anchor_pos_file)
                self.generative_model = olga_load_model.GenerativeModelVJ()
                self.generative_model.load_and_process_igor_model(
                    marginals_file_name)
                self.seq_gen_model = seq_gen.SequenceGenerationVJ(
                    self.generative_model, self.genomic_data)
Exemplo n.º 17
0
def main():
    """ Generate sequences."""

    parser = OptionParser(conflict_handler="resolve")

    #specify model
    parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)')
    parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)')
    parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)')
    parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)')
    parser.add_option('--humanIGK', '--human_B_kappa', action='store_true', dest='humanIGK', default=False, help='use default human IGK model (B cell light kappa chain)')
    parser.add_option('--humanIGL', '--human_B_lambda', action='store_true', dest='humanIGL', default=False, help='use default human IGL model (B cell light lambda chain)')
    parser.add_option('--mouseTRA', '--mouse_T_alpha', action='store_true', dest='mouseTRA', default=False, help='use default mouse TRA model (T cell alpha chain)')
    parser.add_option('--set_custom_model_VDJ', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model')
    parser.add_option('--set_custom_model_VJ', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model')
    parser.add_option('--sonia_model', type='string', default = 'leftright', dest='model_type' ,help=' specify model type: leftright or lengthpos, default is leftright')
    parser.add_option('--post', '--ppost', action='store_true', dest='ppost', default=False, help='sample from post selected repertoire')
    parser.add_option('--pre', '--pgen', action='store_true', dest='pgen', default=False, help='sample from pre selected repertoire ')
    parser.add_option('--delimiter_out','-d', type='choice', dest='delimiter_out',  choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('-s','--chunk_size', type='int',metavar='N', dest='chunck_size', default = int(1e3), help='Number of sequences to generate at each iteration')
    parser.add_option('-r','--rejection_bound', type='int',metavar='N', dest='rejection_bound', default = 10, help='limit above which sequences are always accepted.')

    # input output
    parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences to PATH/TO/FILE')
    parser.add_option('-n', '--N', type='int',metavar='N', dest='num_seqs_to_generate',default=1, help='Number of sequences to sample from.')

    (options, args) = parser.parse_args()

    #Check that the model is specified properly
    main_folder = os.path.dirname(__file__)

    default_models = {}
    default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'),  'VJ']
    default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ']
    default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ']
    default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ']
    default_models['humanIGK'] = [os.path.join(main_folder, 'default_models', 'human_B_kappa'), 'VJ']
    default_models['humanIGL'] = [os.path.join(main_folder, 'default_models', 'human_B_lambda'),  'VJ']
    default_models['mouseTRA'] = [os.path.join(main_folder, 'default_models', 'mouse_T_alpha'), 'VJ']

    num_models_specified = sum([1 for x in list(default_models.keys()) + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)])

    if num_models_specified == 1: #exactly one model specified
        try:
            d_model = [x for x in default_models.keys() if getattr(options, x)][0]
            model_folder = default_models[d_model][0]
            recomb_type = default_models[d_model][1]
        except IndexError:
            if options.vdj_model_folder: #custom VDJ model specified
                model_folder = options.vdj_model_folder
                recomb_type = 'VDJ'
            elif options.vj_model_folder: #custom VJ model specified
                model_folder = options.vj_model_folder
                recomb_type = 'VJ'
    elif num_models_specified == 0:
        print('Need to indicate generative model.')
        print('Exiting...')
        return -1
    elif num_models_specified > 1:
        print('Only specify one model')
        print('Exiting...')
        return -1
    
    #Parse delimiter_out
    delimiter_out = options.delimiter_out
    if delimiter_out is None: #Default case
        delimiter_out = '\t'    
        if options.outfile_name is None:
            pass
        elif options.outfile_name.endswith('.tsv'): #output TAB separated value file
            delimiter_out = '\t'
        elif options.outfile_name.endswith('.csv'): #output COMMA separated value file
            delimiter_out = ','
    else:
        try:
            delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out]
        except KeyError:
            pass #Other string passed as the delimiter.
    #Generative model specification -- note we'll probably change this syntax to
    #allow for arbitrary model file specification
    params_file_name = os.path.join(model_folder,'model_params.txt')
    marginals_file_name = os.path.join(model_folder,'model_marginals.txt')
    V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv')
    J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv')

    for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]:
            if not os.path.isfile(x):
                print('Cannot find: ' + x)
                print('Please check the files (and naming conventions) in the model folder ' + model_folder)
                print('Exiting...')
                return -1

    #Load up model based on recomb_type
    #VDJ recomb case --- used for TCRB and IGH
    if recomb_type == 'VDJ':
        genomic_data = olga_load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
        generative_model = olga_load_model.GenerativeModelVDJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        seqgen_model = sequence_generation.SequenceGenerationVDJ(generative_model, genomic_data)
    #VJ recomb case --- used for TCRA and light chain
    elif recomb_type == 'VJ':
        genomic_data = olga_load_model.GenomicDataVJ()
        genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
        generative_model = olga_load_model.GenerativeModelVJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        seqgen_model = sequence_generation.SequenceGenerationVJ(generative_model, genomic_data)

    if options.pgen:sonia_model=SoniaLeftposRightpos()
    else:sonia_model=SoniaLeftposRightpos(feature_file=os.path.join(model_folder,'features.tsv'),log_file=os.path.join(model_folder,'log.txt'),vj=recomb_type == 'VJ')
    
    # load Evaluate model class
    seq_gen=SequenceGeneration(sonia_model,custom_olga_model=seqgen_model,custom_genomic_data=genomic_data)

    if options.outfile_name is not None: #OUTFILE SPECIFIED
        with open(options.outfile_name,'w') as file:
            to_generate=chuncks(options.num_seqs_to_generate,options.chunck_size)
            for t in tqdm(to_generate):
                if options.pgen:
                    seqs=seq_gen.generate_sequences_pre(num_seqs=t,nucleotide=True)
                elif options.ppost:
                    seqs=seq_gen.generate_sequences_post(num_seqs=t,nucleotide=True,upper_bound=options.rejection_bound)
                else: 
                    print ('ERROR: give option between --pre or --post')
                    return -1
                for seq in seqs: file.write(seq[0]+delimiter_out+seq[1]+delimiter_out+seq[2]+delimiter_out+seq[3]+'\n')
       # np.savetxt(options.outfile_name,seqs,fmt='%s')

    else: #print to stdout
        to_generate=chuncks(options.num_seqs_to_generate,options.chunck_size)
        for t in to_generate:
            if options.pgen:
                seqs=seq_gen.generate_sequences_pre(num_seqs=t,nucleotide=True)
            elif options.ppost:
                seqs=seq_gen.generate_sequences_post(num_seqs=t,nucleotide=True,upper_bound=options.rejection_bound)
            else:
                print ('ERROR: give option between --pre or --post')
                return -1
            for seq in seqs:
                print(seq[0],seq[1],seq[2],seq[3])
Exemplo n.º 18
0
    def add_generated_seqs(self,
                           num_gen_seqs=0,
                           reset_gen_seqs=True,
                           custom_model_folder=None):
        """Generates MonteCarlo sequences for gen_seqs using OLGA.

		Only generates seqs from a V(D)J model. Requires the OLGA package
		(pip install olga).

		Parameters
		----------
		num_gen_seqs : int or float
			Number of MonteCarlo sequences to generate and add to the specified
			sequence pool.
		custom_model_folder : str
			Path to a folder specifying a custom IGoR formatted model to be
			used as a generative model. Folder must contain 'model_params.txt'
			and 'model_marginals.txt'

		Attributes set
		--------------
		gen_seqs : list
			MonteCarlo sequences drawn from a VDJ recomb model
		gen_seq_features : list
			Features gen_seqs have been projected onto.

		"""

        #Load generative model
        if custom_model_folder is None:
            main_folder = os.path.join(
                os.path.dirname(olga_load_model.__file__), 'default_models',
                self.chain_type)
        else:
            main_folder = custom_model_folder

        params_file_name = os.path.join(main_folder, 'model_params.txt')
        marginals_file_name = os.path.join(main_folder, 'model_marginals.txt')
        V_anchor_pos_file = os.path.join(main_folder,
                                         'V_gene_CDR3_anchors.csv')
        J_anchor_pos_file = os.path.join(main_folder,
                                         'J_gene_CDR3_anchors.csv')

        if not os.path.isfile(params_file_name) or not os.path.isfile(
                marginals_file_name):
            print 'Cannot find specified custom generative model files: ' + '\n' + params_file_name + '\n' + marginals_file_name
            print 'Exiting sequence generation...'
            return None
        if not os.path.isfile(V_anchor_pos_file):
            V_anchor_pos_file = os.path.join(
                os.path.dirname(olga_load_model.__file__), 'default_models',
                self.chain_type, 'V_gene_CDR3_anchors.csv')
        if not os.path.isfile(J_anchor_pos_file):
            J_anchor_pos_file = os.path.join(
                os.path.dirname(olga_load_model.__file__), 'default_models',
                self.chain_type, 'J_gene_CDR3_anchors.csv')

        if self.chain_type.endswith('TRA'):
            genomic_data = olga_load_model.GenomicDataVJ()
            genomic_data.load_igor_genomic_data(params_file_name,
                                                V_anchor_pos_file,
                                                J_anchor_pos_file)
            generative_model = olga_load_model.GenerativeModelVJ()
            generative_model.load_and_process_igor_model(marginals_file_name)
            sg_model = seq_gen.SequenceGenerationVJ(generative_model,
                                                    genomic_data)
        else:
            genomic_data = olga_load_model.GenomicDataVDJ()
            genomic_data.load_igor_genomic_data(params_file_name,
                                                V_anchor_pos_file,
                                                J_anchor_pos_file)
            generative_model = olga_load_model.GenerativeModelVDJ()
            generative_model.load_and_process_igor_model(marginals_file_name)
            sg_model = seq_gen.SequenceGenerationVDJ(generative_model,
                                                     genomic_data)

        #Generate sequences
        seqs = [
            [
                seq[1], genomic_data.genV[seq[2]][0].split('*')[0],
                genomic_data.genJ[seq[3]][0].split('*')[0]
            ] for seq in
            [sg_model.gen_rnd_prod_CDR3() for _ in range(int(num_gen_seqs))]
        ]

        if reset_gen_seqs:  #reset gen_seqs if needed
            self.gen_seqs = []
        #Add to specified pool(s)
        self.update_model(add_gen_seqs=seqs)
Exemplo n.º 19
0
    def add_generated_seqs(self, num_gen_seqs = 0, reset_gen_seqs = True, custom_model_folder = None, add_error=False,custom_error=None):
        """Generates MonteCarlo sequences for gen_seqs using OLGA.

        Only generates seqs from a V(D)J model. Requires the OLGA package
        (pip install olga).

        Parameters
        ----------
        num_gen_seqs : int or float
            Number of MonteCarlo sequences to generate and add to the specified
            sequence pool.
        custom_model_folder : str
            Path to a folder specifying a custom IGoR formatted model to be
            used as a generative model. Folder must contain 'model_params.txt'
            and 'model_marginals.txt'
        add_error: bool
            simualate sequencing error: default is false
        custom_error: int
            set custom error rate for sequencing error.
            Default is the one inferred by igor.

        Attributes set
        --------------
        gen_seqs : list
            MonteCarlo sequences drawn from a VDJ recomb model
        gen_seq_features : list
            Features gen_seqs have been projected onto.

        """
        from sonia.utils import add_random_error
        from olga.utils import nt2aa

        #Load generative model
        if custom_model_folder is None:
            try:
                if self.custom_pgen_model is None: main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.chain_type)
                else: main_folder=self.custom_pgen_model
            except:
                main_folder = os.path.join(os.path.dirname(__file__), 'default_models', self.chain_type)
        else:
            main_folder = custom_model_folder

        params_file_name = os.path.join(main_folder,'model_params.txt')
        marginals_file_name = os.path.join(main_folder,'model_marginals.txt')
        V_anchor_pos_file = os.path.join(main_folder,'V_gene_CDR3_anchors.csv')
        J_anchor_pos_file = os.path.join(main_folder,'J_gene_CDR3_anchors.csv')

        if not os.path.isfile(params_file_name) or not os.path.isfile(marginals_file_name):
            print('Cannot find specified custom generative model files: ' + '\n' + params_file_name + '\n' + marginals_file_name)
            print('Exiting sequence generation...')
            return None
        if not os.path.isfile(V_anchor_pos_file):
            V_anchor_pos_file = os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'V_gene_CDR3_anchors.csv')
        if not os.path.isfile(J_anchor_pos_file):
            J_anchor_pos_file = os.path.join(os.path.dirname(olga_load_model.__file__), 'default_models', self.chain_type, 'J_gene_CDR3_anchors.csv')

        with open(params_file_name,'r') as file:
            sep=0
            error_rate=''
            lines=file.read().splitlines()
            while len(error_rate)<1:
                error_rate=lines[-1+sep]
                sep-=1

        if custom_error is None: self.error_rate=float(error_rate)
        else: self.error_rate=custom_error

        if self.vj:
            genomic_data = olga_load_model.GenomicDataVJ()
            genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
            generative_model = olga_load_model.GenerativeModelVJ()
            generative_model.load_and_process_igor_model(marginals_file_name)
            sg_model = seq_gen.SequenceGenerationVJ(generative_model, genomic_data)
        else:
            genomic_data = olga_load_model.GenomicDataVDJ()
            genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
            generative_model = olga_load_model.GenerativeModelVDJ()
            generative_model.load_and_process_igor_model(marginals_file_name)
            sg_model = seq_gen.SequenceGenerationVDJ(generative_model, genomic_data)

        #Generate sequences
        print('Generate sequences.')
        if add_error: seqs = [[nt2aa(add_random_error(seq[0],self.error_rate)), genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0]] for seq in [sg_model.gen_rnd_prod_CDR3(conserved_J_residues='ABCEDFGHIJKLMNOPQRSTUVWXYZ') for _ in tqdm(range(int(num_gen_seqs)))]]
        else: seqs = [[seq[1], genomic_data.genV[seq[2]][0].split('*')[0], genomic_data.genJ[seq[3]][0].split('*')[0]] for seq in [sg_model.gen_rnd_prod_CDR3(conserved_J_residues='ABCEDFGHIJKLMNOPQRSTUVWXYZ') for _ in tqdm(range(int(num_gen_seqs)))]]
        if reset_gen_seqs: #reset gen_seqs if needed
            self.gen_seqs = []
        #Add to specified pool(s)
        self.update_model(add_gen_seqs = seqs)
    def __init__(self,
                 sonia_model=None,
                 include_genes=True,
                 processes=None,
                 custom_olga_model=None):

        if type(sonia_model) == str or sonia_model is None:
            print('ERROR: you need to pass a Sonia object')
            return

        self.sonia_model = sonia_model
        self.include_genes = include_genes

        if processes is None: self.processes = mp.cpu_count()
        else: self.processes = processes

        # you need Z for everything, better to compute it once at the beginning
        self.energies_gen = self.sonia_model.compute_energy(
            self.sonia_model.gen_seq_features[:int(1e6)])
        self.Z = np.sum(np.exp(-self.energies_gen)) / len(self.energies_gen)

        # define olga model

        if custom_olga_model is not None:
            self.pgen_model = custom_olga_model
            self.norm = self.pgen_model.compute_regex_CDR3_template_pgen(
                'X{0,}')

        else:
            main_folder = os.path.join(
                os.path.dirname(olga_load_model.__file__), 'default_models',
                self.sonia_model.chain_type)

            params_file_name = os.path.join(main_folder, 'model_params.txt')
            marginals_file_name = os.path.join(main_folder,
                                               'model_marginals.txt')
            V_anchor_pos_file = os.path.join(main_folder,
                                             'V_gene_CDR3_anchors.csv')
            J_anchor_pos_file = os.path.join(main_folder,
                                             'J_gene_CDR3_anchors.csv')

            if self.sonia_model.chain_type != 'human_T_alpha':
                genomic_data = olga_load_model.GenomicDataVDJ()
                genomic_data.load_igor_genomic_data(params_file_name,
                                                    V_anchor_pos_file,
                                                    J_anchor_pos_file)
                generative_model = olga_load_model.GenerativeModelVDJ()
                generative_model.load_and_process_igor_model(
                    marginals_file_name)
            else:
                genomic_data = olga_load_model.GenomicDataVJ()
                genomic_data.load_igor_genomic_data(params_file_name,
                                                    V_anchor_pos_file,
                                                    J_anchor_pos_file)
                generative_model = olga_load_model.GenerativeModelVJ()
                generative_model.load_and_process_igor_model(
                    marginals_file_name)

            self.pgen_model = pgen.GenerationProbabilityVDJ(
                generative_model, genomic_data)
            self.norm = self.pgen_model.compute_regex_CDR3_template_pgen(
                'X{0,}')
Exemplo n.º 21
0
def main():
    """Compute Pgens from a file and output to another file."""

    parser = OptionParser(conflict_handler="resolve")

    parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)')
    parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)')
    parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)')
    parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)')
    parser.add_option('--set_custom_model_VDJ', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model')
    parser.add_option('--set_custom_model_VJ', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model')

    parser.add_option('-i', '--infile', dest = 'infile_name',metavar='PATH/TO/FILE', help='read in CDR3 sequences (and optionally V/J masks) from PATH/TO/FILE')
    parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences and pgens to PATH/TO/FILE')
    parser.add_option('--seq_in', '--seq_index', type='int', metavar='INDEX', dest='seq_in_index', default = 0, help='specifies sequences to be read in are in column INDEX. Default is index 0 (the first column).')

    parser.add_option('--v_in', '--v_mask_index', type='int', metavar='INDEX', dest='V_mask_index', help='specifies V_masks are found in column INDEX in the input file. Default is no V mask.')
    parser.add_option('--j_in', '--j_mask_index', type='int', metavar='INDEX', dest='J_mask_index', help='specifies J_masks are found in column INDEX in the input file. Default is no J mask.')

    parser.add_option('--v_mask', type='string', dest='V_mask', help='specify V usage to condition Pgen on for seqs read in as arguments.')
    parser.add_option('--j_mask', type='string', dest='J_mask', help='specify J usage to condition Pgen on for seqs read in as arguments.')

    parser.add_option('-m', '--max_number_of_seqs', type='int',metavar='N', dest='max_number_of_seqs', help='compute Pgens for at most N sequences.')
    parser.add_option('--lines_to_skip', type='int',metavar='N', dest='lines_to_skip', default = 0, help='skip the first N lines of the file. Default is 0.')
    parser.add_option('-a', '--alphabet_filename', dest='alphabet_filename', metavar='PATH/TO/FILE', help="specify PATH/TO/FILE defining a custom 'amino acid' alphabet. Default is no custom alphabet.")
    parser.add_option('--seq_type_out', type='choice',metavar='SEQ_TYPE', dest='seq_type_out',  choices=['all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'], help="if read in sequences are ntseqs, declare what type of sequence to compute pgen for. Default is all. Choices: 'all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'")
    parser.add_option('--skip_off','--skip_empty_off', action='store_true', dest = 'skip_empty', default=True, help='stop skipping empty or blank sequences/lines (if for example you want to keep line index fidelity between the infile and outfile).')

    parser.add_option('--display_off', action='store_false', dest='display_seqs', default=True, help='turn the sequence display off (only applies in write-to-file mode). Default is on.')
    parser.add_option('--num_lines_for_display', type='int', metavar='N', default = 50, dest='num_lines_for_display', help='N lines of the output file are displayed when sequence display is on. Also used to determine the number of sequences to average over for speed and time estimates.')
    parser.add_option('--time_updates_off', action='store_false', dest='time_updates', default=True, help='turn time updates off (only applies when sequence display is disabled).')
    parser.add_option('--seqs_per_time_update', type='float', metavar='N', default = 100, dest='seqs_per_time_update', help='specify the number of sequences between time updates. Default is 1e5.')

    parser.add_option('-d', '--delimiter', type='choice', dest='delimiter',  choices=['tab', 'space', ',', ';', ':'], help="declare infile delimiter. Default is tab for .tsv input files, comma for .csv files, and any whitespace for all others. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('--raw_delimiter', type='str', dest='delimiter', help="declare infile delimiter as a raw string.")
    parser.add_option('--delimiter_out', type='choice', dest='delimiter_out',  choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('--raw_delimiter_out', type='str', dest='delimiter_out', help="declare for the delimiter outfile as a raw string.")
    parser.add_option('--gene_mask_delimiter', type='choice', dest='gene_mask_delimiter',  choices=['tab', 'space', ',', ';', ':'], help="declare gene mask delimiter. Default comma unless infile delimiter is comma, then default is a semicolon. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('--raw_gene_mask_delimiter', type='str', dest='gene_mask_delimiter', help="declare delimiter of gene masks as a raw string.")
    parser.add_option('--comment_delimiter', type='str', dest='comment_delimiter', help="character or string to indicate comment or header lines to skip.")


    (options, args) = parser.parse_args()

    #Check that the model is specified properly
    main_folder = os.path.dirname(__file__)

    default_models = {}
    default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'),  'VJ']
    default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ']
    default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ']
    default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ']

    num_models_specified = sum([1 for x in default_models.keys() + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)])

    if num_models_specified == 1: #exactly one model specified
        try:
            d_model = [x for x in default_models.keys() if getattr(options, x)][0]
            model_folder = default_models[d_model][0]
            recomb_type = default_models[d_model][1]
        except IndexError:
            if options.vdj_model_folder: #custom VDJ model specified
                model_folder = options.vdj_model_folder
                recomb_type = 'VDJ'
            elif options.vj_model_folder: #custom VJ model specified
                model_folder = options.vj_model_folder
                recomb_type = 'VJ'
    elif num_models_specified == 0:
        print 'Need to indicate generative model.'
        print 'Exiting...'
        return -1
    elif num_models_specified > 1:
        print 'Only specify one model'
        print 'Exiting...'
        return -1

    #Check that all model and genomic files exist in the indicated model folder
    if not os.path.isdir(model_folder):
        print 'Check pathing... cannot find the model folder: ' + model_folder
        print 'Exiting...'
        return -1

    params_file_name = os.path.join(model_folder,'model_params.txt')
    marginals_file_name = os.path.join(model_folder,'model_marginals.txt')
    V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv')
    J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv')

    for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]:
        if not os.path.isfile(x):
            print 'Cannot find: ' + x
            print 'Please check the files (and naming conventions) in the model folder ' + model_folder
            print 'Exiting...'
            return -1

    alphabet_filename = options.alphabet_filename #used if a custom alphabet is to be specified
    if alphabet_filename is not None:
        if not os.path.isfile(alphabet_filename):
            print 'Cannot find custom alphabet file: ' + infile_name
            print 'Exiting...'
            return -1

    #Load up model based on recomb_type
    #VDJ recomb case --- used for TCRB and IGH
    if recomb_type == 'VDJ':
        genomic_data = load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
        generative_model = load_model.GenerativeModelVDJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        pgen_model = generation_probability.GenerationProbabilityVDJ(generative_model, genomic_data, alphabet_filename)
    #VJ recomb case --- used for TCRA and light chain
    elif recomb_type == 'VJ':
        genomic_data = load_model.GenomicDataVJ()
        genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
        generative_model = load_model.GenerativeModelVJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        pgen_model = generation_probability.GenerationProbabilityVJ(generative_model, genomic_data, alphabet_filename)

    aa_alphabet = ''.join(pgen_model.codons_dict.keys())

    if options.infile_name is not None:
        infile_name = options.infile_name

        if not os.path.isfile(infile_name):
            print 'Cannot find input file: ' + infile_name
            print 'Exiting...'
            return -1

    if options.outfile_name is not None:
        outfile_name = options.outfile_name
        if os.path.isfile(outfile_name):
            if not raw_input(outfile_name + ' already exists. Overwrite (y/n)? ').strip().lower() in ['y', 'yes']:
                print 'Exiting...'
                return -1

    #Parse delimiter
    delimiter = options.delimiter
    if delimiter is None: #Default case
        if options.infile_name is None:
            delimiter = '\t'
        elif infile_name.endswith('.tsv'): #parse TAB separated value file
            delimiter = '\t'
        elif infile_name.endswith('.csv'): #parse COMMA separated value file
            delimiter = ','
    else:
        try:
            delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter]
        except KeyError:
            pass #Other string passed as the delimiter.

    #Parse delimiter_out
    delimiter_out = options.delimiter_out
    if delimiter_out is None: #Default case
        if delimiter is None:
            delimiter_out = '\t'
        else:
            delimiter_out = delimiter
        if options.outfile_name is None:
            pass
        elif outfile_name.endswith('.tsv'): #output TAB separated value file
            delimiter_out = '\t'
        elif outfile_name.endswith('.csv'): #output COMMA separated value file
            delimiter_out = ','
    else:
        try:
            delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out]
        except KeyError:
            pass #Other string passed as the delimiter.

    #Parse gene_delimiter
    gene_mask_delimiter = options.gene_mask_delimiter
    if gene_mask_delimiter is None: #Default case
        gene_mask_delimiter = ','
        if delimiter == ',':
            gene_mask_delimiter = ';'
    else:
        try:
            gene_mask_delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[gene_mask_delimiter]
        except KeyError:
            pass #Other string passed as the delimiter.


    #More options
    time_updates = options.time_updates
    display_seqs = options.display_seqs
    num_lines_for_display = options.num_lines_for_display
    seq_in_index = options.seq_in_index #where in the line the sequence is after line.split(delimiter)
    lines_to_skip = options.lines_to_skip #one method of skipping header
    comment_delimiter = options.comment_delimiter #another method of skipping header
    seqs_per_time_update = options.seqs_per_time_update
    max_number_of_seqs = options.max_number_of_seqs
    V_mask_index = options.V_mask_index #Default is not conditioning on V identity
    J_mask_index = options.J_mask_index #Default is not conditioning on J identity
    skip_empty = options.skip_empty

    seq_type_out = options.seq_type_out #type of pgens to be computed. Can be ntseq, aaseq, or both
    if seq_type_out is not None:
        seq_type_out = {'all': None, 'ntseq': 'ntseq', 'nucleotide': 'ntseq', 'aaseq': 'aaseq', 'amino_acid': 'aaseq'}[seq_type_out]

    if options.infile_name is None: #No infile specified -- args should be the input seqs
        print_warnings = True
        seqs = args
        seq_types = [determine_seq_type(seq, aa_alphabet) for seq in seqs]
        unrecognized_seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is None]
        if len(unrecognized_seqs) > 0 and print_warnings:
            print 'The following sequences/arguments were not recognized: ' + ', '.join(unrecognized_seqs)
        seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is not None]
        seq_types = [seq_type for seq_type in seq_types if seq_type is not None]


        #Format V and J masks -- uniform for all argument input sequences
        try:
            V_mask = options.V_mask.split(',')
            unrecognized_v_genes = [v for v in V_mask if v not in pgen_model.V_mask_mapping.keys()]
            V_mask = [v for v in V_mask if v in pgen_model.V_mask_mapping.keys()]
            if len(unrecognized_v_genes) > 0:
                print 'These V genes/alleles are not recognized: ' + ', '.join(unrecognized_v_genes)
            if len(V_mask) == 0:
                print 'No recognized V genes/alleles in the provided V_mask. Continuing without conditioning on V usage.'
                V_mask = None
        except AttributeError:
            V_mask = options.V_mask #Default is None, i.e. not conditioning on V identity

        try:
            J_mask = options.J_mask.split(',')
            unrecognized_j_genes = [j for j in J_mask if j not in pgen_model.J_mask_mapping.keys()]
            J_mask = [j for j in J_mask if j in pgen_model.J_mask_mapping.keys()]
            if len(unrecognized_j_genes) > 0:
                print 'These J genes/alleles are not recognized: ' + ', '.join(unrecognized_j_genes)
            if len(J_mask) == 0:
                print 'No recognized J genes/alleles in the provided J_mask. Continuing without conditioning on J usage.'
                J_mask = None
        except AttributeError:
            J_mask = options.J_mask #Default is None, i.e. not conditioning on J identity

        print ''
        start_time = time.time()
        for seq, seq_type in zip(seqs, seq_types):
            if seq_type == 'aaseq':
                c_pgen = pgen_model.compute_aa_CDR3_pgen(seq, V_mask, J_mask, print_warnings)
                print 'Pgen of the amino acid sequence ' + seq + ': ' + str(c_pgen)
                print ''
            elif seq_type == 'regex':
                c_pgen = pgen_model.compute_regex_CDR3_template_pgen(seq, V_mask, J_mask, print_warnings)
                print 'Pgen of the regular expression sequence ' + seq + ': ' + str(c_pgen)
                print ''
            elif seq_type == 'ntseq':
                if seq_type_out is None or seq_type_out == 'ntseq':
                    c_pgen_nt = pgen_model.compute_nt_CDR3_pgen(seq, V_mask, J_mask, print_warnings)
                    print 'Pgen of the nucleotide sequence ' + seq + ': ' + str(c_pgen_nt)
                if seq_type_out is None or seq_type_out == 'aaseq':
                    c_pgen_aa = pgen_model.compute_aa_CDR3_pgen(nt2aa(seq), V_mask, J_mask, print_warnings)
                    print 'Pgen of the amino acid sequence nt2aa(' + seq + ') = ' + nt2aa(seq) + ': ' + str(c_pgen_aa)
                print ''

        c_time = time.time() - start_time
        if c_time > 86400: #more than a day
            c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60)
        elif c_time > 3600: #more than an hr
            c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60)
        elif c_time > 60: #more than a min
            c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60)
        else:
            c_time_str = '%.2f seconds.'%(c_time)

        print 'Completed pgen computation in: ' + c_time_str

    else: #Read sequences in from file
        print_warnings = False #Most cases of reading in from file should have warnings disabled
        seqs = []
        seq_types = []
        V_usage_masks = []
        J_usage_masks = []

        infile = open(infile_name, 'r')

        for i, line in enumerate(infile):
            if comment_delimiter is not None: #Default case -- no comments/header delimiter
                if line.startswith(comment_delimiter): #allow comments
                    continue
            if i < lines_to_skip:
                continue

            if delimiter is None: #Default delimiter is any whitespace
                split_line = line.split()
            else:
                split_line = line.split(delimiter)

            #Find the seq
            try:
                seq = split_line[seq_in_index].strip()
                if len(seq.strip()) == 0:
                    if skip_empty:
                        continue
                    else:
                        seqs.append(seq) #keep the blank seq as a placeholder
                        seq_types.append('aaseq')
                else:
                    seqs.append(seq)
                    seq_types.append(determine_seq_type(seq, aa_alphabet))
            except IndexError: #no index match for seq
                if skip_empty and len(line.strip()) == 0:
                    continue
                print 'seq_in_index is out of range'
                print 'Exiting...'
                infile.close()
                return -1

            #Find and format V_usage_mask
            if V_mask_index is None:
                V_usage_masks.append(None) #default mask
            else:
                try:
                    V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter)
                    #check that all V gene/allele names are recognized
                    if all([v in pgen_model.V_mask_mapping for v in V_usage_mask]):
                        V_usage_masks.append(V_usage_mask)
                    else:
                        print str(V_usage_mask) + " is not a usable V_usage_mask composed exclusively of recognized V gene/allele names"
                        print 'Unrecognized V gene/allele names: ' + ', '.join([v for v in V_usage_mask if not v in pgen_model.V_mask_mapping.keys()])
                        print 'Exiting...'
                        infile.close()
                        return -1
                except IndexError: #no index match for V_mask_index
                    print 'V_mask_index is out of range'
                    print 'Exiting...'
                    infile.close()
                    return -1

            #Find and format J_usage_mask
            if J_mask_index is None:
                J_usage_masks.append(None) #default mask
            else:
                try:
                    J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter)
                    #check that all V gene/allele names are recognized
                    if all([j in pgen_model.J_mask_mapping for j in J_usage_mask]):
                        J_usage_masks.append(J_usage_mask)
                    else:
                        print str(J_usage_mask) + " is not a usable J_usage_mask composed exclusively of recognized J gene/allele names"
                        print 'Unrecognized J gene/allele names: ' + ', '.join([j for j in J_usage_mask if not j in pgen_model.J_mask_mapping.keys()])
                        print 'Exiting...'
                        infile.close()
                        return -1
                except IndexError: #no index match for J_mask_index
                    print 'J_mask_index is out of range'
                    print 'Exiting...'
                    infile.close()
                    return -1

            if max_number_of_seqs is not None:
                if len(seqs) >= max_number_of_seqs:
                    break


        unrecognized_seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is None]
        if len(unrecognized_seqs) > 0 and len(unrecognized_seqs) < len(seqs):
            if print_warnings or options.outfile_name is not None:
                print 'Some strings read in were not parsed as sequences -- they will be omitted.'
                print 'Examples of improperly read strings: '
                for unrecognized_seq in unrecognized_seqs[:10]:
                    print unrecognized_seq
            seqs = [seq for i, seq in enumerate(seqs) if seq_types[i] is not None]
            V_usage_masks = [V_usage_mask for i, V_usage_mask in enumerate(V_usage_masks) if seq_types[i] is not None]
            seq_types = [seq_type for seq_type in seq_types if seq_type is not None]
        elif len(unrecognized_seqs) > 0 and len(unrecognized_seqs) == len(seqs):
            print 'None of the read in strings were parsed as sequences. Check input file.'
            print 'Examples of improperly read strings:'
            for unrecognized_seq in unrecognized_seqs[:10]:
                print unrecognized_seq
            print 'Exiting...'
            return -1

        infile.close()


        if options.outfile_name is not None: #OUTFILE SPECIFIED, allow printed info/display

            print 'Successfully read in and formatted ' + str(len(seqs)) + ' sequences and any V or J usages.'
            if display_seqs:
                sys.stdout.write('\r'+'Continuing to Pgen computation in 3... ')
                sys.stdout.flush()
                time.sleep(0.4)
                sys.stdout.write('\r'+'Continuing to Pgen computation in 2... ')
                sys.stdout.flush()
                time.sleep(0.4)
                sys.stdout.write('\r'+'Continuing to Pgen computation in 1... ')
                sys.stdout.flush()
                time.sleep(0.4)
            else:
                print 'Continuing to Pgen computation.'
                print_warnings = True #Display is off, can print warnings

            if display_seqs:
                lines_for_display = []
                times_for_speed_calc = [time.time()]

            outfile = open(outfile_name, 'w')
            start_time = time.time()
            for i, seq in enumerate(seqs):
                if seq_types[i] == 'aaseq':
                    #Compute Pgen and print out
                    c_pgen_line = seq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                if seq_types[i] == 'regex':
                    #Compute Pgen and print out
                    c_pgen_line = seq + delimiter_out + str(pgen_model.compute_regex_CDR3_template_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                elif seq_types[i] == 'ntseq':
                    ntseq = seq
                    if len(ntseq) % 3 == 0: #inframe sequence
                        aaseq = nt2aa(ntseq)
                        #Compute Pgen and print out based on recomb_type and seq_type_out
                        if seq_type_out is None:
                            c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + delimiter_out + aaseq + delimiter_out +  str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                        elif seq_type_out == 'ntseq':
                            c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                        elif seq_type_out == 'aaseq':
                            c_pgen_line = aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                    else: #out of frame sequence -- Pgens are 0 and use 'out_of_frame' for aaseq
                        if seq_type_out is None:
                            c_pgen_line = ntseq + delimiter_out + '0' + delimiter_out + 'out_of_frame' + delimiter_out + '0'
                        elif seq_type_out == 'ntseq':
                            c_pgen_line = ntseq + delimiter_out + '0'
                        elif seq_type_out == 'aaseq':
                            c_pgen_line = 'out_of_frame' + delimiter_out + '0'

                outfile.write(c_pgen_line + '\n')

                #Print time update
                if display_seqs:
                    cc_time = time.time()
                    c_time = cc_time - start_time
                    times_for_speed_calc = [cc_time] + times_for_speed_calc[:num_lines_for_display]
                    c_avg_speed = (len(times_for_speed_calc)-1)/float(times_for_speed_calc[0] - times_for_speed_calc[-1])

                    #eta = ((len(seqs) - (i+1))/float(i+1))*c_time

                    eta = (len(seqs) - (i+1))/c_avg_speed

                    lines_for_display = [c_pgen_line] + lines_for_display[:num_lines_for_display]


                    c_time_str = '%s hours, %s minutes, and %s seconds.'%(repr(int(c_time)/3600).rjust(3), repr((int(c_time)/60)%60).rjust(2), repr(int(c_time)%60).rjust(2))
                    eta_str = '%s hours, %s minutes, and %s seconds.'%(repr(int(eta)/3600).rjust(3), repr((int(eta)/60)%60).rjust(2), repr(int(eta)%60).rjust(2))
                    time_str = 'Time to compute Pgen on %s seqs: %s \nEst. time for remaining %s seqs: %s'%(repr(i+1).rjust(9), c_time_str, repr(len(seqs) - (i + 1)).rjust(9), eta_str)
                    speed_str = 'Current Pgen computation speed: %s seqs/min'%(repr(round((len(times_for_speed_calc)-1)*60/float(times_for_speed_calc[0] - times_for_speed_calc[-1]), 2)).rjust(8))
                    display_str = '\n'.join(lines_for_display[::-1]) + '\n' + '-'*80 + '\n' + time_str + '\n' + speed_str + '\n' + '-'*80
                    print '\033[2J' + display_str
                elif (i+1)%seqs_per_time_update == 0 and time_updates:
                    c_time = time.time() - start_time
                    eta = ((len(seqs) - (i+1))/float(i+1))*c_time
                    if c_time > 86400: #more than a day
                        c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60)
                    elif c_time > 3600: #more than an hr
                        c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60)
                    elif c_time > 60: #more than a min
                        c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60)
                    else:
                        c_time_str = '%.2f seconds.'%(c_time)

                    if eta > 86400: #more than a day
                        eta_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(eta)/86400, (int(eta)/3600)%24, (int(eta)/60)%60, eta%60)
                    elif eta > 3600: #more than an hr
                        eta_str = '%d hours, %d minutes, and %.2f seconds.'%((int(eta)/3600)%24, (int(eta)/60)%60, eta%60)
                    elif eta > 60: #more than a min
                        eta_str = '%d minutes and %.2f seconds.'%((int(eta)/60)%60, eta%60)
                    else:
                        eta_str = '%.2f seconds.'%(eta)

                    print 'Pgen computed for %d sequences in: %s Estimated time remaining: %s'%(i+1, c_time_str, eta_str)

            c_time = time.time() - start_time
            if c_time > 86400: #more than a day
                c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.'%(int(c_time)/86400, (int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60)
            elif c_time > 3600: #more than an hr
                c_time_str = '%d hours, %d minutes, and %.2f seconds.'%((int(c_time)/3600)%24, (int(c_time)/60)%60, c_time%60)
            elif c_time > 60: #more than a min
                c_time_str = '%d minutes and %.2f seconds.'%((int(c_time)/60)%60, c_time%60)
            else:
                c_time_str = '%.2f seconds.'%(c_time)
            print 'Completed Pgen computation for %d sequences: in %s'%(len(seqs), c_time_str)

            outfile.close()

        else: #NO OUTFILE -- print directly to stdout
            start_time = time.time()
            for i, seq in enumerate(seqs):
                if seq_types[i] == 'aaseq':
                    #Compute Pgen and print out
                    c_pgen_line = seq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                if seq_types[i] == 'regex':
                    #Compute Pgen and print out
                    c_pgen_line = seq + delimiter_out + str(pgen_model.compute_regex_CDR3_template_pgen(seq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                elif seq_types[i] == 'ntseq':
                    ntseq = seq
                    if len(ntseq) % 3 == 0: #inframe sequence
                        aaseq = nt2aa(ntseq)
                        #Compute Pgen and print out based on recomb_type and seq_type_out
                        if seq_type_out is None:
                            c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings)) + delimiter_out + aaseq + delimiter_out +  str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                        elif seq_type_out == 'ntseq':
                            c_pgen_line = ntseq + delimiter_out + str(pgen_model.compute_nt_CDR3_pgen(ntseq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                        elif seq_type_out == 'aaseq':
                            c_pgen_line = aaseq + delimiter_out + str(pgen_model.compute_aa_CDR3_pgen(aaseq, V_usage_masks[i], J_usage_masks[i], print_warnings))
                    else: #out of frame sequence -- Pgens are 0 and use 'out_of_frame' for aaseq
                        if seq_type_out is None:
                            c_pgen_line = ntseq + delimiter_out + '0' + delimiter_out + 'out_of_frame' + delimiter_out + '0'
                        elif seq_type_out == 'ntseq':
                            c_pgen_line = ntseq + delimiter_out + '0'
                        elif seq_type_out == 'aaseq':
                            c_pgen_line = 'out_of_frame' + delimiter_out + '0'

                print c_pgen_line
Exemplo n.º 22
0
alpha_marginals_file_name = util.path_to_olga + 'default_models/human_T_alpha/model_marginals.txt'
alpha_V_anchor_pos_file = util.path_to_olga + 'default_models/human_T_alpha/V_gene_CDR3_anchors.csv'
alpha_J_anchor_pos_file = util.path_to_olga + 'default_models/human_T_alpha/J_gene_CDR3_anchors.csv'

mus_beta_params_file_name = util.path_to_olga + 'default_models/mouse_T_beta/model_params.txt'
mus_beta_marginals_file_name = util.path_to_olga + 'default_models/mouse_T_beta/model_marginals.txt'
mus_beta_V_anchor_pos_file = util.path_to_olga + 'default_models/mouse_T_beta/V_gene_CDR3_anchors.csv'
mus_beta_J_anchor_pos_file = util.path_to_olga + 'default_models/mouse_T_beta/J_gene_CDR3_anchors.csv'

humanIg_params_file_name = util.path_to_olga + 'default_models/human_B_heavy/model_params.txt'
humanIg_marginals_file_name = util.path_to_olga + 'default_models/human_B_heavy/model_marginals.txt'
humanIg_V_anchor_pos_file = util.path_to_olga + 'default_models/human_B_heavy/V_gene_CDR3_anchors.csv'
humanIg_J_anchor_pos_file = util.path_to_olga + 'default_models/human_B_heavy/J_gene_CDR3_anchors.csv'

#Load models
beta_genomic_data = load_model.GenomicDataVDJ()
beta_genomic_data.load_igor_genomic_data(beta_params_file_name,
                                         beta_V_anchor_pos_file,
                                         beta_J_anchor_pos_file)
beta_generative_model = load_model.GenerativeModelVDJ()
beta_generative_model.load_and_process_igor_model(beta_marginals_file_name)

#alpha_genomic_data = load_model.GenomicDataVDJ()
#alpha_genomic_data.load_igor_genomic_data(alpha_params_file_name, alpha_V_anchor_pos_file, alpha_J_anchor_pos_file)
#alpha_generative_model = load_model.GenerativeModelVDJ()
#alpha_generative_model.load_and_process_igor_model(alpha_marginals_file_name)

mus_beta_genomic_data = load_model.GenomicDataVDJ()
mus_beta_genomic_data.load_igor_genomic_data(mus_beta_params_file_name,
                                             mus_beta_V_anchor_pos_file,
                                             mus_beta_J_anchor_pos_file)
                    custom_pgen_model='universal_model')
qm0 = SoniaLeftposRightpos(
    load_dir='selection_models/emerson_frequency_leftright_1M',
    custom_pgen_model='universal_model')
qm1 = SoniaVJL(load_dir='selection_models/emerson_frequency_vjl_1M',
               custom_pgen_model='universal_model')

# load Evaluate model

main_folder = 'universal_model'
params_file_name = os.path.join(main_folder, 'model_params.txt')
marginals_file_name = os.path.join(main_folder, 'model_marginals.txt')
V_anchor_pos_file = os.path.join(main_folder, 'V_gene_CDR3_anchors.csv')
J_anchor_pos_file = os.path.join(main_folder, 'J_gene_CDR3_anchors.csv')

genomic_data = olga_load_model.GenomicDataVDJ()
genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file,
                                    J_anchor_pos_file)
generative_model = olga_load_model.GenerativeModelVDJ()
generative_model.load_and_process_igor_model(marginals_file_name)
pgen_model = generation_probability.GenerationProbabilityVDJ(
    generative_model, genomic_data)

ev = EvaluateModel(sonia_model=qm, custom_olga_model=pgen_model)
ev0 = EvaluateModel(sonia_model=qm0, custom_olga_model=pgen_model)
ev1 = EvaluateModel(sonia_model=qm1, custom_olga_model=pgen_model)

#evaluate ppost/pgen
energy, pgen, ppost = ev.evaluate_seqs(to_evalutate)
_, _, ppost_left = ev0.evaluate_seqs(to_evalutate)
_, _, ppost_vjl = ev1.evaluate_seqs(to_evalutate)
Exemplo n.º 24
0
def main():
    """ Evaluate sequences."""
    parser = OptionParser(conflict_handler="resolve")
    
    #specify model
    parser.add_option('--humanTRA', '--human_T_alpha', action='store_true', dest='humanTRA', default=False, help='use default human TRA model (T cell alpha chain)')
    parser.add_option('--humanTRB', '--human_T_beta', action='store_true', dest='humanTRB', default=False, help='use default human TRB model (T cell beta chain)')
    parser.add_option('--mouseTRB', '--mouse_T_beta', action='store_true', dest='mouseTRB', default=False, help='use default mouse TRB model (T cell beta chain)')
    parser.add_option('--humanIGH', '--human_B_heavy', action='store_true', dest='humanIGH', default=False, help='use default human IGH model (B cell heavy chain)')
    parser.add_option('--humanIGK', '--human_B_kappa', action='store_true', dest='humanIGK', default=False, help='use default human IGK model (B cell light kappa chain)')
    parser.add_option('--humanIGL', '--human_B_lambda', action='store_true', dest='humanIGL', default=False, help='use default human IGL model (B cell light lambda chain)')
    parser.add_option('--mouseTRA', '--mouse_T_alpha', action='store_true', dest='mouseTRA', default=False, help='use default mouse TRA model (T cell alpha chain)')

    parser.add_option('--set_custom_model_VDJ', dest='vdj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VDJ generative model')
    parser.add_option('--set_custom_model_VJ', dest='vj_model_folder', metavar='PATH/TO/FOLDER/', help='specify PATH/TO/FOLDER/ for a custom VJ generative model')
    parser.add_option('--sonia_model', type='string', default = 'leftright', dest='model_type' ,help='specify model type: leftright or lengthpos, default is leftright')
    parser.add_option('--epochs', type='int', default = 30, dest='epochs' ,help='number of epochs for inference, default is 30')
    parser.add_option('--batch_size', type='int', default = 5000, dest='batch_size' ,help='size of batch for the stochastic gradient descent')
    parser.add_option('--validation_split', type='float', default = 0.2, dest='validation_split' ,help='fraction of sequences used for validation.')
    parser.add_option('--independent_genes', '--include_indep_genes', action='store_true', dest='independent_genes', default=False, help='Independent gene selection factors q_v*q_j. Deafult is joint q_vj')
    parser.add_option('--min_energy_clip', type='float', default=-5, dest='min_energy_clip',  help='Set numerical lower bound to the values of -logQ, default is -5.')
    parser.add_option('--max_energy_clip', type='float', default=10, dest='max_energy_clip', help='Set numerical upper bound to the values of -logQ, default is 10.')

    #location of seqs
    parser.add_option('--seq_in', '--seq_index', type='int', metavar='INDEX', dest='seq_in_index', default = 0, help='specifies sequences to be read in are in column INDEX. Default is index 0 (the first column).')
    parser.add_option('--v_in', '--v_mask_index', type='int', metavar='INDEX', dest='V_mask_index', default=1, help='specifies V_masks are found in column INDEX in the input file. Default is 1.')
    parser.add_option('--j_in', '--j_mask_index', type='int', metavar='INDEX', dest='J_mask_index', default=2, help='specifies J_masks are found in column INDEX in the input file. Default is 2.')

    # input output
    parser.add_option('-i', '--infile', dest = 'infile_name',metavar='PATH/TO/FILE', help='read in CDR3 sequences (and optionally V/J masks) from PATH/TO/FILE')
    parser.add_option('-o', '--outfile', dest = 'outfile_name', metavar='PATH/TO/FILE', help='write CDR3 sequences and pgens to PATH/TO/FILE')
    parser.add_option('-m', '--max_number_of_seqs', type='int',metavar='N', dest='max_number_of_seqs', help='evaluate for at most N sequences.')
    parser.add_option('-n', '--n_gen_seqs', type='int',metavar='N', dest='n_gen_seqs',default=0, help='sample n sequences from gen distribution.')
    parser.add_option('-g', '--infile_gen', dest = 'infile_gen',metavar='PATH/TO/FILE', help='read generated CDR3 sequences (and optionally V/J masks) from PATH/TO/FILE')
    parser.add_option('--lines_to_skip', type='int',metavar='N', dest='lines_to_skip', default = 0, help='skip the first N lines of the file. Default is 0.')
    parser.add_option('--no_report', '--no_plot_report', action='store_false', dest='plot_report', default=True, help='Do not produce report plots of the inferred model.')
    
    #delimeters
    parser.add_option('-d', '--delimiter', type='choice', dest='delimiter',  choices=['tab', 'space', ',', ';', ':'], help="declare infile delimiter. Default is tab for .tsv input files, comma for .csv files, and any whitespace for all others. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('--raw_delimiter', type='str', dest='delimiter', help="declare infile delimiter as a raw string.")
    parser.add_option('--delimiter_out', type='choice', dest='delimiter_out',  choices=['tab', 'space', ',', ';', ':'], help="declare outfile delimiter. Default is tab for .tsv output files, comma for .csv files, and the infile delimiter for all others. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('--raw_delimiter_out', type='str', dest='delimiter_out', help="declare for the delimiter outfile as a raw string.")
    parser.add_option('--gene_mask_delimiter', type='choice', dest='gene_mask_delimiter',  choices=['tab', 'space', ',', ';', ':'], help="declare gene mask delimiter. Default comma unless infile delimiter is comma, then default is a semicolon. Choices: 'tab', 'space', ',', ';', ':'")
    parser.add_option('--raw_gene_mask_delimiter', type='str', dest='gene_mask_delimiter', help="declare delimiter of gene masks as a raw string.")
    parser.add_option('--comment_delimiter', type='str', dest='comment_delimiter', help="character or string to indicate comment or header lines to skip.")
    parser.add_option('--seed', type='int',metavar='N', dest='seed', default = None, help='set seed for inference')

    (options, args) = parser.parse_args()

    #set seed
    if options.seed is not None: 
        import tensorflow as tf
        np.random.seed(options.seed)
        tf.random.set_seed(options.seed)

    #Check that the model is specified properly
    
    main_folder = os.path.dirname(__file__)
    
    default_models = {}
    default_models['humanTRA'] = [os.path.join(main_folder, 'default_models', 'human_T_alpha'),  'VJ']
    default_models['humanTRB'] = [os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ']
    default_models['mouseTRB'] = [os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ']
    default_models['humanIGH'] = [os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ']
    default_models['humanIGK'] = [os.path.join(main_folder, 'default_models', 'human_B_kappa'), 'VJ']
    default_models['humanIGL'] = [os.path.join(main_folder, 'default_models', 'human_B_lambda'),  'VJ']
    default_models['mouseTRA'] = [os.path.join(main_folder, 'default_models', 'mouse_T_alpha'), 'VJ']

    if options.independent_genes:
        independent_genes=True
        joint_genes=False
    else:
        independent_genes=False
        joint_genes=True

    num_models_specified = sum([1 for x in list(default_models.keys()) + ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)])
    recompute_productive_norm=False
    if num_models_specified == 1: #exactly one model specified
        try:
            d_model = [x for x in default_models.keys() if getattr(options, x)][0]
            model_folder = default_models[d_model][0]
            recomb_type = default_models[d_model][1]
        except IndexError:
            if options.vdj_model_folder: #custom VDJ model specified
                recompute_productive_norm=True
                model_folder = options.vdj_model_folder
                recomb_type = 'VDJ'
            elif options.vj_model_folder: #custom VJ model specified
                recompute_productive_norm=True
                model_folder = options.vj_model_folder
                recomb_type = 'VJ'
    elif num_models_specified == 0:
        print('Need to indicate generative model.')
        print('Exiting...')
        return -1
    elif num_models_specified > 1:
        print('Only specify one model')
        print('Exiting...')
        return -1
    
    if options.max_energy_clip <= options.min_energy_clip :
        print('The clip for the higher energy must be strictly greater than the clip for the lower energy. ')
        print('Exiting...')
        return -1
    else :
        max_energy_clip = options.max_energy_clip
        min_energy_clip = options.min_energy_clip

    #Generative model specification -- note we'll probably change this syntax to
    #allow for arbitrary model file specification
    params_file_name = os.path.join(model_folder,'model_params.txt')
    marginals_file_name = os.path.join(model_folder,'model_marginals.txt')
    V_anchor_pos_file = os.path.join(model_folder,'V_gene_CDR3_anchors.csv')
    J_anchor_pos_file = os.path.join(model_folder,'J_gene_CDR3_anchors.csv')

    for x in [params_file_name, marginals_file_name, V_anchor_pos_file, J_anchor_pos_file]:
            if not os.path.isfile(x):
                print('Cannot find: ' + x)
                print('Please check the files (and naming conventions) in the model folder ' + model_folder)
                print('Exiting...')
                return -1

    #Load up model based on recomb_type
    #VDJ recomb case --- used for TCRB and IGH
    if recomb_type == 'VDJ':
        genomic_data = olga_load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
        generative_model = olga_load_model.GenerativeModelVDJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        pgen_model = generation_probability.GenerationProbabilityVDJ(generative_model, genomic_data)
    #VJ recomb case --- used for TCRA and light chain
    elif recomb_type == 'VJ':
        genomic_data = olga_load_model.GenomicDataVJ()
        genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file, J_anchor_pos_file)
        generative_model = olga_load_model.GenerativeModelVJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        pgen_model = generation_probability.GenerationProbabilityVJ(generative_model, genomic_data)

    if options.infile_name is not None:
        infile_name = options.infile_name

        if not os.path.isfile(infile_name):
            print('Cannot find input file: ' + infile_name)
            print('Exiting...')
            return -1

    if options.outfile_name is not None:
        outfile_name = options.outfile_name
        if os.path.isfile(outfile_name):
            if not input(outfile_name + ' already exists. Overwrite (y/n)? ').strip().lower() in ['y', 'yes']:
                print('Exiting...')
                return -1

    #Parse delimiter
    delimiter = options.delimiter
    if delimiter is None: #Default case
        if options.infile_name is None:
            delimiter = '\t'
        elif infile_name.endswith('.tsv'): #parse TAB separated value file
            delimiter = '\t'
        elif infile_name.endswith('.csv'): #parse COMMA separated value file
            delimiter = ','
    else:
        try:
            delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter]
        except KeyError:
            pass #Other string passed as the delimiter.

    #Parse delimiter_out
    delimiter_out = options.delimiter_out
    if delimiter_out is None: #Default case
        if delimiter is None:
            delimiter_out = '\t'
        else:
            delimiter_out = delimiter
        if options.outfile_name is None:
            pass
        elif outfile_name.endswith('.tsv'): #output TAB separated value file
            delimiter_out = '\t'
        elif outfile_name.endswith('.csv'): #output COMMA separated value file
            delimiter_out = ','
    else:
        try:
            delimiter_out = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[delimiter_out]
        except KeyError:
            pass #Other string passed as the delimiter.

    #Parse gene_delimiter
    gene_mask_delimiter = options.gene_mask_delimiter
    if gene_mask_delimiter is None: #Default case
        gene_mask_delimiter = ','
        if delimiter == ',':
            gene_mask_delimiter = ';'
    else:
        try:
            gene_mask_delimiter = {'tab': '\t', 'space': ' ', ',': ',', ';': ';', ':': ':'}[gene_mask_delimiter]
        except KeyError:
            pass #Other string passed as the delimiter.

    #More options
    seq_in_index = options.seq_in_index #where in the line the sequence is after line.split(delimiter)
    lines_to_skip = options.lines_to_skip #one method of skipping header
    comment_delimiter = options.comment_delimiter #another method of skipping header
    max_number_of_seqs = options.max_number_of_seqs
    V_mask_index = options.V_mask_index #Default is not conditioning on V identity
    J_mask_index = options.J_mask_index #Default is not conditioning on J identity
    skip_empty=True # skip empty lines
    if options.infile_name is None: #No infile specified -- args should be the input seqs
        print('ERROR: specify input file.')
        return -1
    else:
        seqs = []
        V_usage_masks = []
        J_usage_masks = []
        print('Read input file.')
        infile = open(infile_name, 'r')

        for i, line in enumerate(tqdm(infile)):
            if comment_delimiter is not None: #Default case -- no comments/header delimiter
                if line.startswith(comment_delimiter): #allow comments
                    continue
            if i < lines_to_skip:
                continue

            if delimiter is None: #Default delimiter is any whitespace
                split_line = line.split('\n')[0].split()
            else:
                split_line = line.split('\n')[0].split(delimiter)
            #Find the seq
            try:
                seq = split_line[seq_in_index].strip()
                if len(seq.strip()) == 0:
                    if skip_empty:
                        continue
                    else:
                        seqs.append(seq) #keep the blank seq as a placeholder
                        #seq_types.append('aaseq')
                else:
                    seqs.append(seq)
                    #seq_types.append(determine_seq_type(seq, aa_alphabet))
            except IndexError: #no index match for seq
                if skip_empty and len(line.strip()) == 0:
                    continue
                print('seq_in_index is out of range')
                print('Exiting...')
                infile.close()
                return -1

            #Find and format V_usage_mask
            if V_mask_index is None:
                V_usage_masks.append(None) #default mask
            else:
                try:
                    V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter)
                    #check that all V gene/allele names are recognized
                    if all([gene_to_num_str(v, 'V') in pgen_model.V_mask_mapping for v in V_usage_mask]):
                        V_usage_masks.append(V_usage_mask)
                    else:
                        print(str(V_usage_mask) + " is not a usable V_usage_mask composed exclusively of recognized V gene/allele names")
                        print('Unrecognized V gene/allele names: ' + ', '.join([v for v in V_usage_mask if gene_to_num_str(v, 'V') not in pgen_model.V_mask_mapping.keys()]))
                        print('Continuing but inference might be biased...')
                        V_usage_masks.append(V_usage_mask)
                        #infile.close()
                        #return -1
                except IndexError: #no index match for V_mask_index
                    print('V_mask_index is out of range, check the delimeter.')
                    print('Exiting...')
                    infile.close()
                    return -1

            #Find and format J_usage_mask
            if J_mask_index is None:
                J_usage_masks.append(None) #default mask
            else:
                try:
                    J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter)
                    #check that all V gene/allele names are recognized
                    if all([gene_to_num_str(j, 'J') in pgen_model.J_mask_mapping for j in J_usage_mask]):
                        J_usage_masks.append(J_usage_mask)
                    else:
                        print(str(J_usage_mask) + " is not a usable J_usage_mask composed exclusively of recognized J gene/allele names")
                        print('Unrecognized J gene/allele names: ' + ', '.join([j for j in J_usage_mask if gene_to_num_str(j, 'J') not in pgen_model.J_mask_mapping.keys()]))
                        print('Continuing but inference might be biased...')
                        J_usage_masks.append(J_usage_mask)

                        #infile.close()
                        #return -1
                except IndexError: #no index match for J_mask_index
                    print('J_mask_index is out of range, check the delimeter.')
                    print('Exiting...')
                    infile.close()
                    return -1

            if max_number_of_seqs is not None:
                if len(seqs) >= max_number_of_seqs:
                    break

        data_seqs=[[seqs[i],V_usage_masks[i][0],J_usage_masks[i][0]] for i in range(len(seqs))]
        #define number of gen_seqs:
        gen_seqs=[]
        n_gen_seqs=options.n_gen_seqs
        generate_sequences=False
        if options.infile_gen is None:
            generate_sequences=True
            if n_gen_seqs is 0: n_gen_seqs=np.max([int(3e5),3*len(data_seqs)])
        else:
            seqs = []
            V_usage_masks = []
            J_usage_masks = []
            print('Read file of generated seqs.')
            infile = open(options.infile_gen, 'r')

            for i, line in enumerate(tqdm(infile)):
                if comment_delimiter is not None: #Default case -- no comments/header delimiter
                    if line.startswith(comment_delimiter): #allow comments
                        continue
                if i < lines_to_skip:
                    continue

                if delimiter is None: #Default delimiter is any whitespace
                    split_line = line.split('\n')[0].split()
                else:
                    split_line = line.split('\n')[0].split(delimiter)
                #Find the seq
                try:
                    seq = split_line[seq_in_index].strip()
                    if len(seq.strip()) == 0:
                        if skip_empty:
                            continue
                        else:
                            seqs.append(seq) #keep the blank seq as a placeholder
                            #seq_types.append('aaseq')
                    else:
                        seqs.append(seq)
                        #seq_types.append(determine_seq_type(seq, aa_alphabet))
                except IndexError: #no index match for seq
                    if skip_empty and len(line.strip()) == 0:
                        continue
                    print('seq_in_index is out of range')
                    print('Exiting...')
                    infile.close()
                    return -1

                #Find and format V_usage_mask
                if V_mask_index is None:
                    V_usage_masks.append(None) #default mask
                else:
                    try:
                        V_usage_mask = split_line[V_mask_index].strip().split(gene_mask_delimiter)
                        #check that all V gene/allele names are recognized
                        if all([gene_to_num_str(v, 'V') in pgen_model.V_mask_mapping for v in V_usage_mask]):
                            V_usage_masks.append(V_usage_mask)
                        else:
                            print(str(V_usage_mask) + " is not a usable V_usage_mask composed exclusively of recognized V gene/allele names")
                            print('Unrecognized V gene/allele names: ' + ', '.join([v for v in V_usage_mask if gene_to_num_str(v, 'V') not in pgen_model.V_mask_mapping.keys()]))
                            print('Continuing but inference might be biased...')
                            V_usage_masks.append(V_usage_mask)
                            #infile.close()
                            #return -1
                    except IndexError: #no index match for V_mask_index
                        print('V_mask_index is out of range, check the delimeter.')
                        print('Exiting...')
                        infile.close()
                        return -1

                #Find and format J_usage_mask
                if J_mask_index is None:
                    J_usage_masks.append(None) #default mask
                else:
                    try:
                        J_usage_mask = split_line[J_mask_index].strip().split(gene_mask_delimiter)
                        #check that all V gene/allele names are recognized
                        if all([gene_to_num_str(j, 'J') in pgen_model.J_mask_mapping for j in J_usage_mask]):
                            J_usage_masks.append(J_usage_mask)
                        else:
                            print(str(J_usage_mask) + " is not a usable J_usage_mask composed exclusively of recognized J gene/allele names")
                            print('Unrecognized J gene/allele names: ' + ', '.join([j for j in J_usage_mask if gene_to_num_str(j, 'J') not in pgen_model.J_mask_mapping.keys()]))
                            print('Continuing but inference might be biased...')
                            J_usage_masks.append(J_usage_mask)

                            #infile.close()
                            #return -1
                    except IndexError: #no index match for J_mask_index
                        print('J_mask_index is out of range, check the delimeter.')
                        print('Exiting...')
                        infile.close()
                        return -1

            gen_seqs=[[seqs[i],V_usage_masks[i][0],J_usage_masks[i][0]] for i in range(len(seqs))]
        # combine sequences.
        print('Initialise Model.')

        # choose sonia model type
        if options.model_type=='leftright': 
            sonia_model=SoniaLeftposRightpos(data_seqs=data_seqs,
                                             gen_seqs=gen_seqs,
                                             custom_pgen_model=model_folder,
                                             vj=recomb_type == 'VJ',
                                             include_joint_genes=joint_genes,
                                             include_indep_genes=independent_genes,
                                             min_energy_clip=min_energy_clip,
                                             max_energy_clip=max_energy_clip
                                            )
        elif options.model_type=='lengthpos':
            sonia_model=SoniaLengthPos(data_seqs=data_seqs,
                                       gen_seqs=gen_seqs,
                                       custom_pgen_model=model_folder,
                                       vj=recomb_type == 'VJ',
                                       include_joint_genes=joint_genes,
                                       include_indep_genes=independent_genes,
                                       min_energy_clip=min_energy_clip,
                                       max_energy_clip=max_energy_clip
                                      )
        else:
            print('ERROR: choose a model between leftright or lengthpos')

        if generate_sequences: sonia_model.add_generated_seqs(n_gen_seqs,custom_model_folder=model_folder) 

        if recompute_productive_norm: sonia_model.norm_productive=pgen_model.compute_regex_CDR3_template_pgen('CX{0,}')
        
        print('Model initialised. Start inference')
        sonia_model.infer_selection(epochs=options.epochs,verbose=1,batch_size=options.batch_size,validation_split=options.validation_split)
        print('Save Model')
        if options.outfile_name is not None: #OUTFILE SPECIFIED
            sonia_model.save_model(options.outfile_name)
            if options.plot_report:
                from sonia.plotting import Plotter
                pl=Plotter(sonia_model)
                pl.plot_model_learning(os.path.join(options.outfile_name, 'model_learning.png'))
                pl.plot_vjl(os.path.join(options.outfile_name, 'marginals.png'))
                pl.plot_logQ(os.path.join(options.outfile_name, 'log_Q.png'))
                pl.plot_ratioQ(os.path.join(options.outfile_name, 'Q_ratio.png'))

        else: #print to stdout
            sonia_model.save_model('sonia_model')
            if options.plot_report:
                from sonia.plotting import Plotter
                pl=Plotter(sonia_model)
                pl.plot_model_learning(os.path.join('sonia_model', 'model_learning.png'))
                pl.plot_vjl(os.path.join('sonia_model', 'marginals.png'))
                pl.plot_logQ(os.path.join('sonia_model', 'log_Q.png'))
                pl.plot_ratioQ(os.path.join('sonia_model', 'Q_ratio.png'))
Exemplo n.º 25
0
def main():
    """ Generate sequences."""

    parser = OptionParser(conflict_handler="resolve")

    parser.add_option('--humanTRA',
                      '--human_T_alpha',
                      action='store_true',
                      dest='humanTRA',
                      default=False,
                      help='use default human TRA model (T cell alpha chain)')
    parser.add_option('--humanTRB',
                      '--human_T_beta',
                      action='store_true',
                      dest='humanTRB',
                      default=False,
                      help='use default human TRB model (T cell beta chain)')
    parser.add_option('--mouseTRB',
                      '--mouse_T_beta',
                      action='store_true',
                      dest='mouseTRB',
                      default=False,
                      help='use default mouse TRB model (T cell beta chain)')
    parser.add_option('--humanIGH',
                      '--human_B_heavy',
                      action='store_true',
                      dest='humanIGH',
                      default=False,
                      help='use default human IGH model (B cell heavy chain)')
    parser.add_option(
        '--VDJ_model_folder',
        dest='vdj_model_folder',
        metavar='PATH/TO/FOLDER/',
        help='specify PATH/TO/FOLDER/ for a custom VDJ generative model')
    parser.add_option(
        '--VJ_model_folder',
        dest='vj_model_folder',
        metavar='PATH/TO/FOLDER/',
        help='specify PATH/TO/FOLDER/ for a custom VJ generative model')
    parser.add_option('-o',
                      '--outfile',
                      dest='outfile_name',
                      metavar='PATH/TO/FILE',
                      help='write CDR3 sequences to PATH/TO/FILE')

    parser.add_option('-n',
                      '--num_seqs',
                      type='float',
                      metavar='N',
                      default=0,
                      dest='num_seqs_to_generate',
                      help='specify the number of sequences to generate.')
    parser.add_option(
        '--seed',
        type='int',
        dest='seed',
        help=
        'set seed for pseudorandom number generator. Default is to not set a seed.'
    )
    parser.add_option(
        '--seqs_per_time_update',
        type='float',
        default=100000,
        dest='seqs_per_time_update',
        help=
        'specify the number of sequences between time updates. Default is 1e5')
    parser.add_option('--conserved_J_residues',
                      type='string',
                      default='FVW',
                      dest='conserved_J_residues',
                      help="specify conserved J residues. Default is 'FVW'.")
    parser.add_option('--time_updates_off',
                      action='store_false',
                      dest='time_updates',
                      default=True,
                      help='turn time updates off.')
    parser.add_option(
        '--seq_type',
        type='choice',
        default='all',
        dest='seq_type',
        choices=['all', 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'],
        help=
        "declare sequence type for output sequences. Choices: 'all' [default], 'ntseq', 'nucleotide', 'aaseq', 'amino_acid'"
    )
    parser.add_option('--record_genes_off',
                      action='store_false',
                      dest="record_genes",
                      default=True,
                      help='turn off recording V and J gene info.')
    parser.add_option(
        '-d',
        '--delimiter',
        type='choice',
        dest='delimiter',
        choices=['tab', 'space', ',', ';', ':'],
        help=
        "declare delimiter choice. Default is tab for .tsv output files, comma for .csv files, and tab for all others. Choices: 'tab', 'space', ',', ';', ':'"
    )
    parser.add_option('--raw_delimiter',
                      type='str',
                      dest='delimiter',
                      help="declare delimiter choice as a raw string.")

    (options, args) = parser.parse_args()

    main_folder = os.path.dirname(__file__)

    default_models = {}
    default_models['humanTRA'] = [
        os.path.join(main_folder, 'default_models', 'human_T_alpha'), 'VJ'
    ]
    default_models['humanTRB'] = [
        os.path.join(main_folder, 'default_models', 'human_T_beta'), 'VDJ'
    ]
    default_models['mouseTRB'] = [
        os.path.join(main_folder, 'default_models', 'mouse_T_beta'), 'VDJ'
    ]
    default_models['humanIGH'] = [
        os.path.join(main_folder, 'default_models', 'human_B_heavy'), 'VDJ'
    ]

    num_models_specified = sum([
        1 for x in list(default_models.keys()) +
        ['vj_model_folder', 'vdj_model_folder'] if getattr(options, x)
    ])

    if num_models_specified == 1:  #exactly one model specified
        try:
            d_model = [
                x for x in list(default_models.keys()) if getattr(options, x)
            ][0]
            model_folder = default_models[d_model][0]
            recomb_type = default_models[d_model][1]
        except IndexError:
            if options.vdj_model_folder:  #custom VDJ model specified
                model_folder = options.vdj_model_folder
                recomb_type = 'VDJ'
            elif options.vj_model_folder:  #custom VJ model specified
                model_folder = options.vj_model_folder
                recomb_type = 'VJ'
    elif num_models_specified == 0:
        print('Need to indicate generative model.')
        print('Exiting...')
        return -1
    elif num_models_specified > 1:
        print('Only specify one model')
        print('Exiting...')
        return -1

    #Check that all model and genomic files exist in the indicated model folder
    if not os.path.isdir(model_folder):
        print('Check pathing... cannot find the model folder: ' + model_folder)
        print('Exiting...')
        return -1

    params_file_name = os.path.join(model_folder, 'model_params.txt')
    marginals_file_name = os.path.join(model_folder, 'model_marginals.txt')
    V_anchor_pos_file = os.path.join(model_folder, 'V_gene_CDR3_anchors.csv')
    J_anchor_pos_file = os.path.join(model_folder, 'J_gene_CDR3_anchors.csv')

    for x in [
            params_file_name, marginals_file_name, V_anchor_pos_file,
            J_anchor_pos_file
    ]:
        if not os.path.isfile(x):
            print('Cannot find: ' + x)
            print(
                'Please check the files (and naming conventions) in the model folder '
                + model_folder)
            print('Exiting...')
            return -1

    if options.outfile_name is not None:
        outfile_name = options.outfile_name
        if os.path.isfile(outfile_name):
            if not input(outfile_name + ' already exists. Overwrite (y/n)? '
                         ).strip().lower() in ['y', 'yes']:
                print('Exiting...')
                return -1

    #Parse arguments

    num_seqs_to_generate = int(options.num_seqs_to_generate)

    if num_seqs_to_generate <= 0:
        print('Need to specify num_seqs (number of sequences to generate).')
        print('Exiting...')
        return -1

    #Parse default delimiter
    delimiter = options.delimiter
    if delimiter is None:
        delimiter = '\t'
        if options.outfile_name is not None:
            if outfile_name.endswith('.tsv'):
                delimiter = '\t'
            elif outfile_name.endswith('.csv'):
                delimiter = ','
    else:
        try:
            delimiter = {
                'tab': '\t',
                'space': ' ',
                ',': ',',
                ';': ';',
                ':': ':'
            }[delimiter]
        except KeyError:
            pass  #Other raw string.

    #Optional flags
    seq_type = {
        'all': 'all',
        'ntseq': 'ntseq',
        'nucleotide': 'ntseq',
        'aaseq': 'aaseq',
        'amino_acid': 'aaseq'
    }[options.seq_type]
    record_genes = options.record_genes
    seqs_per_time_update = int(options.seqs_per_time_update)
    time_updates = options.time_updates
    conserved_J_residues = options.conserved_J_residues

    if options.seed is not None:
        np.random.seed(options.seed)

    #VDJ recomb case --- used for TCRB and IGH
    if recomb_type == 'VDJ':
        genomic_data = load_model.GenomicDataVDJ()
        genomic_data.load_igor_genomic_data(params_file_name,
                                            V_anchor_pos_file,
                                            J_anchor_pos_file)
        generative_model = load_model.GenerativeModelVDJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        seq_gen = sequence_generation.SequenceGenerationVDJ(
            generative_model, genomic_data)
    #VJ recomb case --- used for TCRA and light chain
    elif recomb_type == 'VJ':
        genomic_data = load_model.GenomicDataVJ()
        genomic_data.load_igor_genomic_data(params_file_name,
                                            V_anchor_pos_file,
                                            J_anchor_pos_file)
        generative_model = load_model.GenerativeModelVJ()
        generative_model.load_and_process_igor_model(marginals_file_name)
        seq_gen = sequence_generation.SequenceGenerationVJ(
            generative_model, genomic_data)

    V_gene_names = [V[0].split('*')[0] for V in genomic_data.genV]
    J_gene_names = [J[0].split('*')[0] for J in genomic_data.genJ]

    if options.outfile_name is not None:
        outfile = open(outfile_name, 'w')

        print('Starting sequence generation... ')
        start_time = time.time()
        for i in range(num_seqs_to_generate):
            ntseq, aaseq, V_in, J_in = seq_gen.gen_rnd_prod_CDR3(
                conserved_J_residues)
            if seq_type == 'all':  #default, include both ntseq and aaseq
                current_line_out = ntseq + delimiter + aaseq
            elif seq_type == 'ntseq':  #only record ntseq
                current_line_out = ntseq
            elif seq_type == 'aaseq':  #only record aaseq
                current_line_out = aaseq

            if record_genes:
                current_line_out += delimiter + V_gene_names[
                    V_in] + delimiter + J_gene_names[J_in]
            outfile.write(current_line_out + '\n')

            if (i + 1) % seqs_per_time_update == 0 and time_updates:
                c_time = time.time() - start_time
                eta = ((num_seqs_to_generate -
                        (i + 1)) / float(i + 1)) * c_time
                if c_time > 86400:  #more than a day
                    c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.' % (
                        int(c_time) / 86400, (int(c_time) / 3600) % 24,
                        (int(c_time) / 60) % 60, c_time % 60)
                elif c_time > 3600:  #more than an hr
                    c_time_str = '%d hours, %d minutes, and %.2f seconds.' % (
                        (int(c_time) / 3600) % 24,
                        (int(c_time) / 60) % 60, c_time % 60)
                elif c_time > 60:  #more than a min
                    c_time_str = '%d minutes and %.2f seconds.' % (
                        (int(c_time) / 60) % 60, c_time % 60)
                else:
                    c_time_str = '%.2f seconds.' % (c_time)

                if eta > 86400:  #more than a day
                    eta_str = '%d days, %d hours, %d minutes, and %.2f seconds.' % (
                        int(eta) / 86400, (int(eta) / 3600) % 24,
                        (int(eta) / 60) % 60, eta % 60)
                elif eta > 3600:  #more than an hr
                    eta_str = '%d hours, %d minutes, and %.2f seconds.' % (
                        (int(eta) / 3600) % 24, (int(eta) / 60) % 60, eta % 60)
                elif eta > 60:  #more than a min
                    eta_str = '%d minutes and %.2f seconds.' % (
                        (int(eta) / 60) % 60, eta % 60)
                else:
                    eta_str = '%.2f seconds.' % (eta)

                print(
                    '%d sequences generated in %s Estimated time remaining: %s'
                    % (i + 1, c_time_str, eta_str))

        c_time = time.time() - start_time
        if c_time > 86400:  #more than a day
            c_time_str = '%d days, %d hours, %d minutes, and %.2f seconds.' % (
                int(c_time) / 86400, (int(c_time) / 3600) % 24,
                (int(c_time) / 60) % 60, c_time % 60)
        elif c_time > 3600:  #more than an hr
            c_time_str = '%d hours, %d minutes, and %.2f seconds.' % (
                (int(c_time) / 3600) % 24,
                (int(c_time) / 60) % 60, c_time % 60)
        elif c_time > 60:  #more than a min
            c_time_str = '%d minutes and %.2f seconds.' % (
                (int(c_time) / 60) % 60, c_time % 60)
        else:
            c_time_str = '%.2f seconds.' % (c_time)
        print('Completed generating all %d sequences in %s' %
              (num_seqs_to_generate, c_time_str))
        outfile.close()

    else:  #print to stdout
        for i in range(num_seqs_to_generate):
            ntseq, aaseq, V_in, J_in = seq_gen.gen_rnd_prod_CDR3(
                conserved_J_residues)
            if seq_type == 'all':  #default, include both ntseq and aaseq
                current_line_out = ntseq + delimiter + aaseq
            elif seq_type == 'ntseq':  #only record ntseq
                current_line_out = ntseq
            elif seq_type == 'aaseq':  #only record aaseq
                current_line_out = aaseq

            if record_genes:
                current_line_out += delimiter + V_gene_names[
                    V_in] + delimiter + J_gene_names[J_in]
            print(current_line_out)
Exemplo n.º 26
0
import olga.load_model as load_model
import olga.generation_probability as pgen
import olga.sequence_generation as seq_gen

#%%

path = '/home/heli/ENV/lib/python2.7/site-packages/olga/'

#Define the files for loading in generative model/data
params_file_name = path + 'default_models/human_T_beta/model_params.txt'
marginals_file_name = path + 'default_models/human_T_beta/model_marginals.txt'
V_anchor_pos_file = path + 'default_models/human_T_beta/V_gene_CDR3_anchors.csv'
J_anchor_pos_file = path + 'default_models/human_T_beta/J_gene_CDR3_anchors.csv'

#Load data
genomic_data = load_model.GenomicDataVDJ()
genomic_data.load_igor_genomic_data(params_file_name, V_anchor_pos_file,
                                    J_anchor_pos_file)
#Load model
generative_model = load_model.GenerativeModelVDJ()
generative_model.load_and_process_igor_model(marginals_file_name)

#Process model/data for pgen computation by instantiating GenerationProbabilityVDJ
pgen_model = pgen.GenerationProbabilityVDJ(generative_model, genomic_data)

#example
#calculating pgen with restriction to V, J gene usage
pgen_model.compute_aa_CDR3_pgen('CAWSVAPDRGGYTF', 'TRBV30*01', 'TRBJ1-2*01')
#calculating pgen without restriction to V, J gene usage
pgen_model.compute_aa_CDR3_pgen('CAWSVAPDRGGYTF')