def compute_protein_coverage(experiment1:Experiment,experiment2:Experiment,progress_bar:bool=True)->Dict[str,Dict[str,np.ndarray]]: """ Compute the difference in protein coverage among the two input experiments Args: experiment1 (Experiment): The first experiment containing the protein and peptides derived from the first HLA-set experiment2 (Experiment): The second experiment containing the protein and peptides derived from the second HLA-set progress_bar (bool): A boolean flag for controlling the progress bar, if true, a progress bar is shown, defaults to True. Returns: Dict[str,Dict[str,np.ndarray]]: Returns a nested dict containing protein identifiers as a keys and a dict as a value, the dict contain two arrays as values,\ the first contain protein coverage in the first HLA-Set and the second contain the coverage in the second HLA-set.\ The results dictionary only contain coverage for proteins observed in the two sets. """ protein_experiment_one:List[str]=experiment1.get_proteins() protein_experiment_two:List[str]=experiment2.get_proteins() present_in_both:List[str]=protein_experiment_one.intersection(protein_experiment_two) results:Dict[str,Dict[str,np.ndarray]]=dict() if progress_bar: for protein in present_in_both: temp_dict={ '_'.join(experiment1.get_hla_set().get_names()):experiment1.get_mapped_protein(protein), '_'.join(experiment2.get_hla_set().get_names()):experiment2.get_mapped_protein(protein) } results.update({protein:temp_dict}) else: for protein in present_in_both: temp_dict={ '_'.join(experiment1.get_hla_set().get_names()):experiment1.get_mapped_protein(protein), '_'.join(experiment2.get_hla_set().get_names()):experiment2.get_mapped_protein(protein) } results.update({protein:temp_dict}) return results
def get_binnary_protein_overlap(exp1:Experiment, exp2:Experiment)->Proteins: """compare the protein overlap between two experimental objects. :param exp1: an instance of class Experiment. :type exp1: Experiment :param exp2: an instance of class Experiment. :type exp2: Experiment :return: a list of proteins that have been identified or inferred in both experiments. :rtype: Proteins """ protein_one=exp1.get_proteins() protein_two=exp2.get_proteins() return list(protein_one.intersection(protein_two))
def get_binnary_peptide_overlap(exp1:Experiment, exp2:Experiment)->Peptides: """compare the peptide overlap between two experimental objects. :param exp1: an instance of class Experiment. :type exp1: Experiment :param exp2: an instance of class Experiment. :type exp2: Experiment :return: a list of peptides that have been identified in both experiments. :rtype: Peptides """ peptide_one=exp1.get_peptides() peptide_two=exp2.get_peptides() return list(peptide_one.intersection(peptide_two))
def load_data(self, exp: Experiment, num_proteins: int = -1) -> None: """Load the data to the Engine, so GOEA can be conducted Args: exp (Experiment): An Experimental object to extract uniprot ids num_proteins (int, optional): The number of proteins to be included in the analysis. Defaults -1 to which mean use all proteins,\ otherwise it uses the number of proteins provided by the user. note that the function is sorted by number of peptides per protein,\ that is the first 10 protein means, getting the top 10 protein with most peptides. Raises: ValueError: if the function called while data being already associated with the engine from a previous call """ if self._gene_ids is not None: raise ValueError( f"There some data still in the engine, the first 10 genes are: {','.join(self._gene_ids[:10])}\ clean your engine from previous data using the function, clean_engine and try again." ) print( f"Getting the number of peptide per protein ..., started at: {time.ctime()}" ) num_protein_per_peptides = exp.get_peptides_per_protein() if num_proteins == -1: list_proteins = num_protein_per_peptides.iloc[:, 0].to_list() else: list_proteins = num_protein_per_peptides.iloc[:, 0].to_list( )[:num_proteins] print( f"Map uniprot to Entrez gene ids ..., starting at: {time.ctime()}") self._gene_ids = [ int(gene_id) for gene_id in map_from_uniprot_to_Entrez_Gene( list_proteins).iloc[:, 1].to_list() ] print(f"{len(self._gene_ids)} Genes have been correctly loaded") return
def simulate_random_experiment(alleles: List[str], path2fasta: str, tissue_name: str = 'TEST_TISSUE', num_pep: int = 10, num_prot: int = 5, proband_name: str = None) -> Experiment: """ Simulate a random experiment objects :param alleles: a list of alleles names. :type alleles: List[str] :param path2fasta: The path to load the database objects :type path2fasta: str :param tissue_name: the name of the tissue, defaults to 'TEST_TISSUE' :type tissue_name: str, optional :param num_pep: the number of peptides in the table, defaults to 10 :type num_pep: int, optional :param num_prot: number of proteins, defaults to 5 :type num_prot: int, optional :param proband_name: The name of the Proband, defaults to None :type proband_name: str, optional :return: A simulated experimental object :rtype: Experiment """ if proband_name is None: proband_name = generate_random_name(12) proband: Proband = Proband(name=proband_name) hla_set: HLASet = HLASet(alleles) ident_table: pd.DataFrame = simulate_an_experimental_ident_table_from_fasta( path2fasta, num_pep, num_pep) # to be upgraded to the new version of the Tissue class tissue: Tissue = Tissue(tissue_name, simulate_an_expression_table(num_transcripts=1000), simulate_an_expression_table(num_transcripts=100)) database: SeqDB = SeqDB(path2fasta) return Experiment(proband=proband, hla_set=hla_set, tissue=tissue, database=database, ident_table=ident_table)
def compute_jaccard_index(exp1:Experiment,exp2:Experiment, level:str='peptide')->float: """Compute Jaccard index between samples two samples Args: exp1 (Experiment): The first experimental instance exp2 (Experiment): The first experimental instance level (str): The level of computing the overlap between samples, can be any of peptide or protein Returns: float: Jaccard index computed with regard to the to provide level """ if level != 'peptide' and level != 'protein': raise ValueError(f"Level: {level} is not supported, currently only level, peptide and protein are supported") if level=='peptide': return (len(exp1.get_peptides().intersection(exp2.get_peptides())) / len(exp1.get_peptides().union(exp2.get_peptides()))) if level=='protein': return (len(exp1.get_proteins().intersection(exp2.get_proteins())) / len(exp1.get_proteins().union(exp2.get_proteins())))
def __init__( self, filepath: str, path2fasta: str, fileformat: str = 'idXML', tissue_name: str = 'total PMBC', proband_name: str = 'Default Proband', hla_set: List[str] = ['DRB1*15:01', 'DRB1*15:01']) -> cExperimet: """A Wrapper class for constracting an experimental dataset using user defined parameters\ The class take care of initializing all classes and functions provided an easy-to-use interface\ for working with immunopeptidomics data Args: filepath (str): the path to load the input file, for example and idXML or an Identification table path2fasta (str): the path to load Fasta database fileformat (str, optional): type of input format, can be any of idXML, pepXML, mzTab or a CSV Table.\ Defaults to 'idXML'. tissue_name (str, optional): The name of the tissue to utilize, this is used for initializing the gene expression table\ Defaults to 'total PMBC'. proband_name (str, optional): the name of the proband from whome the data was obtained. Defaults to 'Default Proband'. hla_set (List[str], optional): A list of HLA alleles from whome the data was obtained. Defaults to ['DRB1*15:01','DRB1*15:01']. Returns: cExperimet: an IPTK.Class.Wrapper.Experiment class, an IPTK.Class.Experiment.Experiment can be extracted from the resutned instance using the get_experiment method """ ## Checking that the input is correct if not os.path.exists(filepath): raise ValueError( f"The provided path for the identification file : {filepath} does not exist!!" ) if not os.path.exists(path2fasta): raise ValueError( f"The path to the proivded fasta file: {path2fasta}, does not exists!!!" ) if fileformat not in ['idXML', 'pepXML', 'csv', 'mzTab']: raise ValueError( f"Unknow input format, the provided format: {fileformat} is not supported, currently supported values are: {', '.join(['idXML', 'pepXML', 'IdTable','mzTab'])}" ) # define the data self._proband = Proband(name=proband_name) # the name of the proband try: self._hLASet = HLASet( hlas=['HLA-DRB1*15:01'] ) # just a place holder to represent the HLA allele, an instance of class HLASet except Exception as exp: raise RuntimeError( f"The following error was Encountered while creating an HLASet: \n{str(exp)}\n" ) try: self._seqBD = SeqDB(path2fasta=path2fasta) except Exception as exp: raise IOError( f"While loading the fasta database the following error was Encountered : \n{str(exp)}\n" ) self._expresson_profile = GeneExpressionDB( ) # use the data on the human protein atlas @https://www.proteinatlas.org/about/download --> Normal tissue data self._protein_locations = CellularLocationDB( ) # use the data on the human protein atlas @https://www.proteinatlas.org/about/download --> Subcellular location data try: self._tissue = Tissue(name='small intestine', main_exp_value=self._expresson_profile, main_location=self._protein_locations ) # create the tissue instance except Exception as exp: raise RuntimeError( f"While creating a tissue instance, the following error was Encountered: \n{str(exp)}\n" ) try: if fileformat == 'idXML': input_table = parse_xml_based_format_to_identification_table( path2XML_file=filepath, path2fastaDB=path2fasta, is_idXML=True) elif fileformat == 'pepXML': input_table = parse_xml_based_format_to_identification_table( path2XML_file=filepath, path2fastaDB=path2fasta, is_idXML=False) elif fileformat == 'mzTab': input_table = parse_mzTab_to_identification_table( path2mzTab=filepath, path2fastaDB=path2fasta) else: input_table = parse_text_table(filepath, path2fasta) except Exception as exp: raise ValueError( f"Loading the input table has caused to the following error: \n{str(exp)}\n" ) # constructing the experiments try: self._exp = Experiment(self._proband, self._hLASet, self._tissue, self._seqBD, input_table) except Exception as exp: raise RuntimeError( f"Generating an Experiment instance has caused to the following error: \n{str(exp)}\n" ) self._cashed_results = dict() return
def compute_expression_correlation(exp1: Experiment, exp2: Experiment) -> float: """compute the correlation in the gene expression between two experiments by constructing a union of all the proteins expressed in the first and second experiments, extract the gene expression of these genes and then compute the correlation using SciPy stat module. :param exp1: The first experimental object :type exp1: Experiment :param exp2: The second experimental object :type exp2: Experiment :return: the correlation in gene expression of the proteins inferred in the provided pair of experiment :rtype: float """ # get the expression tables protein_exp1: Set[str] = set(exp1.get_proteins()) protein_exp2: Set[str] = set(exp2.get_proteins()) unique_proteins = list(protein_exp1.union(protein_exp2)) # get the gene id prot2Ense: pd.DataFrame = map_from_uniprot_gene(unique_proteins) # allocate lists to hold the results gene_expression_exp1: List[float] = [] gene_expression_exp2: List[float] = [] # get the expression from experiment one for prot in unique_proteins: temp_df: pd.DataFrame = prot2Ense.loc[prot2Ense.iloc[:, 0] == prot] if temp_df.shape[0] == 1: # we got only one match gene_id: str = temp_df['Gene-ID'].tolist()[0] try: gene_expression_exp1.append( exp1.get_tissue().get_expression_profile( ).get_gene_id_expression(gene_id=gene_id)) except KeyError: gene_expression_exp1.append(-1) else: temp_gene_expression: List[float] = [] for gene in temp_df.iloc[:, 1].tolist(): try: temp_gene_expression.append( exp1.get_tissue().get_expression_profile( ).get_gene_id_expression(gene_id=gene)) except KeyError: temp_gene_expression.append(-1) # filter the temp_genes for default value temp_gene_process: List[str] = [ elem for elem in temp_gene_expression if elem != -1 ] # add the gene expression as the average if all values have been filtered if len(temp_gene_process) == 0: gene_expression_exp1.append(-1) else: gene_expression_exp1.append(np.mean(temp_gene_process)) # ge the expression from exp2: for prot in unique_proteins: temp_df: pd.DataFrame = prot2Ense.loc[prot2Ense.iloc[:, 0] == prot] if temp_df.shape[0] == 1: # we got only one match gene_id: str = temp_df['Gene-ID'].tolist()[0] try: gene_expression_exp2.append( exp2.get_tissue().get_expression_profile( ).get_gene_id_expression(gene_id=gene_id)) except KeyError: gene_expression_exp2.append(-1) else: temp_gene_expression: List[float] = [] for gene in temp_df.iloc[:, 1].tolist(): try: temp_gene_expression.append( exp2.get_tissue().get_expression_profile( ).get_gene_id_expression(gene_id=gene)) except KeyError: temp_gene_expression.append(-1) # filter the temp_genes for default value temp_gene_process: List[str] = [ elem for elem in temp_gene_expression if elem != -1 ] # add the gene expression as the average if len(temp_gene_process) == 0: gene_expression_exp2.append(-1) else: gene_expression_exp2.append(np.mean(temp_gene_process)) # compute construct a dataframe temp_paired_exp_df: pd.DataFrame = pd.DataFrame({ 'exp2': gene_expression_exp1, 'exp1': gene_expression_exp2 }) # filter the un-mapped from exp1 temp_paired_exp_df = temp_paired_exp_df.loc[ temp_paired_exp_df.iloc[:, 0] != -1, ] # filter the unmapped from exp2 temp_paired_exp_df = temp_paired_exp_df.loc[ temp_paired_exp_df.iloc[:, 1] != -1, ] # compute the correlation return pearsonr(temp_paired_exp_df.iloc[:, 0], temp_paired_exp_df.iloc[:, 1])[0]
def create_experiment(table_format, tissue_name, hla_alleles, n_clicks): if n_clicks > 0: if PEPTIDE_TABLE_PATH is None: return "ERROR: The peptide identification file has not been uploaded" if FASTA_DATABASE_PATH is None: return "ERROR: The sequence database identification has not been uploaded" # try to load the peptide table try: if table_format == 'pepXML': table_pep: pd.DataFrame = inFunc.parse_xml_based_format_to_identification_table( path2XML_file=PEPTIDE_TABLE_PATH, path2fastaDB=FASTA_DATABASE_PATH, is_idXML=False) elif table_format == 'idXML': table_pep: pd.DataFrame = inFunc.parse_xml_based_format_to_identification_table( path2XML_file=PEPTIDE_TABLE_PATH, path2fastaDB=FASTA_DATABASE_PATH, is_idXML=True) elif table_format == 'mzTab': table_pep: pd.DataFrame = inFunc.parse_mzTab_to_identification_table( path2mzTab=PEPTIDE_TABLE_PATH, path2fastaDB=FASTA_DATABASE_PATH, ) else: table_pep: pd.DataFrame = inFunc.parse_text_table( path2file=PEPTIDE_TABLE_PATH, path2fastaDB=FASTA_DATABASE_PATH, sep=',') except Exception as exp: return f'ERROR:: While parsing the identification table, the following error was encountered: {exp} ' # create a proband proband: Proband = Proband(name='UI_PROBAND') # create an sequence database try: seqs: SeqDB = SeqDB(path2fasta=FASTA_DATABASE_PATH) except Exception as exp: return f'ERROR:: While creating the sequence database: the following error was encountered; {exp}' # create the OrgDB try: org_db: OrganismDB = OrganismDB(FASTA_DATABASE_PATH) except Exception as exp: return f'ERROR:: While creating the sequence database: the following error was encountered; {exp}' # create the expression profile if GENE_EXPRESSION_TABLE is None: try: expresson_profile: GeneExpressionDB = GeneExpressionDB( path2data= 'https://www.proteinatlas.org/download/rna_tissue_consensus.tsv.zip', sep='\t') except Exception as exp: return f'While Downloading the online table the following error was encountered: {exp}' else: try: expresson_profile: GeneExpressionDB = GeneExpressionDB( path2data=GENE_EXPRESSION_TABLE, sep='\t') except Exception as exp: return f'While parsing the expression table, the following error was encountered: {exp}' # create the location table if PROTEIN_LOC_TABLE is None: try: protein_locations: CellularLocationDB = CellularLocationDB( path2data= 'https://www.proteinatlas.org/download/subcellular_location.tsv.zip', sep='\t') except Exception as exp: return f'While Downloading the protein sub-cellular location table, the following error was encountered: {exp}' else: try: protein_locations: CellularLocationDB = CellularLocationDB( path2data=PROTEIN_LOC_TABLE, sep='\t') except Exception as exp: return f'While parsing the location table, the following error was encountered: {exp}' # create the tissue instance tissue: Tissue = Tissue(name=tissue_name, main_exp_value=expresson_profile, main_location=protein_locations) # create the hla_set hlas: HLASet = HLASet(hlas=hla_alleles.split(';')) # create the experiment object global experiment try: experiment = Experiment(proband=proband, hla_set=hlas, tissue=tissue, database=seqs, ident_table=table_pep) except Exception as exp: return f'while creating an experimental object,the following error was encounter {exp}' # annoatate the experiment experiment.annotate_proteins(org_db) # return the experiment output return str(experiment)