def get_geneid_to_localization(file_name, mapping_file, only_extracellular=False, filter_uncertain=False): """ Relevant extracellular term: Plasma membrane (GO:0005886) Some other terms: Vesicles (GO:0043231);Nuclear bodies (GO:0016604);Nucleoplasm (GO:0005654);Cytosol (GO:0005829);Nucleoli (GO:0005730);Mitochondria (GO:0005739);Cell Junctions (GO:0030054);Endoplasmic reticulum (GO:0005783);Centrosome (GO:0005813);Nucleoli fibrillar center (GO:0001650);Golgi apparatus (GO:0005794);Nuclear speckles (GO:0016607) Reliability types: Uncertain | Approved | Supported | Enhanced """ geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(mapping_file) geneid_to_localization = {} ids_unmatched = set() f = open(file_name) f.readline() for line in f: words = line.strip("\n").split("\t") # Evidence and reliability are for experimental characterization gene, reliability, go_terms = words[1], words[2], words[-1] if filter_uncertain and reliability == "Uncertain": continue if gene in name_to_geneid: geneid = name_to_geneid[gene] else: #print "Unmatched id", gene ids_unmatched.add(gene) continue for go in go_terms.split(";"): idx = go.find("(") go = go[idx+1:-1] if only_extracellular and go != "GO:0005886": continue geneid_to_localization.setdefault(geneid, set()).add((go, reliability)) f.close() print "Unmatched:", len(ids_unmatched) #, ", ".join(sorted(ids_unmatched)) return geneid_to_localization
def get_geneid_symbol_mapping(mapping_file): """ id_mapping_file = %(data_dir)s/ncbi/geneid_to_symbol.txt """ geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping( mapping_file) return geneid_to_name, name_to_geneid
def create_network_file(file_name, mapping_file, out_file, tissue=None): geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping( mapping_file) parser = TsvReader.TsvReader(file_name, delim="\t") fields_to_include = ["evidence type", "symbol1", "symbol2"] if tissue is not None: fields_to_include += [tissue] column_to_index, id_to_values = parser.read( fields_to_include=fields_to_include) edges = set() for evidence, values in id_to_values.iteritems(): if evidence == "pred": continue if tissue is not None: for gene1, gene2, abundance in values: #if "P" not in set(abundance): if abundance == "-": continue if gene1 in name_to_geneid and gene2 in name_to_geneid: geneid1 = name_to_geneid[gene1] geneid2 = name_to_geneid[gene2] edges.add((geneid1, geneid2)) else: for gene1, gene2 in values: if gene1 in name_to_geneid and gene2 in name_to_geneid: geneid1 = name_to_geneid[gene1] geneid2 = name_to_geneid[gene2] edges.add((geneid1, geneid2)) f = open(out_file, 'w') for u, v in edges: f.write("%s 1 %s\n" % (u, v)) f.close() return
def create_network_file(file_name, mapping_file, out_file, tissue=None): geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(mapping_file) parser = TsvReader.TsvReader(file_name, delim="\t") fields_to_include = ["evidence type", "symbol1", "symbol2"] if tissue is not None: fields_to_include += [tissue] column_to_index, id_to_values = parser.read(fields_to_include=fields_to_include) edges = set() for evidence, values in id_to_values.iteritems(): if evidence == "pred": continue if tissue is not None: for gene1, gene2, abundance in values: #if "P" not in set(abundance): if abundance == "-": continue if gene1 in name_to_geneid and gene2 in name_to_geneid: geneid1 = name_to_geneid[gene1] geneid2 = name_to_geneid[gene2] edges.add((geneid1, geneid2)) else: for gene1, gene2 in values: if gene1 in name_to_geneid and gene2 in name_to_geneid: geneid1 = name_to_geneid[gene1] geneid2 = name_to_geneid[gene2] edges.add((geneid1, geneid2)) f = open(out_file, 'w') for u, v in edges: f.write("%s 1 %s\n" % (u, v)) f.close() return
def main(): base_dir = "/home/emre/arastirma/data/drug/sensitivity/gdsc/" #file_name = base_dir + "gdsc_compounds_conc_w5.csv" #compound_to_concentrations = get_compounds(file_name) #print len(compound_to_concentrations), compound_to_concentrations["GNF-2"] geneid_to_names, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(base_dir + "../../../proteome/ncbi/geneid_to_symbol.txt") file_target = base_dir + "gdsc_en_output_w5.csv" file_response = base_dir + "gdsc_manova_output_w5.csv" get_gsdc_info(file_target, file_response, name_to_geneid) return
def main(): mapping_file = "/home/emre/data/ncbi/geneid_to_symbol.txt" geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(mapping_file) file_name = "/home/emre/data/tissue/hpa_subcellular_location.tsv" geneid_to_localization = get_geneid_to_localization(file_name, mapping_file) # , only_extracellular=True print geneid_to_localization["8648"] # NCOA1 print geneid_to_localization["207"] # AKT1 file_name_membrane = "/home/emre/data/tissue/hpa_predicted_membrane.tsv" file_name_secreted = "/home/emre/data/tissue/hpa_predicted_secreted.tsv" geneid_to_localization = get_geneid_to_predicted_localization(file_name_membrane, file_name_secreted, mapping_file) print geneid_to_localization["5925"] # RB1 return
def get_geneid_to_predicted_localization(file_name_membrane, file_name_secreted, mapping_file): """ Relevant protein classes: Predicted secreted proteins | Predicted membrane proteins Note that evidence and reliability are for experimental characterization """ geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(mapping_file) geneid_to_localization = {} ids_unmatched = set() f = open(file_name_membrane) f.readline() for line in f: words = line.strip("\n").split("\t") gene, protein_class, evidence, reliability = words[0], words[6], words[7], words[11] #if reliability not in ("Approved", "Supported", "Enhanced"): # continue if gene in name_to_geneid: geneid = name_to_geneid[gene] else: #print "Unmatched id", gene ids_unmatched.add(gene) continue for localization in protein_class.split(", "): localization = localization.replace("Predicted ", "").replace(" proteins", "").replace(" genes", "") if localization == "membrane": geneid_to_localization.setdefault(geneid, set()).add((localization, reliability)) f.close() f = open(file_name_secreted) f.readline() for line in f: words = line.strip("\n").split("\t") # Evidence and reliability are for experimental characterization gene, protein_class, evidence, reliability = words[0], words[6], words[7], words[11] if gene in name_to_geneid: geneid = name_to_geneid[gene] else: #print "Unmatched id", gene ids_unmatched.add(gene) continue for localization in protein_class.split(", "): localization = localization.replace("Predicted ", "").replace(" proteins", "").replace(" genes", "") if localization == "secreted": geneid_to_localization.setdefault(geneid, set()).add((localization, reliability)) f.close() print "Unmatched:", len(ids_unmatched) #, ", ".join(sorted(ids_unmatched)) return geneid_to_localization
def get_geneid_symbol_mapping(mapping_file): geneid_to_names, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping( mapping_file) return geneid_to_names, name_to_geneid
def get_geneid_symbol_mapping(mapping_file): """ id_mapping_file = %(data_dir)s/ncbi/geneid_to_symbol.txt """ geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(mapping_file) return geneid_to_name, name_to_geneid
def get_geneid_symbol_mapping(mapping_file): geneid_to_names, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(mapping_file) return geneid_to_names, name_to_geneid