示例#1
0
def get_geneid_to_localization(file_name, mapping_file, only_extracellular=False, filter_uncertain=False):
    """
    Relevant extracellular term: Plasma membrane (GO:0005886)
    Some other terms: Vesicles (GO:0043231);Nuclear bodies (GO:0016604);Nucleoplasm (GO:0005654);Cytosol (GO:0005829);Nucleoli (GO:0005730);Mitochondria (GO:0005739);Cell Junctions (GO:0030054);Endoplasmic reticulum (GO:0005783);Centrosome (GO:0005813);Nucleoli fibrillar center (GO:0001650);Golgi apparatus (GO:0005794);Nuclear speckles (GO:0016607)
    Reliability types: Uncertain | Approved | Supported | Enhanced
    """
    geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(mapping_file)
    geneid_to_localization = {}
    ids_unmatched = set()
    f = open(file_name)
    f.readline()
    for line in f:
	words = line.strip("\n").split("\t")
	# Evidence and reliability are for experimental characterization
	gene, reliability, go_terms = words[1], words[2], words[-1]
	if filter_uncertain and reliability == "Uncertain":
	    continue
	if gene in name_to_geneid:
	    geneid = name_to_geneid[gene]
	else:
	    #print "Unmatched id", gene
	    ids_unmatched.add(gene)
	    continue
	for go in go_terms.split(";"):
	    idx = go.find("(")
	    go = go[idx+1:-1]
	    if only_extracellular and go != "GO:0005886":
		continue
	    geneid_to_localization.setdefault(geneid, set()).add((go, reliability))
    f.close()
    print "Unmatched:", len(ids_unmatched) #, ", ".join(sorted(ids_unmatched))
    return geneid_to_localization
示例#2
0
def get_geneid_symbol_mapping(mapping_file):
    """
    id_mapping_file = %(data_dir)s/ncbi/geneid_to_symbol.txt
    """
    geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(
        mapping_file)
    return geneid_to_name, name_to_geneid
示例#3
0
def create_network_file(file_name, mapping_file, out_file, tissue=None):
    geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(
        mapping_file)
    parser = TsvReader.TsvReader(file_name, delim="\t")
    fields_to_include = ["evidence type", "symbol1", "symbol2"]
    if tissue is not None:
        fields_to_include += [tissue]
    column_to_index, id_to_values = parser.read(
        fields_to_include=fields_to_include)
    edges = set()
    for evidence, values in id_to_values.iteritems():
        if evidence == "pred":
            continue
        if tissue is not None:
            for gene1, gene2, abundance in values:
                #if "P" not in set(abundance):
                if abundance == "-":
                    continue
                if gene1 in name_to_geneid and gene2 in name_to_geneid:
                    geneid1 = name_to_geneid[gene1]
                    geneid2 = name_to_geneid[gene2]
                    edges.add((geneid1, geneid2))
        else:
            for gene1, gene2 in values:
                if gene1 in name_to_geneid and gene2 in name_to_geneid:
                    geneid1 = name_to_geneid[gene1]
                    geneid2 = name_to_geneid[gene2]
                    edges.add((geneid1, geneid2))
    f = open(out_file, 'w')
    for u, v in edges:
        f.write("%s 1 %s\n" % (u, v))
    f.close()
    return
示例#4
0
def create_network_file(file_name, mapping_file, out_file, tissue=None):
    geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(mapping_file)
    parser = TsvReader.TsvReader(file_name, delim="\t")
    fields_to_include = ["evidence type", "symbol1", "symbol2"]
    if tissue is not None:
	fields_to_include += [tissue]
    column_to_index, id_to_values = parser.read(fields_to_include=fields_to_include)
    edges = set()
    for evidence, values in id_to_values.iteritems():
	if evidence == "pred":
	    continue
	if tissue is not None:
	    for gene1, gene2, abundance in values:
		#if "P" not in set(abundance):
		if abundance == "-":
		    continue
		if gene1 in name_to_geneid and gene2 in name_to_geneid:
		    geneid1 = name_to_geneid[gene1]
		    geneid2 = name_to_geneid[gene2]
		    edges.add((geneid1, geneid2))
	else:
	    for gene1, gene2 in values:
		if gene1 in name_to_geneid and gene2 in name_to_geneid:
		    geneid1 = name_to_geneid[gene1]
		    geneid2 = name_to_geneid[gene2]
		    edges.add((geneid1, geneid2))
    f = open(out_file, 'w')
    for u, v in edges:
	f.write("%s 1 %s\n" % (u, v))
    f.close()
    return
示例#5
0
def main():
    base_dir = "/home/emre/arastirma/data/drug/sensitivity/gdsc/"
    #file_name = base_dir + "gdsc_compounds_conc_w5.csv"
    #compound_to_concentrations = get_compounds(file_name)
    #print len(compound_to_concentrations), compound_to_concentrations["GNF-2"]
    geneid_to_names, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(base_dir + "../../../proteome/ncbi/geneid_to_symbol.txt")
    file_target = base_dir + "gdsc_en_output_w5.csv"
    file_response = base_dir + "gdsc_manova_output_w5.csv"
    get_gsdc_info(file_target, file_response, name_to_geneid) 
    return
示例#6
0
def main():
    mapping_file = "/home/emre/data/ncbi/geneid_to_symbol.txt"
    geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(mapping_file)
    file_name = "/home/emre/data/tissue/hpa_subcellular_location.tsv"
    geneid_to_localization = get_geneid_to_localization(file_name, mapping_file) # , only_extracellular=True
    print geneid_to_localization["8648"] # NCOA1
    print geneid_to_localization["207"] # AKT1
    file_name_membrane = "/home/emre/data/tissue/hpa_predicted_membrane.tsv"
    file_name_secreted = "/home/emre/data/tissue/hpa_predicted_secreted.tsv"
    geneid_to_localization = get_geneid_to_predicted_localization(file_name_membrane, file_name_secreted, mapping_file)
    print geneid_to_localization["5925"] # RB1
    return
示例#7
0
def get_geneid_to_predicted_localization(file_name_membrane, file_name_secreted, mapping_file):
    """
    Relevant protein classes: Predicted secreted proteins | Predicted membrane proteins
    Note that evidence and reliability are for experimental characterization
    """
    geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(mapping_file)
    geneid_to_localization = {}
    ids_unmatched = set()
    f = open(file_name_membrane)
    f.readline()
    for line in f:
	words = line.strip("\n").split("\t")
	gene, protein_class, evidence, reliability = words[0], words[6], words[7], words[11]
	#if reliability not in ("Approved", "Supported", "Enhanced"):
	#    continue
	if gene in name_to_geneid:
	    geneid = name_to_geneid[gene]
	else:
	    #print "Unmatched id", gene
	    ids_unmatched.add(gene)
	    continue
	for localization in protein_class.split(", "):
	    localization = localization.replace("Predicted ", "").replace(" proteins", "").replace(" genes", "")
	    if localization == "membrane":
		geneid_to_localization.setdefault(geneid, set()).add((localization, reliability))
    f.close()
    f = open(file_name_secreted)
    f.readline()
    for line in f:
	words = line.strip("\n").split("\t")
	# Evidence and reliability are for experimental characterization
	gene, protein_class, evidence, reliability = words[0], words[6], words[7], words[11]
	if gene in name_to_geneid:
	    geneid = name_to_geneid[gene]
	else:
	    #print "Unmatched id", gene
	    ids_unmatched.add(gene)
	    continue
	for localization in protein_class.split(", "):
	    localization = localization.replace("Predicted ", "").replace(" proteins", "").replace(" genes", "")
	    if localization == "secreted":
		geneid_to_localization.setdefault(geneid, set()).add((localization, reliability))
    f.close()
    print "Unmatched:", len(ids_unmatched) #, ", ".join(sorted(ids_unmatched))
    return geneid_to_localization
示例#8
0
def get_geneid_symbol_mapping(mapping_file):
    geneid_to_names, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(
        mapping_file)
    return geneid_to_names, name_to_geneid
示例#9
0
def get_geneid_symbol_mapping(mapping_file):
    """
    id_mapping_file = %(data_dir)s/ncbi/geneid_to_symbol.txt
    """
    geneid_to_name, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(mapping_file)
    return geneid_to_name, name_to_geneid
示例#10
0
def get_geneid_symbol_mapping(mapping_file):
    geneid_to_names, name_to_geneid = parse_ncbi.get_geneid_symbol_mapping(mapping_file)
    return geneid_to_names, name_to_geneid