def get_geneids_from_affy(affy_id_list, affy_file=None): """ Returns a dictionary mapping affy probe ids to the tuple (genebank,unigene,symbol) given an input list of affy probe ids, and a csv file from affymetrix with the appropriate information @param affy_id_list: A list of strings like '1000_at'... @param affy_file: If none, then the function get_affy_key_file() will be called to get the full file name and path to the csv file, else specify the filename/path. """ if affy_file is None: affy_file = get_affy_key_file() affy_dict = {} lines = [] with open(affy_file, "r") as f: for tmpline in f: if tmpline[0] != "#": lines.append(tmpline) # omit header/comment lines for i, ln in enumerate(lines[1:]): # lines[0] is the column headers ifr.print_progress(i, len(lines)) tmp = ifr.smart_split(ln, sep=",") key = tmp[0] genebank = tmp[8] unigene = tmp[10] symbol = tmp[14] affy_dict[key] = (genebank, unigene, symbol) return affy_dict
def gen_affy_to_geneId_dict(affy_file_subdir="HG_U95A.na33.annot", affy_fn="HG_U95A.na33.annot.csv"): """ Converts a list of affymetric probe set ids into a genelist with names suitable for querying gather or kegg. Generates a dictionary with entries { affy_id : gene_id_list }. Most times, gene_id_list will have only a single entry, but several probes have multiple Gene IDs given. @param affy_file_subdir: The subdirectory of the ifr.DATA_DIR that has the HG_U95A.na33.annot.csv file. @param affy_fn: The csv file in the subdirectory with the data. The parameter is provided in case the file was renamed from the orginal name of "HG_U95A.na33.annot.csv" @note: Relies on a data file called HG_U95A.na33.annot.csv that must be present in the HG_U95A.na33.annot subdirectory of the linked Data directory. It would be most efficient to use this function once and save the resulting dictionary in a pickle file for later use instead of having to re-parse the data. """ affy_dict = {} affy_file = os.path.join(ifr.DATA_DIR, affy_file_subdir, affy_fn) lines = [] with open(affy_file, "r") as f: for tmpline in f: if tmpline[0] != "#": lines.append(tmpline) # omit header/comment lines for i, ln in enumerate(lines[1:]): # lines[0] is the column headers ifr.print_progress(i, len(lines)) tmp = ifr.smart_split(ln, sep=",") key = tmp[0] val = tmp[14] if val == "---": # this affy id has no gene symbol genelist = [] elif "///" in val: # there are more than one GeneIds for this probe # print "Subfield indicator in Gene Symbol for %s, line: %d."%(key,(i+1)) # print "Val: %s"%tmp[14] genelist = [x.strip() for x in val.split("///") if x.strip() != ""] else: genelist = [val] affy_dict[key] = list(set(genelist)) # remove duplicates return affy_dict