def compute_annotations_per_iteration(self, depth=5, saveas=None, bf_thresh=0.0,
                                   num_iterations=10, homologs=False):
     '''
     Given a file with iterative gene removal data, this function will plot
     a bar chart of the GO annotations at a given level (depth) for each iteration.
     @param depth: The GO ontology depth to use.
     @param saveas: Provide a filename to save the GO annotation results. This will
     be a python pickle file. Specify None to avoid saving the results to a file.
     @param bf_thresh: The bayes factor threshold. An annotation must have at least this
     bayes factor before being counted in the results.
     @param num_iterations: The number of iterations to process
     @param homologs: Select whether to include gene homologs in the analysis.
     '''
     iters = self.removed[:num_iterations]
         
     res = []
     
     print "Starting analysis..."
 
     for i,genelist in enumerate(iters):
         ifr.print_progress(i, len(iters))
         bcdat = self._gen_bar_chart_data(genelist, depth, bf_thresh=bf_thresh, homologs=homologs)
         res.append((i,bcdat,genelist))
     print ""
         
     if not saveas is None:
         cPickle.dump(res, open(saveas,"wb"))
         print "Results saved to file: %s"%saveas
     
     self.annotation_dat = res
     self.depth = depth  #for convenience, remember the depth used to generate the analysis
def get_geneids_from_affy(affy_id_list, affy_file=None):
    """
    Returns a dictionary mapping affy probe ids to
    the tuple (genebank,unigene,symbol) given an input list
    of affy probe ids, and a csv file from affymetrix with
    the appropriate information
    @param affy_id_list: A list of strings like '1000_at'...
    @param affy_file: If none, then the function get_affy_key_file()
    will be called to get the full file name and path to the csv file,
    else specify the filename/path.
    """
    if affy_file is None:
        affy_file = get_affy_key_file()

    affy_dict = {}

    lines = []
    with open(affy_file, "r") as f:
        for tmpline in f:
            if tmpline[0] != "#":
                lines.append(tmpline)  # omit header/comment lines

    for i, ln in enumerate(lines[1:]):  # lines[0] is the column headers
        ifr.print_progress(i, len(lines))
        tmp = ifr.smart_split(ln, sep=",")
        key = tmp[0]
        genebank = tmp[8]
        unigene = tmp[10]
        symbol = tmp[14]
        affy_dict[key] = (genebank, unigene, symbol)

    return affy_dict
def all_IFR_pairs_classification(ifr, numIter=20):
    '''
    computes the classification accuracy using all
    pairs of IFR iterations from the flu_genelist SSVM IFR
    '''
    acc = {}  #the classifier accuracy combining iteration i with j
    (D,L,_,_) = ifr.load_flu_mat()
    (D2,L2,_,_) = ifr.load_H1N1_mat()
    
    removed = ifr.get_removed_features()
    
    #compute L2 SVM test accuracies for all iterations upto numIter
    print "Computing L2 SVM test accuracies for each IFR iteration."
    test_acc_list = []
    for x in range(numIter):
        glx = removed[x]
        tmp=pathway_classification(glx, (D,L), (D2,L2))
        test_acc_list.append(tmp[0])
    
    print "Computing L2 SVM test accuracies for all pairs of IFR iterations."
    cur = 0
    total = (numIter/2)*(numIter-1)    
    for i in range(numIter):
        for j in range((i+1),numIter):
            ifr.print_progress(cur, total)
            cur+=1
            
            gl1 = removed[i]
            gl2 = removed[j]
            gl = gl1+gl2
            
            a1 = test_acc_list[i]
            a2 = test_acc_list[j]
            max_acc = max(a1,a2)
            
            #compute combined accuracy
            rc=pathway_classification(gl, (D,L), (D2,L2))
            
            #store the results
            acc[(i,j)] = (rc[0], "IFR %d + IFR %d"%(i,j), a1, a2, max_acc )
    
    res = sorted( acc.values(), reverse=True)
                              
    return res, test_acc_list
def gen_affy_to_geneId_dict(affy_file_subdir="HG_U95A.na33.annot", affy_fn="HG_U95A.na33.annot.csv"):
    """
    Converts a list of affymetric probe set ids into a genelist
    with names suitable for querying gather or kegg. Generates
    a dictionary with entries { affy_id : gene_id_list }. Most times, gene_id_list will
    have only a single entry, but several probes have multiple Gene IDs given.
    @param affy_file_subdir: The subdirectory of the ifr.DATA_DIR that has
    the HG_U95A.na33.annot.csv file.
    @param affy_fn: The csv file in the subdirectory with the data. The parameter is
    provided in case the file was renamed from the orginal name of "HG_U95A.na33.annot.csv"
    @note: Relies on a data file called HG_U95A.na33.annot.csv that must be present
    in the HG_U95A.na33.annot subdirectory of the linked Data directory. It would
    be most efficient to use this function once and save the resulting dictionary
    in a pickle file for later use instead of having to re-parse the data.
    """
    affy_dict = {}

    affy_file = os.path.join(ifr.DATA_DIR, affy_file_subdir, affy_fn)
    lines = []
    with open(affy_file, "r") as f:
        for tmpline in f:
            if tmpline[0] != "#":
                lines.append(tmpline)  # omit header/comment lines

    for i, ln in enumerate(lines[1:]):  # lines[0] is the column headers
        ifr.print_progress(i, len(lines))
        tmp = ifr.smart_split(ln, sep=",")
        key = tmp[0]
        val = tmp[14]
        if val == "---":
            # this affy id has no gene symbol
            genelist = []
        elif "///" in val:  # there are more than one GeneIds for this probe
            # print "Subfield indicator in Gene Symbol for %s, line: %d."%(key,(i+1))
            # print "Val: %s"%tmp[14]
            genelist = [x.strip() for x in val.split("///") if x.strip() != ""]
        else:
            genelist = [val]

        affy_dict[key] = list(set(genelist))  # remove duplicates

    return affy_dict