Пример #1
0
def replacePearsonPvalueWithZscore():
    all_sample_data={}
    for tissue in tissue_comparison_scores:
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            all_sample_data[sample] = [] ### populate this dictionary and create sub-dictionaries
        break

    for tissue in tissue_comparison_scores:
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            all_sample_data[sample].append(r)

    sample_stats={}
    all_dataset_rho_values=[]
    ### Get average and standard deviation for all sample rho's
    for sample in all_sample_data:
        all_dataset_rho_values+=all_sample_data[sample]
        avg=statistics.avg(all_sample_data[sample])
        stdev=statistics.stdev(all_sample_data[sample])
        sample_stats[sample]=avg,stdev
    
    global_rho_avg = statistics.avg(all_dataset_rho_values)
    global_rho_stdev = statistics.stdev(all_dataset_rho_values)
    
    ### Replace the p-value for each rho
    for tissue in tissue_comparison_scores:
        scores = []
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            #u,s=sample_stats[sample]
            #z = (r-u)/s
            z = (r-global_rho_avg)/global_rho_stdev ### Instead of doing this for the sample background, do it relative to all analyzed samples
            scores.append([r,z,sample])
        tissue_comparison_scores[tissue] = scores
Пример #2
0
def reorderArraysOnly(filtered_exp_db,filetype,counts): 
    ###array_order gives the final level order sorted, followed by the original index order as a tuple                   
    ###expr_group_list gives the final level order sorted, followed by the original index order as a tuple
    for probeset in filtered_exp_db:
        grouped_ordered_array_list = {}; group_list = []
        for x in expr_group_list:
            y = x[1]; group = x[2]  ### this is the new first index
            ### for example y = 5, therefore the filtered_exp_db[probeset][5] entry is now the first
            try:
                try: new_item = filtered_exp_db[probeset][y]
                except TypeError: print y,x,expr_group_list; kill
            except IndexError: print probeset,y,x,expr_group_list,'\n',filtered_exp_db[probeset];kill
            ###Used for comparision analysis
            try: grouped_ordered_array_list[group].append(new_item)
            except KeyError: grouped_ordered_array_list[group] = [new_item]
            
        ### For the exon-level expression data, export the group pair data for all pairwise comparisons to different comp files
        ###*******Include a database with the raw values saved for permuteAltAnalyze*******
        for info in comp_group_list:
            group1 = int(info[0]); group2 = int(info[1]); comp = str(info[0]),str(info[1])
            g1_data = grouped_ordered_array_list[group1]
            g2_data = grouped_ordered_array_list[group2]
            #print probeset, group1, group2, g1_data, g2_data, info;kill
            data = comparision_export_db[comp]
            values = [probeset]+g2_data+g1_data; values = string.join(values,'\t')+'\n' ###groups are reversed since so are the labels
            #raw_data_comps[probeset,comp] = temp_raw
            data.write(values)
            
        ### Export all values grouped from the array
        for group in grouped_ordered_array_list: group_list.append(group)
        group_list.sort(); combined_value_list=[]; avg_values=[]
        for group in group_list:
            g_data = grouped_ordered_array_list[group]
            if exp_analysis_type == 'expression':
                try: avg_gdata = statistics.avg(g_data); avg_values.append(avg_gdata)
                except Exception:
                    print g_data
                    print avg_values
                    kill
            combined_value_list+=g_data
        
        if exp_data_format == 'non-log' and counts == 'no':
            try: combined_value_list = logTransform(combined_value_list)
            except Exception:
                print probeset, combined_value_list,comp_group_list,expr_group_list
                print filtered_exp_db[probeset]; kill

        if filetype == 'expression':
            ### Export the expression values for all samples grouped (if meeting the above thresholds)
            values = string.join([probeset]+combined_value_list,'\t')+'\n'
            fulldataset_export_object.write(values) ### Don't need this for dabg data

        if exp_analysis_type == 'expression':
            avg_values.sort() ### Sort to get the lowest dabg and largest average expression
            if filetype == 'dabg':
                if avg_values[0]<=dabg_p_threshold: dabg_summary[probeset]=[] ### store probeset if the minimum p<user-threshold
            else:
                #if 'ENSMUSG00000018263:' in probeset: print probeset,[avg_values[-1],expression_threshold]
                if avg_values[-1]>=expression_threshold:
                    expression_summary[probeset]=[] ### store probeset if the minimum p<user-threshold
Пример #3
0
def plotFeatureBoxPlots(qc_db,dataset_name,feature_type):
    pylab.figure()    
    pylab.xlabel('Biological Sample Names')
    pylab.ylabel('Read Counts - Log2')
    pylab.title('Expression BoxPlots for %ss - %s' % (feature_type,dataset_name))
    #pylab.subplots_adjust(left=0.085, right=0.95, top=0.2, bottom=0.35)
    pylab.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.35)
    
    #axes = getAxes(scores) ### adds buffer space to the end of each axis and creates room for a legend
    #pylab.axis(axes)

    boxplots=[]
    samples=[]
    
    sample_sorted_list=[]
    
    for sample_name in qc_db:
        try: qc = qc_db[sample_name][feature_type]
        except Exception:
            print 'No junction data found for at least one sample:',sample_name; forceExit
        sample_sorted_list.append([statistics.avg(qc),statistics.stdev(qc),sample_name])
    sample_sorted_list.sort()
    sample_sorted_list.reverse()
    
    filename = 'QC-%s-BoxPlot-%s.pdf' % (dataset_name,feature_type)
    export_obj = export.ExportFile(root_dir + filename[:-4]+'.txt')
    export_obj.write('SampleID\tAverage Expression\n')
    
    firstEntry=True
    for (mean,stdev,sample_name) in sample_sorted_list:
        ls=[]; x_ls=[]; y_ls=[]
        qc = qc_db[sample_name][feature_type]
        boxplots.append(qc)
        samples.append(sample_name)
        export_obj.write(sample_name+'\t'+str(mean)+'\n')
        if firstEntry:
            threshold=mean-2*stdev
            firstEntry=False
        else:
            if mean<threshold:
                print sample_name,'expression is considered very low (2 standard deviations away from the max).'
    pylab.boxplot(boxplots, notch=0, whis=1.5, positions=None, widths=None, patch_artist=False)
    #pylab.boxplot(boxplots, notch=0, sym='+', vert=1, whis=1.5, positions=None, widths=None, patch_artist=False)
    xtickNames = pylab.setp(pylab.gca(), xticklabels=samples)
    pylab.setp(xtickNames, rotation=90, fontsize=10)
    export_obj.close()

    #print 'Exporting:',filename
    pylab.savefig(root_dir + filename)
    filename = filename[:-3]+'png'
    pylab.savefig(root_dir + filename) #,dpi=200
    graphic_link.append(['QC - BoxPlot-'+feature_type+' Expression',root_dir+filename])
    try:
        import gc
        pylab.figure.clf()
        pylab.close()
        gc.collect()
    except Exception:
        pass
Пример #4
0
def studAvg():
    studentName = input('Student Name:')
    try:
        average = avg(studentdict[studentName])
        print(studentName,'has an average grade of',average)
    except KeyError as keyErr:
        print('Student',studentName,'not found')   
    print(studentdict)
Пример #5
0
def combine_profiles(profile_list):
    profile_group_sizes={}
    for db in profile_list:
        for key in db: profile_group_sizes[key] = len(db[key])
        break

    new_profile_db={}
    for key in profile_group_sizes:
        x = profile_group_sizes[key] ###number of elements in list for key
        new_val_list=[]; i = 0
        while i<x:
            temp_val_list=[]
            for db in profile_list:
                if key in db: val = db[key][i]; temp_val_list.append(val)
            i+=1; val_avg = statistics.avg(temp_val_list); new_val_list.append(val_avg)
        new_profile_db[key] = new_val_list
    return new_profile_db
Пример #6
0
def parse_input_data(filename,data_type):
    fn=filepath(filename); first_line = 1; array_group_name_db = {}; z=0; array_group_db = {}; output_file = []
    #print "Reading",filename
    secondary_data_type = export.getParentDir(filename) ### e.g., expression or counts
    
    for line in open(fn,'rU').xreadlines():
      data = cleanUpLine(line); t = string.split(data,'\t'); probeset = t[0]; z+=1
      if first_line == 1:
          first_line = 0 #makes this value null for the next loop of actual array data
          ###Below ocucrs if the data is raw opposed to precomputed
          if data_type == 'export':
              if array_type == 'exon': folder = 'ExonArray'+'/'+species + '/'
              elif array_type == 'gene': folder = 'GeneArray'+'/'+species + '/'
              elif array_type == 'junction': folder = 'JunctionArray'+'/'+species + '/'
              elif array_type == 'RNASeq': folder = 'RNASeq'+'/'+species + '/'
              else: folder = array_type + '/'
              parent_path = root_dir+'AltExpression/'+folder
              if array_type == 'RNASeq':
                  output_file =  altanalzye_input[0:-4] + '.ExpCutoff-' + str(original_exp_threshold) +'_'+ filter_method+'.txt'
              else:
                  output_file = altanalzye_input[0:-4] + '.p' + str(int(100*p)) +'_'+ filter_method+'.txt'
              output_file_dir = parent_path+output_file
              print "...Exporting",output_file_dir
              export_data = export.createExportFile(output_file_dir,root_dir+'AltExpression/'+folder)
              fn=filepath(output_file_dir); export_data = open(fn,'w');
              export_data.write(line)
          if ':' in t[1]:
              array_group_list = []; x=0 ###gives us an original index value for each entry in the group
              for entry in t[1:]:
                  array_group,array_name = string.split(entry,':')
                  try:
                      array_group_db[array_group].append(x)
                      array_group_name_db[array_group].append(array_name)
                  except KeyError:
                      array_group_db[array_group] = [x]
                      array_group_name_db[array_group] = [array_name]
                      ### below only occurs with a new group addition
                      array_group_list.append(array_group) #use this to generate comparisons in the below linked function
                  x += 1
          #print '##### array_group_list',array_group_list
      elif len(probeset)>0 and data_type != 'export':
          ###Use the index values from above to assign each expression value to a new database
          temp_group_array={}; array_index_list = []  ###Use this list for permutation analysis
          for group in array_group_db:
              #array_index_list.append(array_group_db[group])
              group_values = []
              for array_index in array_group_db[group]:
                  try: exp_val = float(t[array_index+1])
                  except IndexError: print t, z,'\n',array_index,'\n',group, probeset;kill
                  group_values.append(exp_val)
              avg_stat = statistics.avg(group_values)

              if data_type == 'expression':
                  ###If non-log array data
                  if exp_data_format == 'non-log':
                      ### This works better for RNASeq as opposed to log transforming and then filtering which is more stringent and different than the filtering in ExonArray().
                      if array_type == 'RNASeq':
                        if normalization_method == 'RPKM' and secondary_data_type == 'expression':
                            if ':I' in probeset: k=1 ### Don't require an RPKM threshold for intron IDs (these will likely never meet this unless small or fully retained and highly expressed)
                            elif ':' not in probeset:
                                if avg_stat>=gene_rpkm_threshold: k=1
                                else: k=0
                            elif avg_stat>=exon_rpkm_threshold: k=1
                            elif '-' in probeset: k=1 ### Don't consider RPKM for junctions, just counts
                            else: k=0
                            #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, gene_rpkm_threshold, avg_stat, k]
                        else: ### Otherwise, we are looking at count data
                            if '-' in probeset: ### junction meeting minimum read-count number
                                if avg_stat>=junction_exp_threshold: k=1 ### junction_exp_threshold is the same as nonlog_exp_threshold
                                else: k=0
                            elif ':' not in probeset:
                                if avg_stat>=gene_exp_threshold: k=1
                                else: k=0
                            else: ### exon or intron meeting minimum read-count number
                                if avg_stat>=exon_exp_threshold: k=1
                                else: k=0
                            #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, exon_exp_threshold, junction_exp_threshold, avg_stat, k]
                      else:
                        if avg_stat>=nonlog_exp_threshold: k=1
                        else: k=0
                  elif avg_stat>=log_expression_threshold: k=1
                  else: k=0
                  if normalization_method == 'RPKM' and secondary_data_type == 'expression': ### Treat as dabp p-value
                      try: pvalue_status_db[probeset].append(k)
                      except KeyError: pvalue_status_db[probeset] = [k]
                  else:
                      try: expression_status_db[probeset].append(k)
                      except KeyError: expression_status_db[probeset] = [k]
                  #if probeset == '3209315': print [group],k,len(group_values),array_group_list
              if data_type == 'p-value':
                  if avg_stat<=p: k=1
                  else: k=0
                  #if 'G7216513_a_at' in probeset: print k, avg_stat
                  try: pvalue_status_db[probeset].append(k)
                  except KeyError: pvalue_status_db[probeset] = [k]
      elif data_type == 'export':
          if exp_data_format == 'non-log':
              ### This code was added in version 1.16 in conjunction with a switch from logstatus to
              ### non-log in AltAnalyze to prevent "Process AltAnalyze Filtered" associated errors
              exp_values = t[1:]; exp_values_log2=[]
              for exp_val in exp_values:
                  exp_values_log2.append(str(math.log(float(exp_val),2))) ### exp_val+=1 was removed in 2.0.5
              line = string.join([probeset]+exp_values_log2,'\t')+'\n'
          try: null = export_db[probeset]; export_data.write(line)
          except KeyError: null = [] ### occurs if not a probeset to include in the filtered results export file
    if data_type == 'export': export_data.close()
    return output_file
Пример #7
0
def performGroupNormalization(filename,export_dir,platform):
    expressionDataFormat,increment,convertNonLogToLog = ExpressionBuilder.checkExpressionFileFormat(filename)
    groups_dir = string.replace(export_dir,'exp.','batch.')
    fn=unique.filepath(filename); row_number=0; exp_db={}; relative_headers_exported = False
    group_db = importGroups(groups_dir)
    export_data = export.ExportFile(export_dir)
    for line in open(fn,'rU').xreadlines():
        data = ExpressionBuilder.cleanUpLine(line)
        t = string.split(data,'\t')
        if data[0]=='#' and row_number==0: row_number = 0
        elif row_number==0:
            sample_list = t[1:]
            new_sample_list = []
            for group in group_db:
                group_samples = group_db[group]
                try:
                    sample_index_list = map(lambda x: sample_list.index(x), group_samples)
                    group_db[group] = sample_index_list
                    new_sample_list+=group_samples
                except Exception:
                    missing=[]
                    for x in sample_list:
                        if x not in t[1:]: missing.append(x)
                    print 'missing:',missing
                    print t
                    print sample_list
                    print filename, groups_dir
                    print 'Unknown Error!!! Skipping cluster input file build (check column and row formats for conflicts)'; forceExit
            title = string.join([t[0]]+new_sample_list,'\t')+'\n' ### output the new sample order (group file order)
            export_data.write(title)
            row_number=1
        else:
            gene = t[0]
            if expressionDataFormat == 'non-log' and (convertNonLogToLog or platform == 'RNASeq'):
                ### Convert to log2 RPKM values - or counts
    
                try: all_values = map(lambda x: math.log(float(x)+increment,2), t[1:])
                except Exception:
                    all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment)
            else:
                try: all_values = map(float,t[1:])
                except Exception:
                    all_values = ExpressionBuilder.logTransformWithNAs(t[1:],increment)
            row_number+=1 ### Keep track of the first gene as to write out column headers for the relative outputs
            gene_log_folds = []

            for group in group_db:
                sample_index_list = group_db[group]
                ### Calculate log-fold values relative to the mean of all sample expression values
                try: values = map(lambda x: all_values[x], sample_index_list) ### simple and fast way to reorganize the samples
                except Exception:
                    print len(values), sample_index_list;kill
                try: avg = statistics.avg(values)
                except Exception:
                    values2=[]
                    for v in values:
                        try: values2.append(float(v))
                        except Exception: pass
                    values = values2
                    try: avg = statistics.avg(values)
                    except Exception:
                        if len(values)>0: avg = values[0]
                        else: avg = 0
                try: log_folds = map(lambda x: (x-avg), values)
                except Exception: 
                    log_folds=[]
                    for x in values:
                        try: log_folds.append(x-avg)
                        except Exception: log_folds.append('')
                gene_log_folds+=log_folds                            
            gene_log_folds = map(lambda x: str(x),gene_log_folds)
            export_data.write(string.join([gene]+gene_log_folds,'\t')+'\n')
    export_data.close()
Пример #8
0
def importTableEntries(filename,filter_db,ensembl_exon_db,gene_db,root_dir,transpose,display,showIntrons,analysisType='plot'):
    import collections
    average_samples = True
    if showIntrons == 'yes': include_introns = True
    else: include_introns = False
    uid_db={} ### probeset or AltAnalyze RNA-Seq ID keyed
    uid_list={} ### ordered from first to last exon region
    uid_gene_db={} ### Lets us look at multiple genes
    try:
        import UI
        biotypes = UI.getBiotypes(filename)
    except Exception: biotypes={}
    for gene in ensembl_exon_db:
        uid_list[gene]=[]
        for (index,ed,id) in ensembl_exon_db[gene]:
            proceed = False
            if 'exp.' in filename:
                if include_introns:
                    proceed = True
                elif 'E' in ed.ExonID():
                    proceed = True
            else: ### Include introns for splicing index view
                if include_introns == True: proceed = True
                elif 'E' in ed.ExonID(): proceed = True
            if proceed:
                uid_db[id] = ed
                uid_list[gene].append(id)
            uid_gene_db[id]=gene

    if '_vs_' in filename: ### If one two groups, this is what will be output to the RawSplice folder - need to have this alternate way of getting the expression file location
        rootdir = string.split(filename, 'AltResults')[0]
        exp_dir = getValidExpFile(rootdir+'ExpressionInput')
        alt_groups_dir = string.split(exp_dir, 'ExpressionInput')[0]+'ExpressionInput/groups.'+findFilename(exp_dir)
        alt_groups_dir = string.replace(alt_groups_dir,'exp.','')
        
    start_time = time.time()
    fn = filepath(filename)
    matrix_gene_db={}
    stdev_gene_matrix_db={}
    row_header_gene={}
    ids={}
    x=0
    
    if 'heatmap' in analysisType:
        average_samples = False
        
    if '/' in filename:
        dataset_name = string.split(filename,'/')[-1][:-4]
    else:
        dataset_name = string.split(filename,'\\')[-1][:-4]
    for line in open(fn,'rU').xreadlines():         
        data = line.strip()
        t = string.split(data,'\t')
        if data[0]=='#': x=0
        elif x==0:
            if platform == 'RNASeq':
                removeExtension=True
            else:
                removeExtension=False
            group_db, column_header, sample_name_db = assignGroupColors(t[1:],'',removeExtension=removeExtension)
            x=1
            altresults = False
            if average_samples:
                if 'AltResults' in filename:
                    altresults=True
                    groups_dir = string.split(filename, 'AltResults')[0]+'ExpressionInput/groups.'+findFilename(filename)
                    if verifyFile(groups_dir)==False:
                        groups_dir = alt_groups_dir
                    new_column_header = reformatAltHeaders(t[3:])
                    start = 3
                else:
                    if 'exp.' in filename:
                        groups_dir = string.replace(filename,'exp.','groups.')
                    else:
                        groups_dir = string.replace(filename,'counts.','groups.')
                    new_column_header = column_header
                    start = 1 ### starting index with numeric values
                groups_dir = string.replace(groups_dir,'stats.','groups.')
                groups_dir = string.replace(groups_dir,'-steady-state.txt','.txt') ### groups is for the non-steady-state file
                
                try: group_index_db=collections.OrderedDict()
                except Exception:
                    import ordereddict
                    group_index_db = ordereddict.OrderedDict()
                ### use comps in the future to visualize group comparison changes
                sample_list,group_sample_db,group_db,group_name_sample_db,comp_groups,comps_name_db = ExpressionBuilder.simpleGroupImport(groups_dir)
                for item in sample_list:
                    group_name = group_db[item]
                    proceed=False
                    try: sample_index = new_column_header.index(item); proceed=True
                    except Exception:
                        try:
                            item = string.replace(item,'.bed','')
                            item = string.replace(item,'.CEL','') ### Probe-level analyses as RNA-Seq
                            item = string.replace(item,'.cel','')
                            item = string.replace(item,'.txt','')
                            item = string.replace(item,'.TXT','')
                            item = string.replace(item,'.TAB','')
                            item = string.replace(item,'.tab','')
                            sample_index = new_column_header.index(item)
                            proceed=True
                        except Exception:
                            pass
                            #print [item]
                            #print column_header
                            #print Error
                    if proceed:
                        try: group_index_db[group_name].append(sample_index)
                        except Exception:
                            try: group_index_db[group_name] = [sample_index] ### dictionary of group to input file sample indexes
                            except Exception: pass ### Occurs when analyzing splicing-index for two groups when more than two groups exist (error from 5 lines up)
                groups = map(str, group_index_db) ### store group names
                new_sample_list = map(lambda item: group_db[item], sample_list) ### lookup index of each sample in the ordered group sample list
                column_header = groups
            else:
                if 'AltResults' in filename: start = 3
                else: start = 1 ### starting index with numeric values
                column_header = t[start-1:]
            row_number=1   
        else:
            if ' ' not in t and '' not in t: ### Occurs for rows with missing data
                uid = t[start-1]
                if ';' in uid:
                    uid = string.split(uid,';')[0]
                ids[uid]=None
                ens_geneID = string.split(uid,':')[0]
                #if ens_geneID in gene_db: print uid
                if uid in filter_db or ('heatmap' in analysisType and ens_geneID in gene_db):
                    try:
                        if len(biotypes)==1 and 'junction' in biotypes:
                            gene = ens_geneID
                        else:
                            gene = uid_gene_db[uid]
                        try: row_header_gene[gene].append(uid)
                        except Exception: row_header_gene[gene] = [uid]
                        if average_samples == False:
                            values = map(float,t[start:])
                            try: matrix_gene_db[gene].append(values)
                            except Exception: matrix_gene_db[gene]=[values]
                        else:
                            if platform == 'RNASeq' and altresults==False:
                                ### Convert to log2 RPKM values - or counts
                                values = map(lambda x: math.log(float(x),2), t[start:])
                            else:
                                values = map(float,t[start:])
                                
                            if 'AltResults' in filename: ### If splicing scores, normalize these to the mean values
                                mean = statistics.avg(values)
                                values = map(lambda x: x-mean, values)
                            avg_ls=[]; std_ls = []
                            for group_name in group_index_db:
                                group_values = map(lambda x: values[x], group_index_db[group_name]) ### simple and fast way to reorganize the samples
                                avg = statistics.avg(group_values)
                                try: st_err = statistics.stdev(group_values)/math.sqrt(len(group_values))
                                except Exception:
                                    ### Occurs if no replicates in the dataset
                                    st_err = 0
                                avg_ls.append(avg)
                                std_ls.append(st_err)
                            try: matrix_gene_db[gene].append(avg_ls)
                            except Exception: matrix_gene_db[gene]=[avg_ls]
                            try: stdev_gene_matrix_db[gene].append(std_ls)
                            except Exception: stdev_gene_matrix_db[gene]=[std_ls]
                    except Exception:
                        #print traceback.format_exc()
                        pass
            x+=1

    global colors
    original_column_header = list(column_header)
    if len(uid_list)==0:
        print 'No genes found in the exon expression database'; forceNoExonExpError
    successfully_output_genes=0
    display_count=0 ### Only display a certain number of genes
    
    for last_gene in uid_list: pass
    for gene in uid_list:
        fig = pylab.figure() ### Create this here - resulting in a single figure for memory purposes
        new_header = []
        new_matrix = []
        new_stdev = []
        annotation_list=[]
        gene_symbol = gene_db[gene]
        try: matrix = matrix_gene_db[gene]
        except Exception:
            print gene_symbol, 'not in alternative expression database'
            continue ### go the next gene - no alt.expression for this gene
        row_header = row_header_gene[gene]

        try: stdev_matrix = stdev_gene_matrix_db[gene]
        except Exception: pass
        for uid in uid_list[gene]:
            #print row_header;sys.exit()
            try:
                i = row_header.index(uid) ### If the ID is in the filtered annotated exon list (not just core)
                new_header.append(uid)
                try: new_matrix.append(matrix[i])
                except Exception: print uid, i,len(matrix);sys.exit()
                ed = uid_db[uid]
                annotation_list.append(ed)
                try: new_stdev.append(stdev_matrix[i])
                except Exception: pass
            except Exception: pass

        if len(new_matrix)>0:
            matrix = new_matrix
        if len(new_header)>0:
            row_header = new_header
        if 'heatmap' in analysisType:
            export_dir = root_dir + gene_symbol + '-heatmap.txt'
            export_obj = export.ExportFile(export_dir)
            export_obj.write(string.join(column_header,'\t')+'\n')
            ki=0
            if len(annotation_list)>0:
                for ed in annotation_list:
                    if 'AltResults' not in filename and platform == 'RNASeq':
                        values = map(lambda x: math.log(x,2), matrix[ki])
                    else: values = matrix[ki]
                    export_obj.write(string.join([ed.ExonID()] + map(str,values),'\t')+'\n')
                    ki+=1
                row_metric = 'euclidean'; row_method = None
            else:
                ### Just junctions analyzed here... no sorted junctions yet
                ki=0
                for uid in row_header_gene[gene]:
                    if 'AltResults' not in filename and platform == 'RNASeq':
                        values = map(lambda x: math.log(x,2), matrix[ki])
                    else: values = matrix[ki]
                    export_obj.write(string.join([uid] + map(str,values),'\t')+'\n')
                    ki+=1
                row_metric = 'euclidean'; row_method = 'average'
            export_obj.close()
            import clustering
            
            column_metric = 'euclidean'; column_method = 'hopach'
            color_gradient = 'red_black_sky'; transpose = False; graphic_links=[]
            if ki>100: transpose = True
            if gene == last_gene: display = True
            else: display = False
            graphic_links = clustering.runHCexplicit(export_dir, graphic_links, row_method, row_metric, column_method, column_metric, color_gradient, transpose, display=display, Normalize=True, compressAxis = False, contrast = 2.5)
            successfully_output_genes+=1
        else:
            stdev_matrix = new_stdev
            time_diff = str(round(time.time()-start_time,1))
            #print '%d rows and %d columns imported for %s in %s seconds...' % (len(matrix),len(column_header),dataset_name,time_diff)
            if transpose == True:
                matrix = map(numpy.array, zip(*matrix)) ### coverts these to tuples
                column_header, row_header = row_header, original_column_header
                stdev_matrix = map(numpy.array, zip(*stdev_matrix))
            matrix = numpy.array(matrix)

            stdev_matrix = numpy.array(stdev_matrix)
            try:
                if len(uid_list)>10:
                    #if display_count==5: display=False
                    display=False
                if display_count==0:
                    ### store a consistent color palete to use
                    colors=[]
                    """
                    k=0
                    while k < len(row_header):
                        colors.append(tuple(rand(3)))
                        k+=1"""
                    #http://stackoverflow.com/questions/3016283/create-a-color-generator-from-given-colormap-in-matplotlib
                    cm = pylab.cm.get_cmap('gist_rainbow') #gist_ncar
                    for i in range(len(row_header)):
                        colors.append(cm(1.*i/len(row_header)))  # color will now be an RGBA tuple
        
                plotExonExpression(fig,matrix,stdev_matrix,row_header,column_header,dataset_name,annotation_list,gene_symbol,root_dir,display=display)
                successfully_output_genes+=1
                display_count+=1
            except Exception:
                print traceback.format_exc();sys.exit()
                print gene_symbol, 'failed'
        try: pylab.close()
        except Exception: pass
        if successfully_output_genes>0:
            #try: print 'Gene graphs exported to ExonPlots...'
            #except Exception: pass
            pass
        else:
            print '\nWARNING!!!! No genes with associated alternative exon evidence found\n'; forceNoExonExpError
        try:
            import gc
            fig.clf()
            pylab.close()
            gc.collect()
        except Exception:
            pass
Пример #9
0
def reorder(data,data_headers,array_order,comp_group_list,probeset_db,include_raw_data,array_type,norm,fl,logvalues=True,blanksPresent=False):
    ###array_order gives the final level order sorted, followed by the original index order as a tuple                   
    expbuilder_value_db = {}; group_name_db = {}; summary_filtering_stats = {}; pval_summary_db= {}
    replicates = 'yes'
    
    stat_result_names = ['avg-','log_fold-','fold-','rawp-','adjp-']
    group_summary_result_names = ['avg-']
    
    ### Define expression variables
    try: probability_statistic = fl.ProbabilityStatistic()
    except Exception: probability_statistic = 'unpaired t-test'
    try: gene_exp_threshold = math.log(fl.GeneExpThreshold(),2)
    except Exception: gene_exp_threshold = 0
    try: gene_rpkm_threshold = float(fl.RPKMThreshold())
    except Exception: gene_rpkm_threshold = 0
    try: FDR_statistic = fl.FDRStatistic()
    except Exception: FDR_statistic = 'Benjamini-Hochberg'
    calculateAsNonLog=True
    if blanksPresent:
        calculateAsNonLog=False
    
    ### Begin processing sample expression values according to the organized groups
    for row_id in data:
        try: gene = probeset_db[row_id][0]
        except TypeError: gene = '' #not needed if not altsplice data
        data_headers2 = {} #reset each time
        grouped_ordered_array_list = {}
        for x in array_order:
            y = x[1]  #this is the new first index
            group = x[2]
            group_name = x[3]
            group_name_db[group] = group_name
            #for example y = 5, therefore the data[row_id][5] entry is now the first
            try:
                try: new_item = data[row_id][y]
                except IndexError: print row_id,data[row_id],len(data[row_id]),y,len(array_order),array_order;kill
                if logvalues==False and calculateAsNonLog and array_type == 'RNASeq':
                    new_item = math.pow(2,new_item)
            except TypeError: new_item = ''  #this is for a spacer added in the above function
            try: grouped_ordered_array_list[group].append(new_item)
            except KeyError: grouped_ordered_array_list[group] = [new_item]
            try: data_headers2[group].append(data_headers[y])
            except KeyError: data_headers2[group]= [data_headers[y]]
        #perform statistics on each group comparison - comp_group_list: [(1,2),(3,4)]
        stat_results = {}
        group_summary_results = {}
        for comp in comp_group_list:
            group1 = int(comp[0])
            group2 = int(comp[1])
            group1_name = group_name_db[group1]
            group2_name = group_name_db[group2]
            groups_name = group1_name + "_vs_" + group2_name
            data_list1 = grouped_ordered_array_list[group1] 
            data_list2 = grouped_ordered_array_list[group2] #baseline expression
            if blanksPresent: ### Allows for empty cells
                data_list1 = filterBlanks(data_list1)
                data_list2 = filterBlanks(data_list2)
            try: avg1 = statistics.avg(data_list1)
            except Exception: avg1 = ''
            try: avg2 = statistics.avg(data_list2)
            except Exception: avg2=''
            try:
                if (logvalues == False and array_type != 'RNASeq') or (logvalues==False and calculateAsNonLog):
                    fold = avg1/avg2
                    log_fold = math.log(fold,2)
                    if fold<1: fold = -1.0/fold
                else:
                    log_fold = avg1 - avg2
                    fold = statistics.log_fold_conversion(log_fold) 
            except Exception:
                log_fold=''; fold=''
            try:
                #t,df,tails = statistics.ttest(data_list1,data_list2,2,3) #unpaired student ttest, calls p_value function
                #t = abs(t); df = round(df); p = str(statistics.t_probability(t,df))
                p = statistics.runComparisonStatistic(data_list1,data_list2,probability_statistic)
            except Exception: p = 1; sg = 1; N1=0; N2=0
            comp = group1,group2
            if array_type == 'RNASeq': ### Also non-log but treated differently
                if 'RPKM' == norm: adj = 0
                else: adj = 1
                if calculateAsNonLog == False:
                    try: avg1 = math.pow(2,avg1)-adj; avg2 = math.pow(2,avg2)-adj
                    except Exception: avg1=''; avg2=''
                if 'RPKM' == norm:
                    if avg1 < gene_rpkm_threshold and avg2 < gene_rpkm_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
                else:
                    if avg1 < gene_exp_threshold and avg2 < gene_exp_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
                    #if row_id=='ENSG00000085514':
                    #if fold=='Insufficient Expression':
                    #print [norm, avg1, avg2, fold, comp, gene_exp_threshold, gene_rpkm_threshold, row_id]
                    #5.96999111075 7.72930768675 Insufficient Expression (3, 1) 1.0 ENSG00000085514
            if gene_rpkm_threshold!=0 and calculateAsNonLog: ### Any other data
                a1 = nonLogAvg(data_list1)
                a2 = nonLogAvg(data_list2)
                #print [a1,a2,gene_rpkm_threshold]
                if a1<gene_rpkm_threshold and a2<gene_rpkm_threshold:
                    log_fold = 'Insufficient Expression'
                    fold = 'Insufficient Expression'
                #print log_fold;kill
            try:
                gs = statistics.GroupStats(log_fold,fold,p)
                stat_results[comp] = groups_name,gs,group2_name
                if probability_statistic == 'moderated t-test':
                    gs.setAdditionalStats(data_list1,data_list2) ### Assuming equal variance
                if probability_statistic == 'moderated Welch-test':
                    gs.setAdditionalWelchStats(data_list1,data_list2) ### Assuming unequal variance
            except Exception:
                null=[]; replicates = 'no' ### Occurs when not enough replicates
                #print comp, len(stat_results); kill_program
            group_summary_results[group1] = group1_name,[avg1]
            group_summary_results[group2] = group2_name,[avg2]

        ### Replaces the below method to get the largest possible comparison fold and ftest p-value
        grouped_exp_data = []; avg_exp_data = []
        for group in grouped_ordered_array_list:
            data_list = grouped_ordered_array_list[group]
            if blanksPresent: ### Allows for empty cells
                data_list = filterBlanks(data_list)
            if len(data_list)>0: grouped_exp_data.append(data_list)
            try: avg = statistics.avg(data_list); avg_exp_data.append(avg)
            except Exception:
                avg = ''
                #print row_id, group, data_list;kill
        try: avg_exp_data.sort(); max_fold = avg_exp_data[-1]-avg_exp_data[0]
        except Exception: max_fold = 'NA'
        try: ftestp = statistics.OneWayANOVA(grouped_exp_data)
        except Exception: ftestp = 1
        gs = statistics.GroupStats(max_fold,0,ftestp)
        summary_filtering_stats[row_id] = gs
        
        stat_result_list = []
        for entry in stat_results:
            data_tuple = entry,stat_results[entry]
            stat_result_list.append(data_tuple)
        stat_result_list.sort()
        
        grouped_ordered_array_list2 = []
        for group in grouped_ordered_array_list:
            data_tuple = group,grouped_ordered_array_list[group]
            grouped_ordered_array_list2.append(data_tuple)
        grouped_ordered_array_list2.sort() #now the list is sorted by group number
        
        ###for each rowid, add in the reordered data, and new statistics for each group and for each comparison
        for entry in grouped_ordered_array_list2:
            group_number = entry[0]
            original_data_values = entry[1]
            if include_raw_data == 'yes': ###optionally exclude the raw values
                for value in original_data_values:
                    if array_type == 'RNASeq':
                        if norm == 'RPKM': adj = 0
                        else: adj = 1
                        if calculateAsNonLog == False:
                            value = math.pow(2,value)-adj
                    try: expbuilder_value_db[row_id].append(value)
                    except KeyError: expbuilder_value_db[row_id] = [value]
            if group_number in group_summary_results:
                group_summary_data = group_summary_results[group_number][1] #the group name is listed as the first entry
                for value in group_summary_data:
                    try: expbuilder_value_db[row_id].append(value)
                    except KeyError: expbuilder_value_db[row_id] = [value]
            for info in stat_result_list:
                if info[0][0] == group_number: #comp,(groups_name,[avg1,log_fold,fold,ttest])
                    comp = info[0]; gs = info[1][1]
                    expbuilder_value_db[row_id].append(gs.LogFold())
                    expbuilder_value_db[row_id].append(gs.Fold())
                    expbuilder_value_db[row_id].append(gs.Pval())
                    ### Create a placeholder and store the position of the adjusted p-value to be calculated
                    expbuilder_value_db[row_id].append('') 
                    gs.SetAdjPIndex(len(expbuilder_value_db[row_id])-1)
                    gs.SetPvalIndex(len(expbuilder_value_db[row_id])-2)
                    pval_summary_db[(row_id,comp)] = gs

    ###do the same for the headers, but at the dataset level (redundant processes)
    array_fold_headers = []; data_headers3 = []
    try:
        for group in data_headers2:
            data_tuple = group,data_headers2[group]  #e.g. 1, ['X030910_25_hl.CEL', 'X030910_29R_hl.CEL', 'X030910_45_hl.CEL'])
            data_headers3.append(data_tuple)
        data_headers3.sort()
    except UnboundLocalError:
        print data_headers,'\n',array_order,'\n',comp_group_list,'\n'; kill_program
    
    for entry in data_headers3:
        x = 0 #indicates the times through a loop
        y = 0 #indicates the times through a loop
        group_number = entry[0]
        original_data_values = entry[1]
        if include_raw_data == 'yes': ###optionally exclude the raw values
            for value in original_data_values:
                array_fold_headers.append(value)
        if group_number in group_summary_results:
            group_name = group_summary_results[group_number][0]
            group_summary_data = group_summary_results[group_number][1]
            for value in group_summary_data:
                combined_name = group_summary_result_names[x] + group_name  #group_summary_result_names = ['avg-']
                array_fold_headers.append(combined_name)
                x += 1 #increment the loop index

        for info in stat_result_list:
            if info[0][0] == group_number:  #comp,(groups_name,[avg1,log_fold,fold,ttest],group2_name)
                groups_name = info[1][0]
                only_add_these = stat_result_names[1:]
                for value in only_add_these:
                    new_name = value + groups_name
                    array_fold_headers.append(new_name)

    ###For the raw_data only export we need the headers for the different groups (data_headers2) and group names (group_name_db)       
    raw_data_comp_headers = {}
    for comp in comp_group_list:
        temp_raw = []
        group1 = int(comp[0]);group2 = int(comp[1])
        comp = str(comp[0]),str(comp[1])
        g1_headers = data_headers2[group1]
        g2_headers = data_headers2[group2]
        g1_name = group_name_db[group1]
        g2_name = group_name_db[group2]
        for header in g2_headers: temp_raw.append(g2_name+':'+header)
        for header in g1_headers: temp_raw.append(g1_name+':'+header)
        raw_data_comp_headers[comp] = temp_raw

    ###Calculate adjusted ftest p-values using BH95 sorted method
    statistics.adjustPermuteStats(summary_filtering_stats)
    
    ### Calculate adjusted p-values for all p-values using BH95 sorted method
    round=0
    for info in comp_group_list:
        compid = int(info[0]),int(info[1]); pval_db={}
        for (rowid,comp) in pval_summary_db:
            if comp == compid:
                gs = pval_summary_db[(rowid,comp)]
                pval_db[rowid] = gs

        if 'moderated' in probability_statistic and replicates == 'yes':
            ### Moderates the original reported test p-value prior to adjusting
            try: statistics.moderateTestStats(pval_db,probability_statistic)
            except Exception:
                if round == 0:
                    if replicates == 'yes':
                        print 'Moderated test failed due to issue with mpmpath or out-of-range values\n   ... using unmoderated unpaired test instead!'
                null=[] ### Occurs when not enough replicates
            round+=1
            
        if FDR_statistic == 'Benjamini-Hochberg':
            statistics.adjustPermuteStats(pval_db)
        else:
            ### Calculate a qvalue (https://github.com/nfusi/qvalue)
            import numpy; import qvalue; pvals = []; keys = []
            for key in pval_db: pvals.append(pval_db[key].Pval()); keys.append(key)
            pvals = numpy.array(pvals)
            pvals = qvalue.estimate(pvals)
            for i in range(len(pvals)): pval_db[keys[i]].SetAdjP(pvals[i])
            
        for rowid in pval_db:
            gs = pval_db[rowid]
            expbuilder_value_db[rowid][gs.AdjIndex()] = gs.AdjP() ### set the place holder to the calculated value
            if 'moderated' in probability_statistic:
                expbuilder_value_db[rowid][gs.RawIndex()] = gs.Pval() ### Replace the non-moderated with a moderated p-value
                
    pval_summary_db=[]            
    ###Finished re-ordering lists and adding statistics to expbuilder_value_db
    return expbuilder_value_db, array_fold_headers, summary_filtering_stats, raw_data_comp_headers
Пример #10
0
def nonLogAvg(data_list):
    return statistics.avg(map(lambda x: math.pow(2,x)-1,data_list))
Пример #11
0
def statisticallyFilterFile(input_file,output_file,threshold):
    if 'exp.' in input_file:
        counts_file = string.replace(input_file,'exp.','geneCount.')
    else:
        counts_file = input_file[:-4]+'-geneCount.txt'
    sample_expressed_genes={}
    header=True
    junction_max=[]
    count_sum_array=[]
    for line in open(input_file,'rU').xreadlines():
        data = cleanUpLine(line)
        if '.csv' in input_file:
            t = string.split(data,',')
        else:
            t = string.split(data,'\t')
        if header:
            samples = t[1:]
            header=False
            count_sum_array=[0]*len(samples)
        else:
            try: values = map(float,t[1:])
            except Exception:
                if 'NA' in t[1:]:
                    tn = [0 if x=='NA' else x for x in t[1:]] ### Replace NAs
                    values = map(float,tn)
                else:
                    tn = [0 if x=='' else x for x in t[1:]] ### Replace NAs
                    values = map(float,tn)       
                
            binarized_values = []
            for v in values:
                if v>threshold: binarized_values.append(1)
                else: binarized_values.append(0)
            count_sum_array = [sum(value) for value in zip(*[count_sum_array,binarized_values])]
            
    index=0
    distribution=[]
    count_sum_array_db={}
    samples_to_retain =[]
    samples_to_exclude = []
    for sample in samples:
        count_sum_array_db[sample] = count_sum_array[index]
        distribution.append(count_sum_array[index])
        index+=1
    import statistics
    distribution.sort()
    avg = int(statistics.avg(distribution))
    stdev = int(statistics.stdev(distribution))
    min_exp = int(min(distribution))
    cutoff = avg - (stdev*2)
    dev = 2
    print 'The average number of genes expressed above %s is %s, (SD is %s, min is %s)' % (threshold,avg,stdev,min_exp)
    if cutoff<0:
        if (stdev-avg)>0:
            cutoff = avg - (stdev/2); dev = 0.5
        else:
            cutoff = avg - stdev; dev = 1
    if min_exp>cutoff:
        cutoff = avg - stdev; dev = 1
    import export
    eo = export.ExportFile(counts_file)
    eo.write('Sample\tGenes Expressed(threshold:'+str(threshold)+')\n')
    for sample in samples: ### keep the original order
        if count_sum_array_db[sample]>cutoff:
            samples_to_retain.append(sample)
        else:
            samples_to_exclude.append(sample)
        eo.write(sample+'\t'+str(count_sum_array_db[sample])+'\n')
        
    eo.close()
    print len(samples_to_exclude), 'samples removed (# exp. genes, < %d SD away) (%s)' % (dev,string.join(samples_to_exclude,', '))
    print 'Exporting the filtered expression file to:'
    print output_file
    filterFile(input_file,output_file,samples_to_retain)
Пример #12
0
def generateConstitutiveExpression(exp_dbase,constitutive_gene_db,probeset_gene_db,pre_filtered_db,array_names,filename):
    """Generate Steady-State expression values for each gene for analysis in the main module of this package"""
    steady_state_db={}; k=0; l=0
    remove_nonexpressed_genes = 'no' ### By default set to 'no'

    ###1st Pass: Identify probesets for steady-state calculation
    for gene in probeset_gene_db:
        if avg_all_probes_for_steady_state == 'yes': average_all_probesets[gene] = probeset_gene_db[gene] ### These are all exon aligning (not intron) probesets
        else:
            if gene not in constitutive_gene_db: average_all_probesets[gene] = probeset_gene_db[gene]
            else:
                constitutive_probeset_list = constitutive_gene_db[gene]
                constitutive_filtered=[] ###Added this extra code to eliminate constitutive probesets not in exp_dbase (gene level filters are more efficient when dealing with this many probesets)
                for probeset in constitutive_probeset_list:
                    if probeset in probeset_gene_db[gene]: constitutive_filtered.append(probeset)
                if len(constitutive_filtered)>0: average_all_probesets[gene] = constitutive_filtered
                else: average_all_probesets[gene] = probeset_gene_db[gene]

    ###2nd Pass: Remove probesets that have no detected expression (keep all if none are expressed)
    if excludeLowExpressionExons:
        non_expressed_genes={} ### keep track of these for internal QC
        for gene in average_all_probesets:
            gene_probe_list=[]; x = 0
            for probeset in average_all_probesets[gene]:
                if probeset in pre_filtered_db: gene_probe_list.append(probeset); x += 1
            ###If no constitutive and there are probes with detected expression: replace entry
            if x >0: average_all_probesets[gene] = gene_probe_list
            elif remove_nonexpressed_genes == 'yes': non_expressed_genes[gene]=[]   

    if remove_nonexpressed_genes == 'yes':
        for gene in non_expressed_genes: del average_all_probesets[gene]
    ###3rd Pass: Make sure the probesets are present in the input set (this is not typical unless a user is loading a pre-filtered probeset expression dataset)
    for gene in average_all_probesets:
        v=0
        for probeset in average_all_probesets[gene]:
            try: null = exp_dbase[probeset]; v+=1
            except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets
            if v==0: ###Therefore, no probesets were found that were previously predicted to be best constitutive
                try: average_all_probesets[gene] = probeset_gene_db[gene] ###expand the average_all_probesets to include any exon linked to the gene
                except KeyError: print gene, probeset, len(probeset_gene_db), len(average_all_probesets);kill
    
    for probeset in exp_dbase:
        array_count = len(exp_dbase[probeset]); break

    try: null = array_count
    except Exception:
        print 'WARNING...CRITICAL ERROR. Make sure the correct array type is selected and that all input expression files are indeed present (array_count ERROR).'; forceError
        
    ###Calculate avg expression for each array for each probeset (using constitutive values)
    gene_count_db={}
    for gene in average_all_probesets:
        x = 0 ###For each array, average all probeset expression values
        gene_sum=0
        probeset_list = average_all_probesets[gene]#; k+= len(average_all_probesets[gene])
        if array_type != 'RNASeq': ### Just retain the list of probesets for RNA-seq
            while x < array_count:
                exp_list=[] ### average all exp values for constituitive probesets for each array
                for probeset in probeset_list:
                    try:
                        exp_val = exp_dbase[probeset][x]
                        exp_list.append(exp_val)
                    except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets
                try:
                    if len(exp_list)==0:                
                        for probeset in probeset_list:
                            try:
                                exp_val = exp_dbase[probeset][x]
                                exp_list.append(exp_val)
                            except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets
                    avg_const_exp=statistics.avg(exp_list)
                    ### Add only one avg-expression value for each array, this loop
                    try: steady_state_db[gene].append(avg_const_exp)
                    except KeyError: steady_state_db[gene] = [avg_const_exp]
                except ZeroDivisionError: null=[] ### Occurs when processing a truncated dataset (for testing usually) - no values for the gene should be included
                x += 1

    l = len(probeset_gene_db) - len(steady_state_db)
    steady_state_export = filename[0:-4]+'-steady-state.txt'
    steady_state_export = string.replace(steady_state_export,'counts.','exp.')
    fn=filepath(steady_state_export); data = open(fn,'w'); title = 'Gene_ID'
    
    if array_type == 'RNASeq':
        import RNASeq
        steady_state_db, pre_filtered_db = RNASeq.calculateGeneLevelStatistics(steady_state_export,species,average_all_probesets,normalize_feature_exp,array_names,UserOptions,excludeLowExp=excludeLowExpressionExons)
        ### This "pre_filtered_db" replaces the above since the RNASeq module performs the exon and junction-level filtering, not ExonArray (RPKM and count based)
        ### Use pre_filtered_db to exclude non-expressed features for multi-group alternative exon analysis
        removeNonExpressedProbesets(pre_filtered_db,full_dataset_export_dir)
        reload(RNASeq)
    
    for array in array_names: title = title +'\t'+ array
    data.write(title+'\n')
    for gene in steady_state_db:
        ss_vals = gene
        for exp_val in steady_state_db[gene]:
            ss_vals = ss_vals +'\t'+ str(exp_val)
        data.write(ss_vals+'\n')
    data.close()
    exp_dbase={}; steady_state_db={}; pre_filtered_db ={}
    #print k, "probesets were not found in the expression file, that could be used for the constitutive expression calculation"
    #print l, "genes were also not included that did not have such expression data"
    print "Steady-state data exported to",steady_state_export
Пример #13
0
# As you import something, you can assign it a custom name using `as`
print('Let\'s alias something as we import it!')
print('importing e...')
from math import e
print('importing e as wahoo...')
from math import e as wahoo
print('T/F: e and wahoo are equal:', e == wahoo)
print()

# To summarize, the following three approaches all achieve the exact same thing

print('approach 1')
import statistics
avg = statistics.mean
print('The average of [1,2,3] is:', avg([1, 2, 3]))
# or...
# print('The average of [1,2,3] is:', statistics.mean([1, 2, 3]))
print()

print('approach 2')
from statistics import mean
avg = mean
print('The average of [1,2,3] is:', avg([1, 2, 3]))
# or...
# print('The average of [1,2,3] is:', mean([1, 2, 3]))
print()

print('approach 3')
from statistics import mean as avg
print('The average of [1,2,3] is:', avg([1, 2, 3]))
Пример #14
0
def reorder(data,
            data_headers,
            array_order,
            comp_group_list,
            probeset_db,
            include_raw_data,
            array_type,
            norm,
            fl,
            logvalues=True):
    ###array_order gives the final level order sorted, followed by the original index order as a tuple
    expbuilder_value_db = {}
    group_name_db = {}
    summary_filtering_stats = {}
    pval_summary_db = {}
    replicates = 'yes'

    stat_result_names = ['avg-', 'log_fold-', 'fold-', 'rawp-', 'adjp-']
    group_summary_result_names = ['avg-']

    ### Define expression variables
    try:
        probability_statistic = fl.ProbabilityStatistic()
    except Exception:
        probability_statistic = 'unpaired t-test'
    try:
        gene_exp_threshold = fl.GeneExpThreshold()
    except Exception:
        gene_exp_threshold = 0
    try:
        gene_rpkm_threshold = fl.RPKMThreshold()
    except Exception:
        gene_rpkm_threshold = 0
    calculateAsNonLog = True

    ### Begin processing sample expression values according to the organized groups
    for row_id in data:
        try:
            gene = probeset_db[row_id][0]
        except TypeError:
            gene = ''  #not needed if not altsplice data
        data_headers2 = {}  #reset each time
        grouped_ordered_array_list = {}
        for x in array_order:
            y = x[1]  #this is the new first index
            group = x[2]
            group_name = x[3]
            group_name_db[group] = group_name
            #for example y = 5, therefore the data[row_id][5] entry is now the first
            try:
                try:
                    new_item = data[row_id][y]
                except IndexError:
                    print row_id, data[row_id], len(
                        data[row_id]), y, len(array_order), array_order
                    kill
                if logvalues == False and calculateAsNonLog and array_type == 'RNASeq':
                    new_item = math.pow(2, new_item)
            except TypeError:
                new_item = ''  #this is for a spacer added in the above function
            try:
                grouped_ordered_array_list[group].append(new_item)
            except KeyError:
                grouped_ordered_array_list[group] = [new_item]
            try:
                data_headers2[group].append(data_headers[y])
            except KeyError:
                data_headers2[group] = [data_headers[y]]
        #perform statistics on each group comparison - comp_group_list: [(1,2),(3,4)]
        stat_results = {}
        group_summary_results = {}
        for comp in comp_group_list:
            group1 = int(comp[0])
            group2 = int(comp[1])
            group1_name = group_name_db[group1]
            group2_name = group_name_db[group2]
            groups_name = group1_name + "_vs_" + group2_name
            data_list1 = grouped_ordered_array_list[group1]
            data_list2 = grouped_ordered_array_list[
                group2]  #baseline expression
            avg1 = statistics.avg(data_list1)
            try:
                avg2 = statistics.avg(data_list2)
            except ValueError:
                print data_list2, row_id
                forceError
            if (logvalues == False
                    and array_type != 'RNASeq') or (logvalues == False
                                                    and calculateAsNonLog):
                fold = avg1 / avg2
                log_fold = math.log(fold, 2)
                if fold < 1: fold = -1.0 / fold
            else:
                log_fold = avg1 - avg2
                fold = statistics.log_fold_conversion(log_fold)
            try:
                #t,df,tails = statistics.ttest(data_list1,data_list2,2,3) #unpaired student ttest, calls p_value function
                #t = abs(t); df = round(df); p = str(statistics.t_probability(t,df))
                p = statistics.runComparisonStatistic(data_list1, data_list2,
                                                      probability_statistic)
            except Exception:
                p = 1
                sg = 1
                N1 = 0
                N2 = 0
            comp = group1, group2
            if array_type == 'RNASeq':  ### Also non-log but treated differently
                if norm == 'RPKM': adj = 0
                else: adj = 1
                if calculateAsNonLog == False:
                    avg1 = math.pow(2, avg1) - adj
                    avg2 = math.pow(2, avg2) - adj
                if norm == 'RPKM':
                    if avg1 < gene_rpkm_threshold and avg2 < gene_rpkm_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
                else:
                    if avg1 < gene_exp_threshold and avg2 < gene_exp_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
            try:
                gs = statistics.GroupStats(log_fold, fold, p)
                stat_results[comp] = groups_name, gs, group2_name
                if probability_statistic == 'moderated t-test':
                    gs.setAdditionalStats(
                        data_list1, data_list2)  ### Assuming equal variance
                if probability_statistic == 'moderated Welch-test':
                    gs.setAdditionalWelchStats(
                        data_list1, data_list2)  ### Assuming unequal variance
            except Exception:
                null = []
                replicates = 'no'  ### Occurs when not enough replicates
                #print comp, len(stat_results); kill_program
            group_summary_results[group1] = group1_name, [avg1]
            group_summary_results[group2] = group2_name, [avg2]

        ### Replaces the below method to get the largest possible comparison fold and ftest p-value
        grouped_exp_data = []
        avg_exp_data = []
        for group in grouped_ordered_array_list:
            data_list = grouped_ordered_array_list[group]
            grouped_exp_data.append(data_list)
            try:
                avg = statistics.avg(data_list)
                avg_exp_data.append(avg)
            except Exception:
                print row_id, group, data_list
                kill
        try:
            avg_exp_data.sort()
            max_fold = avg_exp_data[-1] - avg_exp_data[0]
        except Exception:
            max_fold = 'NA'
        try:
            ftestp = statistics.OneWayANOVA(grouped_exp_data)
        except Exception:
            ftestp = 1
        gs = statistics.GroupStats(max_fold, 0, ftestp)
        summary_filtering_stats[row_id] = gs

        stat_result_list = []
        for entry in stat_results:
            data_tuple = entry, stat_results[entry]
            stat_result_list.append(data_tuple)
        stat_result_list.sort()

        grouped_ordered_array_list2 = []
        for group in grouped_ordered_array_list:
            data_tuple = group, grouped_ordered_array_list[group]
            grouped_ordered_array_list2.append(data_tuple)
        grouped_ordered_array_list2.sort(
        )  #now the list is sorted by group number

        ###for each rowid, add in the reordered data, and new statistics for each group and for each comparison
        for entry in grouped_ordered_array_list2:
            group_number = entry[0]
            original_data_values = entry[1]
            if include_raw_data == 'yes':  ###optionally exclude the raw values
                for value in original_data_values:
                    if array_type == 'RNASeq':
                        if norm == 'RPKM': adj = 0
                        else: adj = 1
                        if calculateAsNonLog == False:
                            value = math.pow(2, value) - adj
                    try:
                        expbuilder_value_db[row_id].append(value)
                    except KeyError:
                        expbuilder_value_db[row_id] = [value]
            if group_number in group_summary_results:
                group_summary_data = group_summary_results[group_number][
                    1]  #the group name is listed as the first entry
                for value in group_summary_data:
                    try:
                        expbuilder_value_db[row_id].append(value)
                    except KeyError:
                        expbuilder_value_db[row_id] = [value]
            for info in stat_result_list:
                if info[0][
                        0] == group_number:  #comp,(groups_name,[avg1,log_fold,fold,ttest])
                    comp = info[0]
                    gs = info[1][1]
                    expbuilder_value_db[row_id].append(gs.LogFold())
                    expbuilder_value_db[row_id].append(gs.Fold())
                    expbuilder_value_db[row_id].append(gs.Pval())
                    ### Create a placeholder and store the position of the adjusted p-value to be calculated
                    expbuilder_value_db[row_id].append('')
                    gs.SetAdjPIndex(len(expbuilder_value_db[row_id]) - 1)
                    gs.SetPvalIndex(len(expbuilder_value_db[row_id]) - 2)
                    pval_summary_db[(row_id, comp)] = gs

    ###do the same for the headers, but at the dataset level (redundant processes)
    array_fold_headers = []
    data_headers3 = []
    try:
        for group in data_headers2:
            data_tuple = group, data_headers2[
                group]  #e.g. 1, ['X030910_25_hl.CEL', 'X030910_29R_hl.CEL', 'X030910_45_hl.CEL'])
            data_headers3.append(data_tuple)
        data_headers3.sort()
    except UnboundLocalError:
        print data_headers, '\n', array_order, '\n', comp_group_list, '\n'
        kill_program

    for entry in data_headers3:
        x = 0  #indicates the times through a loop
        y = 0  #indicates the times through a loop
        group_number = entry[0]
        original_data_values = entry[1]
        if include_raw_data == 'yes':  ###optionally exclude the raw values
            for value in original_data_values:
                array_fold_headers.append(value)
        if group_number in group_summary_results:
            group_name = group_summary_results[group_number][0]
            group_summary_data = group_summary_results[group_number][1]
            for value in group_summary_data:
                combined_name = group_summary_result_names[
                    x] + group_name  #group_summary_result_names = ['avg-']
                array_fold_headers.append(combined_name)
                x += 1  #increment the loop index

        for info in stat_result_list:
            if info[0][
                    0] == group_number:  #comp,(groups_name,[avg1,log_fold,fold,ttest],group2_name)
                groups_name = info[1][0]
                only_add_these = stat_result_names[1:]
                for value in only_add_these:
                    new_name = value + groups_name
                    array_fold_headers.append(new_name)

    ###For the raw_data only export we need the headers for the different groups (data_headers2) and group names (group_name_db)
    raw_data_comp_headers = {}
    for comp in comp_group_list:
        temp_raw = []
        group1 = int(comp[0])
        group2 = int(comp[1])
        comp = str(comp[0]), str(comp[1])
        g1_headers = data_headers2[group1]
        g2_headers = data_headers2[group2]
        g1_name = group_name_db[group1]
        g2_name = group_name_db[group2]
        for header in g2_headers:
            temp_raw.append(g2_name + ':' + header)
        for header in g1_headers:
            temp_raw.append(g1_name + ':' + header)
        raw_data_comp_headers[comp] = temp_raw

    ###Calculate adjusted ftest p-values using BH95 sorted method
    statistics.adjustPermuteStats(summary_filtering_stats)

    ### Calculate adjusted p-values for all p-values using BH95 sorted method
    round = 0
    for info in comp_group_list:
        compid = int(info[0]), int(info[1])
        pval_db = {}
        for (rowid, comp) in pval_summary_db:
            if comp == compid:
                gs = pval_summary_db[(rowid, comp)]
                pval_db[rowid] = gs
        if 'moderated' in probability_statistic and replicates == 'yes':
            ### Moderates the original reported test p-value prior to adjusting
            try:
                statistics.moderateTestStats(pval_db, probability_statistic)
            except Exception:
                if round == 0:
                    if replicates == 'yes':
                        print 'Moderated test failed due to issue with mpmpath or out-of-range values\n   ... using unmoderated unpaired test instead!'
                null = []  ### Occurs when not enough replicates
            round += 1
        statistics.adjustPermuteStats(pval_db)
        for rowid in pval_db:
            gs = pval_db[rowid]
            expbuilder_value_db[rowid][gs.AdjIndex()] = gs.AdjP(
            )  ### set the place holder to the calculated value
            if 'moderated' in probability_statistic:
                expbuilder_value_db[rowid][gs.RawIndex()] = gs.Pval(
                )  ### Replace the non-moderated with a moderated p-value

    pval_summary_db = []
    ###Finished re-ordering lists and adding statistics to expbuilder_value_db
    return expbuilder_value_db, array_fold_headers, summary_filtering_stats, raw_data_comp_headers
Пример #15
0
def nonLogAvg(data_list):
    return statistics.avg(map(lambda x: math.pow(2, x) - 1, data_list))