Пример #1
0
def replacePearsonPvalueWithZscore():
    all_sample_data={}
    for tissue in tissue_comparison_scores:
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            all_sample_data[sample] = [] ### populate this dictionary and create sub-dictionaries
        break

    for tissue in tissue_comparison_scores:
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            all_sample_data[sample].append(r)

    sample_stats={}
    all_dataset_rho_values=[]
    ### Get average and standard deviation for all sample rho's
    for sample in all_sample_data:
        all_dataset_rho_values+=all_sample_data[sample]
        avg=statistics.avg(all_sample_data[sample])
        stdev=statistics.stdev(all_sample_data[sample])
        sample_stats[sample]=avg,stdev
    
    global_rho_avg = statistics.avg(all_dataset_rho_values)
    global_rho_stdev = statistics.stdev(all_dataset_rho_values)
    
    ### Replace the p-value for each rho
    for tissue in tissue_comparison_scores:
        scores = []
        for (r,p,sample) in tissue_comparison_scores[tissue]:
            #u,s=sample_stats[sample]
            #z = (r-u)/s
            z = (r-global_rho_avg)/global_rho_stdev ### Instead of doing this for the sample background, do it relative to all analyzed samples
            scores.append([r,z,sample])
        tissue_comparison_scores[tissue] = scores
Пример #2
0
def combine_profiles(profile_list):
    profile_group_sizes = {}
    for db in profile_list:
        for key in db:
            profile_group_sizes[key] = len(db[key])
        break

    new_profile_db = {}
    for key in profile_group_sizes:
        x = profile_group_sizes[key]  ###number of elements in list for key
        new_val_list = []
        i = 0
        while i < x:
            temp_val_list = []
            for db in profile_list:
                if key in db:
                    val = db[key][i]
                    temp_val_list.append(val)
            i += 1
            val_avg = statistics.avg(temp_val_list)
            new_val_list.append(val_avg)
        new_profile_db[key] = new_val_list
    return new_profile_db
Пример #3
0
def statisticallyFilterFile(input_file, output_file, threshold):
    if 'exp.' in input_file:
        counts_file = string.replace(input_file, 'exp.', 'geneCount.')
    else:
        counts_file = input_file[:-4] + '-geneCount.txt'
    sample_expressed_genes = {}
    header = True
    junction_max = []
    count_sum_array = []
    for line in open(input_file, 'rU').xreadlines():
        data = cleanUpLine(line)
        if '.csv' in input_file:
            t = string.split(data, ',')
        else:
            t = string.split(data, '\t')
        if header:
            header_len = len(t)
            full_header = t
            samples = t[1:]
            header = False
            count_sum_array = [0] * len(samples)
        else:
            if len(t) == (header_len + 1):
                ### Correct header with a missing UID column
                samples = full_header
                count_sum_array = [0] * len(samples)
                print 'fixing bad header'
            try:
                values = map(float, t[1:])
            except Exception:
                if 'NA' in t[1:]:
                    tn = [0 if x == 'NA' else x
                          for x in t[1:]]  ### Replace NAs
                    values = map(float, tn)
                else:
                    tn = [0 if x == '' else x for x in t[1:]]  ### Replace NAs
                    values = map(float, tn)

            binarized_values = []
            for v in values:
                if v > threshold: binarized_values.append(1)
                else: binarized_values.append(0)
            count_sum_array = [
                sum(value)
                for value in zip(*[count_sum_array, binarized_values])
            ]

    index = 0
    distribution = []
    count_sum_array_db = {}
    samples_to_retain = []
    samples_to_exclude = []
    for sample in samples:
        count_sum_array_db[sample] = count_sum_array[index]
        distribution.append(count_sum_array[index])
        index += 1
    from stats_scripts import statistics
    distribution.sort()
    avg = int(statistics.avg(distribution))
    stdev = int(statistics.stdev(distribution))
    min_exp = int(min(distribution))
    cutoff = avg - (stdev * 2)
    dev = 2
    print 'The average number of genes expressed above %s is %s, (SD is %s, min is %s)' % (
        threshold, avg, stdev, min_exp)
    if cutoff < 0:
        if (stdev - avg) > 0:
            cutoff = avg - (stdev / 2)
            dev = 0.5
            print cutoff, 'genes expressed selected as a default cutoff to include cells (2-stdev away)'
        else:
            cutoff = avg - stdev
            dev = 1
            print cutoff, 'genes expressed selected as a default cutoff to include cells (1-stdev away)'
    if min_exp > cutoff:
        cutoff = avg - stdev
        dev = 1

    print 'Using a default cutoff of >=500 genes per cell expressed/cell'
    cutoff = 499
    import export
    eo = export.ExportFile(counts_file)
    eo.write('Sample\tGenes Expressed(threshold:' + str(threshold) + ')\n')
    for sample in samples:  ### keep the original order
        if count_sum_array_db[sample] > cutoff:
            samples_to_retain.append(sample)
        else:
            samples_to_exclude.append(sample)
        eo.write(sample + '\t' + str(count_sum_array_db[sample]) + '\n')

    if len(samples_to_retain) < 4:  ### Don't remove any if too few samples
        samples_to_retain += samples_to_exclude
    else:
        print len(
            samples_to_exclude
        ), 'samples removed (< 500 genes expressed)'  # (%s)' % (dev,string.join(samples_to_exclude,', '))
    eo.close()
    print 'Exporting the filtered expression file to:'
    print output_file
    filterFile(input_file, output_file, samples_to_retain)
Пример #4
0
def filterFile(input_file,
               output_file,
               filter_names,
               force=False,
               calculateCentroids=False,
               comparisons=[],
               log2=False,
               convertPSIUID=False,
               partialMatch=False):
    if calculateCentroids:
        filter_names, group_index_db = filter_names

    export_object = open(output_file, 'w')
    firstLine = True
    row_count = 0
    for line in open(input_file, 'rU').xreadlines():
        data = cleanUpLine(line)
        if '.csv' in input_file:
            values = string.split(data, ',')
        else:
            values = string.split(data, '\t')
        row_count += 1
        if firstLine:
            uid_index = 0
            if data[0] != '#':
                if force == True:
                    values2 = []
                    for x in values:
                        if ':' in x:
                            x = string.split(x, ':')[1]
                            values2.append(x)
                        else:
                            values2.append(x)
                    filter_names2 = []
                    for f in filter_names:
                        if f in values: filter_names2.append(f)
                    if len(filter_names2) < 2:
                        filter_names2 = []
                        for f in filter_names:
                            if f in values2: filter_names2.append(f)
                        filter_names = filter_names2
                    else:
                        filter_names = filter_names2
                if force == 'include':
                    values = ['UID'] + filter_names
                    pass
                try:
                    sample_index_list = map(lambda x: values.index(x),
                                            filter_names)
                except:
                    ### If ":" in header name
                    if ':' in line:
                        values2 = []
                        for x in values:
                            if ':' in x:
                                x = string.split(x, ':')[1]
                            values2.append(x)
                        values = values2
                        sample_index_list = map(lambda x: values.index(x),
                                                filter_names)
                    elif '.' in line:
                        values2 = []
                        for x in values:
                            if '.' in x:
                                x = string.split(x, '.')[0]
                            values2.append(x)
                        values = values2
                        sample_index_list = map(lambda x: values.index(x),
                                                filter_names)
                    elif '.$' in line:
                        filter_names2 = []
                        for f in filter_names:  ### if the name in the filter is a string within the input data-file
                            for f1 in values:
                                if f in f1:
                                    filter_names2.append(
                                        f1)  ### change to the reference name
                                    break
                        print len(filter_names2), len(values), len(
                            filter_names)
                        kill
                        filter_names = filter_names2
                        #filter_names = map(lambda x: string.split(x,'.')[0], filter_names)
                        #values = map(lambda x: string.split(x,'.')[0], values)
                        sample_index_list = map(lambda x: values.index(x),
                                                filter_names)
                    elif partialMatch:
                        filter_names_upated = []
                        for x in filter_names:
                            if x not in values:
                                for y in values:
                                    if x in y:
                                        filter_names_upated.append(y)
                        filter_names = filter_names_upated
                        sample_index_list = map(lambda x: values.index(x),
                                                filter_names)
                    else:
                        temp_count = 1
                        for x in filter_names:
                            if x not in values:
                                temp_count += 1
                                if temp_count == 500: print 'too many to print'
                                elif temp_count > 500:
                                    pass
                                else:
                                    print x,
                        print temp_count, 'are missing'
                        kill

                firstLine = False
                header = values
            if 'PSI_EventAnnotation' in input_file:
                uid_index = values.index('UID')
            if log2:
                try:
                    values = map(lambda x: math.log(x + 1, 2))
                except:
                    pass
            if calculateCentroids:
                if len(comparisons) > 0:
                    export_object.write(
                        string.join(['UID'] + map(lambda x: x[0] + '-fold',
                                                  comparisons), '\t') +
                        '\n')  ### Use the numerator group name
                else:
                    clusters = map(str, group_index_db)
                    export_object.write(
                        string.join([values[uid_index]] + clusters, '\t') +
                        '\n')
                continue  ### skip the below code

        if force == 'include':
            if row_count > 1:
                values += ['0']

        try:
            filtered_values = map(
                lambda x: values[x], sample_index_list
            )  ### simple and fast way to reorganize the samples
        except Exception:
            """
            print traceback.format_exc()
            print len(values), len(sample_index_list)
            print input_file, len(filter_names)
            for i in filter_names:
                if i not in header:
                    print i, 'not found'
            sys.exit()
            """
            ### For PSI files with missing values at the end of each line, often
            if len(header) != len(values):
                diff = len(header) - len(values)
                values += diff * ['']
            filtered_values = map(
                lambda x: values[x], sample_index_list
            )  ### simple and fast way to reorganize the samples
            #print values[0]; print sample_index_list; print values; print len(values); print len(prior_values);kill
        prior_values = values
        ######################## Begin Centroid Calculation ########################
        if calculateCentroids:
            mean_matrix = []
            means = {}
            for cluster in group_index_db:
                #### group_index_db[cluster] is all of the indeces for samples in a noted group, cluster is the actual cluster name (not number)
                raw_values = map(lambda x: filtered_values[x],
                                 group_index_db[cluster])
                raw_values2 = []
                for vx in raw_values:
                    if vx != '' and vx != 'NA':
                        raw_values2.append(float(vx))

                if len(raw_values2) > 2:
                    mean = statistics.avg(raw_values2)
                else:
                    mean = ""
                #mean = map(lambda x: filtered_values[uid][x], group_index_db[cluster]) ### Only one value
                means[cluster] = mean
                mean_matrix.append(str(mean))
            filtered_values = mean_matrix
            if len(comparisons) > 0:
                fold_matrix = []
                for (group2, group1) in comparisons:
                    try:
                        fold = means[group2] - means[group1]
                    except:
                        ### Indicates a missing value - exclude
                        fold = 0
                    fold_matrix.append(str(fold))
                filtered_values = fold_matrix
        ########################  End Centroid Calculation  ########################
        new_uid = values[uid_index]
        if convertPSIUID:
            new_uid = string.replace(new_uid, ':', '__')
            if '|' in new_uid:
                new_uid = string.split(new_uid, '|')[0]
            new_uids = string.split(new_uid, '__')
            if len(new_uids) > 2:
                if 'ENS' in new_uids[1]:
                    new_uid = string.join([new_uids[0]] + new_uids[2:], ' ')
        export_object.write(
            string.join([new_uid] + filtered_values, '\t') + '\n')
Пример #5
0
def plotFeatureBoxPlots(qc_db, dataset_name, feature_type):
    pylab.figure()
    pylab.xlabel('Biological Sample Names')
    pylab.ylabel('Read Counts - Log2')
    pylab.title('Expression BoxPlots for %ss - %s' %
                (feature_type, dataset_name))
    #pylab.subplots_adjust(left=0.085, right=0.95, top=0.2, bottom=0.35)
    pylab.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.35)

    #axes = getAxes(scores) ### adds buffer space to the end of each axis and creates room for a legend
    #pylab.axis(axes)

    boxplots = []
    samples = []

    sample_sorted_list = []

    for sample_name in qc_db:
        try:
            qc = qc_db[sample_name][feature_type]
        except Exception:
            print 'No junction data found for at least one sample:', sample_name
            forceExit
        sample_sorted_list.append(
            [statistics.avg(qc),
             statistics.stdev(qc), sample_name])
    sample_sorted_list.sort()
    sample_sorted_list.reverse()

    filename = 'QC-%s-BoxPlot-%s.pdf' % (dataset_name, feature_type)
    export_obj = export.ExportFile(root_dir + filename[:-4] + '.txt')
    export_obj.write('SampleID\tAverage Expression\n')

    firstEntry = True
    for (mean, stdev, sample_name) in sample_sorted_list:
        ls = []
        x_ls = []
        y_ls = []
        qc = qc_db[sample_name][feature_type]
        boxplots.append(qc)
        samples.append(sample_name)
        export_obj.write(sample_name + '\t' + str(mean) + '\n')
        if firstEntry:
            threshold = mean - 2 * stdev
            firstEntry = False
        else:
            if mean < threshold:
                print sample_name, 'expression is considered very low (2 standard deviations away from the max).'
    pylab.boxplot(boxplots,
                  notch=0,
                  whis=1.5,
                  positions=None,
                  widths=None,
                  patch_artist=False)
    #pylab.boxplot(boxplots, notch=0, sym='+', vert=1, whis=1.5, positions=None, widths=None, patch_artist=False)
    xtickNames = pylab.setp(pylab.gca(), xticklabels=samples)
    pylab.setp(xtickNames, rotation=90, fontsize=10)
    export_obj.close()

    #print 'Exporting:',filename
    pylab.savefig(root_dir + filename)
    filename = filename[:-3] + 'png'
    pylab.savefig(root_dir + filename)  #,dpi=200
    graphic_link.append(
        ['QC - BoxPlot-' + feature_type + ' Expression', root_dir + filename])
    try:
        import gc
        pylab.figure.clf()
        pylab.close()
        gc.collect()
    except Exception:
        pass
Пример #6
0
def importTableEntries(filename,
                       filter_db,
                       ensembl_exon_db,
                       gene_db,
                       root_dir,
                       transpose,
                       display,
                       showIntrons,
                       analysisType='plot'):
    import collections
    average_samples = True
    if showIntrons == 'yes': include_introns = True
    else: include_introns = False
    uid_db = {}  ### probeset or AltAnalyze RNA-Seq ID keyed
    uid_list = {}  ### ordered from first to last exon region
    uid_gene_db = {}  ### Lets us look at multiple genes
    try:
        import UI
        biotypes = UI.getBiotypes(filename)
    except Exception:
        biotypes = {}
    for gene in ensembl_exon_db:
        uid_list[gene] = []
        for (index, ed, id) in ensembl_exon_db[gene]:
            proceed = False
            if 'exp.' in filename:
                if include_introns:
                    proceed = True
                elif 'E' in ed.ExonID():
                    proceed = True
            else:  ### Include introns for splicing index view
                if include_introns == True: proceed = True
                elif 'E' in ed.ExonID(): proceed = True
            if proceed:
                uid_db[id] = ed
                uid_list[gene].append(id)
            uid_gene_db[id] = gene

    if '_vs_' in filename:  ### If one two groups, this is what will be output to the RawSplice folder - need to have this alternate way of getting the expression file location
        rootdir = string.split(filename, 'AltResults')[0]
        exp_dir = getValidExpFile(rootdir + 'ExpressionInput')
        alt_groups_dir = string.split(
            exp_dir, 'ExpressionInput'
        )[0] + 'ExpressionInput/groups.' + findFilename(exp_dir)
        alt_groups_dir = string.replace(alt_groups_dir, 'exp.', '')

    start_time = time.time()
    fn = filepath(filename)
    matrix_gene_db = {}
    stdev_gene_matrix_db = {}
    row_header_gene = {}
    ids = {}
    x = 0

    if 'heatmap' in analysisType:
        average_samples = False

    if '/' in filename:
        dataset_name = string.split(filename, '/')[-1][:-4]
    else:
        dataset_name = string.split(filename, '\\')[-1][:-4]
    for line in open(fn, 'rU').xreadlines():
        data = line.strip()
        t = string.split(data, '\t')
        if data[0] == '#': x = 0
        elif x == 0:
            if platform == 'RNASeq':
                removeExtension = True
            else:
                removeExtension = False
            group_db, column_header, sample_name_db = assignGroupColors(
                t[1:], '', removeExtension=removeExtension)
            x = 1
            altresults = False
            if average_samples:
                if 'AltResults' in filename:
                    altresults = True
                    groups_dir = string.split(
                        filename, 'AltResults'
                    )[0] + 'ExpressionInput/groups.' + findFilename(filename)
                    if verifyFile(groups_dir) == False:
                        groups_dir = alt_groups_dir
                    new_column_header = reformatAltHeaders(t[3:])
                    start = 3
                else:
                    if 'exp.' in filename:
                        groups_dir = string.replace(filename, 'exp.',
                                                    'groups.')
                    else:
                        groups_dir = string.replace(filename, 'counts.',
                                                    'groups.')
                    new_column_header = column_header
                    start = 1  ### starting index with numeric values
                groups_dir = string.replace(groups_dir, 'stats.', 'groups.')
                groups_dir = string.replace(
                    groups_dir, '-steady-state.txt',
                    '.txt')  ### groups is for the non-steady-state file

                try:
                    group_index_db = collections.OrderedDict()
                except Exception:
                    import ordereddict
                    group_index_db = ordereddict.OrderedDict()
                ### use comps in the future to visualize group comparison changes
                sample_list, group_sample_db, group_db, group_name_sample_db, comp_groups, comps_name_db = ExpressionBuilder.simpleGroupImport(
                    groups_dir)
                for item in sample_list:
                    group_name = group_db[item]
                    proceed = False
                    try:
                        sample_index = new_column_header.index(item)
                        proceed = True
                    except Exception:
                        try:
                            item = string.replace(item, '.bed', '')
                            item = string.replace(
                                item, '.CEL',
                                '')  ### Probe-level analyses as RNA-Seq
                            item = string.replace(item, '.cel', '')
                            item = string.replace(item, '.txt', '')
                            item = string.replace(item, '.TXT', '')
                            item = string.replace(item, '.TAB', '')
                            item = string.replace(item, '.tab', '')
                            sample_index = new_column_header.index(item)
                            proceed = True
                        except Exception:
                            pass
                            #print [item]
                            #print column_header
                            #print Error
                    if proceed:
                        try:
                            group_index_db[group_name].append(sample_index)
                        except Exception:
                            try:
                                group_index_db[group_name] = [
                                    sample_index
                                ]  ### dictionary of group to input file sample indexes
                            except Exception:
                                pass  ### Occurs when analyzing splicing-index for two groups when more than two groups exist (error from 5 lines up)
                groups = map(str, group_index_db)  ### store group names
                new_sample_list = map(
                    lambda item: group_db[item], sample_list
                )  ### lookup index of each sample in the ordered group sample list
                column_header = groups
            else:
                if 'AltResults' in filename: start = 3
                else: start = 1  ### starting index with numeric values
                column_header = t[start - 1:]
            row_number = 1
        else:
            if ' ' not in t and '' not in t:  ### Occurs for rows with missing data
                uid = t[start - 1]
                if ';' in uid:
                    uid = string.split(uid, ';')[0]
                ids[uid] = None
                ens_geneID = string.split(uid, ':')[0]
                #if ens_geneID in gene_db: print uid
                if uid in filter_db or ('heatmap' in analysisType
                                        and ens_geneID in gene_db):
                    try:
                        if len(biotypes) == 1 and 'junction' in biotypes:
                            gene = ens_geneID
                        else:
                            gene = uid_gene_db[uid]
                        try:
                            row_header_gene[gene].append(uid)
                        except Exception:
                            row_header_gene[gene] = [uid]
                        if average_samples == False:
                            values = map(float, t[start:])
                            try:
                                matrix_gene_db[gene].append(values)
                            except Exception:
                                matrix_gene_db[gene] = [values]
                        else:
                            if platform == 'RNASeq' and altresults == False:
                                ### Convert to log2 RPKM values - or counts
                                values = map(lambda x: math.log(float(x), 2),
                                             t[start:])
                            else:
                                values = map(float, t[start:])

                            if 'AltResults' in filename:  ### If splicing scores, normalize these to the mean values
                                mean = statistics.avg(values)
                                values = map(lambda x: x - mean, values)
                            avg_ls = []
                            std_ls = []
                            for group_name in group_index_db:
                                group_values = map(
                                    lambda x: values[x],
                                    group_index_db[group_name]
                                )  ### simple and fast way to reorganize the samples
                                avg = statistics.avg(group_values)
                                try:
                                    st_err = statistics.stdev(
                                        group_values) / math.sqrt(
                                            len(group_values))
                                except Exception:
                                    ### Occurs if no replicates in the dataset
                                    st_err = 0
                                avg_ls.append(avg)
                                std_ls.append(st_err)
                            try:
                                matrix_gene_db[gene].append(avg_ls)
                            except Exception:
                                matrix_gene_db[gene] = [avg_ls]
                            try:
                                stdev_gene_matrix_db[gene].append(std_ls)
                            except Exception:
                                stdev_gene_matrix_db[gene] = [std_ls]
                    except Exception:
                        #print traceback.format_exc()
                        pass
            x += 1

    global colors
    original_column_header = list(column_header)
    if len(uid_list) == 0:
        print 'No genes found in the exon expression database'
        forceNoExonExpError
    successfully_output_genes = 0
    display_count = 0  ### Only display a certain number of genes

    for last_gene in uid_list:
        pass
    for gene in uid_list:
        fig = pylab.figure(
        )  ### Create this here - resulting in a single figure for memory purposes
        new_header = []
        new_matrix = []
        new_stdev = []
        annotation_list = []
        gene_symbol = gene_db[gene]
        try:
            matrix = matrix_gene_db[gene]
        except Exception:
            #print gene_symbol, 'not in alternative expression database'
            continue  ### go the next gene - no alt.expression for this gene
        row_header = row_header_gene[gene]

        try:
            stdev_matrix = stdev_gene_matrix_db[gene]
        except Exception:
            pass
        for uid in uid_list[gene]:
            #print row_header;sys.exit()
            try:
                i = row_header.index(
                    uid
                )  ### If the ID is in the filtered annotated exon list (not just core)
                new_header.append(uid)
                try:
                    new_matrix.append(matrix[i])
                except Exception:
                    print uid, i, len(matrix)
                    sys.exit()
                ed = uid_db[uid]
                annotation_list.append(ed)
                try:
                    new_stdev.append(stdev_matrix[i])
                except Exception:
                    pass
            except Exception:
                pass

        if len(new_matrix) > 0:
            matrix = new_matrix
        if len(new_header) > 0:
            row_header = new_header
        if 'heatmap' in analysisType:
            export_dir = root_dir + gene_symbol + '-heatmap.txt'
            export_obj = export.ExportFile(export_dir)
            export_obj.write(string.join(column_header, '\t') + '\n')
            ki = 0
            if len(annotation_list) > 0:
                for ed in annotation_list:
                    if 'AltResults' not in filename and platform == 'RNASeq':
                        values = map(lambda x: math.log(x, 2), matrix[ki])
                    else:
                        values = matrix[ki]
                    export_obj.write(
                        string.join([ed.ExonID()] + map(str, values), '\t') +
                        '\n')
                    ki += 1
                row_metric = 'euclidean'
                row_method = None
            else:
                ### Just junctions analyzed here... no sorted junctions yet
                ki = 0
                for uid in row_header_gene[gene]:
                    if 'AltResults' not in filename and platform == 'RNASeq':
                        values = map(lambda x: math.log(x, 2), matrix[ki])
                    else:
                        values = matrix[ki]
                    export_obj.write(
                        string.join([uid] + map(str, values), '\t') + '\n')
                    ki += 1
                row_metric = 'euclidean'
                row_method = 'average'
            export_obj.close()
            from visualization_scripts import clustering

            column_metric = 'euclidean'
            column_method = 'hopach'
            color_gradient = 'red_black_sky'
            transpose = False
            graphic_links = []
            if ki > 100: transpose = True
            if gene == last_gene: display = True
            else: display = False
            graphic_links = clustering.runHCexplicit(export_dir,
                                                     graphic_links,
                                                     row_method,
                                                     row_metric,
                                                     column_method,
                                                     column_metric,
                                                     color_gradient,
                                                     transpose,
                                                     display=display,
                                                     Normalize=True,
                                                     compressAxis=False,
                                                     contrast=2.5)
            successfully_output_genes += 1
        else:
            stdev_matrix = new_stdev
            time_diff = str(round(time.time() - start_time, 1))
            #print '%d rows and %d columns imported for %s in %s seconds...' % (len(matrix),len(column_header),dataset_name,time_diff)
            if transpose == True:
                matrix = map(numpy.array,
                             zip(*matrix))  ### coverts these to tuples
                column_header, row_header = row_header, original_column_header
                stdev_matrix = map(numpy.array, zip(*stdev_matrix))
            matrix = numpy.array(matrix)

            stdev_matrix = numpy.array(stdev_matrix)
            try:
                if len(uid_list) > 10:
                    #if display_count==5: display=False
                    display = False
                if display_count == 0:
                    ### store a consistent color palete to use
                    colors = []
                    """
                    k=0
                    while k < len(row_header):
                        colors.append(tuple(rand(3)))
                        k+=1"""
                    #http://stackoverflow.com/questions/3016283/create-a-color-generator-from-given-colormap-in-matplotlib
                    cm = pylab.cm.get_cmap('gist_rainbow')  #gist_ncar
                    for i in range(len(row_header)):
                        colors.append(cm(1. * i / len(row_header))
                                      )  # color will now be an RGBA tuple

                plotExonExpression(fig,
                                   matrix,
                                   stdev_matrix,
                                   row_header,
                                   column_header,
                                   dataset_name,
                                   annotation_list,
                                   gene_symbol,
                                   root_dir,
                                   display=display)
                successfully_output_genes += 1
                display_count += 1
            except Exception:
                print traceback.format_exc()
                sys.exit()
                print gene_symbol, 'failed'
        try:
            pylab.close()
        except Exception:
            pass
        if successfully_output_genes > 0:
            #try: print 'Gene graphs exported to ExonPlots...'
            #except Exception: pass
            pass
        else:
            print '\nWARNING!!!! No genes with associated alternative exon evidence found\n'
            forceNoExonExpError
        try:
            import gc
            fig.clf()
            pylab.close()
            gc.collect()
        except Exception:
            pass
Пример #7
0
def parse_input_data(filename, data_type):
    fn = filepath(filename)
    first_line = 1
    array_group_name_db = {}
    z = 0
    array_group_db = {}
    output_file = []
    #print "Reading",filename
    secondary_data_type = export.getParentDir(
        filename)  ### e.g., expression or counts

    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        t = string.split(data, '\t')
        probeset = t[0]
        z += 1
        if first_line == 1:
            first_line = 0  #makes this value null for the next loop of actual array data
            ###Below ocucrs if the data is raw opposed to precomputed
            if data_type == 'export':
                if array_type == 'exon':
                    folder = 'ExonArray' + '/' + species + '/'
                elif array_type == 'gene':
                    folder = 'GeneArray' + '/' + species + '/'
                elif array_type == 'junction':
                    folder = 'JunctionArray' + '/' + species + '/'
                elif array_type == 'RNASeq':
                    folder = 'RNASeq' + '/' + species + '/'
                else:
                    folder = array_type + '/'
                parent_path = root_dir + 'AltExpression/' + folder
                if array_type == 'RNASeq':
                    output_file = altanalzye_input[0:-4] + '.ExpCutoff-' + str(
                        original_exp_threshold) + '_' + filter_method + '.txt'
                else:
                    output_file = altanalzye_input[0:-4] + '.p' + str(
                        int(100 * p)) + '_' + filter_method + '.txt'
                output_file_dir = parent_path + output_file
                print "...Exporting", output_file_dir
                export_data = export.createExportFile(
                    output_file_dir, root_dir + 'AltExpression/' + folder)
                fn = filepath(output_file_dir)
                export_data = open(fn, 'w')
                export_data.write(line)
            if ':' in t[1]:
                array_group_list = []
                x = 0  ###gives us an original index value for each entry in the group
                for entry in t[1:]:
                    array_group, array_name = string.split(entry, ':')
                    try:
                        array_group_db[array_group].append(x)
                        array_group_name_db[array_group].append(array_name)
                    except KeyError:
                        array_group_db[array_group] = [x]
                        array_group_name_db[array_group] = [array_name]
                        ### below only occurs with a new group addition
                        array_group_list.append(
                            array_group
                        )  #use this to generate comparisons in the below linked function
                    x += 1
            #print '##### array_group_list',array_group_list
        elif len(probeset) > 0 and data_type != 'export':
            ###Use the index values from above to assign each expression value to a new database
            temp_group_array = {}
            array_index_list = []  ###Use this list for permutation analysis
            for group in array_group_db:
                #array_index_list.append(array_group_db[group])
                group_values = []
                for array_index in array_group_db[group]:
                    try:
                        exp_val = float(t[array_index + 1])
                    except IndexError:
                        print t, z, '\n', array_index, '\n', group, probeset
                        kill
                    group_values.append(exp_val)
                avg_stat = statistics.avg(group_values)

                if data_type == 'expression':
                    ###If non-log array data
                    if exp_data_format == 'non-log':
                        ### This works better for RNASeq as opposed to log transforming and then filtering which is more stringent and different than the filtering in ExonArray().
                        if array_type == 'RNASeq':
                            if normalization_method == 'RPKM' and secondary_data_type == 'expression':
                                if ':I' in probeset:
                                    k = 1  ### Don't require an RPKM threshold for intron IDs (these will likely never meet this unless small or fully retained and highly expressed)
                                elif ':' not in probeset:
                                    if avg_stat >= gene_rpkm_threshold: k = 1
                                    else: k = 0
                                elif avg_stat >= exon_rpkm_threshold: k = 1
                                elif '-' in probeset:
                                    k = 1  ### Don't consider RPKM for junctions, just counts
                                else:
                                    k = 0
                                #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, gene_rpkm_threshold, avg_stat, k]
                            else:  ### Otherwise, we are looking at count data
                                if '-' in probeset:  ### junction meeting minimum read-count number
                                    if avg_stat >= junction_exp_threshold:
                                        k = 1  ### junction_exp_threshold is the same as nonlog_exp_threshold
                                    else:
                                        k = 0
                                elif ':' not in probeset:
                                    if avg_stat >= gene_exp_threshold: k = 1
                                    else: k = 0
                                else:  ### exon or intron meeting minimum read-count number
                                    if avg_stat >= exon_exp_threshold: k = 1
                                    else: k = 0
                                #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, exon_exp_threshold, junction_exp_threshold, avg_stat, k]
                        else:
                            if avg_stat >= nonlog_exp_threshold: k = 1
                            else: k = 0
                    elif avg_stat >= log_expression_threshold: k = 1
                    else: k = 0
                    if normalization_method == 'RPKM' and secondary_data_type == 'expression':  ### Treat as dabp p-value
                        try:
                            pvalue_status_db[probeset].append(k)
                        except KeyError:
                            pvalue_status_db[probeset] = [k]
                    else:
                        try:
                            expression_status_db[probeset].append(k)
                        except KeyError:
                            expression_status_db[probeset] = [k]
                    #if probeset == '3209315': print [group],k,len(group_values),array_group_list
                if data_type == 'p-value':
                    if avg_stat <= p: k = 1
                    else: k = 0
                    #if 'G7216513_a_at' in probeset: print k, avg_stat
                    try:
                        pvalue_status_db[probeset].append(k)
                    except KeyError:
                        pvalue_status_db[probeset] = [k]
        elif data_type == 'export':
            if exp_data_format == 'non-log':
                ### This code was added in version 1.16 in conjunction with a switch from logstatus to
                ### non-log in AltAnalyze to prevent "Process AltAnalyze Filtered" associated errors
                exp_values = t[1:]
                exp_values_log2 = []
                for exp_val in exp_values:
                    exp_values_log2.append(str(
                        math.log(float(exp_val),
                                 2)))  ### exp_val+=1 was removed in 2.0.5
                line = string.join([probeset] + exp_values_log2, '\t') + '\n'
            try:
                null = export_db[probeset]
                export_data.write(line)
            except KeyError:
                null = [
                ]  ### occurs if not a probeset to include in the filtered results export file
    if data_type == 'export': export_data.close()
    return output_file
Пример #8
0
def performGroupNormalization(filename, export_dir, platform):
    expressionDataFormat, increment, convertNonLogToLog = ExpressionBuilder.checkExpressionFileFormat(
        filename)
    groups_dir = string.replace(export_dir, 'exp.', 'batch.')
    fn = unique.filepath(filename)
    row_number = 0
    exp_db = {}
    relative_headers_exported = False
    group_db = importGroups(groups_dir)
    export_data = export.ExportFile(export_dir)
    for line in open(fn, 'rU').xreadlines():
        data = ExpressionBuilder.cleanUpLine(line)
        t = string.split(data, '\t')
        if data[0] == '#' and row_number == 0: row_number = 0
        elif row_number == 0:
            sample_list = t[1:]
            new_sample_list = []
            for group in group_db:
                group_samples = group_db[group]
                try:
                    sample_index_list = map(lambda x: sample_list.index(x),
                                            group_samples)
                    group_db[group] = sample_index_list
                    new_sample_list += group_samples
                except Exception:
                    missing = []
                    for x in sample_list:
                        if x not in t[1:]: missing.append(x)
                    print 'missing:', missing
                    print t
                    print sample_list
                    print filename, groups_dir
                    print 'Unknown Error!!! Skipping cluster input file build (check column and row formats for conflicts)'
                    forceExit
            title = string.join(
                [t[0]] + new_sample_list, '\t'
            ) + '\n'  ### output the new sample order (group file order)
            export_data.write(title)
            row_number = 1
        else:
            gene = t[0]
            if expressionDataFormat == 'non-log' and (convertNonLogToLog
                                                      or platform == 'RNASeq'):
                ### Convert to log2 RPKM values - or counts

                try:
                    all_values = map(
                        lambda x: math.log(float(x) + increment, 2), t[1:])
                except Exception:
                    all_values = ExpressionBuilder.logTransformWithNAs(
                        t[1:], increment)
            else:
                try:
                    all_values = map(float, t[1:])
                except Exception:
                    all_values = ExpressionBuilder.logTransformWithNAs(
                        t[1:], increment)
            row_number += 1  ### Keep track of the first gene as to write out column headers for the relative outputs
            gene_log_folds = []

            for group in group_db:
                sample_index_list = group_db[group]
                ### Calculate log-fold values relative to the mean of all sample expression values
                try:
                    values = map(
                        lambda x: all_values[x], sample_index_list
                    )  ### simple and fast way to reorganize the samples
                except Exception:
                    print len(values), sample_index_list
                    kill
                try:
                    avg = statistics.avg(values)
                except Exception:
                    values2 = []
                    for v in values:
                        try:
                            values2.append(float(v))
                        except Exception:
                            pass
                    values = values2
                    try:
                        avg = statistics.avg(values)
                    except Exception:
                        if len(values) > 0: avg = values[0]
                        else: avg = 0
                try:
                    log_folds = map(lambda x: (x - avg), values)
                except Exception:
                    log_folds = []
                    for x in values:
                        try:
                            log_folds.append(x - avg)
                        except Exception:
                            log_folds.append('')
                gene_log_folds += log_folds
            gene_log_folds = map(lambda x: str(x), gene_log_folds)
            export_data.write(
                string.join([gene] + gene_log_folds, '\t') + '\n')
    export_data.close()
Пример #9
0
def nonLogAvg(data_list):
    return statistics.avg(map(lambda x: math.pow(2, x) - 1, data_list))
Пример #10
0
def reorder(data,
            data_headers,
            array_order,
            comp_group_list,
            probeset_db,
            include_raw_data,
            array_type,
            norm,
            fl,
            logvalues=True,
            blanksPresent=False):
    ###array_order gives the final level order sorted, followed by the original index order as a tuple
    expbuilder_value_db = {}
    group_name_db = {}
    summary_filtering_stats = {}
    pval_summary_db = {}
    replicates = 'yes'

    stat_result_names = ['avg-', 'log_fold-', 'fold-', 'rawp-', 'adjp-']
    group_summary_result_names = ['avg-']

    ### Define expression variables
    try:
        probability_statistic = fl.ProbabilityStatistic()
    except Exception:
        probability_statistic = 'unpaired t-test'
    try:
        gene_exp_threshold = math.log(fl.GeneExpThreshold(), 2)
    except Exception:
        gene_exp_threshold = 0
    try:
        gene_rpkm_threshold = float(fl.RPKMThreshold())
    except Exception:
        gene_rpkm_threshold = 0
    try:
        FDR_statistic = fl.FDRStatistic()
    except Exception:
        FDR_statistic = 'Benjamini-Hochberg'
    calculateAsNonLog = True
    if blanksPresent:
        calculateAsNonLog = False

    ### Begin processing sample expression values according to the organized groups
    for row_id in data:
        try:
            gene = probeset_db[row_id][0]
        except TypeError:
            gene = ''  #not needed if not altsplice data
        data_headers2 = {}  #reset each time
        grouped_ordered_array_list = {}
        for x in array_order:
            y = x[1]  #this is the new first index
            group = x[2]
            group_name = x[3]
            group_name_db[group] = group_name
            #for example y = 5, therefore the data[row_id][5] entry is now the first
            try:
                try:
                    new_item = data[row_id][y]
                except IndexError:
                    print row_id, data[row_id], len(
                        data[row_id]), y, len(array_order), array_order
                    kill
                if logvalues == False and calculateAsNonLog and array_type == 'RNASeq':
                    new_item = math.pow(2, new_item)
            except TypeError:
                new_item = ''  #this is for a spacer added in the above function
            try:
                grouped_ordered_array_list[group].append(new_item)
            except KeyError:
                grouped_ordered_array_list[group] = [new_item]
            try:
                data_headers2[group].append(data_headers[y])
            except KeyError:
                data_headers2[group] = [data_headers[y]]
        #perform statistics on each group comparison - comp_group_list: [(1,2),(3,4)]
        stat_results = {}
        group_summary_results = {}
        for comp in comp_group_list:
            group1 = int(comp[0])
            group2 = int(comp[1])
            group1_name = group_name_db[group1]
            group2_name = group_name_db[group2]
            groups_name = group1_name + "_vs_" + group2_name
            data_list1 = grouped_ordered_array_list[group1]
            data_list2 = grouped_ordered_array_list[
                group2]  #baseline expression
            if blanksPresent:  ### Allows for empty cells
                data_list1 = filterBlanks(data_list1)
                data_list2 = filterBlanks(data_list2)
            try:
                avg1 = statistics.avg(data_list1)
            except Exception:
                avg1 = ''
            try:
                avg2 = statistics.avg(data_list2)
            except Exception:
                avg2 = ''
            try:
                if (logvalues == False
                        and array_type != 'RNASeq') or (logvalues == False
                                                        and calculateAsNonLog):
                    fold = avg1 / avg2
                    log_fold = math.log(fold, 2)
                    if fold < 1: fold = -1.0 / fold
                else:
                    log_fold = avg1 - avg2
                    fold = statistics.log_fold_conversion(log_fold)
            except Exception:
                log_fold = ''
                fold = ''
            try:
                #t,df,tails = statistics.ttest(data_list1,data_list2,2,3) #unpaired student ttest, calls p_value function
                #t = abs(t); df = round(df); p = str(statistics.t_probability(t,df))
                p = statistics.runComparisonStatistic(data_list1, data_list2,
                                                      probability_statistic)
            except Exception:
                p = 1
                sg = 1
                N1 = 0
                N2 = 0
            comp = group1, group2
            if array_type == 'RNASeq':  ### Also non-log but treated differently
                if 'RPKM' == norm: adj = 0
                else: adj = 1
                if calculateAsNonLog == False:
                    try:
                        avg1 = math.pow(2, avg1) - adj
                        avg2 = math.pow(2, avg2) - adj
                    except Exception:
                        avg1 = ''
                        avg2 = ''
                if 'RPKM' == norm:
                    if avg1 < gene_rpkm_threshold and avg2 < gene_rpkm_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
                else:
                    if avg1 < gene_exp_threshold and avg2 < gene_exp_threshold:
                        log_fold = 'Insufficient Expression'
                        fold = 'Insufficient Expression'
                    #if row_id=='ENSG00000085514':
                    #if fold=='Insufficient Expression':
                    #print [norm, avg1, avg2, fold, comp, gene_exp_threshold, gene_rpkm_threshold, row_id]
                    #5.96999111075 7.72930768675 Insufficient Expression (3, 1) 1.0 ENSG00000085514
            if gene_rpkm_threshold != 0 and calculateAsNonLog:  ### Any other data
                a1 = nonLogAvg(data_list1)
                a2 = nonLogAvg(data_list2)
                #print [a1,a2,gene_rpkm_threshold]
                if a1 < gene_rpkm_threshold and a2 < gene_rpkm_threshold:
                    log_fold = 'Insufficient Expression'
                    fold = 'Insufficient Expression'
                #print log_fold;kill
            try:
                gs = statistics.GroupStats(log_fold, fold, p)
                stat_results[comp] = groups_name, gs, group2_name
                if probability_statistic == 'moderated t-test':
                    gs.setAdditionalStats(
                        data_list1, data_list2)  ### Assuming equal variance
                if probability_statistic == 'moderated Welch-test':
                    gs.setAdditionalWelchStats(
                        data_list1, data_list2)  ### Assuming unequal variance
            except Exception:
                null = []
                replicates = 'no'  ### Occurs when not enough replicates
                #print comp, len(stat_results); kill_program
            group_summary_results[group1] = group1_name, [avg1]
            group_summary_results[group2] = group2_name, [avg2]

        ### Replaces the below method to get the largest possible comparison fold and ftest p-value
        grouped_exp_data = []
        avg_exp_data = []
        for group in grouped_ordered_array_list:
            data_list = grouped_ordered_array_list[group]
            if blanksPresent:  ### Allows for empty cells
                data_list = filterBlanks(data_list)
            if len(data_list) > 0: grouped_exp_data.append(data_list)
            try:
                avg = statistics.avg(data_list)
                avg_exp_data.append(avg)
            except Exception:
                avg = ''
                #print row_id, group, data_list;kill
        try:
            avg_exp_data.sort()
            max_fold = avg_exp_data[-1] - avg_exp_data[0]
        except Exception:
            max_fold = 'NA'
        try:
            ftestp = statistics.OneWayANOVA(grouped_exp_data)
        except Exception:
            ftestp = 1
        gs = statistics.GroupStats(max_fold, 0, ftestp)
        summary_filtering_stats[row_id] = gs

        stat_result_list = []
        for entry in stat_results:
            data_tuple = entry, stat_results[entry]
            stat_result_list.append(data_tuple)
        stat_result_list.sort()

        grouped_ordered_array_list2 = []
        for group in grouped_ordered_array_list:
            data_tuple = group, grouped_ordered_array_list[group]
            grouped_ordered_array_list2.append(data_tuple)
        grouped_ordered_array_list2.sort(
        )  #now the list is sorted by group number

        ###for each rowid, add in the reordered data, and new statistics for each group and for each comparison
        for entry in grouped_ordered_array_list2:
            group_number = entry[0]
            original_data_values = entry[1]
            if include_raw_data == 'yes':  ###optionally exclude the raw values
                for value in original_data_values:
                    if array_type == 'RNASeq':
                        if norm == 'RPKM': adj = 0
                        else: adj = 1
                        if calculateAsNonLog == False:
                            value = math.pow(2, value) - adj
                    try:
                        expbuilder_value_db[row_id].append(value)
                    except KeyError:
                        expbuilder_value_db[row_id] = [value]
            if group_number in group_summary_results:
                group_summary_data = group_summary_results[group_number][
                    1]  #the group name is listed as the first entry
                for value in group_summary_data:
                    try:
                        expbuilder_value_db[row_id].append(value)
                    except KeyError:
                        expbuilder_value_db[row_id] = [value]
            for info in stat_result_list:
                if info[0][
                        0] == group_number:  #comp,(groups_name,[avg1,log_fold,fold,ttest])
                    comp = info[0]
                    gs = info[1][1]
                    expbuilder_value_db[row_id].append(gs.LogFold())
                    expbuilder_value_db[row_id].append(gs.Fold())
                    expbuilder_value_db[row_id].append(gs.Pval())
                    ### Create a placeholder and store the position of the adjusted p-value to be calculated
                    expbuilder_value_db[row_id].append('')
                    gs.SetAdjPIndex(len(expbuilder_value_db[row_id]) - 1)
                    gs.SetPvalIndex(len(expbuilder_value_db[row_id]) - 2)
                    pval_summary_db[(row_id, comp)] = gs

    ###do the same for the headers, but at the dataset level (redundant processes)
    array_fold_headers = []
    data_headers3 = []
    try:
        for group in data_headers2:
            data_tuple = group, data_headers2[
                group]  #e.g. 1, ['X030910_25_hl.CEL', 'X030910_29R_hl.CEL', 'X030910_45_hl.CEL'])
            data_headers3.append(data_tuple)
        data_headers3.sort()
    except UnboundLocalError:
        print data_headers, '\n', array_order, '\n', comp_group_list, '\n'
        kill_program

    for entry in data_headers3:
        x = 0  #indicates the times through a loop
        y = 0  #indicates the times through a loop
        group_number = entry[0]
        original_data_values = entry[1]
        if include_raw_data == 'yes':  ###optionally exclude the raw values
            for value in original_data_values:
                array_fold_headers.append(value)
        if group_number in group_summary_results:
            group_name = group_summary_results[group_number][0]
            group_summary_data = group_summary_results[group_number][1]
            for value in group_summary_data:
                combined_name = group_summary_result_names[
                    x] + group_name  #group_summary_result_names = ['avg-']
                array_fold_headers.append(combined_name)
                x += 1  #increment the loop index

        for info in stat_result_list:
            if info[0][
                    0] == group_number:  #comp,(groups_name,[avg1,log_fold,fold,ttest],group2_name)
                groups_name = info[1][0]
                only_add_these = stat_result_names[1:]
                for value in only_add_these:
                    new_name = value + groups_name
                    array_fold_headers.append(new_name)

    ###For the raw_data only export we need the headers for the different groups (data_headers2) and group names (group_name_db)
    raw_data_comp_headers = {}
    for comp in comp_group_list:
        temp_raw = []
        group1 = int(comp[0])
        group2 = int(comp[1])
        comp = str(comp[0]), str(comp[1])
        g1_headers = data_headers2[group1]
        g2_headers = data_headers2[group2]
        g1_name = group_name_db[group1]
        g2_name = group_name_db[group2]
        for header in g2_headers:
            temp_raw.append(g2_name + ':' + header)
        for header in g1_headers:
            temp_raw.append(g1_name + ':' + header)
        raw_data_comp_headers[comp] = temp_raw

    ###Calculate adjusted ftest p-values using BH95 sorted method
    statistics.adjustPermuteStats(summary_filtering_stats)

    ### Calculate adjusted p-values for all p-values using BH95 sorted method
    round = 0
    for info in comp_group_list:
        compid = int(info[0]), int(info[1])
        pval_db = {}
        for (rowid, comp) in pval_summary_db:
            if comp == compid:
                gs = pval_summary_db[(rowid, comp)]
                pval_db[rowid] = gs

        if 'moderated' in probability_statistic and replicates == 'yes':
            ### Moderates the original reported test p-value prior to adjusting
            try:
                statistics.moderateTestStats(pval_db, probability_statistic)
            except Exception:
                if round == 0:
                    if replicates == 'yes':
                        print 'Moderated test failed due to issue with mpmpath or out-of-range values\n   ... using unmoderated unpaired test instead!'
                null = []  ### Occurs when not enough replicates
            round += 1

        if FDR_statistic == 'Benjamini-Hochberg':
            statistics.adjustPermuteStats(pval_db)
        else:
            ### Calculate a qvalue (https://github.com/nfusi/qvalue)
            import numpy
            from stats_scripts import qvalue
            pvals = []
            keys = []
            for key in pval_db:
                pvals.append(pval_db[key].Pval())
                keys.append(key)
            pvals = numpy.array(pvals)
            pvals = qvalue.estimate(pvals)
            for i in range(len(pvals)):
                pval_db[keys[i]].SetAdjP(pvals[i])

        for rowid in pval_db:
            gs = pval_db[rowid]
            expbuilder_value_db[rowid][gs.AdjIndex()] = gs.AdjP(
            )  ### set the place holder to the calculated value
            if 'moderated' in probability_statistic:
                expbuilder_value_db[rowid][gs.RawIndex()] = gs.Pval(
                )  ### Replace the non-moderated with a moderated p-value

    pval_summary_db = []
    ###Finished re-ordering lists and adding statistics to expbuilder_value_db
    return expbuilder_value_db, array_fold_headers, summary_filtering_stats, raw_data_comp_headers
Пример #11
0
def filterFile(input_file,output_file,filter_names,force=False,calculateCentroids=False,comparisons=[]):
    if calculateCentroids:
        filter_names,group_index_db=filter_names
        
    export_object = open(output_file,'w')
    firstLine = True
    for line in open(input_file,'rU').xreadlines():
        data = cleanUpLine(line)
        if '.csv' in input_file:
            values = string.split(data,',')
        else:
            values = string.split(data,'\t')
        if firstLine:
            uid_index = 0
            if data[0]!='#':
                if force:
                    filter_names2=[]
                    for f in filter_names:
                        if f in values: filter_names2.append(f)
                    filter_names = filter_names2
                try:
                    sample_index_list = map(lambda x: values.index(x), filter_names)
                except:
                    ### If ":" in header name
                    if ':' in line:
                        values2=[]
                        for x in values:
                            if ':' in x:
                                x=string.split(x,':')[1]
                            values2.append(x)
                        values = values2
                        sample_index_list = map(lambda x: values.index(x), filter_names)
                    elif '.$' in line:
                        filter_names2=[]
                        for f in filter_names: ### if the name in the filter is a string within the input data-file
                            for f1 in values:
                                if f in f1:
                                    filter_names2.append(f1) ### change to the reference name
                                    break
                        print len(filter_names2), len(values), len(filter_names);kill
                        filter_names = filter_names2
                        #filter_names = map(lambda x: string.split(x,'.')[0], filter_names)
                        #values = map(lambda x: string.split(x,'.')[0], values)
                        sample_index_list = map(lambda x: values.index(x), filter_names)              
                    else:
                        temp_count=1
                        for x in filter_names:
                            if x not in values:
                                temp_count+=1
                                if temp_count>500: print 'too many to print';kill
                                print x,
                        print 'are missing';kill
                        
                firstLine = False
                header = values
            if 'PSI_EventAnnotation' in input_file:
                uid_index = values.index('UID')
            if calculateCentroids:
                if len(comparisons)>0:
                    export_object.write(string.join(['UID']+map(lambda x: x[0]+'-fold',comparisons),'\t')+'\n') ### Use the numerator group name                  
                else:
                    clusters = map(str,group_index_db)
                    export_object.write(string.join([values[uid_index]]+clusters,'\t')+'\n')
                continue ### skip the below code
        try: filtered_values = map(lambda x: values[x], sample_index_list) ### simple and fast way to reorganize the samples
        except Exception:
            print traceback.format_exc()
            print len(values), len(sample_index_list)
            print input_file, len(filter_names)
            for i in filter_names:
                if i not in header:
                    print i, 'not found'
            sys.exit()
            
            sys.exit()
            ### For PSI files with missing values at the end of each line, often
            if len(header) != len(values):
                diff = len(header)-len(values)
                values+=diff*['']
            filtered_values = map(lambda x: values[x], sample_index_list) ### simple and fast way to reorganize the samples
            #print values[0]; print sample_index_list; print values; print len(values); print len(prior_values);kill
        prior_values=values
        ######################## Begin Centroid Calculation ########################
        if calculateCentroids:
            mean_matrix=[]
            means={}
            for cluster in group_index_db:
                #### group_index_db[cluster] is all of the indeces for samples in a noted group, cluster is the actual cluster name (not number)
                try: mean=statistics.avg(map(lambda x: float(filtered_values[x]), group_index_db[cluster]))
                except:
                    continue
                #mean = map(lambda x: filtered_values[uid][x], group_index_db[cluster]) ### Only one value
                means[cluster]=mean
                mean_matrix.append(str(mean))
            filtered_values = mean_matrix
            if len(comparisons)>0:
                fold_matrix=[]
                for (group2, group1) in comparisons:
                    fold = means[group2]-means[group1]
                    fold_matrix.append(str(fold))
                filtered_values = fold_matrix
        ########################  End Centroid Calculation  ######################## 
        export_object.write(string.join([values[uid_index]]+filtered_values,'\t')+'\n')
    export_object.close()
    print 'Filtered columns printed to:',output_file
    return output_file