Exemplo n.º 1
0
    print ('Correlation with {column_name}'.format(column_name = column_name))
    print (corr_matrix[column_name].sort_values(ascending=False))

def save_fig(fig_id, tight_layout=True):
    '''
    save figue to folder. file name will include current timestamp
    '''
    fig_id += '_' + str(time.time())
    path = os.path.join('.', "images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

if __name__ == '__main__':
    housing = load_data(HOUSING_PATH, 'housing.csv')
    print (housing.head())
    print (housing.describe())
    
    scatter_plot_by_column(housing, 'longitude', 'latitude', 'population', 'median_house_value')
    
    attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
    scatter_matrix_for_attributes(housing, attributes)

    #example for feature mapping - better correlation with the target value
    housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
    housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
    housing["population_per_household"] = housing["population"]/housing["households"]

    #see correlation
    show_correlation_with_column(housing, 'median_house_value')
Exemplo n.º 2
0
def unsupervised(Arguments):
    '''
    Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to 
    so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.).
    In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for 
    a single datatype. 
    '''

    if len(Arguments.Data) > 2:
        print "Unsupervised pairwise calculations can consider no more that two datatypes at a time."
        print "If you provide only one datatype, all intra-datatype pairs will be considered. If you"
        print "provide two datatypes, all inter-datatype comparisons will be made. Please change the"
        print "'Data = ' field. Exiting..."
        exit()

    Data = load_data(Arguments)

    Features = list(chain(*Data.Transformed.Features.values()))
    Variates = list(chain(*Data.Transformed.Variates.values()))
    if len(Arguments.Data) == 1:
        Features1 = Features
        Features2 = Features

    if len(Arguments.Data) == 2:
        Features1 = Data.Transformed.Features[Arguments.Data[0]]
        Features2 = Data.Transformed.Features[Arguments.Data[1]]

    PValues = {}
    Interactions = {}
    SampleCounts = {}
    CaseCounts = {} #just the positive class here
    Performances = {}
    EffectSizes  = {}
    Tested = []
    for Feature1 in Features1:
        Tested.append(Feature1)
        for Feature2 in Features2:
            if Feature2 not in Tested:
                a,b,c,d = contingency_table(Variates[Features.index(Feature1)], Variates[Features.index(Feature2)],
                                            NA=Arguments.NA)
                PValue = fisher(a,b,c,d)
                PValues[tuple([Feature1, Feature2])] = PValue.two_tail
                Interactions[tuple([Feature1, Feature2])] = interaction(PValue)
                SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d
                CaseCounts[tuple([Feature1, Feature2])] = a + c
                #A placeholder solely to make pairwise post-processing generalizable
                Performances[tuple([Feature1, Feature2])] = "NA"
                EffectSizes[tuple([Feature1, Feature2])] = "NA"
                
    FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
    for Pair, PValue in PValues.items():
        if FDRs[PValue] < Arguments.FDR:
            pass
        else:
            PValues.pop(Pair, None)
            Interactions.pop(Pair, None)
            SampleCounts.pop(Pair, None)
            CaseCounts.pop(Pair, None)
            Performances.pop(Pair, None)
            EffectSizes.pop(Pair, None)

    Results = {}
    Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments) 
    Results["PValues"] = PValues
    Results["Interactions"] = Interactions
    Results["FDRs"] = FDRs
    Results["SampleCounts"] = SampleCounts
    Results["CaseCounts"] = CaseCounts
    Results["Performances"] = Performances
    Results["EffectSizes"] = EffectSizes

    if Arguments.Filename.lower() == "default":
        Pickle = "_".join(["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin),
                           Arguments.CorrectionMethod])
    else:
        Pickle = Arguments.Filename

    cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)
        
    return
Exemplo n.º 3
0
def pairwise_continuous(Arguments):
    '''
    '''

    if len(Arguments.Data) > 2:
        print "Unsupervised pairwise calculations can consider no more that two datatypes at a time."
        print "If you provide only one datatype, all intra-datatype pairs will be considered. If you"
        print "provide two datatypes, all inter-datatype comparisons will be made. Please change the"
        print "'Data = ' field. Exiting..."
        exit()

    Data = load_data(Arguments)

    Features = list(chain(*Data.Features.values()))
    Variates = list(chain(*Data.Variates.values()))

    if Arguments.Phenotype:
        Features1 = [Feature for Feature in Features if Arguments.Phenotype in Feature]
        Features2 = [Feature for Feature in Features if Arguments.Phenotype not in Feature]

    else:

        if len(Arguments.Data) == 1:
            Features1 = Features
            Features2 = Features

        if len(Arguments.Data) == 2:
            Features1 = Data.Features[Arguments.Data[0]]
            Features2 = Data.Features[Arguments.Data[1]]

    PValues = {}
    Correlations = {}
    Tested = []
    for Feature1 in Features1:
        Tested.append(Feature1)
        for Feature2 in Features2:
            if Feature2 not in Tested:
                PValues[tuple([Feature1, Feature2])] = correlation_pvalue(Variates[Features.index(Feature1)],
                                                                          Variates[Features.index(Feature2)])
                Correlations[tuple([Feature1, Feature2])] = correlation(Variates[Features.index(Feature1)],
                                                                       Variates[Features.index(Feature2)])
    
    FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
    for Pair, PValue in PValues.items():
        if FDRs[PValue] < Arguments.FDR:
            pass
        else:
            PValues.pop(Pair, None)
            Correlations.pop(Pair, None)

    if len(PValues.keys()):
        Results = {}
        Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments, Supervised=Arguments.Phenotype)
        Results["PValues"] = PValues
        Results["Correlations"] = Correlations
        Results["FDRs"] = FDRs

        if Arguments.Filename.lower() == "default":
            Pickle = "_".join(["_".join(sorted(Arguments.Data)), Arguments.CorrectionMethod])
        else:
            Pickle = Arguments.Filename
                
        cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)
    
    return
Exemplo n.º 4
0
def unsupervised(Arguments):
    """
    Pairwise MOCA calculations that are executed if the Phenotype argument is False (the default). Similar to 
    so-called 'supervised' pairwise mode, except that no performance metrics are calculated (sens, spec, PPV, NPV, etc.).
    In unspervised mode, you can compare all inter-datatype pairs for two datatypes, or all intra-datatype pairs for 
    a single datatype. 
    """

    if len(Arguments.Data) > 2:
        print "Unsupervised pairwise calculations can consider no more that two datatypes at a time."
        print "If you provide only one datatype, all intra-datatype pairs will be considered. If you"
        print "provide two datatypes, all inter-datatype comparisons will be made. Please change the"
        print "'Data = ' field. Exiting..."
        exit()

    Data = load_data(Arguments)

    Features = list(chain(*Data.Transformed.Features.values()))
    Variates = list(chain(*Data.Transformed.Variates.values()))
    if len(Arguments.Data) == 1:
        Features1 = Features
        Features2 = Features

    if len(Arguments.Data) == 2:
        Features1 = Data.Transformed.Features[Arguments.Data[0]]
        Features2 = Data.Transformed.Features[Arguments.Data[1]]

    PValues = {}
    Interactions = {}
    SampleCounts = {}
    CaseCounts = {}  # just the positive class here
    Performances = {}
    EffectSizes = {}
    Tested = []
    for Feature1 in Features1:
        Tested.append(Feature1)
        for Feature2 in Features2:
            if Feature2 not in Tested:
                a, b, c, d = contingency_table(
                    Variates[Features.index(Feature1)], Variates[Features.index(Feature2)], NA=Arguments.NA
                )
                PValue = fisher(a, b, c, d)
                PValues[tuple([Feature1, Feature2])] = PValue.two_tail
                Interactions[tuple([Feature1, Feature2])] = interaction(PValue)
                SampleCounts[tuple([Feature1, Feature2])] = a + b + c + d
                CaseCounts[tuple([Feature1, Feature2])] = a + c
                # A placeholder solely to make pairwise post-processing generalizable
                Performances[tuple([Feature1, Feature2])] = "NA"
                EffectSizes[tuple([Feature1, Feature2])] = "NA"

    FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
    for Pair, PValue in PValues.items():
        if FDRs[PValue] < Arguments.FDR:
            pass
        else:
            PValues.pop(Pair, None)
            Interactions.pop(Pair, None)
            SampleCounts.pop(Pair, None)
            CaseCounts.pop(Pair, None)
            Performances.pop(Pair, None)
            EffectSizes.pop(Pair, None)

    Results = {}
    Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments)
    Results["PValues"] = PValues
    Results["Interactions"] = Interactions
    Results["FDRs"] = FDRs
    Results["SampleCounts"] = SampleCounts
    Results["CaseCounts"] = CaseCounts
    Results["Performances"] = Performances
    Results["EffectSizes"] = EffectSizes

    if Arguments.Filename.lower() == "default":
        Pickle = "_".join(
            ["Pairwise", "_".join(sorted(Arguments.Data)), str(Arguments.FeatureMin), Arguments.CorrectionMethod]
        )
    else:
        Pickle = Arguments.Filename

    cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

    return
Exemplo n.º 5
0
def pairwise_continuous(Arguments):
    """
    """

    if len(Arguments.Data) > 2:
        print "Unsupervised pairwise calculations can consider no more that two datatypes at a time."
        print "If you provide only one datatype, all intra-datatype pairs will be considered. If you"
        print "provide two datatypes, all inter-datatype comparisons will be made. Please change the"
        print "'Data = ' field. Exiting..."
        exit()

    Data = load_data(Arguments)

    Features = list(chain(*Data.Features.values()))
    Variates = list(chain(*Data.Variates.values()))

    if Arguments.Phenotype:
        Features1 = [Feature for Feature in Features if Arguments.Phenotype in Feature]
        Features2 = [Feature for Feature in Features if Arguments.Phenotype not in Feature]

    else:

        if len(Arguments.Data) == 1:
            Features1 = Features
            Features2 = Features

        if len(Arguments.Data) == 2:
            Features1 = Data.Features[Arguments.Data[0]]
            Features2 = Data.Features[Arguments.Data[1]]

    PValues = {}
    Correlations = {}
    Tested = []
    for Feature1 in Features1:
        Tested.append(Feature1)
        for Feature2 in Features2:
            if Feature2 not in Tested:
                PValues[tuple([Feature1, Feature2])] = correlation_pvalue(
                    Variates[Features.index(Feature1)], Variates[Features.index(Feature2)]
                )
                Correlations[tuple([Feature1, Feature2])] = correlation(
                    Variates[Features.index(Feature1)], Variates[Features.index(Feature2)]
                )

    FDRs = p_adjust(PValues, Arguments.CorrectionMethod)
    for Pair, PValue in PValues.items():
        if FDRs[PValue] < Arguments.FDR:
            pass
        else:
            PValues.pop(Pair, None)
            Correlations.pop(Pair, None)

    if len(PValues.keys()):
        Results = {}
        Results["Report"] = make_report(Data.Labels, PValues.keys(), Arguments, Supervised=Arguments.Phenotype)
        Results["PValues"] = PValues
        Results["Correlations"] = Correlations
        Results["FDRs"] = FDRs

        if Arguments.Filename.lower() == "default":
            Pickle = "_".join(["_".join(sorted(Arguments.Data)), Arguments.CorrectionMethod])
        else:
            Pickle = Arguments.Filename

        cPickle.dump(Results, open(get_path("MOCA.results") + "/" + Pickle, "wb"), -1)

    return
Exemplo n.º 6
0
                true_positive += 1
            else:
                false_positive += 1
        elif y[i] == 0:
            if prediction == 0:
                true_negative += 1
            else:
                false_negative += 1

    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)

    return (2 * precision * recall) / (precision + recall)


data = load_data()

train_X = [d[:24] for d in data[:int(len(data) * .8)]]
train_y = [d[24] for d in data[:int(len(data) * .8)]]

test_X = [d[:24] for d in data[int(len(data) * 0.2):]]
test_y = [d[24] for d in data[int(len(data) * 0.2):]]

clf = SVC(kernel='linear')
clf.fit(train_X, train_y)

print("SVC linear training set f-measure: ", f_measure(clf, train_X, train_y))
print("SVC linear test set f-measure: ", f_measure(clf, test_X, test_y))

clf = SVC(kernel='rbf')
clf.fit(train_X, train_y)