def get_data_Kfold(mode):
    '''
    get X, y data

    :rtype: tuple
    '''

    if mode == "train":
        _, _, _, train_gray_data, _, _, labels = i_p.load_data()
        data_df = f.make_data_df(train_gray_data, labels)
        data_df = data_df.reset_index()
        data_df.columns = ["pngname", "input", "label"]

        keys = np.asarray(train_gray_data.keys())
        kf = cross_validation.KFold(n=len(keys), n_folds=5)

        return data_df, keys, kf

    elif mode == "test":

        _, _, _, _, test_gray_data, _, _ = i_p.load_data()

        return test_gray_data

    else:
        print "mode error!"
        print "set \"train\" or \"test\""
        quit()
def reprediction():

    _, _, _, _, test_gray_data, _, _ = i_p.load_data()
    test_keys = test_gray_data.keys()

    test_df = f.make_test_df(test_gray_data)

    test_df = test_df.reset_index()
    test_df.columns = ["pngname", "input"]
    clf_dir = os.path.abspath(os.path.dirname(__file__)) +\
        "/../tmp/fit_instance/"
    savefile = clf_dir + "GB22015_10_04_07_30_36.pickle"
    fi = open(savefile, "r")
    clf = pickle.load(fi)
    fi.close()

    for i in xrange(len(test_keys)):

        test_img = test_df[(test_df["pngname"] == test_keys[i])]["input"].as_matrix()[0]

        imgname = test_keys[i]
        shape = test_img.shape

        test_img = {test_keys[i]: test_img}
        X_middle = convert_testdata(test_img, f.transformer_middle)
        middle_ratio = X_middle.mean()
        if middle_ratio >= 0.2:
            X_test = convert_testdata(test_img)
            output = clf.predict(X_test)
            output = np.asarray(output)
            zo = np.vectorize(zero_one)
            output = zo(output).reshape(shape)
        else:
            X_test = convert_testdata(test_img, f.transformer_gray)
            output = np.asarray(X_test)
            zo = np.vectorize(zero_one)
            output = zo(output).reshape(shape)
        tmp = []

        for row in xrange(len(output)):
            for column in xrange(len(output[row])):
                id_ = imgname + "_" + str(row + 1) + "_" + str(column + 1)
                value = output[row][column]

                pix = [id_, value]
                tmp.append(pix)

        if i == 0:
            predict_df = pd.DataFrame(tmp)

        else:
            tmp_df = pd.DataFrame(tmp)
            predict_df = pd.concat([predict_df, tmp_df])

    predict_df.columns = ["id", "value"]

    now = datetime.datetime.now()
    submission_path = SUBMISSION_DIR + "/submission_repredict" + now.strftime("%Y_%m_%d_%H_%M_%S") + ".csv"
    predict_df.to_csv(submission_path, header=True, index=False)
def get_data():
    '''
    get X, y data

    :rtype: tuple
    '''
    _, _, _, train_gray_data, _, _, labels = i_p.load_data()
    data_df = f.make_data_df(train_gray_data, labels)
    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    X = fu.fit_transform(data_df)
    y = np.concatenate(data_df["label"].apply(lambda x: x.flatten()))

    return (X, y)
def make_checkdata(mode="df"):
    
    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    Std = preprocessing.StandardScaler()

    _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data()
    train_keys = train_gray_data.keys()[:2]
   
    train_inputs = {}
    train_labels = {}
    for i in xrange(len(train_keys)):
        input_ = train_gray_data[train_keys[i]]
        label = labels[train_keys[i]]

        train_inputs.update({train_keys[i]:input_})
        train_labels.update({train_keys[i]:label})
 
    test_keys = test_gray_data.keys()[:2]
    test_inputs = {}
    for i in xrange(len(test_keys)):
        input_ = test_gray_data[test_keys[i]]
        test_inputs.update({test_keys[i]:input_})
        
    train_df = f.make_data_df(train_inputs, train_labels)
    test_df = f.make_test_df(test_inputs) 
    

    if mode == "df":

        train_df = train_df.reset_index()
        test_df = test_df.reset_index()
        
        train_df.columns = ["pngname", "input", "label"]
        test_df.columns = ["pngname", "input"]

        return train_df, train_keys, test_df, test_keys


    elif mode == "feature":

        X_train = fu.fit_transform(train_df)
        X_train = Std.fit_transform(X_train)
        y_train = np.concatenate(train_df["label"].apply(lambda x: x.flatten()))
        
        
        
        X_test = fu.fit_transform(test_df)
        X_test = Std.fit_transform(X_test)    
        
        return X_train, y_train, X_test
def dump_train():
    _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data()

    train_df = f.make_data_df(train_gray_data, labels)
    test_df = f.make_test_df(test_gray_data)

    train_df = train_df.reset_index()
    test_df = test_df.reset_index()

    train_df.columns = ["pngname", "input", "label"]
    test_df.columns = ["pngname", "input"]

    fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
    feature_name_list = [s.split("__")[1] for s in fu.get_feature_names()]
    feature_name_list.append("target")
    train_X = fu.fit_transform(train_df)
    train_y = np.concatenate(train_df["label"].apply(lambda x: x.flatten()))
    train_X, train_y = cl.downsampling_data(train_X, train_y, 0.2)
    train_dump = pd.DataFrame(np.c_[train_X, train_y], columns=feature_name_list)
    dump_path = os.path.abspath(os.path.dirname(__file__)) +\
        "/../tmp/train_dump"
    train_dump.to_csv(dump_path + "/train_dump.csv", index=False)
def prediction(clf_name):

    print "****************classifier****************"
    print clf_dict[clf_name]["clf"]
    clf = clf_dict[clf_name]["clf"]

    _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data()
    train_keys = train_gray_data.keys()
    test_keys = test_gray_data.keys()

    train_df = f.make_data_df(train_gray_data, labels)
    test_df = f.make_test_df(test_gray_data)

    train_df = train_df.reset_index()
    test_df = test_df.reset_index()

    train_df.columns = ["pngname", "input", "label"]
    test_df.columns = ["pngname", "input"]

    # operation check
    if clf_name == "SGDB":
        # train_df, train_keys, test_df, test_keys  = pre.make_checkdata(mode="df")
        # train_df, train_keys, _, _  = pre.make_checkdata(mode="df")

        for i in xrange(len(train_keys)):

            train_X, train_y = classify.set_traindata(train_df, train_keys[i])
            clf.partial_fit(train_X, train_y)

    else:

        # operation check
        # train_df, train_keys, _, _  = pre.make_checkdata(mode="df")
        fu = FeatureUnion(transformer_list=f.feature_transformer_rule)
        train_X = fu.fit_transform(train_df)
        train_y = np.concatenate(train_df["label"].apply(lambda x: x.flatten()))
        train_X, train_y = classify.downsampling_data(train_X, train_y, 0.2)

        clf.fit(train_X, train_y)
    clf_dir = os.path.abspath(os.path.dirname(__file__)) +\
        "/../tmp/fit_instance/"
    now = datetime.datetime.now()
    savefile = clf_dir + clf_name + now.strftime("%Y_%m_%d_%H_%M_%S") + ".pickle"
    fi = open(savefile, "w")
    pickle.dump(clf, fi)
    fi.close()

    for i in xrange(len(test_keys)):

        test_img = test_df[(test_df["pngname"] == test_keys[i])]["input"].as_matrix()[0]

        imgname = test_keys[i]
        shape = test_img.shape

        test_img = {test_keys[i]: test_img}
        X_test = convert_testdata(test_img)
        output = clf.predict(X_test)
        output = np.asarray(output)
        zo = np.vectorize(zero_one)
        output = zo(output).reshape(shape)

        tmp = []

        for row in xrange(len(output)):
            for column in xrange(len(output[row])):
                id_ = imgname + "_" + str(row + 1) + "_" + str(column + 1)
                value = output[row][column]

                pix = [id_, value]
                tmp.append(pix)

        if i == 0:
            predict_df = pd.DataFrame(tmp)

        else:
            tmp_df = pd.DataFrame(tmp)
            predict_df = pd.concat([predict_df, tmp_df])

    predict_df.columns = ["id", "value"]

    now = datetime.datetime.now()
    submission_path = SUBMISSION_DIR + "/submission_" + now.strftime("%Y_%m_%d_%H_%M_%S") + ".csv"
    predict_df.to_csv(submission_path, header=True, index=False)
    ('solbel_hol', SobelFilter_hol()),
    ('solbel_ver', SobelFilter_ver()),
    ('raprasian', RapFilter()),
    ('gaussian', GauFilter()),
    ('coordinateX', RelativeCoordinateX()),
    ('coordinateY', RelativeCoordinateY()),
]

transformer_gray = [
    ('average', GrayParamBinary())
]

transformer_middle = [
    ('average', RowMiddleRatio())
]

if __name__ == '__main__':
    _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data()
    # print test_gray_data
    #data_df = make_data_df(test_gray_data, labels)
    data_df = make_test_df(test_gray_data)
    transformer_list = [
        ('average', GrayParamBinary())
    ]
    fu = FeatureUnion(transformer_list=transformer_list)
    feature = fu.fit_transform(data_df)
    print feature
    print feature.shape
    print feature.mean()
    print np.isnan(feature).sum()