Пример #1
0
def do_surveys():
    with figure("tlx_results", figsize=fig_size(0.44, 1)):
        sns.factorplot(x="experiment", y="tlx", data=tlx, kind="box")
        sns.swarmplot(x="experiment", y=r"tlx",
                      data=tlx, palette=cmap_complement, split=True)
        plt.ylim(0, plt.ylim()[1])
        plt.ylabel("NASA-TLX weighted score")

    with figure("tlx_components", figsize=fig_size(0.44, 1)):
        components = ["mental", "physical", "temporal", "performance",
                      "effort", "frustration"]
        molten = pd.melt(tlx, id_vars=["user", "experiment", "order"],
                         value_vars=components,
                         var_name="component", value_name="score")
        g = sns.barplot(x=r"component", y="score", hue="experiment",
                        data=molten)

        plt.gca().set_xticklabels(
                ["MD", "PD", "TD", "P", "E", "F"])

        plt.xlabel("NASA-TLX component")
        plt.ylabel("score")

    with figure("survey_results", fig_size(0.44, 1)):
        sns.factorplot(x="experiment", y="total", data=surveys, kind="box")
        sns.swarmplot(x="experiment", y=r"total", data=surveys, palette=cmap_complement, split=True)
        plt.ylim(0, plt.ylim()[1])
        plt.ylabel("survey score")

    with figure("survey_components", figsize=fig_size(0.9, 0.5)):
        molten = pd.melt(surveys, id_vars=["user", "experiment", "order"],
                         value_vars=[r"orientation_understanding",
                                     r"orientation_control",
                                     r"position_understanding",
                                     r"position_control",
                                     r"spacial_understanding",
                                     r"spacial_control"],
                         var_name="question", value_name="rating")
        g = sns.barplot(x=r"rating", y=r"question", hue="experiment",
                        data=molten)
        sns.stripplot(x="rating", y=r"question", data=molten, hue="experiment",
                      split=True, palette=cmap_complement, jitter=0.6, size=3)

        plt.gca().set_yticklabels(
                ["angle aware", "angle control",
                 "position aware", "position control",
                 "rel. pos. aware", "rel. pos. control"])

        handles, labels = g.get_legend_handles_labels()
        plt.legend(handles[2:], labels[2:])
        plt.xlabel("rating")
        plt.title("Survey results")
Пример #2
0
def do_durations():
    with figure("duration", figsize=fig_size(0.44, 1)):
        sns.factorplot(x="experiment", y="duration", data=analyses, kind="box")
        sns.swarmplot(x="experiment", y="duration", split=True, data=analyses,
                      palette=cmap_complement)
        plt.ylim(0, plt.ylim()[1])
        plt.ylabel("duration (s)")

    with figure("duration_runs", figsize=fig_size(0.44, 1)):
        sns.factorplot(x="order", y="duration", hue="experiment", data=analyses,
                       capsize=0.2)
        plt.ylim(0, plt.ylim()[1])
        plt.ylabel("duration (s)")
        plt.xlabel("run")
Пример #3
0
 def _update_plot(self, axis, view):
     style = self._process_style(self.style[self.cyclic_index])
     if self.plot_type == 'factorplot':
         opts = dict(style, **({'hue': view.x2} if view.x2 else {}))
         sns.factorplot(x=view.x, y=view.y, data=view.data, **opts)
     elif self.plot_type == 'regplot':
         sns.regplot(x=view.x, y=view.y, data=view.data, ax=axis, **style)
     elif self.plot_type == 'boxplot':
         style.pop('return_type', None)
         style.pop('figsize', None)
         sns.boxplot(view.data[view.y], view.data[view.x], ax=axis, **style)
     elif self.plot_type == 'violinplot':
         if view.x:
             sns.violinplot(view.data[view.y],
                            view.data[view.x],
                            ax=axis,
                            **style)
         else:
             sns.violinplot(view.data, ax=axis, **style)
     elif self.plot_type == 'interact':
         sns.interactplot(view.x,
                          view.x2,
                          view.y,
                          data=view.data,
                          ax=axis,
                          **style)
     elif self.plot_type == 'corrplot':
         sns.corrplot(view.data, ax=axis, **style)
     elif self.plot_type == 'lmplot':
         sns.lmplot(x=view.x, y=view.y, data=view.data, ax=axis, **style)
     elif self.plot_type in ['pairplot', 'pairgrid', 'facetgrid']:
         style_keys = list(style.keys())
         map_opts = [(k, style.pop(k)) for k in style_keys if 'map' in k]
         if self.plot_type == 'pairplot':
             g = sns.pairplot(view.data, **style)
         elif self.plot_type == 'pairgrid':
             g = sns.PairGrid(view.data, **style)
         elif self.plot_type == 'facetgrid':
             g = sns.FacetGrid(view.data, **style)
         for opt, args in map_opts:
             plot_fn = getattr(sns, args[0]) if hasattr(
                 sns, args[0]) else getattr(plt, args[0])
             getattr(g, opt)(plot_fn, *args[1:])
         plt.close(self.handles['fig'])
         self.handles['fig'] = plt.gcf()
     else:
         super(SNSFramePlot, self)._update_plot(axis, view)
Пример #4
0
 def _update_plot(self, axis, view):
     style = self._process_style(self.style[self.cyclic_index])
     if self.plot_type == 'factorplot':
         opts = dict(style, **({'hue': view.x2} if view.x2 else {}))
         sns.factorplot(x=view.x, y=view.y, data=view.data, **opts)
     elif self.plot_type == 'regplot':
         sns.regplot(x=view.x, y=view.y, data=view.data,
                     ax=axis, **style)
     elif self.plot_type == 'boxplot':
         style.pop('return_type', None)
         style.pop('figsize', None)
         sns.boxplot(view.data[view.y], view.data[view.x], ax=axis,
                     **style)
     elif self.plot_type == 'violinplot':
         if view.x:
             sns.violinplot(view.data[view.y], view.data[view.x], ax=axis,
                            **style)
         else:
             sns.violinplot(view.data, ax=axis, **style)
     elif self.plot_type == 'interact':
         sns.interactplot(view.x, view.x2, view.y,
                          data=view.data, ax=axis, **style)
     elif self.plot_type == 'corrplot':
         sns.corrplot(view.data, ax=axis, **style)
     elif self.plot_type == 'lmplot':
         sns.lmplot(x=view.x, y=view.y, data=view.data,
                    ax=axis, **style)
     elif self.plot_type in ['pairplot', 'pairgrid', 'facetgrid']:
         style_keys = list(style.keys())
         map_opts = [(k, style.pop(k)) for k in style_keys if 'map' in k]
         if self.plot_type == 'pairplot':
             g = sns.pairplot(view.data, **style)
         elif self.plot_type == 'pairgrid':
             g = sns.PairGrid(view.data, **style)
         elif self.plot_type == 'facetgrid':
             g = sns.FacetGrid(view.data, **style)
         for opt, args in map_opts:
             plot_fn = getattr(sns, args[0]) if hasattr(sns, args[0]) else getattr(plt, args[0])
             getattr(g, opt)(plot_fn, *args[1:])
         if self._close_figures:
             plt.close(self.handles['fig'])
         self.handles['fig'] = plt.gcf()
     else:
         super(SNSFramePlot, self)._update_plot(axis, view)
Пример #5
0
def do_errors():
    with figure("rms", figsize=fig_size(0.9, 0.4)):
        molten = pd.melt(analyses,
                         id_vars=["user", "experiment", "order", "group"],
                         value_vars=["rms", "rms_x", "rms_y"])
        g = sns.factorplot(x="experiment", y="value", col="variable",
                           data=molten, kind="box")
        g.fig.axes[0].set_title("RMS Error")
        g.fig.axes[1].set_title("RMS Error in $x$")
        g.fig.axes[2].set_title("RMS Error in $y$")
        g.fig.axes[0].set_ylabel("error (m)")

    with figure("rms_runs", figsize=fig_size(0.9, 0.4)):
        molten = pd.melt(analyses,
                         id_vars=["user","experiment", "order", "group"],
                         value_vars=["rms", "rms_x", "rms_y"]),
        g = sns.factorplot(x="order", y="value", hue="experiment",
                           col="variable", data=molten, capsize=0.2)
        g.fig.axes[0].set_title("RMS Error")
        g.fig.axes[1].set_title("RMS Error in $x$")
        g.fig.axes[2].set_title("RMS Error in $y$")
        g.fig.axes[0].set_ylabel("error (m)")
        g.fig.axes[0].set_xlabel("run")
        g.fig.axes[1].set_xlabel("run")
        g.fig.axes[2].set_xlabel("run")

    with figure("distance", figsize=fig_size(0.9, 0.4)):
        molten = pd.melt(analyses,
                         id_vars=["user", "experiment", "order", "group"],
                         value_vars=[r"dist_err", r"x_err", r"y_err"])
        g = sns.factorplot(x="experiment", y="value", col="variable",
                           data=molten, kind="box")
        g.fig.axes[0].set_title("Distance from target")
        g.fig.axes[1].set_title("Distance from target in $x$")
        g.fig.axes[2].set_title("Distance from target in $y$")
        g.fig.axes[0].set_ylabel("distance (m)")
        g.axes[0][0].axhline(0, color="black", linewidth=1, zorder=-1)
        g.axes[0][1].axhline(0, color="black", linewidth=1, zorder=-1)
        g.axes[0][2].axhline(0, color="black", linewidth=1, zorder=-1)
def extracted_features_method_classifier_polynomial_features(crossval=True, xpname='cross_validation_classifier',
                                                             oncleaned_data=False,
                                                             learningmethod='xgboost'):
    pol = PolynomialFeatures()
    print('Loading Training Dataset')
    col = [u'euclidean_distance',
           u'fuzz_token_sort_ratio',
           u'fuzz_partial_token_set_ratio', u'canberra_distance', u'skew_q1vec',
           u'kur_q1vec', u'norm_wmd_train', u'wmd_train',
           u'tfidf_word_match_train', u'fuzz_token_set_ratio',
           u'braycurtis_distance', u'fuzz_partial_ratio', u'minkowski_distance',
           u'fuzz_qratio', u'fuzz_wratio', u'cosine_distance',
           u'fuzz_partial_token_sort_ratio', u'jaccard_distance',
           u'word_match_train',
           # u'skew_q2vec', u'kur_q2vec'
           ]
    # col = [
    #     u'euclidean_distance',
    #        u'fuzz_token_sort_ratio',
    #        u'fuzz_partial_token_set_ratio', u'canberra_distance', u'skew_q1vec',
    #        u'kur_q1vec', u'norm_wmd_train', u'wmd_train',
    #        u'tfidf_word_match_train', u'fuzz_token_set_ratio',
    #        u'braycurtis_distance', u'fuzz_partial_ratio', u'minkowski_distance',
    #        u'fuzz_qratio', u'fuzz_wratio', u'cosine_distance',
    #        u'fuzz_partial_token_sort_ratio', u'jaccard_distance',
    #        u'word_match_train', u'skew_q2vec', u'kur_q2vec']
    # df = load_dataset('train', clean=oncleaned_data)
    df = pd.read_csv('/home/nacim/DATASET_KAGGLE/quora/train.csv')
    Xtrain = load_dataset('train_extracted_features').replace('', 0).replace(np.nan, 0).replace(np.inf, 0).replace(
        -np.inf, 0).astype(np.float32)

    y = df['is_duplicate'].values
    Xtrain = np.array(Xtrain[col])
    # Xtrain = poly.fit_transform(Xtrain)
    Xtrain = pol.fit_transform(Xtrain)


    del df

    if crossval:
        sss = StratifiedShuffleSplit(y=y, n_iter=5, test_size=0.2, )

        result = pd.DataFrame()
        for train, test in sss:
            xtrain, xtest, ytrain, ytest = Xtrain[train], Xtrain[test], y[train], y[test]

            xtrain, ytrain = oversample(ssp.csr_matrix(xtrain), ytrain, p=0.165)
            # xtest = ssp.csr_matrix(xtest)

            # dump_svmlight_file(xtrain,ytrain,path='/')
            s = pd.Series()
            for learningmethod in [
                # 'svm_linear',
                # 'svm_rbf',
                'xgboost',
                'rf',
            ]:
                print(learningmethod)
                if learningmethod == 'rf':
                    estimator = RandomForestClassifier(n_jobs=6, n_estimators=100)
                    estimator.fit(xtrain, ytrain)
                    ypred = estimator.predict_proba(xtest)
                    loss = log_loss(ytest, ypred)
                    print loss
                    exit()
                    s[learningmethod] = loss
                elif learningmethod == 'svm_rbf':
                    estimator = SVC(kernel='rbf')
                elif learningmethod == 'svm_linear':
                    estimator = LinearSVC()
                elif learningmethod == 'xgboost':
                    estimator = XGBClassifier(nthread=4, n_estimators=350, max_depth=4)
                    gpu_params = {
                        'objective': 'binary:logistic',
                        'eval_metric': 'logloss',
                        'eta': 0.02,
                        'max_depth': 4,
                        'min_child_weight': 1,
                        'subsample': 0.8,
                        'colsample_bytree': 0.8,
                        # 'updater': 'grow_gpu',
                        'n_estimators': 300,
                        'scale_pos_weight': 1
                    }
                    D_training = xgboost.DMatrix(xtrain, label=ytrain)
                    D_validation = xgboost.DMatrix(xtest, label=ytest)
                    watchlist = [(D_training, 'training'), (D_validation, 'validation')]

                    bst = xgboost.train(gpu_params, D_training, 50000, watchlist, early_stopping_rounds=10000)
                    ypred = bst.predict(D_validation)
                    print ypred
                    exit()

                    #

            result = result.append(s.T, ignore_index=True)
        result.to_csv('{0}_clean{1}.csv'.format(xpname, oncleaned_data))
        final = pd.DataFrame()
        for c in result.columns:
            tmp = pd.DataFrame()
            tmp['logloss'] = result[c]
            tmp['classifier'] = c
            final = final.append(tmp, ignore_index=True)
        sns.factorplot(x='classifier', y='logloss', data=final)
        sns.plt.savefig('{0}_clean{1}.pdf'.format(xpname, oncleaned_data), bbox_inches='tight')

    else:
        print 'Test submission'
        if learningmethod == 'rf':
            estimator = RandomForestClassifier(n_jobs=5, n_estimators=150)
        elif learningmethod == 'xgboost':
            estimator = XGBClassifier(nthread=4, n_estimators=400, max_depth=5)
        # print 'fitting a %s classifier' % learningmethod
        # estimator.fit(Xtrain, y)

        if learningmethod == 'DL':
            model = Sequential()
            model.add(Dense(1024, input_dim=Xtrain.shape[1],kernel_initializer='normal', activation='sigmoid',
                            bias_initializer='random_normal'
                            ))
            # model.add(Dropout(0.3))
            # model.add(Dense(512, kernel_initializer='normal', activation='relu'))
            model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
            # sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)

            model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])

            estimator = model
            y, Xtrain, _ = unison_shuffled_copies(y, Xtrain, np.zeros(len(y)))
            hist = estimator.fit(Xtrain, y, batch_size=1024, epochs=100, validation_split=0.2, shuffle=True,
                                 class_weight={0: 0.79264156344230219, 1: 1.3542873987525375},
                                 verbose=1,
                                 )
            plot_training(hist, 'training_NN_extracted_features___relu')

        result = pd.DataFrame()
        for i in range(1, 21):
            print 'read Test %s' % i
            df = load_dataset('test_part%s_extracted_features' % i, clean=oncleaned_data).replace('', 0).replace(np.nan,
                                                                                                                 0).replace(
                np.inf, 0).replace(-np.inf, 0).astype(np.float32)
            dtmp = load_dataset('test_part%s' % i, clean=oncleaned_data)
            test_ids = dtmp['test_id']
            del dtmp
            Xtest = np.array(df[col])
            Xtest = pol.transform(Xtest)
            del df
            ypred = estimator.predict_proba(Xtest)
            # print ypred
            # exit()
            resulttmp = pd.DataFrame()
            resulttmp['test_id'] = test_ids
            if learningmethod == 'DL':
                resulttmp['is_duplicate'] = ypred.ravel()
            else:
                resulttmp['is_duplicate'] = ypred[:, 1]

            result = result.append(resulttmp, ignore_index=True)
        result.to_csv(xpname + '_clean_____relu{0}_{1}.csv'.format(oncleaned_data, learningmethod), index=False)
def from_question_representation_method_classifier(crossval=True, xpname='cross_validation_classifier',
                                                   learningmethod='xgboost', merge_method='concat'):
    df = load_dataset('train', clean=False)
    y = df['is_duplicate'].values
    del df
    dataset = 'train'
    Xtrain = load_questions_and_merge(dataset, method=merge_method)

    if crossval:
        sss = StratifiedShuffleSplit(y=y, n_iter=1, test_size=0.2, )

        result = pd.DataFrame()
        for train, test in sss:

            xtrain, xtest, ytrain, ytest = Xtrain[train], Xtrain[test], y[train], y[test]
            s = pd.Series()
            for learningmethod in [
                'rf',
                'xgboost']:
                print(learningmethod)
                if learningmethod == 'rf':
                    estimator = RandomForestClassifier(n_jobs=6, n_estimators=150)
                elif learningmethod == 'svm_rbf':
                    estimator = SVC(kernel='rbf')
                elif learningmethod == 'svm_linear':
                    estimator = LinearSVC()
                elif learningmethod == 'xgboost':
                    estimator = XGBClassifier(nthread=6, n_estimators=300, max_depth=4)

                estimator.fit(xtrain, ytrain)

                ypred = estimator.predict_proba(xtest)
                loss = log_loss(ytest, ypred)
                print loss

                s[learningmethod] = loss
            result = result.append(s.T, ignore_index=True)
        result.to_csv('{0}_merge{1}.csv'.format(xpname, merge_method))
        final = pd.DataFrame()
        for c in result.columns:
            tmp = pd.DataFrame()
            tmp['logloss'] = result[c]
            tmp['classifier'] = c
            final = final.append(tmp, ignore_index=True)
        sns.factorplot(x='classifier', y='logloss', data=final)
        sns.plt.savefig('{0}_merge{1}.pdf'.format(xpname, merge_method), bbox_inches='tight')

    else:
        print 'Test submission'
        if learningmethod == 'rf':
            estimator = RandomForestClassifier(n_jobs=6, n_estimators=150)
        elif learningmethod == 'svm_rbf':
            estimator = SVC(kernel='rbf')
        elif learningmethod == 'svm_linear':
            estimator = SVR(kernel='linear')
        elif learningmethod == 'xgboost':
            estimator = XGBClassifier(nthread=6, n_estimators=300, max_depth=4)
        print 'fitting a %s classifier' % learningmethod
        estimator.fit(Xtrain, y)

        result = pd.DataFrame()
        for i in range(1, 21):
            print 'read Test %s' % i
            Xtest = load_questions_and_merge('test_part%s' % (i), method=merge_method)
            dtmp = load_dataset('test_part%s' % i, clean=False)
            test_ids = dtmp['test_id']
            del dtmp
            ypred = estimator.predict_proba(Xtest)
            resulttmp = pd.DataFrame()
            resulttmp['test_id'] = test_ids
            resulttmp['is_duplicate'] = ypred[:, 1]
            result = result.append(resulttmp, ignore_index=True)
        result.to_csv(xpname + '_merge{0}_{1}.csv'.format(merge_method, learningmethod), index=False)
def extracted_features_method_classifier(crossval=True, xpname='cross_validation_classifier', oncleaned_data=False,
                                         learningmethod='xgboost'):
    print('Loading Training Dataset')
    df = load_dataset('train', clean=oncleaned_data)
    Xtrain = load_dataset('train_extracted_features').replace('', 0).replace(np.nan, 0).replace(np.inf, 0).replace(
        -np.inf, 0).astype(np.float32)

    Xtrain = np.array(Xtrain)
    y = df['is_duplicate'].values
    del df

    if crossval:
        sss = StratifiedShuffleSplit(y=y, n_iter=5, test_size=0.2, )

        result = pd.DataFrame()
        for train, test in sss:

            xtrain, xtest, ytrain, ytest = Xtrain[train], Xtrain[test], y[train], y[test]
            s = pd.Series()
            for learningmethod in [
                # 'svm_linear',
                # 'svm_rbf',
                'xgboost',
                'rf',
            ]:
                print(learningmethod)
                if learningmethod == 'rf':
                    estimator = RandomForestClassifier(n_jobs=4, n_estimators=100)
                    estimator.fit(xtrain, ytrain)
                    ypred = estimator.predict_proba(xtest)
                    loss = log_loss(ytest, ypred)
                    print loss
                    s[learningmethod] = loss
                elif learningmethod == 'svm_rbf':
                    estimator = SVC(kernel='rbf')
                elif learningmethod == 'svm_linear':
                    estimator = LinearSVC()
                elif learningmethod == 'xgboost':
                    estimator = XGBClassifier(nthread=4, n_estimators=350, max_depth=4)
                    gpu_params = {
                        'objective': 'binary:logistic',
                        'eval_metric': 'logloss',
                        'eta': 0.01,
                        'max_depth': 9,
                        'min_child_weight': 1,
                        # 'updater': 'grow_gpu',
                        'n_estimators': 1000,
                        'scale_pos_weight': 1
                    }
                    D_training = xgboost.DMatrix(xtrain, label=ytrain)
                    D_validation = xgboost.DMatrix(xtest, label=ytest)
                    watchlist = [(D_training, 'training'), (D_validation, 'validation')]

                    bst = xgboost.train(gpu_params, D_training, 50000, watchlist, early_stopping_rounds=10000,
                                        verbose_eval=50)
                    ypred = bst.predict(D_validation)
                    print ypred
                    exit()

                    #

            result = result.append(s.T, ignore_index=True)
        result.to_csv('{0}_clean{1}.csv'.format(xpname, oncleaned_data))
        final = pd.DataFrame()
        for c in result.columns:
            tmp = pd.DataFrame()
            tmp['logloss'] = result[c]
            tmp['classifier'] = c
            final = final.append(tmp, ignore_index=True)
        sns.factorplot(x='classifier', y='logloss', data=final)
        sns.plt.savefig('{0}_clean{1}.pdf'.format(xpname, oncleaned_data), bbox_inches='tight')

    else:
        print 'Test submission'
        if learningmethod == 'rf':
            estimator = RandomForestClassifier(n_jobs=5, n_estimators=150)
        elif learningmethod == 'svm_rbf':
            estimator = SVC(kernel='rbf')
        elif learningmethod == 'svm_linear':
            estimator = SVR(kernel='linear')
        elif learningmethod == 'xgboost':
            estimator = XGBClassifier(nthread=4, n_estimators=300, max_depth=4)
        print 'fitting a %s classifier' % learningmethod
        estimator.fit(Xtrain, y)

        result = pd.DataFrame()
        for i in range(1, 21):
            print 'read Test %s' % i
            df = load_dataset('test_part%s_extracted_features' % i, clean=oncleaned_data).replace('', 0).replace(np.nan,
                                                                                                                 0).replace(
                np.inf, 0).replace(-np.inf, 0).astype(np.float32)
            dtmp = load_dataset('test_part%s' % i, clean=oncleaned_data)
            test_ids = dtmp['test_id']
            del dtmp
            Xtest = np.array(df)
            del df
            ypred = estimator.predict_proba(Xtest)
            resulttmp = pd.DataFrame()
            resulttmp['test_id'] = test_ids
            resulttmp['is_duplicate'] = ypred[:, 1]

            result = result.append(resulttmp, ignore_index=True)
        result.to_csv(xpname + '_clean{0}_{1}.csv'.format(oncleaned_data, learningmethod), index=False)
def naive_method_classifier(crossval=True, xpname='', learningmethod='rf', oncleaned_data=False, maxfeature=100):
    print('Loading Training Dataset')
    df = load_dataset('train', clean=oncleaned_data)
    first = df[['question1']]
    second = df[['question2']]
    first.columns = ["question"]
    second.columns = ["question"]
    dfq = pd.concat([first, second], axis=0, ignore_index=True).fillna('')
    tfidf = TfidfVectorizer(max_features=maxfeature, stop_words='english').fit_transform(dfq['question'].values)

    N = len(df)
    X_tfidf = (np.abs(tfidf[:N] - tfidf[N:])).toarray()
    y = df['is_duplicate'].values
    if crossval:
        sss = StratifiedShuffleSplit(y=y, n_iter=5, test_size=0.2, )

        result = pd.DataFrame()
        for train, test in sss:

            xtrain, xtest, ytrain, ytest = X_tfidf[train], X_tfidf[test], y[train], y[test]
            s = pd.Series()
            for learningmethod in [
                # 'svm_linear',
                'rf',
                'xgboost']:
                print(learningmethod)
                if learningmethod == 'rf':
                    estimator = RandomForestClassifier(n_jobs=4)
                elif learningmethod == 'svm_rbf':
                    estimator = SVC(kernel='rbf')
                elif learningmethod == 'svm_linear':
                    estimator = SVR(kernel='linear')
                elif learningmethod == 'xgboost':
                    estimator = XGBClassifier(nthread=4, n_estimators=350, max_depth=4)

                estimator.fit(xtrain, ytrain)

                ypred = estimator.predict_proba(xtest)
                loss = log_loss(ytest, ypred)
                print loss

                s[learningmethod] = loss
            result = result.append(s.T, ignore_index=True)
        result.to_csv('{0}_clean{1}.csv'.format(xpname, oncleaned_data))
        final = pd.DataFrame()
        for c in result.columns:
            tmp = pd.DataFrame()
            tmp['logloss'] = result[c]
            tmp['classifier'] = c
            final = final.append(tmp, ignore_index=True)
        sns.factorplot(x='classifier', y='logloss', data=final)
        sns.plt.savefig('{0}_clean{1}.pdf'.format(xpname, oncleaned_data), bbox_inches='tight')

    else:
        print 'Test submission'

        print('Load  Test Dataset')
        df = load_dataset('test', clean=oncleaned_data)

        first = df[['question1']]
        second = df[['question2']]
        first.columns = ["question"]
        second.columns = ["question"]
        dfq = pd.concat([first, second], axis=0, ignore_index=True).fillna('')
        tfidf = TfidfVectorizer(max_features=maxfeature, stop_words='english').fit_transform(dfq['question'].values)

        N = len(df)
        Xtest = (np.abs(tfidf[:N] - tfidf[N:])).toarray()

        if learningmethod == 'rf':
            estimator = RandomForestClassifier(n_jobs=4)
        elif learningmethod == 'svm_rbf':
            estimator = SVC(kernel='rbf')
        elif learningmethod == 'svm_linear':
            estimator = SVR(kernel='linear')
        elif learningmethod == 'xgboost':
            estimator = XGBClassifier(nthread=4, n_estimators=300, max_depth=4)
        print 'fitting a %s classifier' % learningmethod
        estimator.fit(X_tfidf, y)

        ypred = estimator.predict_proba(Xtest)
        result = pd.DataFrame()
        result['test_id'] = df['test_id']
        result['is_duplicate'] = ypred[:, 1]
        result.to_csv(xpname + '_clean{0}_{1}.csv'.format(oncleaned_data, learningmethod), index=False)
Пример #10
0
def do_movement():
    with figure("movement", figsize=fig_size(0.9, 0.4)):
        molten = pd.melt(analyses,
                         id_vars=["user", "experiment", "order", "group"],
                         value_vars=["path_length", "move_x", "move_y"])
        g = sns.factorplot(x="experiment", y="value", col="variable",
                           data=molten, kind="box")
        g.fig.axes[0].set_title("Path length")
        g.fig.axes[1].set_title("Movement in $x$")
        g.fig.axes[2].set_title("Movement in $y$")
        g.fig.axes[0].set_ylabel("distance (m)")
        plt.ylim(0, plt.ylim()[1])

    with figure("movement_x"):
        molten = pd.melt(analyses,
                         id_vars=["user", "experiment", "order", "group"],
                         value_vars=["move_l", "move_r", "move_x"])
        g = sns.factorplot(x="experiment", y="value", col="variable",
                           data=molten, kind="box")
        g.fig.axes[0].set_title("Movement left")
        g.fig.axes[1].set_title("Movement right")
        g.fig.axes[2].set_title("Movement in $x$")
        g.fig.axes[0].set_ylabel("distance (m)")
        plt.ylim(0, plt.ylim()[1])

    with figure("movement_y"):
        molten = pd.melt(analyses,
                         id_vars=["user", "experiment", "order", "group"],
                         value_vars=["move_b", "move_f", "move_y"])
        g = sns.factorplot(x="experiment", y="value", col="variable",
                           data=molten, kind="box")
        g.fig.axes[0].set_title("Movement backwards")
        g.fig.axes[1].set_title("Movement forwards")
        g.fig.axes[2].set_title("Movement in $y$")
        g.fig.axes[0].set_ylabel("distance (m)")
        plt.ylim(0, plt.ylim()[1])

    with figure("movement_back"):
        sns.factorplot(x="experiment", y="move_b", data=analyses, kind="box")
        sns.swarmplot(x="experiment", y="move_b", split=True, data=analyses,
                      palette=cmap_complement)
        plt.ylabel("distance (m)")
        plt.title("Movement backwards")

    with figure("movement_runs", figsize=fig_size(0.9, 0.4)):
        molten = pd.melt(analyses,
                         id_vars=["user", "experiment", "order", "group"],
                         value_vars=["path_length", "move_x", "move_y"])
        g = sns.factorplot(x="order", y="value", col="variable",
                           data=molten, hue="experiment", capsize=0.2)
        g.fig.axes[0].set_title("Path length")
        g.fig.axes[1].set_title("Movement in $x$")
        g.fig.axes[2].set_title("Movement in $y$")
        g.fig.axes[0].set_ylabel("distance (m)")
        g.fig.axes[0].set_xlabel("run")
        g.fig.axes[1].set_xlabel("run")
        g.fig.axes[2].set_xlabel("run")
        plt.ylim(0, plt.ylim()[1])

    with figure("movement_x_runs"):
        molten = pd.melt(analyses,
                         id_vars=["user", "experiment", "order", "group"],
                         value_vars=["move_l", "move_r", "move_x"])
        g = sns.factorplot(x="order", y="value", col="variable",
                           data=molten, hue="experiment")
        g.fig.axes[0].set_title("Movement left")
        g.fig.axes[1].set_title("Movement right")
        g.fig.axes[2].set_title("Movement in $x$")
        g.fig.axes[0].set_ylabel("distance (m)")
        g.fig.axes[0].set_xlabel("run")
        g.fig.axes[1].set_xlabel("run")
        g.fig.axes[2].set_xlabel("run")
        plt.ylim(0, plt.ylim()[1])

    with figure("movement_y_runs"):
        molten = pd.melt(analyses,
                         id_vars=["user", "experiment", "order", "group"],
                         value_vars=["move_b", "move_f", "move_y"])
        g = sns.factorplot(x="order", y="value", col="variable",
                           data=molten, hue="experiment")
        g.fig.axes[0].set_title("Movement backwards")
        g.fig.axes[1].set_title("Movement forwards")
        g.fig.axes[2].set_title("Movement in $y$")
        g.fig.axes[0].set_ylabel("distance (m)")
        g.fig.axes[0].set_xlabel("run")
        g.fig.axes[1].set_xlabel("run")
        g.fig.axes[2].set_xlabel("run")
        plt.ylim(0, plt.ylim()[1])