Пример #1
0
def main(args):
    # datasets de train y test
    df = pd.read_csv("data/imdb_small.csv")

    #recortando data set para pruebas chiquitas
    if (args.elem > 0):
        df = df[:args.elem]

    X_train, y_train, X_test, y_test = get_instances(df)

    alpha = args.alpha_Start

    # rango de k
    k_range = np.arange(args.k_Start, args.k_Stop, args.k_Step)

    # rango de alpha
    alpha_range = np.arange(alpha, args.alpha_Stop, args.alpha_Step)

    # resultados de accuaracies para cada alpha
    results = []

    if (alpha > 0):
        # PCA

        for alpha in alpha_range:
            pca = PCA(alpha)
            #print("fit")
            pca.fit(X_train.toarray())
            #print("trainx")
            X_train_aux = pca.transform(X_train)
            #print("train_test")
            X_test_aux = pca.transform(X_test)

            # resultados de accuaracy con cada k
            accs = evaluate_knn(k_range, X_train_aux, y_train, X_test_aux,
                                y_test, args.reps)
            results.append(accs)
    else:
        # resultados de accuaracy con cada k
        accs = evaluate_knn(k_range, X_train, y_train, X_test, y_test,
                            args.reps)
        results.append(accs)

    if (alpha > 0):
        for i, result in enumerate(results):

            plt.plot(k_range,
                     result,
                     label='alpha = {0}'.format(alpha_range[i]))
    else:
        plt.plot(k_range, results[0], label='sin pca')

    plt.xlabel('k')
    plt.ylabel('accuaracy')
    plt.legend()
    plt.savefig('results/k_vs_accuaracy-{}'.format(
        time.strftime("%Y%m%d-%H%M%S")))
    plt.show()
    '''
Пример #2
0
def run_test(df,
             TRAIN_SIZE=6225,
             TEST_SIZE=500,
             BINARIO=False,
             NEGACIONES=False,
             NORMA_PESADA=False,
             IDF=False,
             STOP_WORDS=False):
    print("--------------------------------------------------------")
    print("Test empezado con:")
    print("Train size:", TRAIN_SIZE)
    print("Test size:", TEST_SIZE)
    print("Negaciones:", NEGACIONES)
    print("Binario:", BINARIO)
    print("Norma pesada:", NORMA_PESADA)
    print("Stop words:", STOP_WORDS)
    print("IDF:", IDF)

    text_train, label_train, text_test, label_test = get_instances(
        df, TRAIN_SIZE, TEST_SIZE)
    X_train, y_train, X_test, y_test = vectorizar(text_train, label_train,
                                                  text_test, label_test,
                                                  BINARIO, IDF, NEGACIONES,
                                                  STOP_WORDS)

    X_train = X_train.todense()
    var_total = np.std(X_train, axis=1).sum()

    pca = PCA(1500)
    pca.fit(X_train)
    X_train = pca.transform(X_train, 1500)
    X_train = tomar_porcentaje(X_train, 0.03, var_total)  #MODIFICAR
    X_test = pca.transform(X_test, X_train.shape[1])
    print("ALPHA =", X_train.shape[1])
    clf = KNNClassifier(1)
    clf.fit(X_train, y_train)

    mat = []
    if not NORMA_PESADA:
        mat = clf.testearK(X_test)
    else:
        y_train_norm = y_train - y_train.mean()
        ystd = np.std(y_train)

        covarianzas = np.zeros(X_train.shape[1])
        correlaciones = np.zeros(X_train.shape[1])
        for i in range(X_train.shape[1]):
            covarianzas[i] = ((X_train[:, i] - X_train[:, i].mean()) *
                              y_train_norm).sum()
            correlaciones[i] = covarianzas[i] / (np.std(X_train[:, i]) * ystd)
        mat = clf.testearK_weighted(X_test, covarianzas)

    vAcc = []
    for i in range(len(mat[0])):
        a = mat[:, i]
        acc = accuracy_score(y_test, a)
        vAcc.append(acc)

    return vAcc
Пример #3
0
def main(args):
    # datasets de train y test
    df = pd.read_csv("data/imdb_small.csv")

    #recortando data set para pruebas chiquitas
    if (args.elem > 0):
        df = df[:args.elem]

    X_train, y_train, X_test, y_test = get_instances(df)

    alpha = args.alpha_Start

    # rango de k
    k_range = np.arange(args.k_Start, args.k_Stop, args.k_Step)

    # rango de alpha
    alpha_range = np.arange(alpha, args.alpha_Stop, args.alpha_Step)

    # resultados de accuaracies para cada k
    results = []
    for i in range(len(k_range)):
        results.append([])

    for alpha in alpha_range:
        pca = PCA(alpha)
        pca.fit(X_train.toarray())
        X_train_aux = pca.transform(X_train)
        X_test_aux = pca.transform(X_test)

        # kNN
        for i in range(len(k_range)):
            k = k_range[i]

            clf = KNNClassifier(k)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            t = accuracy_score(y_test, y_pred)

            results[i].append(t)

    for i, result in enumerate(results):
        plt.plot(alpha_range, result, label='k = {0}'.format(k_range[i]))

    plt.xlabel('alpha')
    plt.ylabel('accuaracy')
    plt.legend()
    plt.savefig('results/k_vs_accuaracy-{}'.format(
        time.strftime("%Y%m%d-%H%M%S")))
    plt.show()
    '''
Пример #4
0
def main(args):
    # datasets de train y test
    df = pd.read_csv("data/imdb_small.csv")

    x_poda_frec = np.arange(args._from, args.to, args.step)
    y_poda_frec = []

    df = df[:6000]

    for i in x_poda_frec:
        X_train, y_train, X_test, y_test = get_instances(df, 0.9, i)

        k = 500
        alpha = 50

        #pca
        pca = PCA(alpha)
        pca.fit(X_train.toarray())
        X_train_aux = pca.transform(X_train)
        X_test_aux = pca.transform(X_test)

        #knn
        clf = KNNClassifier(k)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        y_poda_frec.append(acc)

    fig, ax1 = plt.subplots()

    plt.plot(x_poda_frec, y_poda_frec, label='k = 500, alpha = 50'.format(k))
    plt.xlabel('min_df')
    plt.ylabel('accuaracy')
    plt.legend()
    plt.savefig('results/min_df_accuaracy-{}'.format(
        time.strftime("%Y%m%d-%H%M%S")))
    plt.show()
Пример #5
0
        7: Criterion.eigenvalues
        #9: Criterion.eigenvalues,
        #11: Criterion.all
    }
    paths = {}
    for exp in exponentes:
        paths[exp] = []
        print("exponente: {}".format(exp))
        for rep in range(1):
            print("rep: {}".format(rep))

            out_path = "data/{}_{}_{}.out".format(out_name, exp, rep)
            paths[exp].append(out_path)

            eps = 10**(-exp)
            pca = PCA(alpha, eps)

            print("Entrenando PCA")
            t = time.clock()
            pca.fit(X_train_orig)

            print("Transformando datos")
            X_train = pca.transform(X_train_orig)
            X_test = pca.transform(X_test_orig)

            total_time = time.clock() - t
            print("time: {}".format(total_time))
            """
            Entrenamos KNN
            """
Пример #6
0
def run_test(df,
             TRAIN_SIZE=6225,
             TEST_SIZE=500,
             ALPHA=None,
             K=None,
             BINARIO=False,
             NEGACIONES=False,
             NORMA_PESADA=False,
             IDF=False,
             STOP_WORDS=False):
    print("--------------------------------------------------------")
    print("Test empezado con:")
    print("Train size:", TRAIN_SIZE)
    print("Test size:", TEST_SIZE)
    print("Alpha:", ALPHA)
    print("K:", K)
    print("Negaciones:", NEGACIONES)
    print("Binario:", BINARIO)
    print("Norma pesada:", NORMA_PESADA)
    print("Stop words:", STOP_WORDS)
    print("IDF:", IDF)
    text_train, label_train, text_test, label_test = get_instances(
        df, TRAIN_SIZE, TEST_SIZE)

    print("Vectorizando...")
    X_train, y_train, X_test, y_test = vectorizar(text_train, label_train,
                                                  text_test, label_test,
                                                  BINARIO, IDF, NEGACIONES,
                                                  STOP_WORDS)
    if ALPHA != None:
        print("Obteniendo componentes principales...")
        pca = PCA(ALPHA)
        pca.fit(X_train.todense())
        X_train = pca.transform(X_train, ALPHA)
        X_test = pca.transform(X_test, ALPHA)

    clf = KNNClassifier(K)

    clf.fit(X_train, y_train)
    print("Prediciendo...")
    if not NORMA_PESADA:
        y_pred = clf.predict(X_test)
    else:
        y_train_norm = y_train - y_train.mean()
        ystd = np.std(y_train)

        covarianzas = np.zeros(X_train.shape[1])
        correlaciones = np.zeros(X_train.shape[1])
        for i in range(X_train.shape[1]):
            if ALPHA:
                covarianzas[i] = ((X_train[:, i] - X_train[:, i].mean()) *
                                  y_train_norm).sum()
                correlaciones[i] = covarianzas[i] / (np.std(X_train[:, i]) *
                                                     ystd)
            else:
                covarianzas[i] = ((
                    (X_train.todense())[:, i] - X_train[:, i].mean()) *
                                  y_train_norm).sum()
                correlaciones[i] = covarianzas[i] / (np.std(
                    (X_train.todense())[:, i]) * ystd)
        y_pred = clf.predict_weighted(X_test, np.abs(correlaciones))
    print("Test finalizado")
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy: {}".format(acc))
    return acc
Пример #7
0
    if len(sys.argv) != 3:
        print("Uso: python classify archivo_de_test archivo_salida")
        exit()

    test_path = sys.argv[1]
    out_path = sys.argv[2]

    df = pd.read_csv("data/imdb_small.csv")
    df_test = pd.read_csv(test_path)

    print("Vectorizando datos...")
    X_train, y_train, X_test, ids_test = get_instances(df, df_test)
    #Comentar esto si nuestra mejor configuración no usa PCA

    alpha = 450
    pca = PCA(alpha)
    print("Entrenando PCA")
    pca.fit(X_train.toarray())
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    """
    Entrenamos KNN
    """
    clf = KNNClassifier(2000)

    clf.fit(X_train, y_train)
    """
    Testeamos
    """
    print("Prediciendo etiquetas...")
    y_pred = clf.predict(X_test).reshape(-1)
    TRAIN_SIZE = 6225
    NEGACIONES = True
    BINARIO = True
    NORMA_PESADA = True
    STOP_WORDS = True
    IDF = True

    df_train = pd.read_csv("../data/imdb_small.csv")

    text_train = get_instances(df_train, TRAIN_SIZE)
    X_train = vectorizar(text_train, BINARIO, IDF, NEGACIONES,
                         STOP_WORDS).todense()

    if caso == "0":
        var_total = np.std(X_train, axis=1).sum()
        pca = PCA(alpha)
        pca.fit(X_train)
        X_train = pca.transform(X_train, alpha)
        var_con_pca = np.std(X_train, axis=1).sum()
    else:
        var_total = np.std(X_train, axis=1).sum()
        pca = PCA(X_train.shape[1])
        pca.fit(X_train)
        X_train = pca.transform(X_train, X_train.shape[1])
        var_parcial = np.std(X_train[:, 0])
        for i in range(1, X_train.shape[1]):
            if var_parcial / var_total > P:
                print(i)
                break
            var_parcial = var_parcial + np.std(X_train[:, i])
Пример #9
0
    test_path = sys.argv[1]
    out_path = sys.argv[2]

    df_train = pd.read_csv("../data/imdb_small.csv")
    df_test = pd.read_csv(test_path)

    print("Vectorizando datos...")
    text_train, label_train, text_test, ids_test = get_instances(
        df_train, df_test, TRAIN_SIZE)
    X_train, y_train, X_test = vectorizar(text_train, label_train, text_test,
                                          BINARIO, IDF, NEGACIONES, STOP_WORDS)

    if ALPHA != None:
        print("Obteniendo componentes principales...")
        pca = PCA(ALPHA)
        pca.fit(X_train.todense())
        X_train = pca.transform(X_train, ALPHA)
        X_test = pca.transform(X_test, ALPHA)

    clf = KNNClassifier(K)
    clf.fit(X_train, y_train)
    print("Prediciendo...")
    print(X_test.shape)
    if not NORMA_PESADA:
        y_pred = clf.predict(X_test)
    else:
        y_train_norm = y_train - y_train.mean()
        ystd = np.std(y_train)
        covarianzas = np.zeros(X_train.shape[1])
        correlaciones = np.zeros(X_train.shape[1])
    print("IDF:", IDF)

    df = pd.read_csv("../data/imdb_small.csv")
    df['label'] = (df['label'] == 'pos').astype('int')

    text_train, label_train, text_test, label_test = get_instances(
        df, TRAIN_SIZE, TEST_SIZE)

    print("Vectorizando...")
    X_train, y_train, X_test, y_test = vectorizar(text_train, label_train,
                                                  text_test, label_test,
                                                  BINARIO, IDF, NEGACIONES,
                                                  STOP_WORDS)

    print("Obteniendo componentes principales...")
    pca = PCA(1601)
    pca.fit(X_train.toarray())

    ALPHA = 800

    saltoALPHA = 100
    mAcc = []
    while ALPHA < 1601:
        print("ALPHA = ", ALPHA)

        X_train = pca.transform(X_train, ALPHA)
        X_test = pca.transform(X_test, ALPHA)
        clf = KNNClassifier(1)
        clf.fit(X_train, y_train)

        mat = []