示例#1
0
def main(args):
    # datasets de train y test
    df = pd.read_csv("data/imdb_small.csv")

    #recortando data set para pruebas chiquitas
    if (args.elem > 0):
        df = df[:args.elem]

    X_train, y_train, X_test, y_test = get_instances(df)

    alpha = args.alpha_Start

    # rango de k
    k_range = np.arange(args.k_Start, args.k_Stop, args.k_Step)

    # rango de alpha
    alpha_range = np.arange(alpha, args.alpha_Stop, args.alpha_Step)

    # resultados de accuaracies para cada k
    results = []
    for i in range(len(k_range)):
        results.append([])

    for alpha in alpha_range:
        pca = PCA(alpha)
        pca.fit(X_train.toarray())
        X_train_aux = pca.transform(X_train)
        X_test_aux = pca.transform(X_test)

        # kNN
        for i in range(len(k_range)):
            k = k_range[i]

            clf = KNNClassifier(k)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            t = accuracy_score(y_test, y_pred)

            results[i].append(t)

    for i, result in enumerate(results):
        plt.plot(alpha_range, result, label='k = {0}'.format(k_range[i]))

    plt.xlabel('alpha')
    plt.ylabel('accuaracy')
    plt.legend()
    plt.savefig('results/k_vs_accuaracy-{}'.format(
        time.strftime("%Y%m%d-%H%M%S")))
    plt.show()
    '''
示例#2
0
def evaluate_knn(k_range, X_train, y_train, X_test, y_test, reps):
    accs = []

    # kNN
    for k in k_range:
        t = 0

        for i in range(reps):
            clf = KNNClassifier(k)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            t += accuracy_score(y_test, y_pred)

        accs.append(t / reps)

    return accs
示例#3
0
def run_test(df,
             TRAIN_SIZE=6225,
             TEST_SIZE=500,
             BINARIO=False,
             NEGACIONES=False,
             NORMA_PESADA=False,
             IDF=False,
             STOP_WORDS=False):
    print("--------------------------------------------------------")
    print("Test empezado con:")
    print("Train size:", TRAIN_SIZE)
    print("Test size:", TEST_SIZE)
    print("Negaciones:", NEGACIONES)
    print("Binario:", BINARIO)
    print("Norma pesada:", NORMA_PESADA)
    print("Stop words:", STOP_WORDS)
    print("IDF:", IDF)

    text_train, label_train, text_test, label_test = get_instances(
        df, TRAIN_SIZE, TEST_SIZE)
    X_train, y_train, X_test, y_test = vectorizar(text_train, label_train,
                                                  text_test, label_test,
                                                  BINARIO, IDF, NEGACIONES,
                                                  STOP_WORDS)

    X_train = X_train.todense()
    var_total = np.std(X_train, axis=1).sum()

    pca = PCA(1500)
    pca.fit(X_train)
    X_train = pca.transform(X_train, 1500)
    X_train = tomar_porcentaje(X_train, 0.03, var_total)  #MODIFICAR
    X_test = pca.transform(X_test, X_train.shape[1])
    print("ALPHA =", X_train.shape[1])
    clf = KNNClassifier(1)
    clf.fit(X_train, y_train)

    mat = []
    if not NORMA_PESADA:
        mat = clf.testearK(X_test)
    else:
        y_train_norm = y_train - y_train.mean()
        ystd = np.std(y_train)

        covarianzas = np.zeros(X_train.shape[1])
        correlaciones = np.zeros(X_train.shape[1])
        for i in range(X_train.shape[1]):
            covarianzas[i] = ((X_train[:, i] - X_train[:, i].mean()) *
                              y_train_norm).sum()
            correlaciones[i] = covarianzas[i] / (np.std(X_train[:, i]) * ystd)
        mat = clf.testearK_weighted(X_test, covarianzas)

    vAcc = []
    for i in range(len(mat[0])):
        a = mat[:, i]
        acc = accuracy_score(y_test, a)
        vAcc.append(acc)

    return vAcc
示例#4
0
def main(args):
    # datasets de train y test
    df = pd.read_csv("data/imdb_small.csv")

    x_poda_frec = np.arange(args._from, args.to, args.step)
    y_poda_frec = []

    df = df[:6000]

    for i in x_poda_frec:
        X_train, y_train, X_test, y_test = get_instances(df, 0.9, i)

        k = 500
        alpha = 50

        #pca
        pca = PCA(alpha)
        pca.fit(X_train.toarray())
        X_train_aux = pca.transform(X_train)
        X_test_aux = pca.transform(X_test)

        #knn
        clf = KNNClassifier(k)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        y_poda_frec.append(acc)

    fig, ax1 = plt.subplots()

    plt.plot(x_poda_frec, y_poda_frec, label='k = 500, alpha = 50'.format(k))
    plt.xlabel('min_df')
    plt.ylabel('accuaracy')
    plt.legend()
    plt.savefig('results/min_df_accuaracy-{}'.format(
        time.strftime("%Y%m%d-%H%M%S")))
    plt.show()
示例#5
0
    if len(sys.argv) != 3:
        print("Uso: python classify archivo_de_test archivo_salida")
        exit()

    test_path = sys.argv[1]
    out_path = sys.argv[2]

    df = pd.read_csv("data/imdb_small.csv")
    df_test = pd.read_csv(test_path)

    print("Vectorizando datos...")
    X_train, y_train, X_test, ids_test = get_instances(df, df_test)
    """
    Entrenamos KNN
    """
    clf = KNNClassifier(1120)

    clf.fit(X_train, y_train)
    """
    Testeamos
    """
    print("Prediciendo etiquetas...")
    y_pred = clf.predict(X_test).reshape(-1)

    labels = ['pos' if val == 1 else 'neg' for val in y_pred]

    df_out = pd.DataFrame({"id": ids_test, "label": labels})

    df_out.to_csv(out_path, index=False)

    print("Salida guardada en {}".format(out_path))
示例#6
0
def run_test(df,
             TRAIN_SIZE=6225,
             TEST_SIZE=500,
             ALPHA=None,
             K=None,
             BINARIO=False,
             NEGACIONES=False,
             NORMA_PESADA=False,
             IDF=False,
             STOP_WORDS=False):
    print("--------------------------------------------------------")
    print("Test empezado con:")
    print("Train size:", TRAIN_SIZE)
    print("Test size:", TEST_SIZE)
    print("Alpha:", ALPHA)
    print("K:", K)
    print("Negaciones:", NEGACIONES)
    print("Binario:", BINARIO)
    print("Norma pesada:", NORMA_PESADA)
    print("Stop words:", STOP_WORDS)
    print("IDF:", IDF)
    text_train, label_train, text_test, label_test = get_instances(
        df, TRAIN_SIZE, TEST_SIZE)

    print("Vectorizando...")
    X_train, y_train, X_test, y_test = vectorizar(text_train, label_train,
                                                  text_test, label_test,
                                                  BINARIO, IDF, NEGACIONES,
                                                  STOP_WORDS)
    if ALPHA != None:
        print("Obteniendo componentes principales...")
        pca = PCA(ALPHA)
        pca.fit(X_train.todense())
        X_train = pca.transform(X_train, ALPHA)
        X_test = pca.transform(X_test, ALPHA)

    clf = KNNClassifier(K)

    clf.fit(X_train, y_train)
    print("Prediciendo...")
    if not NORMA_PESADA:
        y_pred = clf.predict(X_test)
    else:
        y_train_norm = y_train - y_train.mean()
        ystd = np.std(y_train)

        covarianzas = np.zeros(X_train.shape[1])
        correlaciones = np.zeros(X_train.shape[1])
        for i in range(X_train.shape[1]):
            if ALPHA:
                covarianzas[i] = ((X_train[:, i] - X_train[:, i].mean()) *
                                  y_train_norm).sum()
                correlaciones[i] = covarianzas[i] / (np.std(X_train[:, i]) *
                                                     ystd)
            else:
                covarianzas[i] = ((
                    (X_train.todense())[:, i] - X_train[:, i].mean()) *
                                  y_train_norm).sum()
                correlaciones[i] = covarianzas[i] / (np.std(
                    (X_train.todense())[:, i]) * ystd)
        y_pred = clf.predict_weighted(X_test, np.abs(correlaciones))
    print("Test finalizado")
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy: {}".format(acc))
    return acc
示例#7
0
    df_test = pd.read_csv(test_path)

    print("Vectorizando datos...")
    text_train, label_train, text_test, ids_test = get_instances(
        df_train, df_test, TRAIN_SIZE)
    X_train, y_train, X_test = vectorizar(text_train, label_train, text_test,
                                          BINARIO, IDF, NEGACIONES, STOP_WORDS)

    if ALPHA != None:
        print("Obteniendo componentes principales...")
        pca = PCA(ALPHA)
        pca.fit(X_train.todense())
        X_train = pca.transform(X_train, ALPHA)
        X_test = pca.transform(X_test, ALPHA)

    clf = KNNClassifier(K)
    clf.fit(X_train, y_train)
    print("Prediciendo...")
    print(X_test.shape)
    if not NORMA_PESADA:
        y_pred = clf.predict(X_test)
    else:
        y_train_norm = y_train - y_train.mean()
        ystd = np.std(y_train)
        covarianzas = np.zeros(X_train.shape[1])
        correlaciones = np.zeros(X_train.shape[1])
        for i in range(X_train.shape[1]):
            if ALPHA:
                covarianzas[i] = ((X_train[:, i] - X_train[:, i].mean()) *
                                  y_train_norm).sum()
                correlaciones[i] = covarianzas[i] / (np.std(X_train[:, i]) *
示例#8
0
# rango de alpha
alpha_range = np.arange(50, 150, 30)

y_alpha = []
y_time = []

for alpha in alpha_range:
  t0 = time.time()
  #pca
  pca = PCA(alpha)
  pca.fit(X_train.toarray())
  X_train_aux = pca.transform(X_train)
  X_test_aux = pca.transform(X_test)

  #knn
  clf = KNNClassifier(k)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  acc = accuracy_score(y_test, y_pred)

  y_alpha.append(acc)

fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('Alpha')
ax1.set_ylabel('Accuaracy', color=color)
ax1.plot(alpha_range, y_alpha, color=color)
ax1.tick_params(axis='y', labelcolor=color)
plt.xticks(rotation=90)
ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
                                                  STOP_WORDS)

    print("Obteniendo componentes principales...")
    pca = PCA(1601)
    pca.fit(X_train.toarray())

    ALPHA = 800

    saltoALPHA = 100
    mAcc = []
    while ALPHA < 1601:
        print("ALPHA = ", ALPHA)

        X_train = pca.transform(X_train, ALPHA)
        X_test = pca.transform(X_test, ALPHA)
        clf = KNNClassifier(1)
        clf.fit(X_train, y_train)

        mat = []
        if not NORMA_PESADA:
            mat = clf.testearK(X_test)
        else:
            y_train_norm = y_train - y_train.mean()
            ystd = np.std(y_train)
            print(X_train.shape)

            covarianzas = np.zeros(ALPHA)
            correlaciones = np.zeros(ALPHA)
            for i in range(ALPHA):
                covarianzas[i] = ((X_train[:, i] - X_train[:, i].mean()) *
                                  y_train_norm).sum()
示例#10
0
logging.info('Vectorizando los datos')
vectorizer = CountVectorizer(max_df=0.90, min_df=0.01, max_features=5000)
vectorizer.fit(text_train)
X_train, y_train = vectorizer.transform(text_train), (
    label_train == 'pos').values
X_test, y_test = vectorizer.transform(text_test), (label_test == 'pos').values
#  X_train = X_train.todense()
#  X_test = X_test.todense()

# ------------------------------------------------------------------------------
# Entrenamiento
# ------------------------------------------------------------------------------

logging.info(f'Entranando el clasificador. (K={K})')
time_start = process_time()
clf = KNNClassifier(K)
clf.fit(X_train, y_train)
time_finish = process_time()
logging.info(f'Clasificador entrenado en {time_finish - time_start:.4f}s')

# ------------------------------------------------------------------------------
# Midiendo predicción
# ------------------------------------------------------------------------------

if N_test == 0:
    N_test = X_test.shape[0]

logging.info(f'Midiendo tiempos para {N_test} elementos de testing.')
time_start = process_time()
clf.predict(X_test[:N_test])
time_finish = process_time()
    df = pd.read_csv("../data/imdb_small.csv")
    df['label'] = (df['label'] == 'pos').astype('int')

    TOTAL_TRAIN = 6000
    text_train, label_train, text_test, label_test = get_instances(df,TOTAL_TRAIN,TEST_SIZE)

    print("Vectorizando...")
    X_train, y_train, X_test, y_test = vectorizar(text_train, label_train, text_test, label_test, BINARIO, IDF, NEGACIONES, STOP_WORDS)

    TRAIN = 1000
    saltoTRAIN = 1000
    mAcc = []
    while TRAIN <= 6000:
        print(TRAIN)
        clf = KNNClassifier(1)
        clf.fit(X_train[:TRAIN], y_train[:TRAIN])
        mat = []
        mat = clf.testearK(X_test)
        vAcc = []
        for i in range(len(mat[0])):
            a = mat[:, i]
            acc = accuracy_score(y_test, a)
            vAcc.append(acc)

        mAcc.append(vAcc)

        TRAIN += saltoTRAIN

    fout = open("resultados_exp5.pkl","wb")
    pickle.dump(mAcc,fout)