Exemplo n.º 1
0
def task22Submission_LR(parameters, write=False, output_dir=None):
    print(parameters)

    dataset = 'test'
    df_out = task_to_df(22, dataset)
    txtfile = 'semeval_data/{}/task-2.2.txt'.format(dataset)
    method, _ = parameters
    train_df = task_to_df(22, 'dev')
    train_y = train_df['gold_sense_id']
    train_X = vectorize(method, task=22, dataset='dev')
    test_X = vectorize(method, task=22, dataset='test')

    clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000)
    clf.fit(train_X, train_y)
    T = clf.predict_proba(test_X)
    confidence = np.amax(T, axis=1)

    y_pred = clf.predict(test_X)
    df_out['predict_sense_id'] = y_pred
    # ---------------------------------------
    if write == True:
        # csvfile = '{}/task22_{}_#{}.csv'.format(output_dir, method, ('LogisticRegression'))
        # df_to_csv(test_df, csvfile)
        # print(method + ' csv file is written')
        task22_toTXT(txtfile, df_out, output_dir)
        print('Task-22 txt file is written')
    # ---------------------------------------
    return df_out, confidence
Exemplo n.º 2
0
def run(sub_task=1):
    documents = [
        "Éste texto no tiene nada que ver con los demás",
        "La plata fue entregada en camiones color plata",
        "El cargamento de oro llegó en un camión. El cargamento de oro llegó en un camión. El cargamento de oro llegó en un camión",
        "Cargamentos de oro dañados por el fuego",
        "El cargamento de oro llegó en un camión"
    ]
    
    query = ["oro plata camión"]
    if sub_task <= 2:
        text_vectorizer, text_vector = vectorizer.vectorize(documents)
        query_vector = text_vectorizer.transform(query)
        
        if sub_task == 1:
            distances = np.array([np.linalg.norm(text_vector[i].toarray() - query_vector.toarray()) for i in range(text_vector.shape[0])])
        elif sub_task == 2:
            distances = np.array([cosine_distance(text_vector[i].toarray()[0], query_vector.toarray()[0]) for i in range(text_vector.shape[0])])
    elif sub_task >= 3:
        if sub_task == 3:
            text_vectorizer, text_vector = vectorizer.vectorize(documents, stop_words=stopwords.spanish)
        elif sub_task == 4:
            text_vectorizer, text_vector = vectorizer.vectorize(documents, stop_words=stopwords.spanish, tokenizer=SpanishTokenizer())
        elif sub_task == 5:
            text_vectorizer, text_vector = vectorizer.tf_idf_vectorize(documents, stop_words=stopwords.spanish, tokenizer=SpanishTokenizer())
            
        query_vector = text_vectorizer.transform(query)
        
        distances = np.array([cosine_distance(text_vector[i].toarray()[0], query_vector.toarray()[0]) for i in range(text_vector.shape[0])])
    
    min_distance = np.argmin(distances)
    
    print("Documento mas parecido: {0}.\nDistancia: {1}\nTexto del documento:\n{2}".format(min_distance, np.amin(distances), documents[min_distance]))
Exemplo n.º 3
0
def get_data(mypath, out_path, overwrite=False):
    pos_reviews = sorted(os.listdir(mypath + "/pos"),
                         key=lambda x: int(x.split("_")[0]))
    neg_reviews = sorted(os.listdir(mypath + "/neg"),
                         key=lambda x: int(x.split("_")[0]))
    pos_length = len(pos_reviews)  ## 12500 for our purposes
    neg_length = len(neg_reviews)

    #check if npy file already exists, otherwise gather data
    if (os.path.exists(out_path) and not overwrite):
        x = np.load(out_path)
        for row in x:
            assert np.nan not in row
            assert np.inf not in row
        assert (x.shape == (25000, BAG_SIZE))
    else:
        x = np.zeros((pos_length + neg_length, BAG_SIZE), dtype=dtype)
        for i in tqdm(range(pos_length), desc='+', ncols=80):
            x[i] = vectorize(mypath + "/pos/" + pos_reviews[i], word_bag)
        for j in tqdm(range(neg_length), desc='-', ncols=80):
            x[len(pos_reviews) + j] = vectorize(
                mypath + "/neg/" + neg_reviews[j], word_bag)

        np.save(out_path, np.nan_to_num(x))

    y = np.zeros(pos_length + neg_length, dtype=dtype)
    y[:pos_length] = 1

    return x, y
Exemplo n.º 4
0
 def test_vectorize(self):
     data = ["This is first sentence", "This is the second sentence"]
     vocab_size = 20000
     seq_len = 5
     tokenizer, vectors = vec.vectorize(vocab_size, seq_len, data)
     assert tokenizer is not None
     assert (vectors == [[0, 1, 2, 4, 3], [1, 2, 5, 6, 3]]).all()
Exemplo n.º 5
0
def task1Submission(parameters, write=False, output_dir=None):
    print('\nTask-1 submission----------\n')
    print(parameters)

    dataset = 'test'
    df_out = task_to_df(1, dataset)
    txtfile = 'semeval_data/{}/task-1.txt'.format(dataset)

    method, norm, affinity, linkage, n_cluster = parameters

    clusterizer = AgglomerativeClustering(n_clusters=n_cluster,
                                          affinity=affinity,
                                          linkage=linkage)
    vecs = vectorize(method, task=1, dataset='test')

    if norm == True:
        method = method + '-norm'
        vecs = sklearn.preprocessing.normalize(vecs, norm='l2', axis=1)

    df_out['predict_sense_id'] = clusterizer.fit_predict(vecs)

    if write == True:
        # csvfile = '{}/task1_{}_#{}#{}#{}.csv'.format(output_dir, method, affinity, linkage, n_cluster)
        # df_to_csv(df_out, csvfile)
        # print(method + ' csv file is written')
        task1_toTXT(txtfile, df_out, output_dir)
        print('Task-1 txt file is written')

    return df_out
Exemplo n.º 6
0
def get_cloud(users, clusters_number=7, cluster_size=10):
    stemmer = create_stemmer()
    stop_words = create_stop_words()
    vectorizer = create_vectorizer()
    processed_messages = []
    for user in users:
        processed_messages.extend(
            process_messages(user['messages'], stemmer, stop_words))
    vectors = vectorize(processed_messages, vectorizer)
    return get_clusters(vectors, vectorizer, clusters_number, cluster_size)
Exemplo n.º 7
0
def vectorize_data(data_path, save_path):
    data = pd.read_csv(data_path, encoding="utf-8")
    tokenizer, vectors = vectorize(
        MAX_VOCAB_SIZE, MAX_SEQUENCE_LENGTH, data["text"].values
    )
    if tokenizer is not None and vectors is not None:
        dump(tokenizer, os.path.join(OBJECTS_DIR, "tokenizer.joblib"))
        vec_df = pd.DataFrame(vectors)
        vec_df = pd.concat([vec_df, data.drop(columns=["text"])], axis=1)
        vec_df.to_csv(save_path, index=False, encoding="utf-8")
    else:
        logger.error("Error in vectorizing data!!!")
Exemplo n.º 8
0
def vectorize_data(data_path, processing_type):
    data = pd.read_csv(data_path, encoding="utf-8")
    tokenizer, vectors = vectorize(MAX_VOCAB_SIZE, MAX_SEQUENCE_LENGTH,
                                   data["text"].values)
    if tokenizer is not None and vectors is not None:
        dump(tokenizer, os.path.join(OBJECTS_DIR, "tokenizer.joblib"))
        vec_df = pd.DataFrame(vectors)
        vec_df = pd.concat([vec_df, data.drop(columns=["text"])], axis=1)
        train_vec_data, val_vec_data = stratified_split(vec_df,
                                                        split_col="threat")
        save_csv_data(
            train_vec_data,
            os.path.join(TRAIN_DATA_DIR_WI,
                         "train_vectors_{}.csv".format(processing_type)),
        )
        save_csv_data(
            val_vec_data,
            os.path.join(VAL_DATA_DIR_WI,
                         "val_vectors_{}.csv".format(processing_type)),
        )
    else:
        logger.error("Error in vectorizing data!!!")
        return False
    return True
Exemplo n.º 9
0
mypath = "../task1/train"
pos_reviews = list(map(lambda x: "{0}.txt".format(x + 1), range(12500)))
neg_reviews = list(map(lambda x: "{0}.txt".format(x + 1), range(12500)))

#check if npy file already exists, otherwise gather data
if (os.path.exists("x_train_small.npy")):
    x = np.load("x_train_small.npy")
    for row in x:
        assert np.nan not in row
        assert np.inf not in row
    # print(np.where(x >= np.finfo(np.float64).max))
    # exit(0)
else:
    x = np.zeros((len(pos_reviews) + len(neg_reviews), BAG_SIZE), dtype=dtype)
    for i in tqdm(range(len(pos_reviews)), desc='+', ncols=80):
        x[i] = vectorize(mypath + "/positive" + "/" + pos_reviews[i], word_bag)
    for j in tqdm(range(len(neg_reviews)), desc='-', ncols=80):
        x[len(pos_reviews) + j] = vectorize(
            mypath + "/negative" + "/" + neg_reviews[j], word_bag)

y = np.zeros(len(pos_reviews) + len(neg_reviews), dtype=dtype)
y[:len(pos_reviews)] = 1
np.save("x_train_small.npy", np.nan_to_num(x))

# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
# clf.fit(x, y)
# labs = clf.predict(X_test)
# print("accuracy of MLP: %i", (sum(np.equal(labs, y_test))/len(y_test)))
Exemplo n.º 10
0
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import BaggingRegressor


mat = pd.read_csv('data.csv',parse_dates=['quote_date'])

#mat.drop('tube_assembly_id',inplace=True,axis=1)
mat['year']=mat['quote_date'].apply(lambda row: row.year)
mat['month']=mat['quote_date'].apply(lambda row: row.month)
mat.drop('quote_date',inplace=True,axis=1)
mat['quantity_tr']=np.sqrt(np.log(mat['quantity']))



cols = ['supplier']
vectorSpec = vectorize(mat,cols,0.001)
mat = pd.concat([mat,vectorSpec],axis=1)
mat.drop(cols,axis=1,inplace=True)

colRemoved = ['cost','id']
cols = mat.drop(colRemoved,axis=1).columns
mat[cols]=mat[cols].fillna(0)
#mat[cols]=np.nan_to_num(np.array(mat[cols]))

"""
for c in cols:
    try:
        mat[c]=pd.cut(mat[c],bins=128,labels=False)
    except:
        print c
        
Exemplo n.º 11
0
mat = mat.merge(bomaggf,how='left',left_on='tube_assembly_id',right_on='tube_assembly_id')
"""

# SPECS
specs = pd.read_csv('competition_data/specs.csv')

specs = specs.set_index('tube_assembly_id')
specs = specs.stack()
specs = specs.reset_index()
specs.drop('level_1', inplace=True, axis=1)
specs.columns = ['tube_assembly_id', 'specs']

nspecs = specs.groupby('tube_assembly_id').count().reset_index()

cols = ['specs']
vectorSpec = vectorize(specs, cols, 0.001)
specs = pd.concat([specs, vectorSpec], axis=1)
specs.drop(cols, axis=1, inplace=True)

specs = specs.groupby('tube_assembly_id').sum()

n_comp = 3
kern = scale(np.array(specs, dtype=np.float))
kpca = RBFSampler(n_components=n_comp, gamma=0.3)
kern = kpca.fit_transform(kern)
specs.drop(specs.columns, axis=1, inplace=True)
cols = ['specs_pca' + str(i) for i in range(n_comp)]
for c in cols:
    specs[c] = np.zeros(len(specs))

specs[cols] = kern
Exemplo n.º 12
0
    target_column = vectorized_data['Number of Votes']
    predictor_columns = vectorized_data.drop('Number of Votes', 1)
    vector_columns = vectorized_data[vector_headers]
    
    
    vectorized_data.reindex(np.random.permutation(vectorized_data.index))
    NUM_ROWS = vectorized_data.shape[0]
    NUM_TEST = int(NUM_ROWS*.15)
    
    train_data = vectorized_data[NUM_TEST:]
    train_target = train_data['Number of Votes']
    train_data = train_data[vector_header]

    test_data = vectorized_data[:NUM_TEST]
    test_target = test_data['Number of Votes']
    test_data = test_data[vector_header]
    
    #(train_data, test_data, train_target, test_target) =  ms.train_test_split(predictor_columns, target_column, test_size = 0.15)    
    
    classifier = RandomForestClassifier(n_estimators=10)
    classifier = classifier.fit(train_data[vector_headers], train_target)
    results = classifier.predict(test_data[vector_headers])
    
    output = pd.DataFrame(data={"Candidate":test_data['Candidate'], "County":test_data['County'], "Estimated Votes":results, "Actual Votes":test_target})    
    return output

cleaned_tweets = clean()
vectorized_data = vectorize(cleaned_tweets)
results = prediction(vectorized_data)
print(results)
Exemplo n.º 13
0
#imports
import pandas as pd
from vectorizer import fit_vectorizer, vectorize, fit_scaler, scale

#load data
train = pd.read_csv('../../data/train.csv', names=['label', 'title', 'review'])

#only run this part once!
#fit_vectorizer(train.review)
fit_scaler('bin/x_train_vect.sav')

#may take a while
x_train_filename = 'bin/x_train_scaled.sav'
vectorize(train.review, x_train_filename)

#should go faster
test = pd.read_csv('../../data/test.csv', names=['label', 'title', 'review'])
x_test_filename = 'bin/x_test_scaled.sav'
vectorize(test.review, x_test_filename)