Python get_data 예제들, data_preprocessing.get_data Python 예제들

예제 #1

0

파일 보기

파일: interface.py 프로젝트: lalitha1201/AutoRecommender

def dialogue():
    click.echo(
        'Welcome to AutoRec! Let me help you with the product recommendations.'
    )
    name = click.prompt(colored('Enter your name', Fore.MAGENTA))
    #click.echo(name)
    csv = click.prompt(
        colored('Enter the path for your sample data', Fore.MAGENTA))
    #click.echo(name)
    click.clear()
    #user = click.prompt(colored('do you want to recommend items for users to purchase? [Y/N]',Fore.GREEN), type= click.Choice(['Y','N']))
    #csv = click.prompt(colored('Enter the path',Fore.MAGENTA))
    #click.echo(name)
    click.clear()
    modelname = 'user_user'
    data_subset = data.get_data(csv)
    location = click.echo('Data being preprocessed')
    modelname = click.prompt(colored(
        'Do you want to recommend items for users to purchase? [y/N]"',
        Fore.MAGENTA),
                             type=click.Choice(['Y', 'N']))
    #click.echo(name)
    click.clear()
    results = model.all_models(csv, modelname)
    modeled = click.echo('Data being modeled')
    modelname1 = click.prompt(colored(
        'Do you want to recommend similar items for users to purchase? [y/N]"',
        Fore.MAGENTA),
                              type=click.Choice(['Y', 'N']))
    #click.echo(name)
    click.clear()
    modelname = 'item_item'
    results = model.all_models(csv, modelname)
    modelname2 = click.prompt(colored(
        'Recommendations using matrix factorization? [y/N]"', Fore.MAGENTA),
                              type=click.Choice(['Y', 'N']))
    modelname = 'matrix'
    results1 = model.all_models(csv, modelname)

예제 #2

0

파일 보기

파일: cm.py 프로젝트: atao99/HYML2020

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from data_preprocessing import get_data
from model import CNN

BATCH = 128

if __name__ == "__main__":
    print("\nLoading data...", flush=True)
    data_set = get_data(os.path.join(sys.argv[1], "validation"), range(3430))
    data_loader = DataLoader(data_set, batch_size=BATCH, shuffle=False)

    print("\nLoading model...", flush=True)
    model = CNN().cuda()
    model.load_state_dict(torch.load(sys.argv[2]))

    print("\nStart predicting...", flush=True)
    model.eval()
    loss = nn.CrossEntropyLoss()  # softmax + cross entropy
    y = []
    pred_y = []
    with torch.no_grad():
        for i, data in enumerate(data_loader):
            pred_y += np.argmax(model(data[0].cuda()).cpu().data.numpy(),
                                axis=1).tolist()

예제 #3

0

파일 보기

파일: filter.py 프로젝트: atao99/HYML2020

        obj.backward()
        optimizer.step()
    filter_visualization = x.detach().cpu().squeeze(
    )  # To discard the dimension euqal to 1
    # We just want to record x after ITERs, so we can detach from graph with gradient discarded.

    hook_handle.remove(
    )  # hook_handle must be removed, or it'll always exist and work for the conv layer.
    return filter_activations, filter_visualization


if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print("\nLoading data...", flush=True)
    train_set = get_data(os.path.join(sys.argv[1], "training"), IMG_ID)
    x = []
    y = []
    for i in range(len(IMG_ID)):
        x_tmp, y_tmp = train_set.__getitem__(i)
        x.append(x_tmp)
        y.append(y_tmp)

    print("\nLoading model...", flush=True)
    model = CNN().to(device)
    model.load_state_dict(torch.load(sys.argv[2]))

    print("\nStart explaining...", flush=True)
    for cf in [[0, 25], [0, 55], [4, 64], [4, 120], [8, 45], [8, 85]]:
        print("\n{}".format(cf[1]))
        filter_activations, filter_visualization = filter_explaination(

예제 #4

0

파일 보기

파일: model_cf.py 프로젝트: lalitha1201/AutoRecommender

def all_models(_file_path, modelname):
    #data = ds.get_data(_file_path, 'data/data_subset.csv', 0.99)
    data = ds.get_data(_file_path)

    data_surprise = data[['customer_id', 'product_id', 'star_rating']]. \
    rename(columns={'customer_id': 'userID', 'product_id': 'itemID', 'star_rating': 'rating'})

    reader = Reader(rating_scale=(1.0, 5.0))
    df_loaded = Dataset.load_from_df(data_surprise, reader)
    #trainset = df_loaded.build_full_trainset()

    results_list = []

    # features
    reviews = data.shape[0]
    n_users = data.customer_id.nunique()
    n_products = data.product_id.nunique()
    mean_rating = data.star_rating.mean()
    rating_std = data.star_rating.std()
    sparsity = reviews * 100 / (n_users * n_products)

    #for model in ['user_user', 'item_item', 'matrix_fact']:
    # Perform cross validation
    results = model_selection.cross_validate(select_model(
        df_loaded, model_selection=modelname),
                                             df_loaded,
                                             measures=['RMSE', 'MAE'],
                                             cv=5,
                                             verbose=False)

    # precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
    kf = KFold(n_splits=5)
    trainset, testset = train_test_split(df_loaded, test_size=.25)
    map_k, mar_k = 0, 0
    algo = select_model(df_loaded, model_selection=modelname)
    #for trainset, testset in trainset.split():
    algo.fit(trainset)
    predictions = algo.test(testset)
    #uid = str(11613707)  # raw user id (as in the ratings file). They are **strings**!
    #iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

    # get a prediction for specific users and items.
    #pred = algo.predict(uid,verbose=True)
    #pred
    top_n = rec.get_top_n(predictions, n=30)

    #top_n[data_surprise.userID[38745832]]

    # top_n = rec.get_top_n(predictions,data_surprise,userID = 11613707)
    # pred_SVD_124 = top.get_top_n(predictions,userId = 13545982,data = data)
    #top_n.head(15)
    # pred_SVD_124
    print('Recommendations for the user')
    # print('user Id    Item Id')
    dfo = pd.DataFrame(columns=['UserId', 'ItemId'])
    i = 0
    for uid, user_ratings in top_n.items():
        row = [uid, top_n[uid]]
        dfo.loc[i] = row
        i = i + 1
        #dfo.to_csv('submissionF.csv', index = False)

        # print('Recommendations for the user')

        #print(uid, [iid for (iid, _) in user_ratings])
        precisions, recalls = metrics.precision_recall_at_k(predictions,
                                                            k=5,
                                                            threshold=4)

        # Precision and recall can then be added for all the splits
        merge = dfo.merge(data, left_on='UserId', right_on='customer_id')
        merge1 = merge[['UserId', 'product_title']]
        print(merge1)
        map_k += precisions
        mar_k += recalls
        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series(map_k / 5, index=['map_k']))
        tmp = tmp.append(pd.Series(mar_k / 5, index=['mar_k']))
        tmp = tmp.append(pd.Series([str(_file_path)], index=['data']))
        tmp = tmp.append(pd.Series([str(modelname)], index=['Algorithm']))

        # features
        tmp = tmp.append(pd.Series(reviews, index=['reviews']))
        tmp = tmp.append(pd.Series(n_users, index=['n_users']))
        tmp = tmp.append(pd.Series(n_products, index=['n_products']))
        tmp = tmp.append(pd.Series(mean_rating, index=['mean_rating']))
        tmp = tmp.append(pd.Series(rating_std, index=['std_rating']))
        tmp = tmp.append(pd.Series(sparsity, index=['sparsity']))

        results_list.append(tmp)
        # print(results_list)
    results_df = pd.DataFrame(results_list)

    # saving the results file to folder
    return results_df

예제 #5

0

파일 보기

파일: model_cf.py 프로젝트: geethaaluri/AutoRecommender

def all_models(_file_path, _save_path):
    data = ds.get_data(_file_path, 'data/data_subset.csv', 0.99)
    data_surprise = data[['customer_id', 'product_id', 'star_rating']]. \
        rename(columns={'customer_id': 'userID', 'product_id': 'itemID', 'star_rating': 'rating'})

    reader = Reader(rating_scale=(1.0, 5.0))
    df_loaded = Dataset.load_from_df(data_surprise, reader)

    results_list = []

    # features
    reviews = data.shape[0]
    n_users = data.customer_id.nunique()
    n_products = data.product_id.nunique()
    mean_rating = data.star_rating.mean()
    rating_std = data.star_rating.std()
    sparsity = reviews * 100 / (n_users * n_products)

    for model in ['user_user', 'item_item', 'matrix_fact']:
        # Perform cross validation
        results = model_selection.cross_validate(select_model(
            df_loaded, model_selection=model),
                                                 df_loaded,
                                                 measures=['RMSE', 'MAE'],
                                                 cv=5,
                                                 verbose=False)

        # precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
        kf = KFold(n_splits=5)
        map_k, mar_k = 0, 0
        algo = select_model(df_loaded, model_selection=model)
        for trainset, testset in kf.split(df_loaded):
            algo.fit(trainset)
            predictions = algo.test(testset)
            precisions, recalls = metrics.precision_recall_at_k(predictions,
                                                                k=5,
                                                                threshold=4)

            # Precision and recall can then be added for all the splits

            map_k += precisions
            mar_k += recalls

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series(map_k / 5, index=['map_k']))
        tmp = tmp.append(pd.Series(mar_k / 5, index=['mar_k']))
        tmp = tmp.append(pd.Series([str(_file_path)], index=['data']))
        tmp = tmp.append(pd.Series([str(model)], index=['Algorithm']))

        # features
        tmp = tmp.append(pd.Series(reviews, index=['reviews']))
        tmp = tmp.append(pd.Series(n_users, index=['n_users']))
        tmp = tmp.append(pd.Series(n_products, index=['n_products']))
        tmp = tmp.append(pd.Series(mean_rating, index=['mean_rating']))
        tmp = tmp.append(pd.Series(rating_std, index=['std_rating']))
        tmp = tmp.append(pd.Series(sparsity, index=['sparsity']))

        results_list.append(tmp)
    print(results_list)
    results_df = pd.DataFrame(results_list)

    # saving the results file to folder
    if _save_path:
        results_df.to_csv(_save_path, mode='a', index=False)

    return results_df

예제 #6

0

파일 보기

파일: shap.py 프로젝트: atao99/HYML2020

sys.path = sys.path[1:] + [sys.path[0]]
import shap
import matplotlib.pyplot as plt

from data_preprocessing import get_data
from model import CNN

TOTAL_ID = range(9866)
B_SIZE = 32
IMG_ID = [7741, 7096, 7290, 1658]

if __name__ == "__main__":
    index = np.random.permutation(TOTAL_ID)
    print("\nLoading data...", flush=True)
    train_set = get_data(os.path.join(sys.argv[1], "training"),
                         index[:B_SIZE].tolist() + IMG_ID)
    x = []
    y = []
    for i in range(B_SIZE + len(IMG_ID)):
        x_tmp, y_tmp = train_set.__getitem__(i)
        x.append(x_tmp)
        y.append(y_tmp)
    x = torch.stack(x).cuda()

    print("\nLoading model...", flush=True)
    model = CNN().cuda()
    model.load_state_dict(torch.load(sys.argv[2]))

    print("\nComputing...", flush=True)
    e = shap.DeepExplainer(model, x[:B_SIZE])
    shap_values = e.shap_values(x[B_SIZE:])

예제 #7

0

파일 보기

def predict(inputs):
    model.eval()
    inputs = torch.FloatTensor(inputs).permute(
        0, 3, 1, 2)  # from numpy(b, h, w, c) to pytorch(b, c, h, w)
    output = model(inputs.cuda())
    return output.detach().cpu().numpy()


def segmentation(inputs):
    return slic(inputs, n_segments=100, compactness=1, sigma=1)


if __name__ == "__main__":
    print("\nLoading data...", flush=True)
    train_set = get_data(os.path.join(sys.argv[1], "validation"), IMG_ID)
    x = []
    y = []
    for i in range(len(IMG_ID)):
        x_tmp, y_tmp = train_set.__getitem__(i)
        x.append(x_tmp)
        y.append(y_tmp)
    x = torch.stack(x).cuda()
    y = torch.stack(y)

    print("\nLoading model...", flush=True)
    model = CNN().cuda()
    model.load_state_dict(torch.load(sys.argv[2]))
    model.eval()
    pred_y = np.argmax(model(x).cpu().data.numpy(), axis=1).tolist()

예제 #8

0

파일 보기

파일: streamlit_app.py 프로젝트: lalitha1201/AutoRecommender

    return algo


import os, io

st.title('Welcome to RecServe!')
st.header('Let me help you with the product recommendations')
option1 = st.selectbox('Select the path for the dataset?', ['sample_us.tsv'])
#st.write('sample_us.tsv')
#st.write('You selected:',option1)

#url = st.text_input('Enter the path for the data')
st.write('The data is loaded')
#data_load_state = st.text('Loading the data')
data = ds.get_data(option1)
st.write(data)

#data = ds.get_data(_file_path, 'data/data_subset.csv', 0.99)
#data = ds.get_data('/Users/lalitharahul/Desktop/AutoRecommender/RecServe/sample_us.tsv')
#data = ds.get_data(url)
#st.write(data)
#data_load_state.text('Data is preprocessed')
data_surprise = data[['customer_id', 'product_id', 'star_rating']]. \
rename(columns={'customer_id': 'userID', 'product_id': 'itemID', 'star_rating': 'rating'})

reader = Reader(rating_scale=(1.0, 5.0))
df_loaded = Dataset.load_from_df(data_surprise, reader)
#trainset = df_loaded.build_full_trainset()
results_list = []

예제 #9

0

파일 보기

def run(experiment):
    save_path = "checkpoints/" + experiment.name 
    log_path = "tensorboard/train/" + experiment.name
    # create or clean directory
    for path in [save_path, log_path]:
        if not os.path.exists(path):
            os.makedirs(path)
        else:
            shutil.rmtree(path)           
            os.makedirs(path)
    save_path += "/dev"

    # log git commit hash
    repo = git.Repo(search_parent_directories=True)
    sha = repo.head.object.hexsha
    file = open(log_path + "/git_commit_" + sha, 'w')
    file.close()

    epochs, input_batch_size, rnn_size, num_layers, encoding_embedding_size, decoding_embedding_size, learning_rate, keep_probability, num_samples, reward = map(experiment.hyperparams.get, ('epochs', 'input_batch_size', 'rnn_size', 'num_layers', 'encoding_embedding_size', 'decoding_embedding_size', 'learning_rate', 'keep_probability', 'num_samples', "reward"))
    
    ### prepare data ###
    (train_source_int_text, train_target_int_text), (valid_source_int_text, valid_target_int_text), (
            source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = data_preprocessing.get_data(experiment.data["dataset"], experiment.data["folder"], experiment.data["train_source_file"], experiment.data["train_target_file"], experiment.data["dev_source_file"], experiment.data["dev_target_file"], experiment.tokenization)

    max_source_sentence_length = max([len(sentence) for sentence in train_source_int_text])

    train_source = train_source_int_text
    train_target = train_target_int_text
    
    valid_source = valid_source_int_text
    valid_target = valid_target_int_text

    # shuffle
    rnd = random.Random(1234)
    train_combined = list(zip(train_source, train_target))
    rnd.shuffle(train_combined)
    train_source, train_target = zip(*train_combined)

    valid_combined = list(zip(valid_source, valid_target))
    rnd.shuffle(valid_combined)
    valid_source, valid_target = zip(*valid_combined)

    # set reward function
    if reward == "levenshtein":
        reward_func = lambda ref_hyp: - textdistance.levenshtein(ref_hyp[0], ref_hyp[1])   
    elif reward == "jaro-winkler":
        reward_func = lambda ref_hyp: textdistance.JaroWinkler()(ref_hyp[0], ref_hyp[1]) 
    elif reward == "hamming":
        reward_func = lambda ref_hyp: - textdistance.hamming(ref_hyp[0], ref_hyp[1])

    if experiment.train_method == 'MLE':
        graph_batch_size = input_batch_size
    elif experiment.train_method == 'reinforce' or experiment.train_method == 'reinforce_test':
        graph_batch_size = num_samples

    ### prepare model ###
    tf.reset_default_graph()# maybe need?
    with tf.variable_scope(tf.get_variable_scope(), reuse=False):
        model = rnn_model.RNN(graph_batch_size, max_source_sentence_length, source_vocab_to_int, target_vocab_to_int, encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers)
    
    eval_batch_size = 128
    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
        eval_model = rnn_model.RNN(eval_batch_size, max_source_sentence_length, source_vocab_to_int, target_vocab_to_int, encoding_embedding_size, decoding_embedding_size, rnn_size, num_layers, False)


    early_stopping = True

    ### train model ###
    if experiment.train_method == 'reinforce_test':
        train.reinforce_test(model, experiment.start_checkpoint, source_vocab_to_int, learning_rate, keep_probability, graph_batch_size, target_int_to_vocab, source_int_to_vocab, valid_source, valid_target)
    else:
        train.train(experiment.name, experiment.train_method, model, epochs, input_batch_size, train_source, train_target, valid_source, valid_target, learning_rate, keep_probability, save_path, experiment.start_checkpoint, target_int_to_vocab, source_int_to_vocab, source_vocab_to_int, log_path, graph_batch_size, experiment.max_hours, eval_model, eval_batch_size, reward_func, early_stopping)

예제 #10

0

파일 보기

import sys
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from data_preprocessing import get_data
from A1 import A1
from A2 import A2
from B1 import B1
from B2 import B2

# ======================================================================================================================
# Data preprocessing
smile_X_train, smile_y_train, smile_X_test, smile_y_test, gender_X_train, gender_y_train, gender_X_test, gender_y_test, eye_X_train, eye_y_train, eye_X_test, eye_y_test, face_X_train, face_y_train, face_X_test, face_y_test = get_data(
)
# ======================================================================================================================
# Task A1
model_A1 = A1.create_model()  # Build model object.
acc_A1_train, model_A1_trained = A1.train_model(
    model_A1, gender_X_train, gender_y_train
)  # Train model based on the training set (you should fine-tune your model based on validation set.)
acc_A1_test = A1.test_model(model_A1_trained, gender_X_test,
                            gender_y_test)  # Test model based on the test set.

#Clean up memory/GPU etc...             # Some code to free memory if necessary.

# ======================================================================================================================
# Task A2
model_A2 = A2.create_model()
acc_A2_train, model_A2_trained = A2.train_model(model_A2, smile_X_train,
                                                smile_y_train)
acc_A2_test = A2.test_model(model_A2_trained, smile_X_test, smile_y_test)