Exemplo n.º 1
0
def main():
    if len(sys.argv) != 3:
        print('usage: python postagger-test.py', file=sys.stderr)
        print('                <str: test prefix>', file=sys.stderr)
        print('                <str: model prefix>', file=sys.stderr)
        return

    test_prefix = sys.argv[1]
    model_prefix = sys.argv[2]

    print('loading data ...', file=sys.stderr)

    # load test data
    test_words = [w.lower() for w in utils.read_data(test_prefix + '.words')]
    test_pos = utils.read_data(test_prefix + '.pos')

    # load dictionary
    word_ids = Dictionary.load(model_prefix + '.wordid')
    pos_ids = Dictionary.load(model_prefix + '.posid')

    # make word/POS IDs
    test_wids = [word_ids[w] for w in test_words]
    test_pids = [pos_ids[w] for w in test_pos]

    # load and test tagger
    tagger = POSTagger.load(model_prefix)
    tagger.test(test_wids, test_pids)
Exemplo n.º 2
0
 def test_choose_bins(self):
     """ test of the choose_bins function """
     data_to_test = fct.read_data(file)
     bin_1, bin_n, bins = fct.choose_bins(data_to_test, 'height', 0.1)
     assert round(bin_1, 1) == 1.6
     assert round(bin_n, 1) == 1.9
     assert round(bins, 1) == 15.
Exemplo n.º 3
0
 def test_plot_histogram(self):
     """ test of the plot_histogram function """
     data_to_test = fct.read_data(file)
     data_F = fct.sort_data(data_to_test, 'F')
     bin_1, bin_n, bins = fct.choose_bins(data_F, 'age', 1.)
     spot1, width1 = fct.plot_histogram(data_F['age'].values, bin_1, bin_n,
                                        bins, 'left', 'women')
     assert round(spot1[-1] - spot1[0], 1) == 1.0
Exemplo n.º 4
0
 def test_doublehistogram(self):
     """ test of the doublehistogram function """
     data_to_test = fct.read_data(file)
     spot1, spot2, width1, width2 = fct.doublehistogram(
         data_to_test,
         list(data_to_test)[1], 1., 300, './figures/tests/')
     assert round(width1, 2) == round(width2, 2)
     assert round(spot1[0] + width1, 2) == round(spot2[0] - width1, 2)
Exemplo n.º 5
0
def main():
    if len(sys.argv) != 9:
        print('usage: python postagger-train.py', file=sys.stderr)
        print('                <str: train prefix>', file=sys.stderr)
        print('                <str: dev prefix>', file=sys.stderr)
        print('                <str: model prefix>', file=sys.stderr)
        print('                <int: word n-gram size>', file=sys.stderr)
        print('                <int: POS n-gram size>', file=sys.stderr)
        print('                <int: word window size>', file=sys.stderr)
        print('                <int: POS history size>', file=sys.stderr)
        print('                <int: max iteration>', file=sys.stderr)
        return

    train_prefix = sys.argv[1]
    dev_prefix = sys.argv[2]
    model_prefix = sys.argv[3]
    word_ngram_size = int(sys.argv[4])
    pos_ngram_size = int(sys.argv[5])
    word_window_size = int(sys.argv[6])
    pos_history_size = int(sys.argv[7])
    max_iteration = int(sys.argv[8])

    print('loading data ...', file=sys.stderr)

    # load train/dev data
    train_words = [w.lower() for w in utils.read_data(train_prefix + '.words')]
    train_pos = utils.read_data(train_prefix + '.pos')
    dev_words = [w.lower() for w in utils.read_data(dev_prefix + '.words')]
    dev_pos = utils.read_data(dev_prefix + '.pos')

    # make dictionary
    word_ids = Dictionary(train_words, frozen=True)
    pos_ids = Dictionary(train_pos, frozen=True)
    word_ids.save(model_prefix + '.wordid')
    pos_ids.save(model_prefix + '.posid')

    # make word/POS IDs
    train_wids = [word_ids[w] for w in train_words]
    train_pids = [pos_ids[w] for w in train_pos]
    dev_wids = [word_ids[w] for w in dev_words]
    dev_pids = [pos_ids[w] for w in dev_pos]

    # train
    tagger = POSTagger(word_ngram_size, pos_ngram_size, word_window_size, pos_history_size)
    tagger.train(len(pos_ids), train_wids, train_pids, dev_wids, dev_pids, max_iteration, model_prefix)
Exemplo n.º 6
0
def main():
    data_set_list = ['MNIST', 'lung_small', 'warpPIE10P', 'Yale', 'digits']
    n_clusters_list = [10, 7, 10, 15, 10]
    meth = ['MCFS-I']

    for i in range(len(data_set_list)):

        print('dataset: {}'.format(data_set_list[i]))
        data, label = utils.read_data(data_set_list[i])

        # test n_emb
        for n_emb in range(1, 31):

            print('n_emb: {}/{}'.format(n_emb, 50))
            with open("../Result/" + data_set_list[i] + "_nemb_test.txt",
                      'a') as f:
                line = str(n_emb) + '\n'
                f.write(line)

            for j in range(len(meth)):
                print('method: {}\ttime: {}'.format(
                    meth[j],
                    time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

                for o in range(3):
                    print('step: {}/3\ttime: {}'.format(
                        str(o + 1),
                        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
                    star_time = datetime.now()
                    weight = MCFS.mcfs(X=data,
                                       n_selected_features=100,
                                       i=j,
                                       n_emb=n_emb,
                                       n_neighbors=0)
                    idx = MCFS.feature_ranking(weight)

                    selected_data = data[:, idx[0:100]]
                    end_time = datetime.now()

                    # perform kmeans clustering based on the selected features and repeats 5 times
                    nmi_total = 0.0
                    for k in range(3):
                        nmi_total += MCFS.eval_cluster_prediction(
                            selected_data, label, n_clusters_list[i])

                    # output the average NMI
                    with open(
                            "../Result/" + data_set_list[i] + "_nemb_test.txt",
                            'a') as f:
                        line = meth[j] + ': ' + str(float(nmi_total) / 3) + '\tcost_time: ' + \
                               str((end_time-star_time).seconds) + 's\n'
                        f.write(line)

            with open("../Result/" + data_set_list[i] + "_nemb_test.txt",
                      'a') as f:
                line = '\n\n'
                f.write(line)
Exemplo n.º 7
0
def main():
    logging.set_verbosity_info()
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path', default='best_model_ckpt_0', type=str)
    parser.add_argument('--seed', default=202105, type=int)
    args = parser.parse_args()
    seed_random(args.seed)
    data_path = './user_data/duality_pair_pretrain_no_nsp.txt'
    vocab_path = './user_data/vocab.txt'
    model_path = './user_data/nezha-cn-base'
    output_path = './user_data/pretrained-nezha-base'

    tokenizer = BertTokenizer.from_pretrained(vocab_path)
    data = read_data(data_path, tokenizer)

    train_dataset = TcDataset(data)

    model = NeZhaForMaskedLM.from_pretrained(model_path)
    model.resize_token_embeddings(tokenizer.vocab_size)

    data_collator = TcCollator(max_seq_len=30,
                               tokenizer=tokenizer,
                               mlm_probability=0.15)

    logging_path = os.path.join(output_path, 'log')
    model_save_path = os.path.join(output_path, args.model_path)
    tokenizer_and_config = os.path.join(output_path, 'tokenizer_and_config')
    build_path(model_save_path)
    build_path(logging_path)
    build_path(tokenizer_and_config)

    training_args = TrainingArguments(output_dir=output_path,
                                      overwrite_output_dir=True,
                                      learning_rate=6e-5,
                                      num_train_epochs=130,
                                      per_device_train_batch_size=128,
                                      logging_steps=5000,
                                      fp16=True,
                                      fp16_backend='amp',
                                      load_best_model_at_end=True,
                                      prediction_loss_only=True,
                                      logging_dir=logging_path,
                                      logging_first_step=True,
                                      dataloader_num_workers=4,
                                      seed=2021)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model(model_save_path)
    tokenizer.save_pretrained(tokenizer_and_config)
Exemplo n.º 8
0
def main(fixed_params_path, params_path, visualization, check_embedding, remove, edge_batch_size):
    params = read_data(params_path)
    params.pop('remove', None)
    params.pop('edge_batch_size', None)
    train_full_model(fixed_params_path=fixed_params_path,
                     visualization=visualization,
                     check_embedding=check_embedding,
                     remove=remove,
                     edge_batch_size=edge_batch_size,
                     **params)
Exemplo n.º 9
0
def features_from(i):
    df_fiscalite, df_resp_fis, df_ids_fis, df_democratie, df_resp_dem, df_ids_dem, df_ecologie, df_resp_eco, df_ids_eco, df_organisation, df_resp_org, df_ids_org = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    gmm, features = 0, 0
    print(df_fiscalite)
    if (i == 0):
        df_fiscalite = ut.read_data(
            'data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json')
        df_resp_fis = get_open_reponses(df_fiscalite)
        df_ids_fis = get_ids_open_reponses(df_fiscalite)
    elif (i == 1):
        df_democratie = ut.read_data('data/DEMOCRATIE_ET_CITOYENNETE.json')
        df_resp_dem = get_open_reponses(df_democratie)
        df_ids_dem = get_ids_open_reponses(df_democratie)
    elif (i == 2):
        df_ecologie = ut.read_data('data/LA_TRANSITION_ECOLOGIQUE.json')
        df_resp_eco = get_open_reponses(df_ecologie)
        df_ids_eco = get_ids_open_reponses(df_ecologie)
    elif (i == 3):
        df_organisation = ut.read_data(
            'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json')
        df_resp_org = get_open_reponses(df_organisation)
        df_ids_org = get_ids_open_reponses(df_organisation)
    dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie],
                    ["ecologie", df_ecologie],
                    ["organisation", df_organisation]])
    dfs_responses = np.array([["responses fiscalite", df_resp_fis],
                              ["responses democratie", df_resp_dem],
                              ["responses ecologie", df_resp_eco],
                              ["responses organisation", df_resp_org]])
    dfs_ids = np.array([df_ids_fis, df_ids_dem, df_ids_eco, df_ids_org])
    # read features
    features = np.loadtxt(dfs_responses[i, 0] + '_all_questions.tsv',
                          delimiter='\t')
    # Fit GMM
    gmm = GaussianMixture(n_components=10)
    gmm.fit(np.array(features))
    local_pool = multiprocessing.Pool(20, initializer)
    local_pool.map(fill_X, range(four_surveys_taken_auth_ids))
    local_pool.close()
    local_pool.join()
    np.savetxt("X_" + str(i) + ".csv", X, delimiter=",")
Exemplo n.º 10
0
def main():
    params = get_cmd()

    aa2idx, idx2aa = dict(), dict()
    for i, aa in enumerate(ORDER_LIST):
        aa2idx[aa] = i
        idx2aa[i] = aa

    X = np.load(params.coords)["arr_0"]
    sequences, labels, _ = read_data(params.inputfile, get_labels=True)

    assert params.query >= 0 and params.query < len(sequences)
    assert params.target >= 0 and params.target < len(sequences)
    assert params.query != params.target
    if labels[params.query] == labels[params.target]:
        print("Warning, query and target are from the same family")

    seq_query = sequences[params.query]
    seq_target = sequences[params.target]

    encoded_query = X[params.query]
    encoded_target = X[params.target]

    decoder = keras.models.load_model(params.decoder)

    points = list()
    for v in np.linspace(0, 1, params.steps + 2):
        points.append(slerp(v, encoded_query, encoded_target))

    points = np.asarray(points)
    decoded_points = decoder.predict(points)
    decoded_seq = []
    for pred in decoded_points:
        wp = warm_prediction(pred.T, 0.5).T
        num_seq = [
            np.random.choice(np.arange(len(ORDER_LIST)), p=wp[j])
            for j in range(len(wp))
        ]
        decoded_seq.append("".join(idx2aa[i] for i in num_seq))

    with open(params.outputfile, "w") as outf:
        #outf.write(">query_original\n{}\n".format(seq_query))
        outf.write(">query\n{}\n".format(decoded_seq[0]))
        for i in range(1, len(decoded_seq) - 1):
            outf.write(">interpolated_{}\n{}\n".format(i, decoded_seq[i]))
        outf.write(">target\n{}\n".format(decoded_seq[-1]))
        #outf.write(">target_original\n{}\n".format(seq_target))

    sys.exit(0)
Exemplo n.º 11
0
def main(params_path, user_ids, use_saved_graph, trained_model_path,
         use_saved_already_bought, graph_path, ctm_id_path, pdt_id_path,
         already_bought_path, k, remove):
    params = read_data(params_path)
    params.pop('k', None)
    params.pop('remove', None)

    inference_ondemand(
        user_ids=user_ids,  # List or 'all'
        use_saved_graph=use_saved_graph,
        trained_model_path=trained_model_path,
        use_saved_already_bought=use_saved_already_bought,
        graph_path=graph_path,
        ctm_id_path=ctm_id_path,
        pdt_id_path=pdt_id_path,
        already_bought_path=already_bought_path,
        k=k,
        remove=remove,
        **params,
    )
Exemplo n.º 12
0
def main():
    data_set_list = ['MNIST', 'lung_small', 'warpPIE10P', 'Yale', 'digits']
    n_clusters_list = [10, 7, 10, 15, 10]
    n_select_feature = [10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 64, 70, 80, 90, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300]
    num_select_feature_max = [200, 200, 200, 200, 64]
    meth = ['MCFS', 'MCFS-I', 'lap_score', 'NDFS']


    for i in range(len(data_set_list)):
        """
            i = 0: use data set MNIST
            i = 1: use data set lung_small
            i = 2: use data set warpPIE10P
            i = 3: use data set Yale
            i = 4: use data set digits
        """

        print('dataset: {}'.format(data_set_list[i]))
        data, label = utils.read_data(data_set_list[i])
        for num_sel_fea in n_select_feature:
            if num_sel_fea > num_select_feature_max[i]:
                break

            print('select feature: {}/{}'.format(num_sel_fea, num_select_feature_max[i]))
            with open("../Result/" + data_set_list[i] + ".txt", 'a') as f:
                line = str(num_sel_fea) + '\n'
                f.write(line)

            for j in range(len(meth)):
                """
                    j = 0: test MCFS
                    j = 1: test MCFS-I
                    j = 2: test lap_score
                    j = 3: test NDFS
                """
                print('method: {}\ttime: {}'.format(meth[j], time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

                cnt = 0
                for o in range(5):
                    print('step: {}/5\ttime: {}'.format(str(o+1), time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
                    star_time = datetime.now()
                    if j < 2:
                        weight = MCFS.mcfs(X=data, n_selected_features=num_sel_fea, i=j,
                                           n_emb=n_clusters_list[i], n_neighbors=5)
                        idx = MCFS.feature_ranking(weight)
                    elif j == 2:
                        kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn",
                                    "weight_mode": "heat_kernel", "k": 5, 't': 1}
                        W = construct_W(data, **kwargs_W)
                        score = lap_score.lap_score(data, W=W)
                        idx = lap_score.feature_ranking(score)
                    elif j == 3:
                        kwargs = {"metric": "euclidean", "neighborMode": "knn",
                                  "weightMode": "heatKernel", "k": 5, 't': 1}
                        W = construct_W(data, **kwargs)
                        Weight = NDFS.ndfs(data, W=W, n_clusters=20)
                        idx = feature_ranking(Weight)

                    selected_data = data[:, idx[0:num_sel_fea]]
                    end_time = datetime.now()
                    print((end_time-star_time).microseconds)
                    cnt += (end_time-star_time).microseconds

                    # perform k-means clustering based on the selected features and repeats 5 times
                    nmi_total = 0.0
                    for k in range(5):
                        nmi_total += MCFS.eval_cluster_prediction(selected_data, label, n_clusters_list[i])

                    # output the average NMI
                    with open("../Result/" + data_set_list[i] + ".txt", 'a') as f:
                        line = meth[j] + ': ' + str(float(nmi_total) / 5) + '\tcost_time: ' + \
                               str((end_time-star_time).microseconds) + 'us\n'
                        f.write(line)

            with open("../Result/" + 'MCFS-I' + ".txt", 'a') as f:
                line = '\n\n'
                f.write(line)

    print('Test is complete!')
    utils.send_message('Complete', 'Test is complete!')
    def _read(self, file_path: str):
        lines = utils.read_data(file_path, self.percent_data)

        # Create instances
        for line in lines:
            yield self.text_to_instance(**line)
Exemplo n.º 14
0
 def test_sort_data(self):
     """ test of the sort_data function """
     data_to_test = fct.read_data(file)
     dataM = fct.sort_data(data_to_test, 'M')
     assert np.size(np.where(dataM['gender'].values == 'F')) == 0
Exemplo n.º 15
0
 def test_read_data(self):
     """ test of the read_data function """
     data_to_test = fct.read_data(file)
     assert np.size(data_to_test) == 12
     assert round(np.mean(data_to_test['age'].values), 2) == 33.25
     assert round(np.sum(data_to_test['height'].values), 2) == 6.79
Exemplo n.º 16
0
import argparse

from src.utils import exam, exam_handler, read_data

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Supervised training')
    parser.add_argument('--data_path',
                        '-d',
                        type=str,
                        default='./data/vocabulary.json',
                        help='Path to vocabulary JSON')
    parser.add_argument('--source_lang',
                        '-s',
                        type=str,
                        default='es',
                        help='Source Language')
    parser.add_argument('--target_lang',
                        '-t',
                        type=str,
                        default='fr',
                        help='Target Language')
    parser.add_argument(
        '--mode',
        '-m',
        type=int,
        default=1,
        help='Test Mode: 1->multilingual 2->ES to FR 3->FR to ES')
    params = parser.parse_args()
    data = read_data(params.data_path)
    exam_handler(data, params.source_lang, params.target_lang, params.mode)
Exemplo n.º 17
0
from src.utils import read_data, get_open_reponses, get_ids_open_reponses
from sklearn.mixture import GaussianMixture


def fill_X(auth_index):
    global gmm
    global ids_auth
    global features
    global four_surveys_taken_auth_ids
    auth = four_surveys_taken_auth_ids[auth_index]
    k = list(ids_auth).index(auth)
    return gmm.predict_proba(features[k].reshape(1, -1))[0]


n_compo = 10
df_organisation = read_data(
    'data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json')
df_resp_org = get_open_reponses(df_organisation)
df_ids_org = get_ids_open_reponses(df_organisation)
four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv",
                                         delimiter=",",
                                         dtype=str)
ids_auth = np.sort(list(set(df_resp_org['authorId'].values)))
np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s")
X = np.zeros((len(four_surveys_taken_auth_ids), n_compo))
# read features
features = np.loadtxt('responses organisation_all_questions.tsv',
                      delimiter='\t')
# Fit GMM
gmm = GaussianMixture(n_components=n_compo)
gmm.fit(features)
# pool
Exemplo n.º 18
0
import pandas as pd
from src.utils import read_data

enc_data = read_data('data/test.question', 'ENCSENTS')
dec_data = read_data('data/test.answer', 'DECSENTS')

# enc_data['ENCSLEN'] = enc_data.ENCSENTS.apply(lambda l: len(l.split()))
# dec_data['DECSLEN'] = dec_data.DECSENTS.apply(lambda l: len(l.split()))

dataset = pd.concat([enc_data, dec_data], axis=1)
dataset.dropna(axis=0, inplace=True)

dataset.to_csv('data/dataset.csv', index=False, encoding='utf-8', sep='\t')
Exemplo n.º 19
0
def inference_ondemand(
    user_ids,  # List or 'all'
    use_saved_graph: bool,
    trained_model_path: str,
    use_saved_already_bought: bool,
    graph_path=None,
    ctm_id_path=None,
    pdt_id_path=None,
    already_bought_path=None,
    k=10,
    remove=.99,
    **params,
):
    """
    Given a fully trained model, return recommendations specific to each user.

    Files needed to run
    -------------------
    Params used when training the model:
        Those params will indicate how to run inference on the model. Usually, they are outputted during training
        (and hyperparametrization).
    If using a saved already bought dict:
        The already bought dict: the dict includes all previous purchases of all user ids for which recommendations
                                 were requested. If not using a saved dict, it will be created using the graph.
                                 Using a saved already bought dict is not necessary, but might make the inference
                                 process faster.
    A) If using a saved graph:
        The saved graph: the graph that must include all user ids for which recommendations were requested. Usually,
                         it is outputted during training. It could also be created by another independent function.
        ID mapping: ctm_id and pdt_id mapping that allows to associate real-world information, e.g. item and customer
        identifier, to actual nodes in the graph. They are usually saved when generating a graph.
    B) If not using a saved graph:
        The graph will be generated on demand, using all the files in DataPaths of src.utils_data. All those files will
        be needed.

    Parameters
    ----------
    See click options below for details.

    Returns
    -------
    Recommendations for all user ids.

    """
    # Load & preprocess data
    ## Graph
    if use_saved_graph:
        graph = read_graph(graph_path)
        ctm_id_df = read_data(ctm_id_path)
        pdt_id_df = read_data(pdt_id_path)
    else:
        # Create graph
        data_paths = DataPaths()
        fixed_params = FixedParameters(
            num_epochs=0,
            start_epoch=0,  # Not used (only used in training)
            patience=0,
            edge_batch_size=0,  # Not used (only used in training)
            remove=remove,
            item_id_type=params['item_id_type'],
            duplicates=params['duplicates'])
        data = DataLoader(data_paths, fixed_params)
        ctm_id_df = data.ctm_id
        pdt_id_df = data.pdt_id

        graph = create_graph(data.graph_schema, )
        graph = assign_graph_features(
            graph,
            fixed_params,
            data,
            **params,
        )
    ## Preprocess: fetch right user ids
    if user_ids[0] == 'all':
        test_uids = np.arange(graph.num_nodes('user'))
    else:
        test_uids = fetch_uids(user_ids, ctm_id_df)
    ## Remove already bought
    if use_saved_already_bought:
        already_bought_dict = read_data(already_bought_path)
    else:
        bought_eids = graph.out_edges(u=test_uids, form='eid', etype='buys')
        already_bought_dict = create_already_bought(graph, bought_eids)

    # Load model
    dim_dict = {
        'user': graph.nodes['user'].data['features'].shape[1],
        'item': graph.nodes['item'].data['features'].shape[1],
        'out': params['out_dim'],
        'hidden': params['hidden_dim']
    }
    if 'sport' in graph.ntypes:
        dim_dict['sport'] = graph.nodes['sport'].data['features'].shape[1]
    trained_model = ConvModel(
        graph,
        params['n_layers'],
        dim_dict,
        params['norm'],
        params['dropout'],
        params['aggregator_type'],
        params['pred'],
        params['aggregator_hetero'],
        params['embedding_layer'],
    )
    trained_model.load_state_dict(
        torch.load(trained_model_path, map_location=device))
    if cuda:
        trained_model = trained_model.to(device)

    # Create dataloader
    all_iids = np.arange(graph.num_nodes('item'))
    test_node_ids = {'user': test_uids, 'item': all_iids}
    n_layers = params['n_layers']
    if params['embedding_layer']:
        n_layers = n_layers - 1
    sampler = dgl.dataloading.MultiLayerFullNeighborSampler(n_layers)
    nodeloader_test = dgl.dataloading.NodeDataLoader(graph,
                                                     test_node_ids,
                                                     sampler,
                                                     batch_size=128,
                                                     shuffle=True,
                                                     drop_last=False,
                                                     num_workers=num_workers)
    num_batches_test = math.ceil((len(test_uids) + len(all_iids)) / 128)

    # Fetch recs
    trained_model.eval()
    with torch.no_grad():
        embeddings = get_embeddings(
            graph,
            params['out_dim'],
            trained_model,
            nodeloader_test,
            num_batches_test,
            cuda,
            device,
            params['embedding_layer'],
        )
        recs = get_recs(graph,
                        embeddings,
                        trained_model,
                        params['out_dim'],
                        k,
                        test_uids,
                        already_bought_dict,
                        remove_already_bought=True,
                        cuda=cuda,
                        device=device,
                        pred=params['pred'],
                        use_popularity=params['use_popularity'],
                        weight_popularity=params['weight_popularity'])

        # Postprocess: user & item ids
        processed_recs = postprocess_recs(recs, pdt_id_df, ctm_id_df,
                                          params['item_id_type'],
                                          params['ctm_id_type'])
        print(processed_recs)
        return processed_recs
Exemplo n.º 20
0
"""
Created on Sun Apr 14 17:27:13 2019

@author: gabriel
"""
#%%
import src.utils as ut
import numpy as np
import pandas as pd
import string
from src.kmeans_embeddings import FeaturesExtractor
from src.utils import (read_data, get_open_reponses)
from sklearn.mixture import GaussianMixture

#%% extract data from json
df_fiscalite = ut.read_data('data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json')
df_democratie = ut.read_data('data/DEMOCRATIE_ET_CITOYENNETE.json')
df_ecologie = ut.read_data('data/LA_TRANSITION_ECOLOGIQUE.json')
df_organisation = ut.read_data('data/ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json')

dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie], ["ecologie", df_ecologie], ["organisation", df_organisation]])
#%%

questionId = '162'

df_responses = get_open_reponses(df_fiscalite)

responses = (df_responses[df_responses.questionId == questionId].formattedValue.values.tolist())

# Extract embeddings for sentences
s = FeaturesExtractor()
Exemplo n.º 21
0
#%%
import numpy as np
from src.kmeans_embeddings import FeaturesExtractor
from src.utils import read_data, get_open_reponses, get_ids_open_reponses

#%% extract data from json
df_fiscalite = read_data('data/LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json')
df_democratie = 0
df_ecologie = 0
df_organisation = 0

dfs = np.array([["fiscalite", df_fiscalite], ["democratie", df_democratie],
                ["ecologie", df_ecologie], ["organisation", df_organisation]])
#%%

#%% responses of each themes
df_resp_fis = get_open_reponses(df_fiscalite)
df_resp_dem = 0
df_resp_eco = 0
df_resp_org = 0

dfs_responses = np.array([["responses fiscalite", df_resp_fis],
                          ["responses democratie", df_resp_dem],
                          ["responses ecologie", df_resp_eco],
                          ["responses organisation", df_resp_org]])
#%%

#%% extract features
s = FeaturesExtractor()

# -*- coding: utf-8 -*-


from src.kmeans_embeddings import FeaturesExtractor
from src.utils import (read_data, get_open_reponses)
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd


if __name__ == '__main__':
    df = read_data('data/DEMOCRATIE_ET_CITOYENNETE.json')
    df_responses = get_open_reponses(df)

    responses = (df_responses[df_responses.questionId == '107'].
                 formattedValue.values.tolist())

    # Extract embeddings for sentences
    s = FeaturesExtractor()
    features = [s.get_features(x) for x in responses]

    features_np = np.array(features)
    print(features_np)

    samples_id = np.random.choice(range(len(features)), 5000)

    features_np_samples = features_np[samples_id, :]
    np.savetxt('features_s.tsv', features_np_samples, delimiter='\t')
    responses_samples = [responses[i] for i in samples_id]
    with open('labels_s.tsv', 'w') as f:
        for resp in responses_samples:
Exemplo n.º 23
0
##################################################################
# Description: Main code to plot statistics (in histogram) of age and height of a database of men and women. Men and women are separated here to give two distinguishable distributions. Histograms are stored in ./figures.
# Code name: ./src/main_improved.py
# Date of creation: 2019/03/12
# Date of last modification: 2019/03/15
# Contact information: Yann Chavaillaz, [email protected]
##################################################################

# Local packages required
import src.utils as fct

# Definition of paths
path_data = './data/'  # where to find the dataset
path_fig = './figures/'  # where to store the figures

# Definition and reading of the dataset
file = (path_data + 'data.csv')  # where data is stored
data = fct.read_data(file)

# Definition of parameters
resolution = 300  # resolution of the figure in dpi
step1 = 1.  # increment of histogram for the 1st variable
step2 = 0.1  # increment of histogram for the 2nd variable

# Plotting and saving histograms of age and size distribution among both genders
fct.doublehistogram(data, list(data)[1], step1, resolution, path_fig)  # age
fct.doublehistogram(data, list(data)[2], step2, resolution, path_fig)  # size
Exemplo n.º 24
0
from collections import Counter
from src.settings import *
from src.utils import read_data, parse_config
from src.functions import create_distance_matrix, create_vertices, teitz_bart_algorithm

if __name__ == '__main__':
    config = parse_config(CONFIG_FILE)
    count = int(config['DEFAULT']['count'])
    p = int(config['DEFAULT']['p'])
    data = read_data(DATA_FILE)
    vertices = create_vertices(data)
    distance_matrix = create_distance_matrix(data)
    medians = [
        tuple(teitz_bart_algorithm(distance_matrix, vertices, p))
        for i in range(count)
    ]
    counter = Counter(medians).items()
    for key, val in sorted(counter, key=lambda item: item[1], reverse=True):
        print(f"median {key}, amount {val}")
Exemplo n.º 25
0
n_samples = 100
# print(ttest_1samp(np.repeat(5, n_samples), 5))
# exit(1)
# for i in range(50):
x = np.random.randn(n_samples, 1)
y = x
# Z = np.random.randn(n_samples, 2)
# print(fcit.test(x, y, prop_test=0.2))
print(kernel_based_indepence(x, y, approximate=False))
y = np.random.randn(n_samples, 1)
print(kernel_based_indepence(x, y, approximate=False))
# print(kernel_based_conditional_independence(x, y, Z))
exit(1)

short_metrics_p, long_metrics_p = utils.read_data(shift=True)
short_metrics = short_metrics_p[:, :, 0]
long_metrics = long_metrics_p[:, :, 0]
metrics = np.hstack((short_metrics, long_metrics))
from itertools import combinations
from time import time

start = time()
values = []
for mx, my, mz in combinations(
        metrics.reshape((metrics.shape[1], metrics.shape[0], 1)), 3):
    values.append(
        kernel_based_conditional_independence(mx, my, mz, approximate=True))
print(time() - start)
plt.hist(values, bins='auto')
plt.show()
Exemplo n.º 26
0
def train_full_model(fixed_params_path,
                     visualization,
                     check_embedding,
                     remove,
                     edge_batch_size,
                     **params,):
    """
    Given the best hyperparameter combination, function to train the model on all available data.

    Files needed to run
    -------------------
    All the files in the TrainDataPaths:
        It includes all the interactions between user, sport and items, as well as features for user, sport and items.
    Fixed_params and params found in hyperparametrization:
        Those params will indicate how to train the model. Usually, they are found when running the hyperparametrization
        loop.

    Parameters
    ----------
    See click options below for details.


    Saves to files
    --------------
    trained_model with its fixed parameters and hyperparameters:
        The trained model with all parameters are saved to the folder 'models'.
    graph and ID mapping:
        When doing inference, it might be useful to import an already built graph (and the mapping that allows to
        associate node ID with personal information such as CUSTOMER IDENTIFIER or ITEM IDENTIFIER). Thus, the graph and ID mapping are saved to
        folder 'models'.
    """
    # Load parameters
    fixed_params = read_data(fixed_params_path)
    class objectview(object):
        def __init__(self, d):
            self.__dict__ = d
    fixed_params = objectview(fixed_params)
    fixed_params.remove = remove
    fixed_params.subtrain_size = 0.01
    fixed_params.valid_size = 0.01
    fixed_params.edge_batch_size = edge_batch_size

    # Create full train set
    train_data_paths = TrainDataPaths()
    presplit_item_feat = read_data(train_data_paths.item_feat_path)
    full_interaction_data = read_data(train_data_paths.full_interaction_path)
    train_df, test_df = presplit_data(presplit_item_feat,
                                      full_interaction_data,
                                      num_min=3,
                                      remove_unk=True,
                                      sort=True,
                                      test_size_days=1,
                                      item_id_type='ITEM IDENTIFIER',
                                      ctm_id_type='CUSTOMER IDENTIFIER', )
    train_data_paths.train_path = train_df
    train_data_paths.test_path = test_df
    data = DataLoader(train_data_paths, fixed_params)

    # Initialize graph & features
    valid_graph = create_graph(
        data.graph_schema,
    )
    valid_graph = assign_graph_features(valid_graph,
                                        fixed_params,
                                        data,
                                        **params,
                                        )

    dim_dict = {'user': valid_graph.nodes['user'].data['features'].shape[1],
                'item': valid_graph.nodes['item'].data['features'].shape[1],
                'out': params['out_dim'],
                'hidden': params['hidden_dim']}

    all_sids = None
    if 'sport' in valid_graph.ntypes:
        dim_dict['sport'] = valid_graph.nodes['sport'].data['features'].shape[1]
        all_sids = np.arange(valid_graph.num_nodes('sport'))

    # Initialize model
    model = ConvModel(valid_graph,
                      params['n_layers'],
                      dim_dict,
                      params['norm'],
                      params['dropout'],
                      params['aggregator_type'],
                      params['pred'],
                      params['aggregator_hetero'],
                      params['embedding_layer'],
                      )
    if cuda:
        model = model.to(device)

    # Initialize dataloaders
    # get training and test ids
    (
        train_graph,
        train_eids_dict,
        valid_eids_dict,
        subtrain_uids,
        valid_uids,
        test_uids,
        all_iids,
        ground_truth_subtrain,
        ground_truth_valid,
        all_eids_dict
    ) = train_valid_split(
        valid_graph,
        data.ground_truth_test,
        fixed_params.etype,
        fixed_params.subtrain_size,
        fixed_params.valid_size,
        fixed_params.reverse_etype,
        fixed_params.train_on_clicks,
        fixed_params.remove_train_eids,
        params['clicks_sample'],
        params['purchases_sample'],
    )

    (
        edgeloader_train,
        edgeloader_valid,
        nodeloader_subtrain,
        nodeloader_valid,
        nodeloader_test
    ) = generate_dataloaders(valid_graph,
                             train_graph,
                             train_eids_dict,
                             valid_eids_dict,
                             subtrain_uids,
                             valid_uids,
                             test_uids,
                             all_iids,
                             fixed_params,
                             num_workers,
                             all_sids,
                             embedding_layer=params['embedding_layer'],
                             n_layers=params['n_layers'],
                             neg_sample_size=params['neg_sample_size'],
                             )

    train_eids_len = 0
    valid_eids_len = 0
    for etype in train_eids_dict.keys():
        train_eids_len += len(train_eids_dict[etype])
        valid_eids_len += len(valid_eids_dict[etype])
    num_batches_train = math.ceil(train_eids_len / fixed_params.edge_batch_size)
    num_batches_subtrain = math.ceil(
        (len(subtrain_uids) + len(all_iids)) / fixed_params.node_batch_size
    )
    num_batches_val_loss = math.ceil(valid_eids_len / fixed_params.edge_batch_size)
    num_batches_val_metrics = math.ceil(
        (len(valid_uids) + len(all_iids)) / fixed_params.node_batch_size
    )
    num_batches_test = math.ceil(
        (len(test_uids) + len(all_iids)) / fixed_params.node_batch_size
    )

    # Run model
    hp_sentence = params
    hp_sentence.update(vars(fixed_params))
    hp_sentence = f'{str(hp_sentence)[1: -1]} \n'
    save_txt(f'\n \n START - Hyperparameters \n{hp_sentence}', train_data_paths.result_filepath, "a")
    trained_model, viz, best_metrics = train_model(
        model,
        fixed_params.num_epochs,
        num_batches_train,
        num_batches_val_loss,
        edgeloader_train,
        edgeloader_valid,
        max_margin_loss,
        params['delta'],
        params['neg_sample_size'],
        params['use_recency'],
        cuda,
        device,
        fixed_params.optimizer,
        params['lr'],
        get_metrics=True,
        train_graph=train_graph,
        valid_graph=valid_graph,
        nodeloader_valid=nodeloader_valid,
        nodeloader_subtrain=nodeloader_subtrain,
        k=fixed_params.k,
        out_dim=params['out_dim'],
        num_batches_val_metrics=num_batches_val_metrics,
        num_batches_subtrain=num_batches_subtrain,
        bought_eids=train_eids_dict[('user', 'buys', 'item')],
        ground_truth_subtrain=ground_truth_subtrain,
        ground_truth_valid=ground_truth_valid,
        remove_already_bought=True,
        result_filepath=train_data_paths.result_filepath,
        start_epoch=fixed_params.start_epoch,
        patience=fixed_params.patience,
        pred=params['pred'],
        use_popularity=params['use_popularity'],
        weight_popularity=params['weight_popularity'],
        remove_false_negative=fixed_params.remove_false_negative,
        embedding_layer=params['embedding_layer'],
    )

    # Get viz & metrics
    if visualization:
        plot_train_loss(hp_sentence, viz)

    # Report performance on validation set
    sentence = ("BEST VALIDATION Precision "
                "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%"
                .format(best_metrics['precision'] * 100,
                        best_metrics['recall'] * 100,
                        best_metrics['coverage'] * 100))

    log.info(sentence)
    save_txt(sentence, train_data_paths.result_filepath, mode='a')

    # Report performance on test set
    log.debug('Test metrics start ...')
    trained_model.eval()
    with torch.no_grad():
        embeddings = get_embeddings(valid_graph,
                                    params['out_dim'],
                                    trained_model,
                                    nodeloader_test,
                                    num_batches_test,
                                    cuda,
                                    device,
                                    params['embedding_layer'],
                                    )

        for ground_truth in [data.ground_truth_purchase_test, data.ground_truth_test]:
            precision, recall, coverage = get_metrics_at_k(
                embeddings,
                valid_graph,
                trained_model,
                params['out_dim'],
                ground_truth,
                all_eids_dict[('user', 'buys', 'item')],
                fixed_params.k,
                True,  # Remove already bought
                cuda,
                device,
                params['pred'],
                params['use_popularity'],
                params['weight_popularity'],
            )

            sentence = ("TEST Precision "
                        "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%"
                        .format(precision * 100,
                                recall * 100,
                                coverage * 100))
            log.info(sentence)
            save_txt(sentence, train_data_paths.result_filepath, mode='a')

    if check_embedding:
        trained_model.eval()
        with torch.no_grad():
            log.debug('ANALYSIS OF RECOMMENDATIONS')
            if 'sport' in train_graph.ntypes:
                result_sport = explore_sports(embeddings,
                                              data.sport_feat_df,
                                              data.spt_id,
                                              fixed_params.num_choices)

                save_txt(result_sport, train_data_paths.result_filepath, mode='a')

            already_bought_dict = create_already_bought(valid_graph,
                                                        all_eids_dict[('user', 'buys', 'item')],
                                                        )
            already_clicked_dict = None
            if fixed_params.discern_clicks:
                already_clicked_dict = create_already_bought(valid_graph,
                                                             all_eids_dict[('user', 'clicks', 'item')],
                                                             etype='clicks',
                                                             )

            users, items = data.ground_truth_test
            ground_truth_dict = create_ground_truth(users, items)
            user_ids = np.unique(users).tolist()
            recs = get_recs(valid_graph,
                            embeddings,
                            trained_model,
                            params['out_dim'],
                            fixed_params.k,
                            user_ids,
                            already_bought_dict,
                            remove_already_bought=True,
                            pred=params['pred'],
                            use_popularity=params['use_popularity'],
                            weight_popularity=params['weight_popularity'])

            users, items = data.ground_truth_purchase_test
            ground_truth_purchase_dict = create_ground_truth(users, items)
            explore_recs(recs,
                         already_bought_dict,
                         already_clicked_dict,
                         ground_truth_dict,
                         ground_truth_purchase_dict,
                         data.item_feat_df,
                         fixed_params.num_choices,
                         data.pdt_id,
                         fixed_params.item_id_type,
                         train_data_paths.result_filepath)

            if fixed_params.item_id_type == 'SPECIFIC ITEM IDENTIFIER':
                coverage_metrics = check_coverage(data.user_item_train,
                                                  data.item_feat_df,
                                                  data.pdt_id,
                                                  recs)

                sentence = (
                    "COVERAGE \n|| All transactions : "
                    "Generic {:.1f}% | Junior {:.1f}% | Male {:.1f}% | Female {:.1f}% | Eco {:.1f}% "
                    "\n|| Recommendations : "
                    "Generic {:.1f}% | Junior {:.1f}% | Male {:.1f}% | Female {:.1f} | Eco {:.1f}%%"
                        .format(
                        coverage_metrics['generic_mean_whole'] * 100,
                        coverage_metrics['junior_mean_whole'] * 100,
                        coverage_metrics['male_mean_whole'] * 100,
                        coverage_metrics['female_mean_whole'] * 100,
                        coverage_metrics['eco_mean_whole'] * 100,
                        coverage_metrics['generic_mean_recs'] * 100,
                        coverage_metrics['junior_mean_recs'] * 100,
                        coverage_metrics['male_mean_recs'] * 100,
                        coverage_metrics['female_mean_recs'] * 100,
                        coverage_metrics['eco_mean_recs'] * 100,
                    )
                )
                log.info(sentence)
                save_txt(sentence, train_data_paths.result_filepath, mode='a')

        save_outputs(
            {
                'embeddings': embeddings,
                'already_bought': already_bought_dict,
                'already_clicked': already_bought_dict,
                'ground_truth': ground_truth_dict,
                'recs': recs,
            },
            'outputs/'
        )

    # Save model
    date = str(datetime.datetime.now())[:-10].replace(' ', '')
    torch.save(trained_model.state_dict(), f'models/FULL_Recall_{recall * 100:.2f}_{date}.pth')
    # Save all necessary params
    save_outputs(
        {
            f'{date}_params': params,
            f'{date}_fixed_params': vars(fixed_params),
        },
        'models/'
    )
    print("Saved model & parameters to disk.")

    # Save graph & ID mapping
    save_graphs(f'models/{date}_graph.bin', [valid_graph])
    save_outputs(
        {
            f'{date}_ctm_id': data.ctm_id,
            f'{date}_pdt_id': data.pdt_id,
        },
        'models/'
    )
    print("Saved graph & ID mapping to disk.")
Exemplo n.º 27
0
from src.utils import read_data, get_open_reponses, get_ids_open_reponses
from sklearn.mixture import GaussianMixture


def fill_X(auth_index):
    global gmm
    global ids_auth
    global features
    global four_surveys_taken_auth_ids
    auth = four_surveys_taken_auth_ids[auth_index]
    k = list(ids_auth).index(auth)
    return gmm.predict_proba(features[k].reshape(1, -1))[0]


n_compo = 10
df_ecologie = read_data('data/LA_TRANSITION_ECOLOGIQUE.json')
df_resp_eco = get_open_reponses(df_ecologie)
df_ids_eco = get_ids_open_reponses(df_ecologie)
four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv",
                                         delimiter=",",
                                         dtype=str)
ids_auth = np.sort(list(set(df_resp_eco['authorId'].values)))
np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s")
X = np.zeros((len(four_surveys_taken_auth_ids), n_compo))
# read features
features = np.loadtxt('responses ecologie_all_questions.tsv', delimiter='\t')
# Fit GMM
gmm = GaussianMixture(n_components=n_compo)
gmm.fit(features)
# pool
local_pool = multiprocessing.Pool(10)
Exemplo n.º 28
0
from src.utils import read_data, get_open_reponses, get_ids_open_reponses
from sklearn.mixture import GaussianMixture


def fill_X(auth_index):
    global gmm
    global ids_auth
    global features
    global four_surveys_taken_auth_ids
    auth = four_surveys_taken_auth_ids[auth_index]
    k = list(ids_auth).index(auth)
    return gmm.predict_proba(features[k].reshape(1, -1))[0]


n_compo = 10
df_democratie = read_data('data/DEMOCRATIE_ET_CITOYENNETE.json')
df_resp_dem = get_open_reponses(df_democratie)
df_ids_dem = get_ids_open_reponses(df_democratie)
four_surveys_taken_auth_ids = np.loadtxt("four_surveys_taken_auth_ids.csv",
                                         delimiter=",",
                                         dtype=str)
ids_auth = np.sort(list(set(df_resp_dem['authorId'].values)))
np.savetxt("ids_auth_sorted.csv", ids_auth, delimiter=",", fmt="%s")
X = np.zeros((len(four_surveys_taken_auth_ids), n_compo))
# read features
features = np.loadtxt('responses democratie_all_questions.tsv', delimiter='\t')
# Fit GMM
gmm = GaussianMixture(n_components=n_compo)
gmm.fit(features)
# pool
local_pool = multiprocessing.Pool(10)
Exemplo n.º 29
0
def inference_fn(trained_model,
                 remove,
                 fixed_params,
                 overwrite_fixed_params=False,
                 days_of_purchases=710,
                 days_of_clicks=710,
                 lifespan_of_items=710,
                 **params):
    """
    Function to run inference inside the hyperparameter loop and calculate metrics.

    Parameters
    ----------
    trained_model:
        Model trained during training of hyperparameter loop.
    remove:
        Percentage of data removed. See src.utils_data for more details.
    fixed_params:
        All parameters used during training of hyperparameter loop. See src.utils_data for more details.
    overwrite_fixed_params:
        If true, training parameters will overwritten by the parameters below. Can be useful if need to test the model
        on different parameters, e.g. that includes older clicks or purchases.
    days_of_purchases, days_of_clicks, lifespan_of_items:
        All parameters that can overwrite the training parameters. Only useful if overwrite_fixed_params is True.
    params:
        All other parameters used during training.

    Returns
    -------
    recall:
        Recall on the test set. Relevant to compare with recall computed on hyperparametrization test set (since
        parameters like 'remove' and all overwritable parameters are different)

    Saves to file
    -------------
    Metrics computed on the test set.
    """
    # Import parameters
    if isinstance(fixed_params, str):
        path = fixed_params
        fixed_params = read_data(path)

        class objectview(object):
            def __init__(self, d):
                self.__dict__ = d

        fixed_params = objectview(fixed_params)

    if 'params' in params.keys():
        # if isinstance(params['params'], str):
        path = params['params']
        params = read_data(path)

    # Initialize data
    data_paths = DataPaths()
    fixed_params.remove = remove
    if overwrite_fixed_params:
        fixed_params.days_of_purchases = days_of_purchases
        fixed_params.days_of_clicks = days_of_clicks
        fixed_params.lifespan_of_items = lifespan_of_items
    data = DataLoader(data_paths, fixed_params)

    # Get graph
    valid_graph = create_graph(data.graph_schema, )
    valid_graph = assign_graph_features(
        valid_graph,
        fixed_params,
        data,
        **params,
    )

    dim_dict = {
        'user': valid_graph.nodes['user'].data['features'].shape[1],
        'item': valid_graph.nodes['item'].data['features'].shape[1],
        'out': params['out_dim'],
        'hidden': params['hidden_dim']
    }

    all_sids = None
    if 'sport' in valid_graph.ntypes:
        dim_dict['sport'] = valid_graph.nodes['sport'].data['features'].shape[
            1]
        all_sids = np.arange(valid_graph.num_nodes('sport'))

    # get training and test ids
    (train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids,
     test_uids, all_iids, ground_truth_subtrain, ground_truth_valid,
     all_eids_dict) = train_valid_split(
         valid_graph,
         data.ground_truth_test,
         fixed_params.etype,
         fixed_params.subtrain_size,
         fixed_params.valid_size,
         fixed_params.reverse_etype,
         fixed_params.train_on_clicks,
         fixed_params.remove_train_eids,
         params['clicks_sample'],
         params['purchases_sample'],
     )
    (edgeloader_train, edgeloader_valid, nodeloader_subtrain, nodeloader_valid,
     nodeloader_test) = generate_dataloaders(
         valid_graph,
         train_graph,
         train_eids_dict,
         valid_eids_dict,
         subtrain_uids,
         valid_uids,
         test_uids,
         all_iids,
         fixed_params,
         num_workers,
         all_sids,
         embedding_layer=params['embedding_layer'],
         n_layers=params['n_layers'],
         neg_sample_size=params['neg_sample_size'],
     )

    num_batches_test = math.ceil(
        (len(test_uids) + len(all_iids)) / fixed_params.node_batch_size)

    # Import model
    if isinstance(trained_model, str):
        path = trained_model
        trained_model = ConvModel(
            valid_graph,
            params['n_layers'],
            dim_dict,
            params['norm'],
            params['dropout'],
            params['aggregator_type'],
            fixed_params.pred,
            params['aggregator_hetero'],
            params['embedding_layer'],
        )
        trained_model.load_state_dict(torch.load(path, map_location=device))
    if cuda:
        trained_model = trained_model.to(device)

    trained_model.eval()
    with torch.no_grad():
        embeddings = get_embeddings(
            valid_graph,
            params['out_dim'],
            trained_model,
            nodeloader_test,
            num_batches_test,
            cuda,
            device,
            params['embedding_layer'],
        )

        for ground_truth in [
                data.ground_truth_purchase_test, data.ground_truth_test
        ]:
            precision, recall, coverage = get_metrics_at_k(
                embeddings,
                valid_graph,
                trained_model,
                params['out_dim'],
                ground_truth,
                all_eids_dict[('user', 'buys', 'item')],
                fixed_params.k,
                True,  # Remove already bought
                cuda,
                device,
                fixed_params.pred,
                params['use_popularity'],
                params['weight_popularity'],
            )

            sentence = ("TEST Precision "
                        "{:.3f}% | Recall {:.3f}% | Coverage {:.2f}%".format(
                            precision * 100, recall * 100, coverage * 100))

            print(sentence)
            save_txt(sentence, data_paths.result_filepath, mode='a')

    return recall
Exemplo n.º 30
0
def format_dfs(
    train_path,  # str (path) or pd.Dataframe directly (df)
    test_path,  # str (path) or pd.Dataframe directly (df)
    item_sport_path: str,
    user_sport_path: str,
    sport_sportg_path: str,
    item_feat_path: str,
    user_feat_path: str,
    sport_feat_path: str,
    sport_onehot_path: str,
    remove: float = 0.,
    ctm_id_type: str = 'CUSTOMER IDENTIFIER',
    item_id_type: str = 'SPECIFIC ITEM IDENTIFIER',
    days_of_purchases: int = 710,
    days_of_clicks: int = 710,
    lifespan_of_items: int = 710,
    report_model_coverage: bool = False,
):
    """
    Import all dfs from csv paths and preprocess interactions to sample interactions and remove old users and items.

    Parameters
    ----------
    train_path, test_path:
        Paths of interaction files, between user and items (in the train set and the test set). To accommodate a wider
        range of utilisation, train_path and test_path can be directly dataframes instead of strings. All files with
        user and items must include a column named with the specified ctm_id_type or item_id_type.
    item_sport_path, user_sport_path, sport_sportg_path:
        Paths of interaction files, between item and sport, user and sport, sport and sport group. All files with user
        and items must include a column named with the specified ctm_id_type or item_id_type.
    item_feat_path, user_feat_path, sport_feat_path:
        Paths of feature files, for item, user and sports. Item features include textual descriptions and junior, male,
        female and eco indicators. User features include male and female indicator. Sport features include only name of
        sport. All files with user and items must include a column named with the specified ctm_id_type or item_id_type.
    sport_onehot_path:
        Path for a csv matrix containing the sport_id and a one-hot vector, unique per sport.
    remove:
        Removes a proportion of users from the dataset randomly.
    ctm_id_type :
        Identifier for the customers.
    item_id_type :
        Identifier for the items. Can be SPECIFIC ITEM IDENTIFIER (e.g. item SKU)
        or GENERAL ITEM IDENTIFIER (e.g. item family identifier)
    days_of_purchases (Days_of_clicks) :
            Number of days of purchases (clicks) that should be kept in the dataset.
            Intuition is that interactions of 12+ months ago might not be relevant. Max is 710 days
            Those that do not have any remaining interactions will be fed recommendations from another
            model.
    lifespan_of_items :
        Number of days since most recent transactions for an item to be considered by the
        model. Max is 710 days. Won't make a difference is it is > Days_of_interaction.
    report_model_coverage : bool
        Computes how many users are included by these parameters (and would thus receive a recommendation by this GNN
        model).

    Returns
    -------
    user_item_train, user_item_test, user_sport_interaction, item_sport_interaction, sport_sportg_interaction:
        Dataframes of interactions.
    item_feat_df, user_feat_df, sport_feat_df, sport_onehot_df:
        Dataframes of features.
    """
    np.random.seed(11)

    # User, item and sport features
    item_feat_df = read_data(item_feat_path)
    user_feat_df = read_data(user_feat_path)
    sport_feat_df = read_data(sport_feat_path)
    sport_onehot_df = read_data(sport_onehot_path)

    # User-item interaction. We allow direct df instead of path: check which was passed.
    if isinstance(train_path, str):
        user_item_train = read_data(train_path)
    elif isinstance(train_path, pd.DataFrame):
        user_item_train = train_path
    else:
        raise TypeError(
            f'Type of {train_path} not recognized. Should be str or pd.DataFrame'
        )
    if isinstance(test_path, str):
        user_item_test = read_data(test_path)
    elif isinstance(test_path, pd.DataFrame):
        user_item_test = test_path
    else:
        raise TypeError(
            f'Type of {test_path} not recognized. Should be str or pd.DataFrame'
        )

    if days_of_purchases < 710:
        most_recent_date = datetime.strptime(max(user_item_train.hit_date),
                                             '%Y-%m-%d')
        limit_date = datetime.strftime(
            (most_recent_date - timedelta(days=int(days_of_purchases))),
            format='%Y-%m-%d')
        user_item_train = user_item_train[(
            user_item_train.hit_date >= limit_date) |
                                          (user_item_train.buy == 0)]

    if days_of_clicks < 710:
        most_recent_date = datetime.strptime(max(user_item_train.hit_date),
                                             '%Y-%m-%d')
        limit_date = datetime.strftime(
            (most_recent_date - timedelta(days=int(days_of_clicks))),
            format='%Y-%m-%d')
        user_item_train = user_item_train[(
            user_item_train.hit_date >= limit_date) |
                                          (user_item_train.buy == 1)]

    if lifespan_of_items < days_of_purchases:
        most_recent_date = datetime.strptime(max(user_item_train.hit_date),
                                             '%Y-%m-%d')
        limit_date = datetime.strftime(
            (most_recent_date - timedelta(days=int(lifespan_of_items))),
            format='%Y-%m-%d')
        item_list = user_item_train[user_item_train.hit_date >= limit_date][
            'SPECIFIC ITEM IDENTIFIER'].unique()
        user_item_train = user_item_train[
            user_item_train['SPECIFIC ITEM IDENTIFIER'].isin(item_list)]

    if remove > 0:
        ctm_list = user_item_train[ctm_id_type].unique()
        np.random.shuffle(ctm_list)
        ctm_list = ctm_list[:int(len(ctm_list) * (1 - remove))]
        user_item_train = user_item_train[user_item_train[ctm_id_type].isin(
            ctm_list)]
        user_item_test = user_item_test[user_item_test[ctm_id_type].isin(
            ctm_list)]

    if remove == 0:
        # Make sure that if no observations were removed by days of clicks / purchases, no user is only in test set
        user_item_test = user_item_test[user_item_test[ctm_id_type].isin(
            user_item_train[ctm_id_type].unique())]

    if item_id_type == 'GENERAL ITEM IDENTIFIER':
        user_item_train = user_item_train.merge(item_feat_df[[
            'SPECIFIC ITEM IDENTIFIER', 'GENERAL ITEM IDENTIFIER'
        ]].drop_duplicates(),
                                                how='left',
                                                on='SPECIFIC ITEM IDENTIFIER')
        user_item_test = user_item_test.merge(item_feat_df[[
            'SPECIFIC ITEM IDENTIFIER', 'GENERAL ITEM IDENTIFIER'
        ]].drop_duplicates(),
                                              how='left',
                                              on='SPECIFIC ITEM IDENTIFIER')
        assert user_item_train.general_item_identifier.isna().sum() == 0
        assert user_item_test.general_item_identifier.isna().sum() == 0

    # Item-sport interaction
    item_sport_interaction = read_data(item_sport_path)
    if lifespan_of_items < days_of_purchases:
        item_sport_interaction = item_sport_interaction[
            item_sport_interaction['SPECIFIC ITEM IDENTIFIER'].isin(item_list)]
    if item_id_type == 'GENERAL ITEM IDENTIFIER':
        item_sport_interaction = item_sport_interaction.merge(
            item_feat_df[[
                'SPECIFIC ITEM IDENTIFIER', 'GENERAL ITEM IDENTIFIER'
            ]],
            how='left',
            on='SPECIFIC ITEM IDENTIFIER')
    # Drop duplicates if not item_id_type not model number
    item_sport_interaction.drop_duplicates(inplace=True)

    # User-sport interaction
    user_sport_interaction = read_data(user_sport_path)
    if remove > 0:
        user_sport_interaction = user_sport_interaction[
            user_sport_interaction[ctm_id_type].isin(ctm_list)]

    # Sport-sportgroups interaction
    sport_sportg_interaction = read_data(sport_sportg_path)

    if report_model_coverage:
        train_users = user_item_train[ctm_id_type].unique().tolist()
        test_users = user_item_test[ctm_id_type].unique().tolist()
        sport_users = user_sport_interaction[ctm_id_type].unique().tolist()
        unseen_users = [uid for uid in test_users if uid not in train_users]
        print(f'There are {len(unseen_users)} users with no interactions')
        train_users.extend(sport_users)
        unseen_users = [uid for uid in test_users if uid not in train_users]
        print(f'and {len(unseen_users)} with also no sports associated')
        print(f'out of {len(test_users)}')

    return user_item_train, user_item_test, item_sport_interaction, user_sport_interaction, \
           sport_sportg_interaction, item_feat_df, user_feat_df, sport_feat_df, sport_onehot_df