示例#1
0
def main():

    CONFIG = {
        'pub': ('/data21/lgalke/datasets/citations_pmc.tsv', 2011, 50),
        'eco': ('/data21/lgalke/datasets/econbiz62k.tsv', 2012, 1)
    }

    print("Loading pre-trained embedding", W2V_PATH)
    vectors = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY)

    CONDITIONS = ConditionList([
        ('title', PretrainedWordEmbeddingCondition(vectors, dim=0))
    ])

    PARSER = argparse.ArgumentParser()
    PARSER.add_argument('data', type=str, choices=['pub', 'eco'])
    args = PARSER.parse_args()
    DATA = CONFIG[args.data]
    logfile = '/data22/ivagliano/test-irgan/' + args.data + '-decoder.log'
    bags = Bags.load_tabcomma_format(DATA[0])
    c_year = DATA[1]

    evaluate = Evaluation(bags,
                          year=c_year,
                          logfile=logfile).setup(min_count=DATA[2],
                                                 min_elements=2)
    user_num = evaluate.train_set.size()[0] + evaluate.test_set.size()[0]
    item_num = evaluate.train_set.size()[1]
    models = [IRGANRecommender(user_num, item_num, g_epochs=1, d_epochs=1, n_epochs=1, conditions=CONDITIONS)]
    evaluate(models)
示例#2
0
def main(year, min_count=None, outfile=None):
    """ Main function for training and evaluating AAE methods on IREON data """
    if (CLEAN == True):
        print("Loading data from", DATA_PATH)
        papers = load(DATA_PATH)
        print("Cleaning data...")
        clean(CLEAN_DATA_PATH, papers)
        print("Clean data in {}".format(CLEAN_DATA_PATH))
        return

    print("Loading data from", CLEAN_DATA_PATH)
    papers = load(CLEAN_DATA_PATH)
    print("Unpacking IREON data...")
    bags_of_papers, ids, side_info = unpack_papers(papers)
    del papers
    bags = Bags(bags_of_papers, ids, side_info)

    log("Whole dataset:", logfile=outfile)
    log(bags, logfile=outfile)

    evaluation = Evaluation(bags, year, logfile=outfile)
    evaluation.setup(min_count=min_count, min_elements=2)
    print("Loading pre-trained embedding", W2V_PATH)

    with open(outfile, 'a') as fh:
        print("~ Partial List", "~" * 42, file=fh)
    evaluation(BASELINES + RECOMMENDERS)

    with open(outfile, 'a') as fh:
        print("~ Partial List + Titles", "~" * 42, file=fh)
    evaluation(TITLE_ENHANCED)
示例#3
0
def main(outfile=None, min_count=None, aggregate=None):
    """ Main function for training and evaluating AAE methods on MDP data """
    print("Loading data from", DATA_PATH)
    playlists = playlists_from_slices(DATA_PATH, n_jobs=4)
    print("Unpacking json data...")
    if aggregate is not None:
        aggregate =['artist_name', 'track_name', 'album_name']
        print("Using aggegated metadata {}".format(aggregate))
    else:
        print("Aggrgate={}".fomat(aggregate))
        print("Using title only")
    bags_of_tracks, pids, side_info = unpack_playlists(playlists, aggregate)
    del playlists
    bags = Bags(bags_of_tracks, pids, side_info)
    log("Whole dataset:", logfile=outfile)
    log(bags, logfile=outfile)
    train_set, dev_set, y_test = prepare_evaluation(bags,
                                                    n_items=N_ITEMS,
                                                    min_count=min_count)

    log("Train set:", logfile=outfile)
    log(train_set, logfile=outfile)

    log("Dev set:", logfile=outfile)
    log(dev_set, logfile=outfile)

    # THE GOLD (put into sparse matrix)
    y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False)

    # the known items in the test set, just to not recompute
    x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False)

    for model in MODELS:
        log('=' * 78, logfile=outfile)
        log(model, logfile=outfile)

        # Training
        model.train(train_set)

        # Prediction
        y_pred = model.predict(dev_set)

        # Sanity-fix #1, make sparse stuff dense, expect array
        if sp.issparse(y_pred):
            y_pred = y_pred.toarray()
        else:
            y_pred = np.asarray(y_pred)

        # Sanity-fix, remove predictions for already present items
        y_pred = remove_non_missing(y_pred, x_test, copy=False)

        # Evaluate metrics
        results = evaluate(y_test, y_pred, METRICS, batch_size=1000)

        log("-" * 78, logfile=outfile)
        for metric, stats in zip(METRICS, results):
            log("* {}: {} ({})".format(metric, *stats), logfile=outfile)

        log('=' * 78, logfile=outfile)
示例#4
0
def main(year, min_count=None, outfile=None, drop=1):
    """ Main function for training and evaluating AAE methods on IREON data """
    if (CLEAN == True):
        print("Loading data from", DATA_PATH)
        papers = load(DATA_PATH)
        print("Cleaning data...")
        clean(CLEAN_DATA_PATH, papers)
        print("Clean data in {}".format(CLEAN_DATA_PATH))
        return

    print("Loading data from", CLEAN_DATA_PATH)
    papers = load(CLEAN_DATA_PATH)
    print("Unpacking IREON data...")
    # bags_of_papers, ids, side_info = unpack_papers(papers)
    bags_of_papers, ids, side_info = unpack_papers_conditions(papers)
    del papers
    bags = Bags(bags_of_papers, ids, side_info)
    if args.compute_mi:
        from aaerec.utils import compute_mutual_info
        print("[MI] Dataset: IREON (fiv)")
        print("[MI] min Count:", min_count)
        tmp = bags.build_vocab(min_count=min_count, max_features=None)
        mi = compute_mutual_info(tmp,
                                 conditions=None,
                                 include_labels=True,
                                 normalize=True)
        with open('mi.csv', 'a') as mifile:
            print('IREON', min_count, mi, sep=',', file=mifile)
        print("=" * 78)
        exit(0)

    log("Whole dataset:", logfile=outfile)
    log(bags, logfile=outfile)

    evaluation = Evaluation(bags, year, logfile=outfile)
    evaluation.setup(min_count=min_count, min_elements=2, drop=drop)

    # Use only partial citations/labels list (no additional metadata)
    with open(outfile, 'a') as fh:
        print("~ Partial List", "~" * 42, file=fh)
    evaluation(BASELINES + RECOMMENDERS)
    # Use additional metadata (as defined in CONDITIONS for all models but SVD, which uses only titles)
    with open(outfile, 'a') as fh:
        print("~ Conditioned Models", "~" * 42, file=fh)
    evaluation(CONDITIONED_MODELS)
示例#5
0
def main(year, dataset, min_count=None, outfile=None, drop=1):
    """ Main function for training and evaluating AAE methods on DBLP data """
    path = DATA_PATH + ("dblp-ref/" if dataset == "dblp" else "acm.txt")
    print("Loading data from", path)
    papers = papers_from_files(path, dataset, n_jobs=4)
    print("Unpacking {} data...".format(dataset))
    bags_of_papers, ids, side_info = unpack_papers(papers)
    del papers
    bags = Bags(bags_of_papers, ids, side_info)
    if args.compute_mi:
        from aaerec.utils import compute_mutual_info
        print("[MI] Dataset:", dataset)
        print("[MI] min Count:", min_count)
        tmp = bags.build_vocab(min_count=min_count, max_features=None)
        mi = compute_mutual_info(tmp, conditions=None, include_labels=True,
                                  normalize=True)
        with open('mi.csv', 'a') as mifile:
            print(dataset, min_count, mi, sep=',', file=mifile)

        print("=" * 78)
        exit(0)

    log("Whole dataset:", logfile=outfile)
    log(bags, logfile=outfile)

    evaluation = Evaluation(bags, year, logfile=outfile)
    evaluation.setup(min_count=min_count, min_elements=2, drop=drop)

    # To evaluate the baselines and the recommenders without metadata (or just the recommenders without metadata)
    # with open(outfile, 'a') as fh:
    #     print("~ Partial List", "~" * 42, file=fh)
    # evaluation(BASELINES + RECOMMENDERS)
    # evaluation(RECOMMENDERS, batch_size=1000)

    with open(outfile, 'a') as fh:
        print("~ Partial List + Titles + Author + Venue", "~" * 42, file=fh)
    # To evaluate SVD with titles
    # evaluation(TITLE_ENHANCED)
    evaluation(CONDITIONED_MODELS, batch_size=1000)
示例#6
0
def main():
    """ Evaluates the VAE Recommender """
    CONFIG = {
        'pub': ('/data21/lgalke/datasets/citations_pmc.tsv', 2011, 50),
        'eco': ('/data21/lgalke/datasets/econbiz62k.tsv', 2012, 1)
    }

    PARSER = argparse.ArgumentParser()
    PARSER.add_argument('data', type=str, choices=['pub', 'eco'])
    args = PARSER.parse_args()
    DATA = CONFIG[args.data]
    logfile = '/data22/ivagliano/test-vae/' + args.data + '-hyperparams-opt.log'
    bags = Bags.load_tabcomma_format(DATA[0])
    c_year = DATA[1]

    evaluate = Evaluation(bags, year=c_year,
                          logfile=logfile).setup(min_count=DATA[2],
                                                 min_elements=2)
    print("Loading pre-trained embedding", W2V_PATH)
    vectors = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY)

    params = {
        #'n_epochs': 10,
        'batch_size': 100,
        'optimizer': 'adam',
        # 'normalize_inputs': True,
    }

    CONDITIONS = ConditionList([('title',
                                 PretrainedWordEmbeddingCondition(vectors))])

    # 100 hidden units, 200 epochs, bernoulli prior, normalized inputs -> 0.174
    # activations = ['ReLU','SELU']
    # lrs = [(0.001, 0.0005), (0.001, 0.001)]
    hcs = [(100, 50), (300, 100)]
    epochs = [50, 100, 200, 500]

    # dropouts = [(.2,.2), (.1,.1), (.1, .2), (.25, .25), (.3,.3)] # .2,.2 is best
    # priors = ['categorical'] # gauss is best
    # normal = [True, False]
    # bernoulli was good, letz see if categorical is better... No
    import itertools
    models = [
        VAERecommender(conditions=CONDITIONS,
                       **params,
                       n_hidden=hc[0],
                       n_code=hc[1],
                       n_epochs=e) for hc, e in itertools.product(hcs, epochs)
    ]
    # models = [VAERecommender(conditions=CONDITIONS, **params)]
    evaluate(models)
示例#7
0
def main(outfile=None, min_count=None):
    """ Main function for training and evaluating AAE methods on Reuters data """
    print("Loading data from", DATA_PATH)
    bags = Bags.load_tabcomma_format(DATA_PATH, unique=True)
    log("Whole dataset:", logfile=outfile)
    log(bags, logfile=outfile)
    train_set, dev_set, y_test = prepare_evaluation(bags,
                                                    min_count=min_count)

    log("Train set:", logfile=outfile)
    log(train_set, logfile=outfile)

    log("Dev set:", logfile=outfile)
    log(dev_set, logfile=outfile)

    # THE GOLD (put into sparse matrix)
    y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False)

    # the known items in the test set, just to not recompute
    x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False)

    for model in MODELS:
        log('=' * 78, logfile=outfile)
        log(model, logfile=outfile)

        # Training
        model.train(train_set)

        # Prediction
        y_pred = model.predict(dev_set)

        # Sanity-fix #1, make sparse stuff dense, expect array
        if sp.issparse(y_pred):
            y_pred = y_pred.toarray()
        else:
            y_pred = np.asarray(y_pred)

        # Sanity-fix, remove predictions for already present items
        y_pred = remove_non_missing(y_pred, x_test, copy=False)

        # Evaluate metrics
        results = evaluate(y_test, y_pred, METRICS)

        log("-" * 78, logfile=outfile)
        for metric, stats in zip(METRICS, results):
            log("* {}: {} ({})".format(metric, *stats), logfile=outfile)

        log('=' * 78, logfile=outfile)
示例#8
0
def main(year, dataset, min_count=None, outfile=None):
    """ Main function for training and evaluating AAE methods on DBLP data """
    path = DATA_PATH + ("dblp-ref/" if dataset == "dblp" else "acm.txt")
    print("Loading data from", path)
    papers = papers_from_files(path, dataset, n_jobs=4)
    print("Unpacking {} data...".format(dataset))
    bags_of_papers, ids, side_info = unpack_papers(papers)
    del papers
    bags = Bags(bags_of_papers, ids, side_info)

    log("Whole dataset:", logfile=outfile)
    log(bags, logfile=outfile)

    evaluation = Evaluation(bags, year, logfile=outfile)
    evaluation.setup(min_count=min_count, min_elements=2)
    print("Loading pre-trained embedding", W2V_PATH)

    with open(outfile, 'a') as fh:
        print("~ Partial List", "~" * 42, file=fh)
    evaluation(BASELINES + RECOMMENDERS)

    with open(outfile, 'a') as fh:
        print("~ Partial List + Titles", "~" * 42, file=fh)
    evaluation(TITLE_ENHANCED)
示例#9
0
W2V_PATH = "/data21/lgalke/vectors/GoogleNews-vectors-negative300.bin.gz"
W2V_IS_BINARY = True

PARSER = argparse.ArgumentParser()
PARSER.add_argument('dataset', type=str, help='path to dataset')
PARSER.add_argument('year', type=int, help='First year of the testing set.')
PARSER.add_argument('-m',
                    '--min-count',
                    type=int,
                    help='Pruning parameter',
                    default=50)
PARSER.add_argument('-o', '--outfile', type=str, default=None)

ARGS = PARSER.parse_args()

DATASET = Bags.load_tabcomma_format(ARGS.dataset, unique=True)

EVAL = Evaluation(DATASET, ARGS.year, logfile=ARGS.outfile)
EVAL.setup(min_count=ARGS.min_count, min_elements=2)
print("Loading pre-trained embedding", W2V_PATH)
VECTORS = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY)

BASELINES = [
    # RandomBaseline(),
    # MostPopular(),
    Countbased(),
    SVDRecommender(1000, use_title=False),
]

ae_params = {
    'n_code': 50,
示例#10
0
def main(outfile=None, min_count=None, aggregate=None):
    """ Main function for training and evaluating AAE methods on MDP data """
    print("Loading data from", DATA_PATH)
    playlists = playlists_from_slices(DATA_PATH, n_jobs=4)
    print("Unpacking json data...")
    bags_of_tracks, pids, side_info = unpack_playlists_for_models_concatenated(
        playlists)

    del playlists
    bags = Bags(data=bags_of_tracks, owners=pids, owner_attributes=side_info)
    if args.compute_mi:
        from sklearn.metrics import mutual_info_score
        print("Computing MI")
        X = bags.build_vocab(min_count=args.min_count,
                             max_features=None).tocsr()
        C = X.T @ X
        print("(Pairwise) mutual information:",
              mutual_info_score(None, None, contingency=C))
        # Exit in this case
        print("Bye.")
        exit(0)

    log("Whole dataset:", logfile=outfile)
    log(bags, logfile=outfile)
    train_set, dev_set, y_test = prepare_evaluation(bags,
                                                    n_items=N_ITEMS,
                                                    min_count=min_count)

    log("Train set:", logfile=outfile)
    log(train_set, logfile=outfile)

    log("Dev set:", logfile=outfile)
    log(dev_set, logfile=outfile)

    # THE GOLD (put into sparse matrix)
    y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False)

    # the known items in the test set, just to not recompute
    x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False)

    for model in MODELS:
        log('=' * 78, logfile=outfile)
        log(model, logfile=outfile)
        log(model.model_params, logfile=outfile)

        # Training
        model.train(train_set)
        print("training finished")

        # Prediction
        y_pred = model.predict(dev_set)
        print("prediction finished")

        print(" prediction sparse?:", sp.issparse(y_pred))
        # Sanity-fix #1, make sparse stuff dense, expect array
        if sp.issparse(y_pred):
            y_pred = y_pred.toarray()
        else:
            y_pred = np.asarray(y_pred)

        print("remove non-missing:")
        # Sanity-fix, remove predictions for already present items
        y_pred = remove_non_missing(y_pred, x_test, copy=False)

        print("evaluate:")
        # Evaluate metrics
        results = evaluate(y_test, y_pred, METRICS, batch_size=500)

        print("metrics: ")
        log("-" * 78, logfile=outfile)
        for metric, stats in zip(METRICS, results):
            log("* {}: {} ({})".format(metric, *stats), logfile=outfile)

        log('=' * 78, logfile=outfile)
示例#11
0
        elif dataset == "swp":
            text = "labels"
        else:
            text = "tracks"
        print("Generating {} distribution".format(text))
        citations = paper_by_n_citations(citations)

    print("Unpacking {} data...".format(dataset))
    if dataset == "acm" or dataset == "dblp":
        bags_of_papers, ids, side_info = unpack_papers(papers)
    elif dataset == "mpd":
        # not bags_of_papers but bugs_of_tracks
        bags_of_papers, ids, side_info = unpack_playlists(papers)
    else:
        bags_of_papers, ids, side_info = unpack_papers_fiv(papers)
    bags = Bags(bags_of_papers, ids, side_info)

else:
    print("Loading {}".format(path))
    df = pd.read_csv(path, sep="\t", dtype=str, error_bad_lines=False)
    # replace nan with empty string
    df = df.replace(np.nan, "", regex=True)

    citations = generate_citations(df)
    print("Generating {} distribution".format("citations" if dataset ==
                                              "pubmed" else "occurrences"))
    citations = paper_by_n_citations(citations)

    set_cnts = set_count(df)

    print("Unpacking {} data...".format(dataset))
示例#12
0
import numpy as np
from sklearn.metrics import mutual_info_score

from aaerec.datasets import Bags
from aaerec.condition import ConditionList, CountCondition
from aaerec.utils import compute_mutual_info

PARSER = argparse.ArgumentParser()
PARSER.add_argument('dataset', type=str,
                    help='path to dataset')
PARSER.add_argument('-m', '--min-count', type=int,
                    help='Pruning parameter', default=None)
PARSER.add_argument('-M', '--max-features', type=int,
                    help='Max features', default=None)
ARGS = PARSER.parse_args()


# MI_CONDITIONS = ConditionList([('title', CountCondition(max_features=100000))])
MI_CONDITIONS = None

print("Computing Mutual Info with args")
print(ARGS)

# With no metadata or just titles
BAGS = Bags.load_tabcomma_format(ARGS.dataset, unique=True)\
    .build_vocab(min_count=ARGS.min_count, max_features=ARGS.max_features)

mi = compute_mutual_info(BAGS, MI_CONDITIONS, include_labels=True, normalize=True)
with open('mi.csv', 'a') as mifile:
    print('CITREC', ARGS.min_count, mi, sep=',', file=mifile)
示例#13
0
def main(outfile=None, min_count=None, drop=1):
    """ Main function for training and evaluating AAE methods on Reuters data """
    print("Loading data from", DATA_PATH)
    bags = Bags.load_tabcomma_format(DATA_PATH, unique=True)
    if args.compute_mi:
        from aaerec.utils import compute_mutual_info
        print("[MI] Dataset: Reuters")
        print("[MI] min Count:", min_count)
        tmp = bags.build_vocab(min_count=min_count, max_features=None)
        mi = compute_mutual_info(tmp,
                                 conditions=None,
                                 include_labels=True,
                                 normalize=True)
        with open('mi.csv', 'a') as mifile:
            print('Reuters', min_count, mi, sep=',', file=mifile)
        print("=" * 78)
        exit(0)
    log("Whole dataset:", logfile=outfile)
    log(bags, logfile=outfile)
    train_set, dev_set, y_test = prepare_evaluation(bags,
                                                    min_count=min_count,
                                                    drop=drop)

    log("Train set:", logfile=outfile)
    log(train_set, logfile=outfile)

    log("Dev set:", logfile=outfile)
    log(dev_set, logfile=outfile)

    # THE GOLD (put into sparse matrix)
    y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False)

    # the known items in the test set, just to not recompute
    x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False)

    for model in MODELS:
        log('=' * 78, logfile=outfile)
        log(model, logfile=outfile)

        # Training
        model.train(train_set)

        # Prediction
        y_pred = model.predict(dev_set)

        # Sanity-fix #1, make sparse stuff dense, expect array
        if sp.issparse(y_pred):
            y_pred = y_pred.toarray()
        else:
            y_pred = np.asarray(y_pred)

        # Sanity-fix, remove predictions for already present items
        y_pred = remove_non_missing(y_pred, x_test, copy=False)

        # Evaluate metrics
        results = evaluate(y_test, y_pred, METRICS)

        log("-" * 78, logfile=outfile)
        for metric, stats in zip(METRICS, results):
            log("* {}: {} ({})".format(metric, *stats), logfile=outfile)

        log('=' * 78, logfile=outfile)
示例#14
0
import numpy as np
from aaerec.datasets import Bags
# path = '../Data/Economics/econbiz62k.tsv'
path = '../Data/PMC/citations_pmc.tsv'
bags = Bags.load_tabcomma_format(path, unique=True)
bags = bags.build_vocab(apply=True)

csr = bags.tocsr()
print("N ratings:", csr.sum())

column_sums = csr.sum(0).flatten()
row_sums = csr.sum(1).flatten()

print(column_sums.shape)
print(row_sums.shape)


FMT = "N={}, Min={}, Max={} Median={}, Mean={}, Std={}"

def compute_stats(A):
    return A.shape[1], A.min(), A.max(), np.median(A, axis=1)[0,0], A.mean(), A.std()


print("Items per document")
print(FMT.format(*compute_stats(row_sums)))
print("Documents per item")
print(FMT.format(*compute_stats(column_sums)))

示例#15
0
        else:
            text = "tracks"
        print("Generating {} distribution".format(text))
        citations = paper_by_n_citations(citations)

    print("Unpacking {} data...".format(dataset))
    if dataset == "acm" or dataset == "dblp":
        bags_of_papers, ids, side_info = unpack_papers(papers)
    elif dataset == "mpd":
        # not bags_of_papers but bugs_of_tracks
        bags_of_papers, ids, side_info = unpack_playlists(papers)
    elif dataset == "swp":
        bags_of_papers, ids, side_info = unpack_papers_fiv(papers)
    else:
        bags_of_papers, ids, side_info = unpack_papers_econis(papers)
    bags = Bags(bags_of_papers, ids, side_info)

else:
    print("Loading {}".format(path))
    df = pd.read_csv(path, sep="\t", dtype=str, error_bad_lines=False)
    # replace nan with empty string
    df = df.replace(np.nan, "", regex=True)

    citations = generate_citations(df, dataset)
    print("Generating {} distribution".format("citations" if dataset ==
                                              "pubmed" else "occurrences"))
    citations = paper_by_n_citations(citations)

    set_cnts = set_count(df, dataset)

    print("Unpacking {} data...".format(dataset))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model',
        type=str,
        defaule='aae',
        # All possible method should appear here
        choices=['cm', 'svd', 'ae', 'aae', 'mlp'],
        help="Specify the model to use [aae]")
    parser.add_argument('--epochs',
                        type=int,
                        default=20,
                        help="Specify the number of training epochs [50]")
    parser.add_argument('--hidden',
                        type=int,
                        default=200,
                        help="Number of hidden units [100]")
    parser.add_argument('--no-title',
                        action='store_false',
                        default=True,
                        dest='use_title',
                        help="Do not use the playlist titles")
    parser.add_argument('--max-items',
                        type=int,
                        default=75000,
                        help="Limit the max number of considered items")
    parser.add_argument(
        '--vocab-size',
        type=int,
        default=50000,
        help="Limit the max number of distinct condition words")
    parser.add_argument('-j',
                        '--jobs',
                        type=int,
                        default=4,
                        help="Number of jobs for data loading [4].")
    parser.add_argument('-o',
                        '--outfile',
                        default="submission.csv",
                        type=str,
                        help="Write submissions to this path")
    parser.add_argument('--use-embedding',
                        default=False,
                        action='store_true',
                        help="Use embedding (SGNS GoogleNews) [false]")
    parser.add_argument('--dont-aggregate',
                        action='store_false',
                        dest='aggregate',
                        default=True,
                        help="Aggregate track metadata as side info input")
    parser.add_argument('--debug',
                        action='store_true',
                        default=False,
                        help="Activate debug mode, run only on small sample")
    parser.add_argument(
        '-x',
        '--exclude',
        type=argparse.FileType('r'),
        default=None,
        help="Path to file with slice filenames to exclude for training")
    parser.add_argument(
        '--dev',
        type=str,
        default=None,
        help='Path to dev set, use in combination with (-x, --exclude)')
    parser.add_argument('--no-idf',
                        action='store_false',
                        default=True,
                        dest='use_idf',
                        help="Do **not** use idf re-weighting")
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help="Initial learning rate [0.001]")
    parser.add_argument('--code',
                        type=int,
                        default=100,
                        help="Code dimension [50]")
    args = parser.parse_args()

    # Either exclude and dev set, or no exclude and test set
    assert (args.dev is None) == (args.exclude is None)
    if args.dev is not None:
        print("Making submission for dev set:", args.dev)
        assert os.path.isfile(args.dev)

    # Dump args into submission file
    if os.path.exists(args.outfile) and \
            input("Path '{}' exists. Overwrite? [y/N]"
                  .format(args.outfile)) != 'y':
        exit(-1)

    with open(args.outfile, 'w') as out:
        print('#', args, file=out)

    if args.use_embedding:
        print("Loading embedding:", W2V_PATH)
        vectors = KeyedVectors.load_word2vec_format(W2V_PATH,
                                                    binary=W2V_IS_BINARY)
    else:
        vectors = None

    # Create the model as specified by command line args
    # Count-based never uses title
    # Decoding recommender always uses title

    tfidf_params = {'max_features': args.vocab_size, 'use_idf': args.use_idf}

    model = {
        'cm':
        Countbased(),
        'svd':
        SVDRecommender(use_title=args.use_title),
        'ae':
        AAERecommender(use_title=args.use_title,
                       adversarial=False,
                       n_hidden=args.hidden,
                       n_code=args.code,
                       n_epochs=args.epochs,
                       embedding=vectors,
                       lr=args.lr,
                       tfidf_params=tfidf_params),
        'aae':
        AAERecommender(
            use_title=args.use_title,
            adversarial=True,
            n_hidden=args.hidden,
            n_code=args.code,
            n_epochs=args.epochs,
            gen_lr=args.lr,
            reg_lr=args.lr,  # same gen and reg lrs
            embedding=vectors,
            tfidf_params=tfidf_params),
        'mlp':
        DecodingRecommender(n_epochs=args.epochs,
                            n_hidden=args.hidden,
                            embedding=vectors,
                            tfidf_params=tfidf_params)
    }[args.model]

    track_attrs = TRACK_INFO if args.aggregate else None

    if args.exclude is not None:
        # Dev set case, exclude dev set data
        exclude = [line.strip() for line in args.exclude]
    else:
        # Real submission case, do not exclude any training data
        exclude = None

    # = Training =
    print("Loading data from {} using {} jobs".format(DATA_PATH, args.jobs))
    playlists = playlists_from_slices(DATA_PATH,
                                      n_jobs=args.jobs,
                                      debug=args.debug,
                                      without=exclude)
    print("Unpacking playlists")
    train_set = Bags(*unpack_playlists(playlists, aggregate=track_attrs))

    print("Building vocabulary of {} most frequent items".format(
        args.max_items))
    vocab, __counts = train_set.build_vocab(max_features=args.max_items,
                                            apply=False)
    train_set = train_set.apply_vocab(vocab)
    print("Training set:", train_set, sep='\n')

    print("Training for {} epochs".format(args.epochs))
    try:
        model.train(train_set)
    except KeyboardInterrupt:
        print("Training interrupted by keyboard, pass.")

    # Not required anymore
    del train_set

    # = Predictions =
    if args.dev is not None:
        print("Loading and unpacking DEV set")
        data, index2playlist, side_info = unpack_playlists(
            load(args.dev), aggregate=track_attrs)
    else:
        print("Loading and unpacking test set")
        data, index2playlist, side_info = unpack_playlists(
            load(TEST_PATH), aggregate=track_attrs)
    test_set = Bags(data, index2playlist, side_info)
    # Apply same vocabulary as in training
    test_set = test_set.apply_vocab(vocab)
    print("Test set:", test_set, sep='\n')

    pred = model.predict(test_set)
    if sp.issparse(pred):
        pred = pred.toarray()
    else:
        pred = np.asarray(pred)
    print("Scaling and removing non-missing items")
    pred = remove_non_missing(pred, test_set.tocsr(), copy=False)

    index2trackid = {v: k for k, v in vocab.items()}
    print("Making submission:", args.outfile)
    make_submission(pred, index2playlist, index2trackid, outfile=args.outfile)
    print("Success.")
    print("Make sure to verify the submission format via", VERIFY_SCRIPT)
示例#17
0
def main(year,
         dataset,
         min_count=None,
         outfile=None,
         drop=1,
         baselines=False,
         autoencoders=False,
         conditioned_autoencoders=False,
         all_metadata=True):
    """ Main function for training and evaluating AAE methods on DBLP data """

    assert baselines or autoencoders or conditioned_autoencoders, "Please specify what to run"

    if all_metadata:
        # V2 - all metadata
        CONDITIONS = ConditionList([
            ('title', PretrainedWordEmbeddingCondition(VECTORS)),
            ('venue', PretrainedWordEmbeddingCondition(VECTORS)),
            (
                'author',
                CategoricalCondition(
                    embedding_dim=32,
                    reduce="sum",  # vocab_size=0.01,
                    sparse=False,
                    embedding_on_gpu=True))
        ])
    else:
        # V1 - only title metadata
        CONDITIONS = ConditionList([
            ('title', PretrainedWordEmbeddingCondition(VECTORS))
        ])
    #### CONDITOINS defined

    ALL_MODELS = []

    if baselines:
        # Models without metadata
        BASELINES = [
            # RandomBaseline(),
            # MostPopular(),
            Countbased(),
            SVDRecommender(1000, use_title=False)
        ]

        ALL_MODELS += BASELINES

        if not all_metadata:
            # SVD can use only titles not generic conditions
            ALL_MODELS += [SVDRecommender(1000, use_title=True)]

    if autoencoders:
        AUTOENCODERS = [
            AAERecommender(adversarial=False,
                           conditions=None,
                           lr=0.001,
                           **AE_PARAMS),
            AAERecommender(adversarial=True,
                           conditions=None,
                           gen_lr=0.001,
                           reg_lr=0.001,
                           **AE_PARAMS),
            VAERecommender(conditions=None, **AE_PARAMS),
            DAERecommender(conditions=None, **AE_PARAMS)
        ]
        ALL_MODELS += AUTOENCODERS

    if conditioned_autoencoders:
        # Model with metadata (metadata used as set in CONDITIONS above)
        CONDITIONED_AUTOENCODERS = [
            AAERecommender(adversarial=False,
                           conditions=CONDITIONS,
                           lr=0.001,
                           **AE_PARAMS),
            AAERecommender(adversarial=True,
                           conditions=CONDITIONS,
                           gen_lr=0.001,
                           reg_lr=0.001,
                           **AE_PARAMS),
            DecodingRecommender(CONDITIONS,
                                n_epochs=100,
                                batch_size=1000,
                                optimizer='adam',
                                n_hidden=100,
                                lr=0.001,
                                verbose=True),
            VAERecommender(conditions=CONDITIONS, **AE_PARAMS),
            DAERecommender(conditions=CONDITIONS, **AE_PARAMS)
        ]
        ALL_MODELS += CONDITIONED_AUTOENCODERS

    print("Finished preparing models:", *ALL_MODELS, sep='\n\t')

    path = DATA_PATH + ("dblp-ref/" if dataset == "dblp" else "acm.txt")
    print("Loading data from", path)
    papers = papers_from_files(path, dataset, n_jobs=4)
    print("Unpacking {} data...".format(dataset))
    bags_of_papers, ids, side_info = unpack_papers(papers)
    del papers
    bags = Bags(bags_of_papers, ids, side_info)
    if args.compute_mi:
        from aaerec.utils import compute_mutual_info
        print("[MI] Dataset:", dataset)
        print("[MI] min Count:", min_count)
        tmp = bags.build_vocab(min_count=min_count, max_features=None)
        mi = compute_mutual_info(tmp,
                                 conditions=None,
                                 include_labels=True,
                                 normalize=True)
        with open('mi.csv', 'a') as mifile:
            print(dataset, min_count, mi, sep=',', file=mifile)

        print("=" * 78)
        exit(0)

    log("Whole dataset:", logfile=outfile)
    log(bags, logfile=outfile)

    evaluation = Evaluation(bags, year, logfile=outfile)
    evaluation.setup(min_count=min_count, min_elements=2, drop=drop)
    with open(outfile, 'a') as fh:
        print("~ Partial List + Titles + Author + Venue", "~" * 42, file=fh)
    evaluation(ALL_MODELS, batch_size=1000)