Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('exclude',
                        type=argparse.FileType('r'),
                        help="Path to exclude file, determines ground truth.")
    parser.add_argument('submission',
                        type=str,
                        help="Path to dev submission file")
    parser.add_argument('-v', '--verbose', default=0, type=int)

    args = parser.parse_args()

    dev_slices = [line.strip() for line in args.exclude]
    if args.verbose:
        print("Loading ground truth from", dev_slices)

    ground_truth = playlists_from_slices(DATA_PATH,
                                         only=dev_slices,
                                         verbose=args.verbose)
    # Make the json stuff dictionaries from pid to track uris
    ground_truth = {
        p['pid']: [t['track_uri'] for t in p['tracks']]
        for p in ground_truth
    }

    predictions = load_submission(args.submission)

    # Verify that pids match
    pids = set(ground_truth.keys())
    pids_pred = set(predictions.keys())
    if not pids_pred:
        print(args.submission, 'is empty.')
        exit(1)
    if args.verbose:
        print(len(pids), "pids in ground truth")
        print(len(pids_pred), "pids in predictions")
        print(len(set.intersection(pids, pids_pred)), "pids in intersection")
    # Super strict: All pids in both are the same
    assert len(pids ^ pids_pred) == 0

    # Less strict: all predicted pids should be also in gold
    assert len(pids_pred - pids) == 0

    summary = aggregate_metrics(ground_truth, predictions, 500, pids)

    print(summary)

    args = parser.parse_args()
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('slices',
                        type=argparse.FileType('r'),
                        help="Path to file with one slice filename per line")
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        default=None,
                        help="File to put output")
    args = parser.parse_args()

    if args.output is None:
        print("No output file specified, performing a dry run")

    if os.path.exists(args.output) and \
            input("Path '{}' exists. Overwrite? [y/N]"
                  .format(args.output)) != 'y':
        exit(-1)

    # strip newlines from exclude names
    slices = [s.strip() for s in args.slices]
    print("Creating dev set from slices:", slices)

    playlists = playlists_from_slices(DATA_PATH, only=slices)

    dev_playlists = corrupt_playlists(playlists)

    dev_set = {
        'date': str(date.today()),
        'version': 'dev set created from: ' + str(slices),
        'playlists': dev_playlists
    }

    with open(args.output, 'w') as fhandle:
        json.dump(dev_set, fhandle)
Exemplo n.º 3
0
    return p


path = set_path(dataset)

if dataset == "dblp" or dataset == "acm" or dataset == "swp" or dataset == "mpd":
    if dataset != "swp" and dataset != "mpd":
        print("Loading {} dataset".format(dataset.upper()))
        papers = papers_from_files(path, dataset, n_jobs=1)
    elif dataset == "swp":
        print("Loading SWP dataset")
        papers = load(path)
    else:
        print("Loading MPD dataset")
        # actually not papers but playlists
        papers = playlists_from_slices(path, n_jobs=4)

    years, citations, set_cnts = generate_years_citations_set_cnts(
        papers, dataset)

    if dataset != "mpd":
        # only papers from min_year
        years = from_to_key(years, min_year)
        years = collections.OrderedDict(sorted(years.items()))
        l = list(years.keys())
        print("First year {}, last year {}".format(l[0], l[-1]))
        cnt = 0

        for key, value in years.items():
            cnt += value
            if cnt / len(papers) >= 0.9:
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model',
        type=str,
        default='aae',
        # All possible method should appear here
        choices=['cm', 'svd', 'ae', 'aae', 'mlp'],
        help="Specify the model to use [aae]")
    parser.add_argument('--epochs',
                        type=int,
                        default=20,
                        help="Specify the number of training epochs [50]")
    parser.add_argument('--hidden',
                        type=int,
                        default=200,
                        help="Number of hidden units [100]")
    parser.add_argument('--no-title',
                        action='store_false',
                        default=True,
                        dest='use_title',
                        help="Do not use the playlist titles")
    parser.add_argument('--max-items',
                        type=int,
                        default=75000,
                        help="Limit the max number of considered items")
    parser.add_argument(
        '--vocab-size',
        type=int,
        default=50000,
        help="Limit the max number of distinct condition words")
    parser.add_argument('-j',
                        '--jobs',
                        type=int,
                        default=4,
                        help="Number of jobs for data loading [4].")
    parser.add_argument('-o',
                        '--outfile',
                        default="submission.csv",
                        type=str,
                        help="Write submissions to this path")
    parser.add_argument('--use-embedding',
                        default=False,
                        action='store_true',
                        help="Use embedding (SGNS GoogleNews) [false]")
    parser.add_argument('--dont-aggregate',
                        action='store_false',
                        dest='aggregate',
                        default=True,
                        help="Aggregate track metadata as side info input")
    parser.add_argument('--debug',
                        action='store_true',
                        default=False,
                        help="Activate debug mode, run only on small sample")
    parser.add_argument(
        '-x',
        '--exclude',
        type=argparse.FileType('r'),
        default=None,
        help="Path to file with slice filenames to exclude for training")
    parser.add_argument(
        '--dev',
        type=str,
        default=None,
        help='Path to dev set, use in combination with (-x, --exclude)')
    parser.add_argument('--no-idf',
                        action='store_false',
                        default=True,
                        dest='use_idf',
                        help="Do **not** use idf re-weighting")
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help="Initial learning rate [0.001]")
    parser.add_argument('--code',
                        type=int,
                        default=100,
                        help="Code dimension [100]")
    args = parser.parse_args()

    # Either exclude and dev set, or no exclude and test set
    assert (args.dev is None) == (args.exclude is None)
    if args.dev is not None:
        print("Making submission for dev set:", args.dev)
        assert os.path.isfile(args.dev)

    # Dump args into submission file
    if os.path.exists(args.outfile) and \
            input("Path '{}' exists. Overwrite? [y/N]"
                  .format(args.outfile)) != 'y':
        exit(-1)

    with open(args.outfile, 'w') as out:
        print('#', args, file=out)

    if args.use_embedding:
        print("Loading embedding:", W2V_PATH)
        vectors = KeyedVectors.load_word2vec_format(W2V_PATH,
                                                    binary=W2V_IS_BINARY)
    else:
        vectors = None

    # Create the model as specified by command line args
    # Count-based never uses title
    # Decoding recommender always uses title

    tfidf_params = {'max_features': args.vocab_size, 'use_idf': args.use_idf}

    model = {
        'cm':
        Countbased(),
        'svd':
        SVDRecommender(use_title=args.use_title),
        'ae':
        AAERecommender(use_title=args.use_title,
                       adversarial=False,
                       n_hidden=args.hidden,
                       n_code=args.code,
                       n_epochs=args.epochs,
                       embedding=vectors,
                       lr=args.lr,
                       tfidf_params=tfidf_params),
        'aae':
        AAERecommender(
            use_title=args.use_title,
            adversarial=True,
            n_hidden=args.hidden,
            n_code=args.code,
            n_epochs=args.epochs,
            gen_lr=args.lr,
            reg_lr=args.lr,  # same gen and reg lrs
            embedding=vectors,
            tfidf_params=tfidf_params),
        'mlp':
        DecodingRecommender(n_epochs=args.epochs,
                            n_hidden=args.hidden,
                            embedding=vectors,
                            tfidf_params=tfidf_params)
    }[args.model]

    track_attrs = TRACK_INFO if args.aggregate else None

    if args.exclude is not None:
        # Dev set case, exclude dev set data
        exclude = [line.strip() for line in args.exclude]
    else:
        # Real submission case, do not exclude any training data
        exclude = None

    # = Training =
    print("Loading data from {} using {} jobs".format(DATA_PATH, args.jobs))
    playlists = playlists_from_slices(DATA_PATH,
                                      n_jobs=args.jobs,
                                      debug=args.debug,
                                      without=exclude)
    print("Unpacking playlists")
    train_set = Bags(*unpack_playlists(playlists, aggregate=track_attrs))

    print("Building vocabulary of {} most frequent items".format(
        args.max_items))
    vocab, __counts = train_set.build_vocab(max_features=args.max_items,
                                            apply=False)
    train_set = train_set.apply_vocab(vocab)
    print("Training set:", train_set, sep='\n')

    print("Training for {} epochs".format(args.epochs))
    try:
        model.train(train_set)
    except KeyboardInterrupt:
        print("Training interrupted by keyboard, pass.")

    # Not required anymore
    del train_set

    # = Predictions =
    if args.dev is not None:
        print("Loading and unpacking DEV set")
        data, index2playlist, side_info = unpack_playlists(
            load(args.dev), aggregate=track_attrs)
    else:
        print("Loading and unpacking test set")
        data, index2playlist, side_info = unpack_playlists(
            load(TEST_PATH), aggregate=track_attrs)
    test_set = Bags(data, index2playlist, side_info)
    # Apply same vocabulary as in training
    test_set = test_set.apply_vocab(vocab)
    print("Test set:", test_set, sep='\n')

    pred = model.predict(test_set)
    if sp.issparse(pred):
        pred = pred.toarray()
    else:
        pred = np.asarray(pred)
    print("Scaling and removing non-missing items")
    pred = remove_non_missing(pred, test_set.tocsr(), copy=False)

    index2trackid = {v: k for k, v in vocab.items()}
    print("Making submission:", args.outfile)
    make_submission(pred, index2playlist, index2trackid, outfile=args.outfile)
    print("Success.")
    print("Make sure to verify the submission format via", VERIFY_SCRIPT)