def main(): parser = argparse.ArgumentParser() parser.add_argument('exclude', type=argparse.FileType('r'), help="Path to exclude file, determines ground truth.") parser.add_argument('submission', type=str, help="Path to dev submission file") parser.add_argument('-v', '--verbose', default=0, type=int) args = parser.parse_args() dev_slices = [line.strip() for line in args.exclude] if args.verbose: print("Loading ground truth from", dev_slices) ground_truth = playlists_from_slices(DATA_PATH, only=dev_slices, verbose=args.verbose) # Make the json stuff dictionaries from pid to track uris ground_truth = { p['pid']: [t['track_uri'] for t in p['tracks']] for p in ground_truth } predictions = load_submission(args.submission) # Verify that pids match pids = set(ground_truth.keys()) pids_pred = set(predictions.keys()) if not pids_pred: print(args.submission, 'is empty.') exit(1) if args.verbose: print(len(pids), "pids in ground truth") print(len(pids_pred), "pids in predictions") print(len(set.intersection(pids, pids_pred)), "pids in intersection") # Super strict: All pids in both are the same assert len(pids ^ pids_pred) == 0 # Less strict: all predicted pids should be also in gold assert len(pids_pred - pids) == 0 summary = aggregate_metrics(ground_truth, predictions, 500, pids) print(summary) args = parser.parse_args()
def main(): parser = argparse.ArgumentParser() parser.add_argument('slices', type=argparse.FileType('r'), help="Path to file with one slice filename per line") parser.add_argument('-o', '--output', type=str, default=None, help="File to put output") args = parser.parse_args() if args.output is None: print("No output file specified, performing a dry run") if os.path.exists(args.output) and \ input("Path '{}' exists. Overwrite? [y/N]" .format(args.output)) != 'y': exit(-1) # strip newlines from exclude names slices = [s.strip() for s in args.slices] print("Creating dev set from slices:", slices) playlists = playlists_from_slices(DATA_PATH, only=slices) dev_playlists = corrupt_playlists(playlists) dev_set = { 'date': str(date.today()), 'version': 'dev set created from: ' + str(slices), 'playlists': dev_playlists } with open(args.output, 'w') as fhandle: json.dump(dev_set, fhandle)
return p path = set_path(dataset) if dataset == "dblp" or dataset == "acm" or dataset == "swp" or dataset == "mpd": if dataset != "swp" and dataset != "mpd": print("Loading {} dataset".format(dataset.upper())) papers = papers_from_files(path, dataset, n_jobs=1) elif dataset == "swp": print("Loading SWP dataset") papers = load(path) else: print("Loading MPD dataset") # actually not papers but playlists papers = playlists_from_slices(path, n_jobs=4) years, citations, set_cnts = generate_years_citations_set_cnts( papers, dataset) if dataset != "mpd": # only papers from min_year years = from_to_key(years, min_year) years = collections.OrderedDict(sorted(years.items())) l = list(years.keys()) print("First year {}, last year {}".format(l[0], l[-1])) cnt = 0 for key, value in years.items(): cnt += value if cnt / len(papers) >= 0.9:
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--model', type=str, default='aae', # All possible method should appear here choices=['cm', 'svd', 'ae', 'aae', 'mlp'], help="Specify the model to use [aae]") parser.add_argument('--epochs', type=int, default=20, help="Specify the number of training epochs [50]") parser.add_argument('--hidden', type=int, default=200, help="Number of hidden units [100]") parser.add_argument('--no-title', action='store_false', default=True, dest='use_title', help="Do not use the playlist titles") parser.add_argument('--max-items', type=int, default=75000, help="Limit the max number of considered items") parser.add_argument( '--vocab-size', type=int, default=50000, help="Limit the max number of distinct condition words") parser.add_argument('-j', '--jobs', type=int, default=4, help="Number of jobs for data loading [4].") parser.add_argument('-o', '--outfile', default="submission.csv", type=str, help="Write submissions to this path") parser.add_argument('--use-embedding', default=False, action='store_true', help="Use embedding (SGNS GoogleNews) [false]") parser.add_argument('--dont-aggregate', action='store_false', dest='aggregate', default=True, help="Aggregate track metadata as side info input") parser.add_argument('--debug', action='store_true', default=False, help="Activate debug mode, run only on small sample") parser.add_argument( '-x', '--exclude', type=argparse.FileType('r'), default=None, help="Path to file with slice filenames to exclude for training") parser.add_argument( '--dev', type=str, default=None, help='Path to dev set, use in combination with (-x, --exclude)') parser.add_argument('--no-idf', action='store_false', default=True, dest='use_idf', help="Do **not** use idf re-weighting") parser.add_argument('--lr', type=float, default=0.001, help="Initial learning rate [0.001]") parser.add_argument('--code', type=int, default=100, help="Code dimension [100]") args = parser.parse_args() # Either exclude and dev set, or no exclude and test set assert (args.dev is None) == (args.exclude is None) if args.dev is not None: print("Making submission for dev set:", args.dev) assert os.path.isfile(args.dev) # Dump args into submission file if os.path.exists(args.outfile) and \ input("Path '{}' exists. Overwrite? [y/N]" .format(args.outfile)) != 'y': exit(-1) with open(args.outfile, 'w') as out: print('#', args, file=out) if args.use_embedding: print("Loading embedding:", W2V_PATH) vectors = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY) else: vectors = None # Create the model as specified by command line args # Count-based never uses title # Decoding recommender always uses title tfidf_params = {'max_features': args.vocab_size, 'use_idf': args.use_idf} model = { 'cm': Countbased(), 'svd': SVDRecommender(use_title=args.use_title), 'ae': AAERecommender(use_title=args.use_title, adversarial=False, n_hidden=args.hidden, n_code=args.code, n_epochs=args.epochs, embedding=vectors, lr=args.lr, tfidf_params=tfidf_params), 'aae': AAERecommender( use_title=args.use_title, adversarial=True, n_hidden=args.hidden, n_code=args.code, n_epochs=args.epochs, gen_lr=args.lr, reg_lr=args.lr, # same gen and reg lrs embedding=vectors, tfidf_params=tfidf_params), 'mlp': DecodingRecommender(n_epochs=args.epochs, n_hidden=args.hidden, embedding=vectors, tfidf_params=tfidf_params) }[args.model] track_attrs = TRACK_INFO if args.aggregate else None if args.exclude is not None: # Dev set case, exclude dev set data exclude = [line.strip() for line in args.exclude] else: # Real submission case, do not exclude any training data exclude = None # = Training = print("Loading data from {} using {} jobs".format(DATA_PATH, args.jobs)) playlists = playlists_from_slices(DATA_PATH, n_jobs=args.jobs, debug=args.debug, without=exclude) print("Unpacking playlists") train_set = Bags(*unpack_playlists(playlists, aggregate=track_attrs)) print("Building vocabulary of {} most frequent items".format( args.max_items)) vocab, __counts = train_set.build_vocab(max_features=args.max_items, apply=False) train_set = train_set.apply_vocab(vocab) print("Training set:", train_set, sep='\n') print("Training for {} epochs".format(args.epochs)) try: model.train(train_set) except KeyboardInterrupt: print("Training interrupted by keyboard, pass.") # Not required anymore del train_set # = Predictions = if args.dev is not None: print("Loading and unpacking DEV set") data, index2playlist, side_info = unpack_playlists( load(args.dev), aggregate=track_attrs) else: print("Loading and unpacking test set") data, index2playlist, side_info = unpack_playlists( load(TEST_PATH), aggregate=track_attrs) test_set = Bags(data, index2playlist, side_info) # Apply same vocabulary as in training test_set = test_set.apply_vocab(vocab) print("Test set:", test_set, sep='\n') pred = model.predict(test_set) if sp.issparse(pred): pred = pred.toarray() else: pred = np.asarray(pred) print("Scaling and removing non-missing items") pred = remove_non_missing(pred, test_set.tocsr(), copy=False) index2trackid = {v: k for k, v in vocab.items()} print("Making submission:", args.outfile) make_submission(pred, index2playlist, index2trackid, outfile=args.outfile) print("Success.") print("Make sure to verify the submission format via", VERIFY_SCRIPT)