def main(): CONFIG = { 'pub': ('/data21/lgalke/datasets/citations_pmc.tsv', 2011, 50), 'eco': ('/data21/lgalke/datasets/econbiz62k.tsv', 2012, 1) } print("Loading pre-trained embedding", W2V_PATH) vectors = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY) CONDITIONS = ConditionList([ ('title', PretrainedWordEmbeddingCondition(vectors, dim=0)) ]) PARSER = argparse.ArgumentParser() PARSER.add_argument('data', type=str, choices=['pub', 'eco']) args = PARSER.parse_args() DATA = CONFIG[args.data] logfile = '/data22/ivagliano/test-irgan/' + args.data + '-decoder.log' bags = Bags.load_tabcomma_format(DATA[0]) c_year = DATA[1] evaluate = Evaluation(bags, year=c_year, logfile=logfile).setup(min_count=DATA[2], min_elements=2) user_num = evaluate.train_set.size()[0] + evaluate.test_set.size()[0] item_num = evaluate.train_set.size()[1] models = [IRGANRecommender(user_num, item_num, g_epochs=1, d_epochs=1, n_epochs=1, conditions=CONDITIONS)] evaluate(models)
def main(year, min_count=None, outfile=None): """ Main function for training and evaluating AAE methods on IREON data """ if (CLEAN == True): print("Loading data from", DATA_PATH) papers = load(DATA_PATH) print("Cleaning data...") clean(CLEAN_DATA_PATH, papers) print("Clean data in {}".format(CLEAN_DATA_PATH)) return print("Loading data from", CLEAN_DATA_PATH) papers = load(CLEAN_DATA_PATH) print("Unpacking IREON data...") bags_of_papers, ids, side_info = unpack_papers(papers) del papers bags = Bags(bags_of_papers, ids, side_info) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) evaluation = Evaluation(bags, year, logfile=outfile) evaluation.setup(min_count=min_count, min_elements=2) print("Loading pre-trained embedding", W2V_PATH) with open(outfile, 'a') as fh: print("~ Partial List", "~" * 42, file=fh) evaluation(BASELINES + RECOMMENDERS) with open(outfile, 'a') as fh: print("~ Partial List + Titles", "~" * 42, file=fh) evaluation(TITLE_ENHANCED)
def main(outfile=None, min_count=None, aggregate=None): """ Main function for training and evaluating AAE methods on MDP data """ print("Loading data from", DATA_PATH) playlists = playlists_from_slices(DATA_PATH, n_jobs=4) print("Unpacking json data...") if aggregate is not None: aggregate =['artist_name', 'track_name', 'album_name'] print("Using aggegated metadata {}".format(aggregate)) else: print("Aggrgate={}".fomat(aggregate)) print("Using title only") bags_of_tracks, pids, side_info = unpack_playlists(playlists, aggregate) del playlists bags = Bags(bags_of_tracks, pids, side_info) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) train_set, dev_set, y_test = prepare_evaluation(bags, n_items=N_ITEMS, min_count=min_count) log("Train set:", logfile=outfile) log(train_set, logfile=outfile) log("Dev set:", logfile=outfile) log(dev_set, logfile=outfile) # THE GOLD (put into sparse matrix) y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False) # the known items in the test set, just to not recompute x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False) for model in MODELS: log('=' * 78, logfile=outfile) log(model, logfile=outfile) # Training model.train(train_set) # Prediction y_pred = model.predict(dev_set) # Sanity-fix #1, make sparse stuff dense, expect array if sp.issparse(y_pred): y_pred = y_pred.toarray() else: y_pred = np.asarray(y_pred) # Sanity-fix, remove predictions for already present items y_pred = remove_non_missing(y_pred, x_test, copy=False) # Evaluate metrics results = evaluate(y_test, y_pred, METRICS, batch_size=1000) log("-" * 78, logfile=outfile) for metric, stats in zip(METRICS, results): log("* {}: {} ({})".format(metric, *stats), logfile=outfile) log('=' * 78, logfile=outfile)
def main(year, min_count=None, outfile=None, drop=1): """ Main function for training and evaluating AAE methods on IREON data """ if (CLEAN == True): print("Loading data from", DATA_PATH) papers = load(DATA_PATH) print("Cleaning data...") clean(CLEAN_DATA_PATH, papers) print("Clean data in {}".format(CLEAN_DATA_PATH)) return print("Loading data from", CLEAN_DATA_PATH) papers = load(CLEAN_DATA_PATH) print("Unpacking IREON data...") # bags_of_papers, ids, side_info = unpack_papers(papers) bags_of_papers, ids, side_info = unpack_papers_conditions(papers) del papers bags = Bags(bags_of_papers, ids, side_info) if args.compute_mi: from aaerec.utils import compute_mutual_info print("[MI] Dataset: IREON (fiv)") print("[MI] min Count:", min_count) tmp = bags.build_vocab(min_count=min_count, max_features=None) mi = compute_mutual_info(tmp, conditions=None, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print('IREON', min_count, mi, sep=',', file=mifile) print("=" * 78) exit(0) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) evaluation = Evaluation(bags, year, logfile=outfile) evaluation.setup(min_count=min_count, min_elements=2, drop=drop) # Use only partial citations/labels list (no additional metadata) with open(outfile, 'a') as fh: print("~ Partial List", "~" * 42, file=fh) evaluation(BASELINES + RECOMMENDERS) # Use additional metadata (as defined in CONDITIONS for all models but SVD, which uses only titles) with open(outfile, 'a') as fh: print("~ Conditioned Models", "~" * 42, file=fh) evaluation(CONDITIONED_MODELS)
def main(year, dataset, min_count=None, outfile=None, drop=1): """ Main function for training and evaluating AAE methods on DBLP data """ path = DATA_PATH + ("dblp-ref/" if dataset == "dblp" else "acm.txt") print("Loading data from", path) papers = papers_from_files(path, dataset, n_jobs=4) print("Unpacking {} data...".format(dataset)) bags_of_papers, ids, side_info = unpack_papers(papers) del papers bags = Bags(bags_of_papers, ids, side_info) if args.compute_mi: from aaerec.utils import compute_mutual_info print("[MI] Dataset:", dataset) print("[MI] min Count:", min_count) tmp = bags.build_vocab(min_count=min_count, max_features=None) mi = compute_mutual_info(tmp, conditions=None, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print(dataset, min_count, mi, sep=',', file=mifile) print("=" * 78) exit(0) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) evaluation = Evaluation(bags, year, logfile=outfile) evaluation.setup(min_count=min_count, min_elements=2, drop=drop) # To evaluate the baselines and the recommenders without metadata (or just the recommenders without metadata) # with open(outfile, 'a') as fh: # print("~ Partial List", "~" * 42, file=fh) # evaluation(BASELINES + RECOMMENDERS) # evaluation(RECOMMENDERS, batch_size=1000) with open(outfile, 'a') as fh: print("~ Partial List + Titles + Author + Venue", "~" * 42, file=fh) # To evaluate SVD with titles # evaluation(TITLE_ENHANCED) evaluation(CONDITIONED_MODELS, batch_size=1000)
def main(): """ Evaluates the VAE Recommender """ CONFIG = { 'pub': ('/data21/lgalke/datasets/citations_pmc.tsv', 2011, 50), 'eco': ('/data21/lgalke/datasets/econbiz62k.tsv', 2012, 1) } PARSER = argparse.ArgumentParser() PARSER.add_argument('data', type=str, choices=['pub', 'eco']) args = PARSER.parse_args() DATA = CONFIG[args.data] logfile = '/data22/ivagliano/test-vae/' + args.data + '-hyperparams-opt.log' bags = Bags.load_tabcomma_format(DATA[0]) c_year = DATA[1] evaluate = Evaluation(bags, year=c_year, logfile=logfile).setup(min_count=DATA[2], min_elements=2) print("Loading pre-trained embedding", W2V_PATH) vectors = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY) params = { #'n_epochs': 10, 'batch_size': 100, 'optimizer': 'adam', # 'normalize_inputs': True, } CONDITIONS = ConditionList([('title', PretrainedWordEmbeddingCondition(vectors))]) # 100 hidden units, 200 epochs, bernoulli prior, normalized inputs -> 0.174 # activations = ['ReLU','SELU'] # lrs = [(0.001, 0.0005), (0.001, 0.001)] hcs = [(100, 50), (300, 100)] epochs = [50, 100, 200, 500] # dropouts = [(.2,.2), (.1,.1), (.1, .2), (.25, .25), (.3,.3)] # .2,.2 is best # priors = ['categorical'] # gauss is best # normal = [True, False] # bernoulli was good, letz see if categorical is better... No import itertools models = [ VAERecommender(conditions=CONDITIONS, **params, n_hidden=hc[0], n_code=hc[1], n_epochs=e) for hc, e in itertools.product(hcs, epochs) ] # models = [VAERecommender(conditions=CONDITIONS, **params)] evaluate(models)
def main(outfile=None, min_count=None): """ Main function for training and evaluating AAE methods on Reuters data """ print("Loading data from", DATA_PATH) bags = Bags.load_tabcomma_format(DATA_PATH, unique=True) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) train_set, dev_set, y_test = prepare_evaluation(bags, min_count=min_count) log("Train set:", logfile=outfile) log(train_set, logfile=outfile) log("Dev set:", logfile=outfile) log(dev_set, logfile=outfile) # THE GOLD (put into sparse matrix) y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False) # the known items in the test set, just to not recompute x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False) for model in MODELS: log('=' * 78, logfile=outfile) log(model, logfile=outfile) # Training model.train(train_set) # Prediction y_pred = model.predict(dev_set) # Sanity-fix #1, make sparse stuff dense, expect array if sp.issparse(y_pred): y_pred = y_pred.toarray() else: y_pred = np.asarray(y_pred) # Sanity-fix, remove predictions for already present items y_pred = remove_non_missing(y_pred, x_test, copy=False) # Evaluate metrics results = evaluate(y_test, y_pred, METRICS) log("-" * 78, logfile=outfile) for metric, stats in zip(METRICS, results): log("* {}: {} ({})".format(metric, *stats), logfile=outfile) log('=' * 78, logfile=outfile)
def main(year, dataset, min_count=None, outfile=None): """ Main function for training and evaluating AAE methods on DBLP data """ path = DATA_PATH + ("dblp-ref/" if dataset == "dblp" else "acm.txt") print("Loading data from", path) papers = papers_from_files(path, dataset, n_jobs=4) print("Unpacking {} data...".format(dataset)) bags_of_papers, ids, side_info = unpack_papers(papers) del papers bags = Bags(bags_of_papers, ids, side_info) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) evaluation = Evaluation(bags, year, logfile=outfile) evaluation.setup(min_count=min_count, min_elements=2) print("Loading pre-trained embedding", W2V_PATH) with open(outfile, 'a') as fh: print("~ Partial List", "~" * 42, file=fh) evaluation(BASELINES + RECOMMENDERS) with open(outfile, 'a') as fh: print("~ Partial List + Titles", "~" * 42, file=fh) evaluation(TITLE_ENHANCED)
W2V_PATH = "/data21/lgalke/vectors/GoogleNews-vectors-negative300.bin.gz" W2V_IS_BINARY = True PARSER = argparse.ArgumentParser() PARSER.add_argument('dataset', type=str, help='path to dataset') PARSER.add_argument('year', type=int, help='First year of the testing set.') PARSER.add_argument('-m', '--min-count', type=int, help='Pruning parameter', default=50) PARSER.add_argument('-o', '--outfile', type=str, default=None) ARGS = PARSER.parse_args() DATASET = Bags.load_tabcomma_format(ARGS.dataset, unique=True) EVAL = Evaluation(DATASET, ARGS.year, logfile=ARGS.outfile) EVAL.setup(min_count=ARGS.min_count, min_elements=2) print("Loading pre-trained embedding", W2V_PATH) VECTORS = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY) BASELINES = [ # RandomBaseline(), # MostPopular(), Countbased(), SVDRecommender(1000, use_title=False), ] ae_params = { 'n_code': 50,
def main(outfile=None, min_count=None, aggregate=None): """ Main function for training and evaluating AAE methods on MDP data """ print("Loading data from", DATA_PATH) playlists = playlists_from_slices(DATA_PATH, n_jobs=4) print("Unpacking json data...") bags_of_tracks, pids, side_info = unpack_playlists_for_models_concatenated( playlists) del playlists bags = Bags(data=bags_of_tracks, owners=pids, owner_attributes=side_info) if args.compute_mi: from sklearn.metrics import mutual_info_score print("Computing MI") X = bags.build_vocab(min_count=args.min_count, max_features=None).tocsr() C = X.T @ X print("(Pairwise) mutual information:", mutual_info_score(None, None, contingency=C)) # Exit in this case print("Bye.") exit(0) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) train_set, dev_set, y_test = prepare_evaluation(bags, n_items=N_ITEMS, min_count=min_count) log("Train set:", logfile=outfile) log(train_set, logfile=outfile) log("Dev set:", logfile=outfile) log(dev_set, logfile=outfile) # THE GOLD (put into sparse matrix) y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False) # the known items in the test set, just to not recompute x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False) for model in MODELS: log('=' * 78, logfile=outfile) log(model, logfile=outfile) log(model.model_params, logfile=outfile) # Training model.train(train_set) print("training finished") # Prediction y_pred = model.predict(dev_set) print("prediction finished") print(" prediction sparse?:", sp.issparse(y_pred)) # Sanity-fix #1, make sparse stuff dense, expect array if sp.issparse(y_pred): y_pred = y_pred.toarray() else: y_pred = np.asarray(y_pred) print("remove non-missing:") # Sanity-fix, remove predictions for already present items y_pred = remove_non_missing(y_pred, x_test, copy=False) print("evaluate:") # Evaluate metrics results = evaluate(y_test, y_pred, METRICS, batch_size=500) print("metrics: ") log("-" * 78, logfile=outfile) for metric, stats in zip(METRICS, results): log("* {}: {} ({})".format(metric, *stats), logfile=outfile) log('=' * 78, logfile=outfile)
elif dataset == "swp": text = "labels" else: text = "tracks" print("Generating {} distribution".format(text)) citations = paper_by_n_citations(citations) print("Unpacking {} data...".format(dataset)) if dataset == "acm" or dataset == "dblp": bags_of_papers, ids, side_info = unpack_papers(papers) elif dataset == "mpd": # not bags_of_papers but bugs_of_tracks bags_of_papers, ids, side_info = unpack_playlists(papers) else: bags_of_papers, ids, side_info = unpack_papers_fiv(papers) bags = Bags(bags_of_papers, ids, side_info) else: print("Loading {}".format(path)) df = pd.read_csv(path, sep="\t", dtype=str, error_bad_lines=False) # replace nan with empty string df = df.replace(np.nan, "", regex=True) citations = generate_citations(df) print("Generating {} distribution".format("citations" if dataset == "pubmed" else "occurrences")) citations = paper_by_n_citations(citations) set_cnts = set_count(df) print("Unpacking {} data...".format(dataset))
import numpy as np from sklearn.metrics import mutual_info_score from aaerec.datasets import Bags from aaerec.condition import ConditionList, CountCondition from aaerec.utils import compute_mutual_info PARSER = argparse.ArgumentParser() PARSER.add_argument('dataset', type=str, help='path to dataset') PARSER.add_argument('-m', '--min-count', type=int, help='Pruning parameter', default=None) PARSER.add_argument('-M', '--max-features', type=int, help='Max features', default=None) ARGS = PARSER.parse_args() # MI_CONDITIONS = ConditionList([('title', CountCondition(max_features=100000))]) MI_CONDITIONS = None print("Computing Mutual Info with args") print(ARGS) # With no metadata or just titles BAGS = Bags.load_tabcomma_format(ARGS.dataset, unique=True)\ .build_vocab(min_count=ARGS.min_count, max_features=ARGS.max_features) mi = compute_mutual_info(BAGS, MI_CONDITIONS, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print('CITREC', ARGS.min_count, mi, sep=',', file=mifile)
def main(outfile=None, min_count=None, drop=1): """ Main function for training and evaluating AAE methods on Reuters data """ print("Loading data from", DATA_PATH) bags = Bags.load_tabcomma_format(DATA_PATH, unique=True) if args.compute_mi: from aaerec.utils import compute_mutual_info print("[MI] Dataset: Reuters") print("[MI] min Count:", min_count) tmp = bags.build_vocab(min_count=min_count, max_features=None) mi = compute_mutual_info(tmp, conditions=None, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print('Reuters', min_count, mi, sep=',', file=mifile) print("=" * 78) exit(0) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) train_set, dev_set, y_test = prepare_evaluation(bags, min_count=min_count, drop=drop) log("Train set:", logfile=outfile) log(train_set, logfile=outfile) log("Dev set:", logfile=outfile) log(dev_set, logfile=outfile) # THE GOLD (put into sparse matrix) y_test = lists2sparse(y_test, dev_set.size(1)).tocsr(copy=False) # the known items in the test set, just to not recompute x_test = lists2sparse(dev_set.data, dev_set.size(1)).tocsr(copy=False) for model in MODELS: log('=' * 78, logfile=outfile) log(model, logfile=outfile) # Training model.train(train_set) # Prediction y_pred = model.predict(dev_set) # Sanity-fix #1, make sparse stuff dense, expect array if sp.issparse(y_pred): y_pred = y_pred.toarray() else: y_pred = np.asarray(y_pred) # Sanity-fix, remove predictions for already present items y_pred = remove_non_missing(y_pred, x_test, copy=False) # Evaluate metrics results = evaluate(y_test, y_pred, METRICS) log("-" * 78, logfile=outfile) for metric, stats in zip(METRICS, results): log("* {}: {} ({})".format(metric, *stats), logfile=outfile) log('=' * 78, logfile=outfile)
import numpy as np from aaerec.datasets import Bags # path = '../Data/Economics/econbiz62k.tsv' path = '../Data/PMC/citations_pmc.tsv' bags = Bags.load_tabcomma_format(path, unique=True) bags = bags.build_vocab(apply=True) csr = bags.tocsr() print("N ratings:", csr.sum()) column_sums = csr.sum(0).flatten() row_sums = csr.sum(1).flatten() print(column_sums.shape) print(row_sums.shape) FMT = "N={}, Min={}, Max={} Median={}, Mean={}, Std={}" def compute_stats(A): return A.shape[1], A.min(), A.max(), np.median(A, axis=1)[0,0], A.mean(), A.std() print("Items per document") print(FMT.format(*compute_stats(row_sums))) print("Documents per item") print(FMT.format(*compute_stats(column_sums)))
else: text = "tracks" print("Generating {} distribution".format(text)) citations = paper_by_n_citations(citations) print("Unpacking {} data...".format(dataset)) if dataset == "acm" or dataset == "dblp": bags_of_papers, ids, side_info = unpack_papers(papers) elif dataset == "mpd": # not bags_of_papers but bugs_of_tracks bags_of_papers, ids, side_info = unpack_playlists(papers) elif dataset == "swp": bags_of_papers, ids, side_info = unpack_papers_fiv(papers) else: bags_of_papers, ids, side_info = unpack_papers_econis(papers) bags = Bags(bags_of_papers, ids, side_info) else: print("Loading {}".format(path)) df = pd.read_csv(path, sep="\t", dtype=str, error_bad_lines=False) # replace nan with empty string df = df.replace(np.nan, "", regex=True) citations = generate_citations(df, dataset) print("Generating {} distribution".format("citations" if dataset == "pubmed" else "occurrences")) citations = paper_by_n_citations(citations) set_cnts = set_count(df, dataset) print("Unpacking {} data...".format(dataset))
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--model', type=str, defaule='aae', # All possible method should appear here choices=['cm', 'svd', 'ae', 'aae', 'mlp'], help="Specify the model to use [aae]") parser.add_argument('--epochs', type=int, default=20, help="Specify the number of training epochs [50]") parser.add_argument('--hidden', type=int, default=200, help="Number of hidden units [100]") parser.add_argument('--no-title', action='store_false', default=True, dest='use_title', help="Do not use the playlist titles") parser.add_argument('--max-items', type=int, default=75000, help="Limit the max number of considered items") parser.add_argument( '--vocab-size', type=int, default=50000, help="Limit the max number of distinct condition words") parser.add_argument('-j', '--jobs', type=int, default=4, help="Number of jobs for data loading [4].") parser.add_argument('-o', '--outfile', default="submission.csv", type=str, help="Write submissions to this path") parser.add_argument('--use-embedding', default=False, action='store_true', help="Use embedding (SGNS GoogleNews) [false]") parser.add_argument('--dont-aggregate', action='store_false', dest='aggregate', default=True, help="Aggregate track metadata as side info input") parser.add_argument('--debug', action='store_true', default=False, help="Activate debug mode, run only on small sample") parser.add_argument( '-x', '--exclude', type=argparse.FileType('r'), default=None, help="Path to file with slice filenames to exclude for training") parser.add_argument( '--dev', type=str, default=None, help='Path to dev set, use in combination with (-x, --exclude)') parser.add_argument('--no-idf', action='store_false', default=True, dest='use_idf', help="Do **not** use idf re-weighting") parser.add_argument('--lr', type=float, default=0.001, help="Initial learning rate [0.001]") parser.add_argument('--code', type=int, default=100, help="Code dimension [50]") args = parser.parse_args() # Either exclude and dev set, or no exclude and test set assert (args.dev is None) == (args.exclude is None) if args.dev is not None: print("Making submission for dev set:", args.dev) assert os.path.isfile(args.dev) # Dump args into submission file if os.path.exists(args.outfile) and \ input("Path '{}' exists. Overwrite? [y/N]" .format(args.outfile)) != 'y': exit(-1) with open(args.outfile, 'w') as out: print('#', args, file=out) if args.use_embedding: print("Loading embedding:", W2V_PATH) vectors = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY) else: vectors = None # Create the model as specified by command line args # Count-based never uses title # Decoding recommender always uses title tfidf_params = {'max_features': args.vocab_size, 'use_idf': args.use_idf} model = { 'cm': Countbased(), 'svd': SVDRecommender(use_title=args.use_title), 'ae': AAERecommender(use_title=args.use_title, adversarial=False, n_hidden=args.hidden, n_code=args.code, n_epochs=args.epochs, embedding=vectors, lr=args.lr, tfidf_params=tfidf_params), 'aae': AAERecommender( use_title=args.use_title, adversarial=True, n_hidden=args.hidden, n_code=args.code, n_epochs=args.epochs, gen_lr=args.lr, reg_lr=args.lr, # same gen and reg lrs embedding=vectors, tfidf_params=tfidf_params), 'mlp': DecodingRecommender(n_epochs=args.epochs, n_hidden=args.hidden, embedding=vectors, tfidf_params=tfidf_params) }[args.model] track_attrs = TRACK_INFO if args.aggregate else None if args.exclude is not None: # Dev set case, exclude dev set data exclude = [line.strip() for line in args.exclude] else: # Real submission case, do not exclude any training data exclude = None # = Training = print("Loading data from {} using {} jobs".format(DATA_PATH, args.jobs)) playlists = playlists_from_slices(DATA_PATH, n_jobs=args.jobs, debug=args.debug, without=exclude) print("Unpacking playlists") train_set = Bags(*unpack_playlists(playlists, aggregate=track_attrs)) print("Building vocabulary of {} most frequent items".format( args.max_items)) vocab, __counts = train_set.build_vocab(max_features=args.max_items, apply=False) train_set = train_set.apply_vocab(vocab) print("Training set:", train_set, sep='\n') print("Training for {} epochs".format(args.epochs)) try: model.train(train_set) except KeyboardInterrupt: print("Training interrupted by keyboard, pass.") # Not required anymore del train_set # = Predictions = if args.dev is not None: print("Loading and unpacking DEV set") data, index2playlist, side_info = unpack_playlists( load(args.dev), aggregate=track_attrs) else: print("Loading and unpacking test set") data, index2playlist, side_info = unpack_playlists( load(TEST_PATH), aggregate=track_attrs) test_set = Bags(data, index2playlist, side_info) # Apply same vocabulary as in training test_set = test_set.apply_vocab(vocab) print("Test set:", test_set, sep='\n') pred = model.predict(test_set) if sp.issparse(pred): pred = pred.toarray() else: pred = np.asarray(pred) print("Scaling and removing non-missing items") pred = remove_non_missing(pred, test_set.tocsr(), copy=False) index2trackid = {v: k for k, v in vocab.items()} print("Making submission:", args.outfile) make_submission(pred, index2playlist, index2trackid, outfile=args.outfile) print("Success.") print("Make sure to verify the submission format via", VERIFY_SCRIPT)
def main(year, dataset, min_count=None, outfile=None, drop=1, baselines=False, autoencoders=False, conditioned_autoencoders=False, all_metadata=True): """ Main function for training and evaluating AAE methods on DBLP data """ assert baselines or autoencoders or conditioned_autoencoders, "Please specify what to run" if all_metadata: # V2 - all metadata CONDITIONS = ConditionList([ ('title', PretrainedWordEmbeddingCondition(VECTORS)), ('venue', PretrainedWordEmbeddingCondition(VECTORS)), ( 'author', CategoricalCondition( embedding_dim=32, reduce="sum", # vocab_size=0.01, sparse=False, embedding_on_gpu=True)) ]) else: # V1 - only title metadata CONDITIONS = ConditionList([ ('title', PretrainedWordEmbeddingCondition(VECTORS)) ]) #### CONDITOINS defined ALL_MODELS = [] if baselines: # Models without metadata BASELINES = [ # RandomBaseline(), # MostPopular(), Countbased(), SVDRecommender(1000, use_title=False) ] ALL_MODELS += BASELINES if not all_metadata: # SVD can use only titles not generic conditions ALL_MODELS += [SVDRecommender(1000, use_title=True)] if autoencoders: AUTOENCODERS = [ AAERecommender(adversarial=False, conditions=None, lr=0.001, **AE_PARAMS), AAERecommender(adversarial=True, conditions=None, gen_lr=0.001, reg_lr=0.001, **AE_PARAMS), VAERecommender(conditions=None, **AE_PARAMS), DAERecommender(conditions=None, **AE_PARAMS) ] ALL_MODELS += AUTOENCODERS if conditioned_autoencoders: # Model with metadata (metadata used as set in CONDITIONS above) CONDITIONED_AUTOENCODERS = [ AAERecommender(adversarial=False, conditions=CONDITIONS, lr=0.001, **AE_PARAMS), AAERecommender(adversarial=True, conditions=CONDITIONS, gen_lr=0.001, reg_lr=0.001, **AE_PARAMS), DecodingRecommender(CONDITIONS, n_epochs=100, batch_size=1000, optimizer='adam', n_hidden=100, lr=0.001, verbose=True), VAERecommender(conditions=CONDITIONS, **AE_PARAMS), DAERecommender(conditions=CONDITIONS, **AE_PARAMS) ] ALL_MODELS += CONDITIONED_AUTOENCODERS print("Finished preparing models:", *ALL_MODELS, sep='\n\t') path = DATA_PATH + ("dblp-ref/" if dataset == "dblp" else "acm.txt") print("Loading data from", path) papers = papers_from_files(path, dataset, n_jobs=4) print("Unpacking {} data...".format(dataset)) bags_of_papers, ids, side_info = unpack_papers(papers) del papers bags = Bags(bags_of_papers, ids, side_info) if args.compute_mi: from aaerec.utils import compute_mutual_info print("[MI] Dataset:", dataset) print("[MI] min Count:", min_count) tmp = bags.build_vocab(min_count=min_count, max_features=None) mi = compute_mutual_info(tmp, conditions=None, include_labels=True, normalize=True) with open('mi.csv', 'a') as mifile: print(dataset, min_count, mi, sep=',', file=mifile) print("=" * 78) exit(0) log("Whole dataset:", logfile=outfile) log(bags, logfile=outfile) evaluation = Evaluation(bags, year, logfile=outfile) evaluation.setup(min_count=min_count, min_elements=2, drop=drop) with open(outfile, 'a') as fh: print("~ Partial List + Titles + Author + Venue", "~" * 42, file=fh) evaluation(ALL_MODELS, batch_size=1000)