示例#1
0
def main():

    CONFIG = {
        'pub': ('/data21/lgalke/datasets/citations_pmc.tsv', 2011, 50),
        'eco': ('/data21/lgalke/datasets/econbiz62k.tsv', 2012, 1)
    }

    print("Loading pre-trained embedding", W2V_PATH)
    vectors = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY)

    CONDITIONS = ConditionList([
        ('title', PretrainedWordEmbeddingCondition(vectors, dim=0))
    ])

    PARSER = argparse.ArgumentParser()
    PARSER.add_argument('data', type=str, choices=['pub', 'eco'])
    args = PARSER.parse_args()
    DATA = CONFIG[args.data]
    logfile = '/data22/ivagliano/test-irgan/' + args.data + '-decoder.log'
    bags = Bags.load_tabcomma_format(DATA[0])
    c_year = DATA[1]

    evaluate = Evaluation(bags,
                          year=c_year,
                          logfile=logfile).setup(min_count=DATA[2],
                                                 min_elements=2)
    user_num = evaluate.train_set.size()[0] + evaluate.test_set.size()[0]
    item_num = evaluate.train_set.size()[1]
    models = [IRGANRecommender(user_num, item_num, g_epochs=1, d_epochs=1, n_epochs=1, conditions=CONDITIONS)]
    evaluate(models)
示例#2
0
def test_full_pipeline():
    """ This test shows how to use condition (-list) in a complete pipeline """
    import gensim
    data = {
        'titles': [
            "the quick brown fox jumps over the lazy dog",
            "the cat sits on the mat", "if it fits, I sits"
        ],
        'authors': [
            "Iacopo",
            "Gunnar",
            "Lukas",
        ]
    }

    emb_dim = 10
    model = gensim.models.word2vec.Word2Vec(
        [s.split() for s in data['titles']],
        min_count=1,
        window=2,
        size=emb_dim)
    cond1 = PretrainedWordEmbeddingCondition(model.wv, use_cuda=False)
    cond2 = CategoricalCondition(emb_dim, vocab_size=3, use_cuda=False)

    clist = ConditionList([('titles', cond1), ('authors', cond2)])

    # Apply fit_transform on all conditions, store results
    prepped_inputs = clist.fit_transform([data[k] for k in clist.keys()])

    # Let's assume the encoder produced these codes originally
    codes = torch.rand(3, 10)

    criterion = torch.nn.MSELoss()

    decoder = torch.nn.Linear(
        codes.size(1) + clist.size_increment(), codes.size(1))
    optimizer = torch.optim.Adam(decoder.parameters())

    losses = []
    for __epoch in range(10):
        for start in range(len(codes)):
            end = start + 1
            code_batch = codes[start:end]
            # Batch all condition inputs
            cinputs_batch = [inp[start:end] for inp in prepped_inputs]
            clist.zero_grad()
            optimizer.zero_grad()
            conditioned_code = clist.encode_impose(code_batch, cinputs_batch)
            # assert dim is predictable for decoder
            assert conditioned_code.size(1) - code_batch.size(1)\
                == clist.size_increment()
            out = decoder(conditioned_code)
            # Reconstruction loss
            loss = criterion(out, code_batch)
            loss.backward()
            losses.append(loss.item())
            clist.step()
            optimizer.step()
示例#3
0
def test_word_emb_condition():
    import gensim
    sentences = [
        "the quick brown fox jumps over the lazy dog",
        "the cat sits on the mat", "if it fits, I sits"
    ]
    emb_dim = 10
    model = gensim.models.word2vec.Word2Vec([s.split() for s in sentences],
                                            min_count=1,
                                            window=2,
                                            size=emb_dim)
    condition = PretrainedWordEmbeddingCondition(model.wv, use_cuda=False)
    sentences_trf = condition.fit_transform(sentences)

    code = torch.rand(len(sentences), 5)
    conditioned_code = condition.encode_impose(code, sentences_trf)

    assert conditioned_code.size(
        1) == code.size(1) + condition.size_increment()
示例#4
0
def main():
    """ Evaluates the VAE Recommender """
    CONFIG = {
        'pub': ('/data21/lgalke/datasets/citations_pmc.tsv', 2011, 50),
        'eco': ('/data21/lgalke/datasets/econbiz62k.tsv', 2012, 1)
    }

    PARSER = argparse.ArgumentParser()
    PARSER.add_argument('data', type=str, choices=['pub', 'eco'])
    args = PARSER.parse_args()
    DATA = CONFIG[args.data]
    logfile = '/data22/ivagliano/test-vae/' + args.data + '-hyperparams-opt.log'
    bags = Bags.load_tabcomma_format(DATA[0])
    c_year = DATA[1]

    evaluate = Evaluation(bags, year=c_year,
                          logfile=logfile).setup(min_count=DATA[2],
                                                 min_elements=2)
    print("Loading pre-trained embedding", W2V_PATH)
    vectors = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY)

    params = {
        #'n_epochs': 10,
        'batch_size': 100,
        'optimizer': 'adam',
        # 'normalize_inputs': True,
    }

    CONDITIONS = ConditionList([('title',
                                 PretrainedWordEmbeddingCondition(vectors))])

    # 100 hidden units, 200 epochs, bernoulli prior, normalized inputs -> 0.174
    # activations = ['ReLU','SELU']
    # lrs = [(0.001, 0.0005), (0.001, 0.001)]
    hcs = [(100, 50), (300, 100)]
    epochs = [50, 100, 200, 500]

    # dropouts = [(.2,.2), (.1,.1), (.1, .2), (.25, .25), (.3,.3)] # .2,.2 is best
    # priors = ['categorical'] # gauss is best
    # normal = [True, False]
    # bernoulli was good, letz see if categorical is better... No
    import itertools
    models = [
        VAERecommender(conditions=CONDITIONS,
                       **params,
                       n_hidden=hc[0],
                       n_code=hc[1],
                       n_epochs=e) for hc, e in itertools.product(hcs, epochs)
    ]
    # models = [VAERecommender(conditions=CONDITIONS, **params)]
    evaluate(models)
示例#5
0
    # RandomBaseline(),
    # MostPopular(),
    Countbased(),
    SVDRecommender(1000, use_title=False),
]

RECOMMENDERS = [
    AAERecommender(adversarial=False, lr=0.001, **ae_params),
    AAERecommender(prior='gauss', gen_lr=0.001, reg_lr=0.001, **ae_params),
    VAERecommender(conditions=None, **vae_params),
    DAERecommender(conditions=None, **ae_params)
]

# Metadata to use
CONDITIONS = ConditionList([
    ('title', PretrainedWordEmbeddingCondition(VECTORS)),
    #    ('author', CategoricalCondition(embedding_dim=32, reduce="sum",
    #                                    sparse=True, embedding_on_gpu=True))
])

# Model with metadata (metadata used as set in CONDITIONS above)
CONDITIONED_MODELS = [
    # TODO SVD can use only titles not generic conditions
    SVDRecommender(1000, use_title=True),
    AAERecommender(adversarial=False,
                   conditions=CONDITIONS,
                   lr=0.001,
                   **ae_params),
    AAERecommender(adversarial=True,
                   conditions=CONDITIONS,
                   gen_lr=0.001,
示例#6
0
MIN_COUNT = 55
# Use command line arg '-m' instead

TRACK_INFO = ['artist_name', 'track_name', 'album_name']
# TODO: find the side info fields
PLAYLIST_INFO = ['name']

# TFIDF_PARAMS = { 'max_features': N_WORDS }

W2V_PATH = "/data21/lgalke/vectors/GoogleNews-vectors-negative300.bin.gz"
W2V_IS_BINARY = True
VECTORS = KeyedVectors.load_word2vec_format(W2V_PATH, binary=W2V_IS_BINARY)
DATA_PATH = "/data21/lgalke/datasets/MPD/data/"

CONDITIONS = ConditionList([
    ('name', PretrainedWordEmbeddingCondition(VECTORS)),
    (
        'artist_name',
        CategoricalCondition(
            embedding_dim=32,
            reduce="sum",  # vocab_size=0.01,
            sparse=True,
            embedding_on_gpu=True)),
    ('track_name', PretrainedWordEmbeddingCondition(VECTORS)),
    ('album_name', PretrainedWordEmbeddingCondition(VECTORS))
])

# These need to be implemented in evaluation.py
METRICS = ['mrr']

MODELS = [
示例#7
0
    'n_hidden': 100,
    'normalize_inputs': True,
}
vae_params = {
    'n_code': 50,
    # VAE results get worse with more epochs in preliminary optimization
    #(Pumed with threshold 50)
    'n_epochs': 50,
    'batch_size': 100,
    'n_hidden': 100,
    'normalize_inputs': True,
}

# Metadata to use
CONDITIONS = ConditionList([('title',
                             PretrainedWordEmbeddingCondition(VECTORS))])

# Models without/with metadata (Reuters has only titles)
MODELS = [
    # Use no metadata (only item sets)
    Countbased(),
    SVDRecommender(10, use_title=False),
    AAERecommender(adversarial=False, lr=0.001, **ae_params),
    AAERecommender(adversarial=True,
                   prior='gauss',
                   gen_lr=0.001,
                   reg_lr=0.001,
                   **ae_params),
    VAERecommender(conditions=None, **vae_params),
    DAERecommender(conditions=None, **ae_params),
    # Use title (as defined in CONDITIONS above)
示例#8
0
def main(year,
         dataset,
         min_count=None,
         outfile=None,
         drop=1,
         baselines=False,
         autoencoders=False,
         conditioned_autoencoders=False,
         all_metadata=True):
    """ Main function for training and evaluating AAE methods on DBLP data """

    assert baselines or autoencoders or conditioned_autoencoders, "Please specify what to run"

    if all_metadata:
        # V2 - all metadata
        CONDITIONS = ConditionList([
            ('title', PretrainedWordEmbeddingCondition(VECTORS)),
            ('venue', PretrainedWordEmbeddingCondition(VECTORS)),
            (
                'author',
                CategoricalCondition(
                    embedding_dim=32,
                    reduce="sum",  # vocab_size=0.01,
                    sparse=False,
                    embedding_on_gpu=True))
        ])
    else:
        # V1 - only title metadata
        CONDITIONS = ConditionList([
            ('title', PretrainedWordEmbeddingCondition(VECTORS))
        ])
    #### CONDITOINS defined

    ALL_MODELS = []

    if baselines:
        # Models without metadata
        BASELINES = [
            # RandomBaseline(),
            # MostPopular(),
            Countbased(),
            SVDRecommender(1000, use_title=False)
        ]

        ALL_MODELS += BASELINES

        if not all_metadata:
            # SVD can use only titles not generic conditions
            ALL_MODELS += [SVDRecommender(1000, use_title=True)]

    if autoencoders:
        AUTOENCODERS = [
            AAERecommender(adversarial=False,
                           conditions=None,
                           lr=0.001,
                           **AE_PARAMS),
            AAERecommender(adversarial=True,
                           conditions=None,
                           gen_lr=0.001,
                           reg_lr=0.001,
                           **AE_PARAMS),
            VAERecommender(conditions=None, **AE_PARAMS),
            DAERecommender(conditions=None, **AE_PARAMS)
        ]
        ALL_MODELS += AUTOENCODERS

    if conditioned_autoencoders:
        # Model with metadata (metadata used as set in CONDITIONS above)
        CONDITIONED_AUTOENCODERS = [
            AAERecommender(adversarial=False,
                           conditions=CONDITIONS,
                           lr=0.001,
                           **AE_PARAMS),
            AAERecommender(adversarial=True,
                           conditions=CONDITIONS,
                           gen_lr=0.001,
                           reg_lr=0.001,
                           **AE_PARAMS),
            DecodingRecommender(CONDITIONS,
                                n_epochs=100,
                                batch_size=1000,
                                optimizer='adam',
                                n_hidden=100,
                                lr=0.001,
                                verbose=True),
            VAERecommender(conditions=CONDITIONS, **AE_PARAMS),
            DAERecommender(conditions=CONDITIONS, **AE_PARAMS)
        ]
        ALL_MODELS += CONDITIONED_AUTOENCODERS

    print("Finished preparing models:", *ALL_MODELS, sep='\n\t')

    path = DATA_PATH + ("dblp-ref/" if dataset == "dblp" else "acm.txt")
    print("Loading data from", path)
    papers = papers_from_files(path, dataset, n_jobs=4)
    print("Unpacking {} data...".format(dataset))
    bags_of_papers, ids, side_info = unpack_papers(papers)
    del papers
    bags = Bags(bags_of_papers, ids, side_info)
    if args.compute_mi:
        from aaerec.utils import compute_mutual_info
        print("[MI] Dataset:", dataset)
        print("[MI] min Count:", min_count)
        tmp = bags.build_vocab(min_count=min_count, max_features=None)
        mi = compute_mutual_info(tmp,
                                 conditions=None,
                                 include_labels=True,
                                 normalize=True)
        with open('mi.csv', 'a') as mifile:
            print(dataset, min_count, mi, sep=',', file=mifile)

        print("=" * 78)
        exit(0)

    log("Whole dataset:", logfile=outfile)
    log(bags, logfile=outfile)

    evaluation = Evaluation(bags, year, logfile=outfile)
    evaluation.setup(min_count=min_count, min_elements=2, drop=drop)
    with open(outfile, 'a') as fh:
        print("~ Partial List + Titles + Author + Venue", "~" * 42, file=fh)
    evaluation(ALL_MODELS, batch_size=1000)