Пример #1
0
def prepare_dataloader(data_dict, args):
    g = data_dict['graph']
    user_ntype = data_dict['user_ntype']
    item_ntype = data_dict['item_ntype']
    textset = data_dict['textset']
    # Sampler
    batch_sampler = sampler_module.ItemToItemBatchSampler(
        g, user_ntype, item_ntype, args.batch_size)
    neighbor_sampler = sampler_module.NeighborSampler(
        g, user_ntype, item_ntype, args.random_walk_length,
        args.random_walk_restart_prob, args.num_random_walks,
        args.num_neighbors, args.num_layers)
    collator = sampler_module.PinSAGECollator(neighbor_sampler, g, item_ntype,
                                              textset)
    dataloader = DataLoader(batch_sampler,
                            collate_fn=collator.collate_train,
                            num_workers=args.num_workers)

    dataloader_test = DataLoader(torch.arange(g.number_of_nodes(item_ntype)),
                                 batch_size=args.batch_size,
                                 collate_fn=collator.collate_test,
                                 num_workers=args.num_workers)
    dataloader_it = iter(dataloader)

    return dataloader_it, dataloader_test, neighbor_sampler
Пример #2
0
def train(dataset, args):
    g = dataset['train-graph']
    val_matrix = dataset['val-matrix'].tocsr()
    test_matrix = dataset['test-matrix'].tocsr()
    item_texts = dataset['item-texts']
    user_ntype = dataset['user-type']
    item_ntype = dataset['item-type']
    user_to_item_etype = dataset['user-to-item-type']
    timestamp = dataset['timestamp-edge-column']

    device = torch.device(args.device)

    # Prepare torchtext dataset and vocabulary
    fields = {}
    examples = []
    for key, texts in item_texts.items():
        fields[key] = torchtext.data.Field(include_lengths=True,
                                           lower=True,
                                           batch_first=True)
    for i in range(g.number_of_nodes(item_ntype)):
        example = torchtext.data.Example.fromlist(
            [item_texts[key][i] for key in item_texts.keys()],
            [(key, fields[key]) for key in item_texts.keys()])
        examples.append(example)
    textset = torchtext.data.Dataset(examples, fields)
    for key, field in fields.items():
        field.build_vocab(getattr(textset, key))
        #field.build_vocab(getattr(textset, key), vectors='fasttext.simple.300d')

    # Sampler
    batch_sampler = sampler_module.ItemToItemBatchSampler(
        g, user_ntype, item_ntype, args.batch_size)
    neighbor_sampler = sampler_module.NeighborSampler(
        g, user_ntype, item_ntype, args.random_walk_length,
        args.random_walk_restart_prob, args.num_random_walks,
        args.num_neighbors, args.num_layers)
    collator = sampler_module.PinSAGECollator(neighbor_sampler, g, item_ntype,
                                              textset)
    dataloader = DataLoader(batch_sampler,
                            collate_fn=collator.collate_train,
                            num_workers=args.num_workers)
    dataloader_test = DataLoader(torch.arange(g.number_of_nodes(item_ntype)),
                                 batch_size=args.batch_size,
                                 collate_fn=collator.collate_test,
                                 num_workers=args.num_workers)
    dataloader_it = iter(dataloader)

    # Model
    model = PinSAGEModel(g, item_ntype, textset, args.hidden_dims,
                         args.num_layers).to(device)
    item_emb = nn.Embedding(g.number_of_nodes(item_ntype),
                            args.hidden_dims,
                            sparse=True)
    # Optimizer
    opt = torch.optim.Adam(model.parameters(), lr=args.lr)
    opt_emb = torch.optim.SparseAdam(item_emb.parameters(), lr=args.lr)

    # For each batch of head-tail-negative triplets...
    for epoch_id in range(args.num_epochs):
        model.train()
        for batch_id in tqdm.trange(args.batches_per_epoch):
            pos_graph, neg_graph, blocks = next(dataloader_it)
            # Copy to GPU
            for i in range(len(blocks)):
                blocks[i] = blocks[i].to(device)
            pos_graph = pos_graph.to(device)
            neg_graph = neg_graph.to(device)

            loss = model(pos_graph, neg_graph, blocks, item_emb).mean()
            opt.zero_grad()
            opt_emb.zero_grad()
            loss.backward()
            opt.step()
            opt_emb.step()

        # Evaluate
        model.eval()
        with torch.no_grad():
            item_batches = torch.arange(g.number_of_nodes(item_ntype)).split(
                args.batch_size)
            h_item_batches = []
            for blocks in tqdm.tqdm(dataloader_test):
                for i in range(len(blocks)):
                    blocks[i] = blocks[i].to(device)

                h_item_batches.append(model.get_repr(blocks, item_emb))
            h_item = torch.cat(h_item_batches, 0)

            print(
                evaluation.evaluate_nn(dataset, h_item, args.k,
                                       args.batch_size))
Пример #3
0
def train(gpu, args):
    # Load dataset
    with open(args.dataset_path, 'rb') as f:
        dataset = pickle.load(f)
    rank = args.nr * args.gpus + gpu
    g = dataset['train-graph']
    val_matrix = dataset['val-matrix'].tocsr()
    test_matrix = dataset['test-matrix'].tocsr()
    item_texts = dataset['item-texts']
    user_ntype = dataset['user-type']
    item_ntype = dataset['item-type']
    user_to_item_etype = dataset['user-to-item-type']
    timestamp = dataset['timestamp-edge-column']
    dist.init_process_group(backend='nccl',
                            init_method='env://',
                            world_size=args.world_size,
                            rank=rank)

    cuda_string = 'cuda' + ':' + str(gpu)
    device = torch.device(
        cuda_string) if torch.cuda.is_available() else torch.device("cpu")

    print(device)

    # Assign user and movie IDs and use them as features (to learn an individual trainable
    # embedding for each entity)
    g.nodes[user_ntype].data['id'] = torch.arange(
        g.number_of_nodes(user_ntype))
    g.nodes[item_ntype].data['id'] = torch.arange(
        g.number_of_nodes(item_ntype))

    # Prepare torchtext dataset and vocabulary
    fields = {}
    examples = []
    for key, texts in item_texts.items():
        fields[key] = torchtext.data.Field(include_lengths=True,
                                           lower=True,
                                           batch_first=True)
    for i in range(g.number_of_nodes(item_ntype)):
        example = torchtext.data.Example.fromlist(
            [item_texts[key][i] for key in item_texts.keys()],
            [(key, fields[key]) for key in item_texts.keys()])
        examples.append(example)
    textset = torchtext.data.Dataset(examples, fields)
    for key, field in fields.items():
        field.build_vocab(getattr(textset, key))
        #field.build_vocab(getattr(textset, key), vectors='fasttext.simple.300d')

    # Sampler
    batch_sampler = sampler_module.ItemToItemBatchSampler(
        g, user_ntype, item_ntype, args.batch_size)
    neighbor_sampler = sampler_module.NeighborSampler(
        g, user_ntype, item_ntype, args.random_walk_length,
        args.random_walk_restart_prob, args.num_random_walks,
        args.num_neighbors, args.num_layers)
    collator = sampler_module.PinSAGECollator(neighbor_sampler, g, item_ntype,
                                              textset)
    dataloader = DataLoader(batch_sampler,
                            collate_fn=collator.collate_train,
                            num_workers=args.num_workers)
    dataloader_test = DataLoader(torch.arange(g.number_of_nodes(item_ntype)),
                                 batch_size=args.batch_size,
                                 collate_fn=collator.collate_test,
                                 num_workers=args.num_workers)
    dataloader_it = iter(dataloader)

    print(args.num_layers)
    # Model
    model = PinSAGEModel(g, item_ntype, textset, args.hidden_dims,
                         args.num_layers).to(device)
    model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
    # Optimizer
    print(model)
    opt = torch.optim.Adam(model.parameters(), lr=args.lr)

    # For each batch of head-tail-negative triplets...
    for epoch_id in range(args.num_epochs):
        model.train()
        for batch_id in tqdm.trange(args.batches_per_epoch):
            for pos_graph, neg_graph, blocks in dataloader:
                for i in range(len(blocks)):
                    blocks[i] = blocks[i].to(device)
                pos_graph = pos_graph.to(device)
                neg_graph = neg_graph.to(device)

                loss = model(pos_graph, neg_graph, blocks).mean()
                opt.zero_grad()
                loss.backward()
                opt.step()

        # Evaluate
        model.eval()
        with torch.no_grad():
            item_batches = torch.arange(g.number_of_nodes(item_ntype)).split(
                args.batch_size)
            h_item_batches = []
            for blocks in dataloader_test:
                for i in range(len(blocks)):
                    blocks[i] = blocks[i].to(device)

                h_item_batches.append(model.get_repr(blocks))
            h_item = torch.cat(h_item_batches, 0)

            print(
                evaluation.evaluate_nn(dataset, h_item, args.k,
                                       args.batch_size))
Пример #4
0
def train(dataset, args):
    g = dataset['train-graph']
    val_matrix = dataset['val-matrix'].tocsr()
    test_matrix = dataset['test-matrix'].tocsr()
    item_texts = dataset['item-texts']
    user_ntype = dataset['user-type']
    item_ntype = dataset['item-type']
    user_to_item_etype = dataset['user-to-item-type']
    timestamp = dataset['timestamp-edge-column']

    device = torch.device(args.device)

    # Assign user and movie IDs and use them as features (to learn an individual trainable
    # embedding for each entity)
    g.nodes[user_ntype].data['id'] = torch.arange(g.num_nodes(user_ntype))
    g.nodes[item_ntype].data['id'] = torch.arange(g.num_nodes(item_ntype))

    # Prepare torchtext dataset and Vocabulary
    textset = {}
    tokenizer = get_tokenizer(None)

    textlist = []
    batch_first = True

    for i in range(g.num_nodes(item_ntype)):
        for key in item_texts.keys():
            l = tokenizer(item_texts[key][i].lower())
            textlist.append(l)
    for key, field in item_texts.items():
        vocab2 = build_vocab_from_iterator(textlist,
                                           specials=["<unk>", "<pad>"])
        textset[key] = (textlist, vocab2, vocab2.get_stoi()['<pad>'],
                        batch_first)

    # Sampler
    batch_sampler = sampler_module.ItemToItemBatchSampler(
        g, user_ntype, item_ntype, args.batch_size)
    neighbor_sampler = sampler_module.NeighborSampler(
        g, user_ntype, item_ntype, args.random_walk_length,
        args.random_walk_restart_prob, args.num_random_walks,
        args.num_neighbors, args.num_layers)
    collator = sampler_module.PinSAGECollator(neighbor_sampler, g, item_ntype,
                                              textset)
    dataloader = DataLoader(batch_sampler,
                            collate_fn=collator.collate_train,
                            num_workers=args.num_workers)
    dataloader_test = DataLoader(torch.arange(g.num_nodes(item_ntype)),
                                 batch_size=args.batch_size,
                                 collate_fn=collator.collate_test,
                                 num_workers=args.num_workers)
    dataloader_it = iter(dataloader)

    # Model
    model = PinSAGEModel(g, item_ntype, textset, args.hidden_dims,
                         args.num_layers).to(device)
    # Optimizer
    opt = torch.optim.Adam(model.parameters(), lr=args.lr)

    # For each batch of head-tail-negative triplets...
    for epoch_id in range(args.num_epochs):
        model.train()
        for batch_id in tqdm.trange(args.batches_per_epoch):
            pos_graph, neg_graph, blocks = next(dataloader_it)
            # Copy to GPU
            for i in range(len(blocks)):
                blocks[i] = blocks[i].to(device)
            pos_graph = pos_graph.to(device)
            neg_graph = neg_graph.to(device)

            loss = model(pos_graph, neg_graph, blocks).mean()
            opt.zero_grad()
            loss.backward()
            opt.step()

        # Evaluate
        model.eval()
        with torch.no_grad():
            item_batches = torch.arange(g.num_nodes(item_ntype)).split(
                args.batch_size)
            h_item_batches = []
            for blocks in dataloader_test:
                for i in range(len(blocks)):
                    blocks[i] = blocks[i].to(device)

                h_item_batches.append(model.get_repr(blocks))
            h_item = torch.cat(h_item_batches, 0)

            print(
                evaluation.evaluate_nn(dataset, h_item, args.k,
                                       args.batch_size))