def forward(self, graph: dgl.DGLHeteroGraph): graph = self.mlp_gnn1(graph) graph.ndata['data'] = self.ln1(graph.ndata['data']) graph.ndata['data'] = self.inner_layers(graph.ndata['data']) graph = self.mlp_gnn2(graph) graph.ndata['data'] = self.ln2(graph.ndata['data']) return graph
def forward( self, hg: dgl.DGLHeteroGraph, inputs: Dict[str, torch.Tensor], ) -> Dict[str, torch.Tensor]: hg = hg.local_var() if self._use_weight: weight = self.basis() if self._use_basis else self.weight weight_dict = {self._rel_names[i]: {'weight': w.squeeze( dim=0)} for i, w in enumerate(torch.split(weight, 1, dim=0))} else: weight_dict = {} if self._use_self_loop: if hg.is_block: inputs_dst = {ntype: h[:hg.num_dst_nodes( ntype)] for ntype, h in inputs.items()} else: inputs_dst = inputs else: inputs_dst = None x = self._conv(hg, inputs, mod_kwargs=weight_dict) x = {ntype: self._apply_layers(ntype, h, inputs_dst) for ntype, h in x.items()} return x
def inference( self, hg: dgl.DGLHeteroGraph, batch_size: int, num_workers: int, embedding_layer: nn.Module, device: torch.device, ) -> Dict[str, torch.Tensor]: for i, layer in enumerate(self._layers): sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) dataloader = dgl.dataloading.NodeDataLoader( hg, {ntype: hg.nodes(ntype) for ntype in hg.ntypes}, sampler, batch_size=batch_size, shuffle=False, drop_last=False, num_workers=num_workers, ) if i < self._num_layers - 1: y = {ntype: torch.zeros(hg.num_nodes( ntype), self._hidden_feats, device=device) for ntype in hg.ntypes} else: y = {ntype: torch.zeros(hg.num_nodes( ntype), self._out_feats, device=device) for ntype in hg.ntypes} for in_nodes, out_nodes, blocks in dataloader: in_nodes = {rel: nid.to(device) for rel, nid in in_nodes.items()} out_nodes = {rel: nid.to(device) for rel, nid in out_nodes.items()} block = blocks[0].to(device) if i == 0: h = embedding_layer(in_nodes=in_nodes, device=device) else: h = {ntype: x[ntype][in_nodes[ntype]] for ntype in hg.ntypes} h = layer(block, h) if i < self._num_layers - 1: h = self._apply_layers(i, h) for ntype in h: y[ntype][out_nodes[ntype]] = h[ntype] x = y return x
def __init__(self, g: dgl.DGLHeteroGraph, fanouts: List[int], negative_sampling: bool): self._g = g self._fanouts = fanouts self._negative_sampling = negative_sampling if negative_sampling is True: self._negative_weights = g.in_degrees().float() ** 0.75 else: self._negative_weights = None
def _propagate_user_to_item(self, block: DGLHeteroGraph) -> th.Tensor: with block.local_scope(): for etype in self._rating_set: block.apply_edges(lambda edges: self._compute_message_user_to_item(edges, etype), etype=etype) block.update_all(dgl_fn.copy_e("m", f"m_{etype}"), dgl_fn.mean(f"m_{etype}", f"h_{etype}"), etype=etype) item_features: th.Tensor = block.dstnodes["item"].data["item_features"] all_feature_on_item = [item_features] for rating in self._rating_set: feature_name = f"h_{rating}" if feature_name in block.dstnodes["item"].data: all_feature_on_item.append(block.dstnodes["item"].data[feature_name]) else: all_feature_on_item.append(th.zeros( item_features.shape[0], self._embedding_dim, dtype=item_features.dtype, device=item_features.device)) return self._agg_activation(self._item_aggregate_layer(th.cat(all_feature_on_item, dim=1)))
def _propagate_item_to_user(self, block: DGLHeteroGraph) -> th.Tensor: with block.local_scope(): block.srcnodes["item"].data["item_id_embedding"] = \ self._item_id_embedding_layer(block.srcnodes["item"].data[dgl.NID]) for etype in [f"rev-{rating}" for rating in self._rating_set]: block.apply_edges(lambda edges: self._compute_message_item_to_user(edges, etype), etype=etype) block.update_all(dgl_fn.copy_e("m", f"m_{etype}"), dgl_fn.mean(f"m_{etype}", f"h_{etype}"), etype=etype) user_feature = block.dstnodes["user"].data["user_features"] all_features_on_user = [user_feature] for rating in self._rating_set: feature_name = f"h_rev-{rating}" if feature_name in block.dstnodes["user"].data: all_features_on_user.append(block.dstnodes["user"].data[feature_name]) else: all_features_on_user.append(th.zeros( user_feature.shape[0], self._embedding_dim, dtype=user_feature.dtype, device=user_feature.device)) return self._agg_activation(self._user_aggregate_layer(th.cat(all_features_on_user, dim=1)))
def forward(self, graph: dgl.DGLHeteroGraph, relation_target_node_features: dict, relation_embedding: dict): """ :param graph: dgl.DGLHeteroGraph :param relation_target_node_features: dict, {relation_type: target_node_features shape (N_nodes, input_dim)}, each value in relation_target_node_features represents the representation of target node features :param relation_embedding: embedding for each relation, dict, {etype: feature} :return: output_features: dict, {relation_type: target_node_features} """ # in each relation, target type of nodes has an embedding # dictionary of {(srctype, etypye, dsttype): target_node_features} input_src = relation_target_node_features if graph.is_block: input_dst = {} for srctype, etypye, dsttype in relation_target_node_features: input_dst[(srctype, etypye, dsttype)] = relation_target_node_features[(srctype, etypye, dsttype)][ :graph.number_of_dst_nodes(dsttype)] else: input_dst = relation_target_node_features # output_features, dict {(srctype, etypye, dsttype): target_node_features} output_features = self.hetero_conv(graph, input_src, input_dst, relation_embedding, self.node_transformation_weight, self.relation_transformation_weight) # residual connection for the target node if self.residual: for srctype, etype, dsttype in output_features: alpha = F.sigmoid(self.residual_weight[dsttype]) output_features[(srctype, etype, dsttype)] = output_features[(srctype, etype, dsttype)] * alpha + \ self.res_fc[dsttype]( input_dst[(srctype, etype, dsttype)]) * (1 - alpha) output_features_dict = {} # different relations crossing layer for srctype, etype, dsttype in output_features: # (dsttype_node_relations_num, dst_nodes_num, n_heads * hidden_dim) dst_node_relations_features = torch.stack([output_features[(stype, reltype, dtype)] for stype, reltype, dtype in output_features if dtype == dsttype], dim=0) output_features_dict[(srctype, etype, dsttype)] = self.relations_crossing_layer(dst_node_relations_features, self.relations_crossing_attention_weight[etype]) # layer norm for the output if self.norm: for srctype, etype, dsttype in output_features_dict: output_features_dict[(srctype, etype, dsttype)] = self.layer_norm[dsttype](output_features_dict[(srctype, etype, dsttype)]) relation_embedding_dict = {} for etype in relation_embedding: relation_embedding_dict[etype] = self.relation_propagation_layer[etype](relation_embedding[etype]) # relation features after relation crossing layer, {(srctype, etype, dsttype): target_node_features} # relation embeddings after relation update, {etype: relation_embedding} return output_features_dict, relation_embedding_dict
def normalize_weight(g: dgl.DGLHeteroGraph, weight): graph = g.local_var() graph.edata['weight'] = weight in_deg = graph.in_degrees(range( graph.number_of_nodes())).float().unsqueeze(-1) graph.ndata['in_deg'] = in_deg graph.update_all( dgl.function.copy_edge('weight', 'edge_w'), dgl.function.sum('edge_w', 'total'), lambda nodes: {'norm': nodes.data['total'] / nodes.data['in_deg']}) graph.apply_edges( lambda edges: {'weight': edges.data['weight'] / edges.dst['norm']}) return graph.edata['weight']
def prepare_mp(g: dgl.DGLHeteroGraph): """ Explicitly materialize the CSR, CSC and COO representation of the given graph so that they could be shared via copy-on-write to sampler workers and GPU trainers. This is a workaround before full shared memory support on heterogeneous graphs. """ for etype in g.etypes: g.in_degree(0, etype=etype) g.out_degree(0, etype=etype) g.find_edges([0], etype=etype)
def forward(self, graph: dgl.DGLHeteroGraph, **kwargs): def message_func(edges): et = edges.canonical_etype[1] out = self.edges_modules[et](edges.src['data']) return {'m': out} def reduce_func(nodes): return {'data': torch.mean(nodes.mailbox['m'], dim=1)} with graph.local_scope(): for et in graph.etypes: graph[et].update_all( message_func=message_func, reduce_func=reduce_func, etype=et, ) return graph
def forward(self, edge_subgraph: dgl.DGLHeteroGraph, nodes_representation: dict, etype: str): """ :param edge_subgraph: sampled subgraph :param nodes_representation: input node features, dict :param etype: predict edge type, str :return: """ edge_subgraph = edge_subgraph.local_var() edge_type_subgraph = edge_subgraph[etype] for ntype in nodes_representation: edge_type_subgraph.nodes[ntype].data['h'] = self.projection_layer( nodes_representation[ntype]) edge_type_subgraph.apply_edges(fn.u_dot_v('h', 'h', 'score'), etype=etype) return self.sigmoid(edge_type_subgraph.edata['score'])
def forward(self, graph: dgl.DGLHeteroGraph, feat: tuple, dst_node_transformation_weight: nn.Parameter, src_node_transformation_weight: nn.Parameter, src_nodes_attention_weight: nn.Parameter): r"""Compute graph attention network layer. Parameters ---------- graph : specific relational DGLHeteroGraph feat : pair of torch.Tensor The pair contains two tensors of shape (N_{in}, D_{in_{src}})` and (N_{out}, D_{in_{dst}}). dst_node_transformation_weight: Parameter (input_dst_dim, n_heads * hidden_dim) src_node_transformation_weight: Parameter (input_src_dim, n_heads * hidden_dim) src_nodes_attention_weight: Parameter (n_heads, 2 * hidden_dim) Returns ------- torch.Tensor, shape (N, H, D_out)` where H is the number of heads, and D_out is size of output feature. """ graph = graph.local_var() # Tensor, (N_src, input_src_dim) feat_src = self.dropout(feat[0]) # Tensor, (N_dst, input_dst_dim) feat_dst = self.dropout(feat[1]) # Tensor, (N_src, n_heads, hidden_dim) -> (N_src, input_src_dim) * (input_src_dim, n_heads * hidden_dim) feat_src = torch.matmul(feat_src, src_node_transformation_weight).view(-1, self._num_heads, self._out_feats) # Tensor, (N_dst, n_heads, hidden_dim) -> (N_dst, input_dst_dim) * (input_dst_dim, n_heads * hidden_dim) feat_dst = torch.matmul(feat_dst, dst_node_transformation_weight).view(-1, self._num_heads, self._out_feats) # first decompose the weight vector into [a_l || a_r], then # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j, This implementation is much efficient # Tensor, (N_dst, n_heads, 1), (N_dst, n_heads, hidden_dim) * (n_heads, hidden_dim) e_dst = (feat_dst * src_nodes_attention_weight[:, :self._out_feats]).sum(dim=-1, keepdim=True) # Tensor, (N_src, n_heads, 1), (N_src, n_heads, hidden_dim) * (n_heads, hidden_dim) e_src = (feat_src * src_nodes_attention_weight[:, self._out_feats:]).sum(dim=-1, keepdim=True) # (N_src, n_heads, hidden_dim), (N_src, n_heads, 1) graph.srcdata.update({'ft': feat_src, 'e_src': e_src}) # (N_dst, n_heads, 1) graph.dstdata.update({'e_dst': e_dst}) # compute edge attention, e_src and e_dst are a_src * Wh_src and a_dst * Wh_dst respectively. graph.apply_edges(fn.u_add_v('e_src', 'e_dst', 'e')) # shape (edges_num, heads, 1) e = self.leaky_relu(graph.edata.pop('e')) # compute softmax graph.edata['a'] = edge_softmax(graph, e) graph.update_all(fn.u_mul_e('ft', 'a', 'msg'), fn.sum('msg', 'ft')) # (N_dst, n_heads * hidden_dim), (N_dst, n_heads, hidden_dim) reshape dst_features = graph.dstdata.pop('ft').reshape(-1, self._num_heads * self._out_feats) dst_features = F.relu(dst_features) return dst_features
def forward(self, graph: dgl.DGLHeteroGraph): # embedding block if self.embed is not None: graph.ndata['data'] = self.embed(graph.ndata['data']) src_embed = graph.ndata['data'] # [B*N, 100] graph.ndata['data'] = self.embed_modules(src_embed) # [B*N, 64] # MLP-GNN graph = self.gnn_modules(graph) graph.ndata['data'] = torch.cat((src_embed, graph.ndata['data']), dim=-1) # [B*N, 164] with graph.local_scope(): graph.ndata['scoring_out'] = self.score_mlp(graph.ndata['data']) weights = dgl.softmax_nodes(graph, 'scoring_out') node_embed = self.transform_mlp(graph.ndata['data']) graph.ndata['node_embed'] = weights * node_embed # [B*N, 8] node_embed = dgl.sum_nodes(graph, 'node_embed') node_embed = self.out_linear(node_embed) return node_embed
def forward(self, graph: dgl.DGLHeteroGraph, feat, weight): graph = graph.local_var() feat_src, feat_dst = self.feat_drop(feat[0]), self.feat_drop(feat[1]) # node_ids_src, node_ids_dst = node_ids[0], node_ids[1] # graph.srcdata['h'], graph.srcdata['id'] = feat_src, node_ids_src # graph.dstdata['h'], graph.dstdata['id'] = feat_dst, node_ids_dst graph.srcdata['h'], graph.dstdata['h'] = feat_src, feat_dst graph.edata['weight'] = weight graph.update_all(self.message_func, fn.sum('m', 'neigh')) degs = graph.in_degrees().to(feat_dst.device) h_neigh = (graph.dstdata['neigh'] + graph.dstdata['h']) / (degs.unsqueeze(-1) + 1) rst = self.fc_neigh(h_neigh) # activation if self.activation is not None: rst = self.activation(rst) # normalization if self.norm is not None: rst = self.norm(rst) return rst
def import_features( g: dgl.DGLHeteroGraph, user_feat_df, item_feat_df, sport_onehot_df, ctm_id: pd.DataFrame, pdt_id: pd.DataFrame, spt_id: pd.DataFrame, user_item_train, get_popularity: bool, num_days_pop: int, item_id_type: str, ctm_id_type: str, spt_id_type: str, ): """ Import features to a dict for all node types. For user and item, initializes feature arrays with only 0, then fills the values if they are available. Parameters ---------- get_popularity, num_days_pop: The recommender system can be enhanced by giving score boost for items that were popular. If get_popularity, popularity of the items will be computed. Num_days_pop defines the number of days to include in the computation. item_id_type, ctm_id_type, spt_id_type: See utils_data for details. all other parameters: See other functions in this file for details. Returns ------- features_dict: Dictionary with all the features imported here. """ features_dict = {} # User user_feat_df = user_feat_df.merge(ctm_id, how='inner', on=ctm_id_type) ids = user_feat_df.ctm_new_id.values.astype(int) feats = np.stack( (user_feat_df.is_male.values, user_feat_df.is_female.values), axis=1) user_feat = np.zeros((g.number_of_nodes('user'), 2)) user_feat[ids] = feats user_feat = torch.tensor(user_feat).float() features_dict['user_feat'] = user_feat # Item if item_id_type in ['SPECIFIC ITEM IDENTIFIER']: item_feat_df = item_feat_df.merge(pdt_id, how='left', on=item_id_type) item_feat_df = item_feat_df[ item_feat_df.pdt_new_id < g.number_of_nodes('item')] # Only IDs that are in graph ids = item_feat_df.pdt_new_id.values.astype(int) feats = np.stack(( item_feat_df.is_junior.values, item_feat_df.is_male.values, item_feat_df.is_female.values, item_feat_df.eco_design.values, ), axis=1) item_feat = np.zeros((g.number_of_nodes('item'), feats.shape[1])) item_feat[ids] = feats item_feat = torch.tensor(item_feat).float() elif item_id_type in ['GENERAL ITEM IDENTIFIER']: item_feat = torch.zeros((g.number_of_nodes('item'), 4)) else: raise KeyError(f'Item ID {item_id_type} not recognized.') features_dict['item_feat'] = item_feat # Sport one-hot if 'sport' in g.ntypes: sport_onehot_df = sport_onehot_df.merge(spt_id, how='inner', on=spt_id_type) sport_onehot_df.sort_values( by='spt_new_id', inplace=True ) # Values need to be sorted by node id to align with g.nodes['sport'] feats = sport_onehot_df.drop(labels=[spt_id_type, 'spt_new_id'], axis=1).values assert feats.shape[0] == g.num_nodes('sport') sport_feat = torch.tensor(feats).float() features_dict['sport_feat'] = sport_feat # Popularity if get_popularity: item_popularity = np.zeros((g.number_of_nodes('item'), 1)) pop_df = user_item_train.merge(pdt_id, how='left', on=item_id_type) most_recent_date = datetime.strptime(max(pop_df.hit_date), '%Y-%m-%d') limit_date = datetime.strftime( (most_recent_date - timedelta(days=num_days_pop)), format='%Y-%m-%d') pop_df = pop_df[pop_df.hit_date >= limit_date] pop_df = pd.DataFrame(pop_df.pdt_new_id.value_counts()) pop_df.columns = ['purchases'] pop_df['score'] = pop_df.purchases / pop_df.purchases.sum() pop_df.sort_index(inplace=True) ids = pop_df.index.values.astype(int) scores = pop_df.score.values item_popularity[ids] = np.expand_dims(scores, axis=1) item_popularity = torch.tensor(item_popularity).float() features_dict['item_pop'] = item_popularity return features_dict
def forward(self,g: dgl.DGLHeteroGraph,h,etype='interact'): with g.local_scope(): g.nodes['user'].data['h']=self.dropout(h['user']) g.nodes['item'].data['h'] = self.dropout(h['item']) g.apply_edges(self.apply_edges,etype=etype) return g.edges[etype].data['score']*5
def forward(self, decode_graph: dgl.DGLHeteroGraph, node_representations: Tensor): with decode_graph.local_scope(): decode_graph.ndata["h"] = node_representations decode_graph.apply_edges(dgl.function.u_dot_v("h", "h", "logits")) return decode_graph.edata["logits"]
def inference(self, graph: dgl.DGLHeteroGraph, relation_target_node_features: dict, relation_embedding: dict = None, device: str = 'cuda:0'): """ mini-batch inference of final representation over all node types. Outer loop: Interate the layers, Inner loop: Interate the batches :param graph: The whole relational graphs :param relation_target_node_features: target node features under each relation, dict, {(srctype, etype, dsttype): features} :param relation_embedding: embedding for each relation, dict, {etype: feature} or None :param device: device str """ with torch.no_grad(): if relation_embedding is None: relation_embedding = {} for etype in self.relation_embedding: relation_embedding[etype] = self.relation_embedding[etype].flatten() # interate over each layer for index, layer in enumerate(self.layers): # Tensor, features of all relation embeddings of the target nodes, store on cpu y = { (stype, etype, dtype): torch.zeros(graph.number_of_nodes(dtype), self.hidden_dim * self.n_heads) for stype, etype, dtype in graph.canonical_etypes} # full sample for each type of nodes sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) dataloader = dgl.dataloading.NodeDataLoader( graph, {ntype: torch.arange(graph.number_of_nodes(ntype)) for ntype in graph.ntypes}, sampler, batch_size=1280, shuffle=True, drop_last=False, num_workers=4) tqdm_dataloader = tqdm(dataloader, ncols=120) for batch, (input_nodes, output_nodes, blocks) in enumerate(tqdm_dataloader): block = blocks[0].to(device) # for relational graphs that only contain a single type of nodes, construct the input and output node dictionary if len(set(blocks[0].ntypes)) == 1: input_nodes = {blocks[0].ntypes[0]: input_nodes} output_nodes = {blocks[0].ntypes[0]: output_nodes} input_features = {(stype, etype, dtype): relation_target_node_features[(stype, etype, dtype)][ input_nodes[dtype]].to(device) for stype, etype, dtype in relation_target_node_features.keys()} input_relation_features = relation_embedding if index == 0: # target relation feature projection for the first layer in the full batch inference for stype, reltype, dtype in input_features: input_features[(stype, reltype, dtype)] = self.projection_layer[dtype]( input_features[(stype, reltype, dtype)]) h, input_relation_features = layer(block, input_features, input_relation_features) for stype, reltype, dtype in h.keys(): y[(stype, reltype, dtype)][output_nodes[dtype]] = h[(stype, reltype, dtype)].cpu() tqdm_dataloader.set_description(f'inference for the {batch}-th batch in model {index}-th layer') # update the features of all the nodes (after the graph convolution) in the whole graph relation_target_node_features = y # relation embedding is updated after each layer relation_embedding = input_relation_features for stype, etype, dtype in relation_target_node_features: relation_target_node_features[(stype, etype, dtype)] = relation_target_node_features[ (stype, etype, dtype)].to(device) relation_fusion_embedding_dict = {} # relation_target_node_features -> {(srctype, etype, dsttype): target_node_features} for dsttype in set([dtype for _, _, dtype in relation_target_node_features]): relation_target_node_features_dict = {etype: relation_target_node_features[(stype, etype, dtype)] for stype, etype, dtype in relation_target_node_features} etypes = [etype for stype, etype, dtype in relation_target_node_features if dtype == dsttype] dst_node_features = [relation_target_node_features_dict[etype] for etype in etypes] dst_relation_embeddings = [relation_embedding[etype] for etype in etypes] dst_node_feature_transformation_weight = [self.node_transformation_weight[etype] for etype in etypes] dst_relation_embedding_transformation_weight = [self.relation_transformation_weight[etype] for etype in etypes] # use mini-batch to avoid out of memory in inference relation_fusion_embedding = [] index = 0 batch_size = 2560 while index < dst_node_features[0].shape[0]: # Tensor, shape (heads_num * hidden_dim) relation_fusion_embedding.append(self.relation_fusing( [dst_node_feature[index: index + batch_size, :] for dst_node_feature in dst_node_features], dst_relation_embeddings, dst_node_feature_transformation_weight, dst_relation_embedding_transformation_weight)) index += batch_size relation_fusion_embedding_dict[dsttype] = torch.cat(relation_fusion_embedding, dim=0) # relation_fusion_embedding_dict, {ntype: tensor -> (nodes, n_heads * hidden_dim)} # relation_target_node_features, {ntype: tensor -> (num_relations, nodes, n_heads * hidden_dim)} return relation_fusion_embedding_dict, relation_target_node_features
def train_valid_split( valid_graph: dgl.DGLHeteroGraph, ground_truth_test, etypes, subtrain_size, valid_size, reverse_etype, train_on_clicks, remove_train_eids, clicks_sample=1, purchases_sample=1, ): """ Using the full graph, sample train_graph and eids of edges for train & validation, as well as nids for test. Process: - Validation - valid_eids are the most recent X edges of all eids of the graph (based on valid_size) - valid_uids and iid are the user ids and item ids associated with those edges (and together they form the ground_truth) - Training graph & eids - All edges and reverse edges of valid_eids are removed from the full graph. - train_eids are all remaining edges. - Sampling of training eids - It might be relevant to have numerous edges in the training graph to do message passing, but to optimize the model to give great scores only to recent interaction (to help with seasonality) - Thus, if purchases_sample or clicks_sample are < 1, only the most recent X edges are kept in the train_eids dict - An extra option is available to insure that no information leakage appear: remove_train_eids. If true, all eids in train_eids dict will be removed from the graph. (Otherwise, information leakage is still taken care of during EdgeDataLoader: sampled edges are removed from the computation blocks). Based on experience, it is best to set remove_train_eids as False. - Computing metrics on training set: subtrain nids - To compute metrics on the training set, we sample a "subtrain set". We need the ground_truth for the subtrain, as well as node ids for all user and items in the subtrain set. - Computing metrics on test set - We need node ids for all user and items in the test set (so we can fetch their embeddings during recommendations) """ np.random.seed(11) all_eids_dict = {} valid_eids_dict = {} train_eids_dict = {} valid_uids_all = [] valid_iids_all = [] for etype in etypes: all_eids = np.arange(valid_graph.number_of_edges(etype)) valid_eids = all_eids[int(len(all_eids) * (1 - valid_size)):] valid_uids, valid_iids = valid_graph.find_edges(valid_eids, etype=etype) valid_uids_all.extend(valid_uids.tolist()) valid_iids_all.extend(valid_iids.tolist()) all_eids_dict[etype] = all_eids if (etype == ('user', 'buys', 'item')) or (etype == ('user', 'clicks', 'item') and train_on_clicks): valid_eids_dict[etype] = valid_eids ground_truth_valid = (np.array(valid_uids_all), np.array(valid_iids_all)) valid_uids = np.array(np.unique(valid_uids_all)) # Create partial graph train_graph = valid_graph.clone() for etype in etypes: if (etype == ('user', 'buys', 'item')) or (etype == ('user', 'clicks', 'item') and train_on_clicks): train_graph.remove_edges(valid_eids_dict[etype], etype=etype) train_graph.remove_edges(valid_eids_dict[etype], etype=reverse_etype[etype]) train_eids = np.arange(train_graph.number_of_edges(etype)) train_eids_dict[etype] = train_eids if purchases_sample != 1: eids = train_eids_dict[('user', 'buys', 'item')] train_eids_dict[( 'user', 'buys', 'item')] = eids[int(len(eids) * (1 - purchases_sample)):] eids = valid_eids_dict[('user', 'buys', 'item')] valid_eids_dict[( 'user', 'buys', 'item')] = eids[int(len(eids) * (1 - purchases_sample)):] if clicks_sample != 1 and ('user', 'clicks', 'item') in train_eids_dict.keys(): eids = train_eids_dict[('user', 'clicks', 'item')] train_eids_dict[('user', 'clicks', 'item')] = eids[int(len(eids) * (1 - clicks_sample)):] eids = valid_eids_dict[('user', 'clicks', 'item')] valid_eids_dict[('user', 'clicks', 'item')] = eids[int(len(eids) * (1 - clicks_sample)):] if remove_train_eids: train_graph.remove_edges(train_eids_dict[etype], etype=etype) train_graph.remove_edges(train_eids_dict[etype], etype=reverse_etype[etype]) # Generate inference nodes for subtrain & ground truth for subtrain ## Choose the subsample of training set. For now, only users with purchases are included. train_uids, train_iids = valid_graph.find_edges(train_eids_dict[etypes[0]], etype=etypes[0]) unique_train_uids = np.unique(train_uids) subtrain_uids = np.random.choice( unique_train_uids, int(len(unique_train_uids) * subtrain_size), replace=False) ## Fetch uids and iids of subtrain sample for all etypes subtrain_uids_all = [] subtrain_iids_all = [] for etype in train_eids_dict.keys(): train_uids, train_iids = valid_graph.find_edges(train_eids_dict[etype], etype=etype) subtrain_eids = [] for i in range(len(train_eids_dict[etype])): if train_uids[i].item() in subtrain_uids: subtrain_eids.append(train_eids_dict[etype][i].item()) subtrain_uids, subtrain_iids = valid_graph.find_edges(subtrain_eids, etype=etype) subtrain_uids_all.extend(subtrain_uids.tolist()) subtrain_iids_all.extend(subtrain_iids.tolist()) ground_truth_subtrain = (np.array(subtrain_uids_all), np.array(subtrain_iids_all)) subtrain_uids = np.array(np.unique(subtrain_uids_all)) # Generate inference nodes for test test_uids, _ = ground_truth_test test_uids = np.unique(test_uids) all_iids = np.arange(valid_graph.num_nodes('item')) return train_graph, train_eids_dict, valid_eids_dict, subtrain_uids, valid_uids, test_uids, \ all_iids, ground_truth_subtrain, ground_truth_valid, all_eids_dict