Пример #1
0
def load_graph(info_dir, embed_dim=16):
    print "Loading adjacency info..."
    adj_lists = pickle.load(open(info_dir + "/adj_lists.pkl"))
    relations = pickle.load(open(info_dir + "/rels.pkl"))
    print "Loading feature data.."
    post_feats = np.load(info_dir + "/post_feats.npy")
    post_feats = np.concatenate([np.zeros((1,100)), post_feats])
    comment_feats = np.load(info_dir + "/comment_feats.npy")
    comment_feats = np.concatenate([np.zeros((1,100)), comment_feats])


    num_users = len(set([id for rel, adj in adj_lists.iteritems() for id in adj if rel[0] == "user"]))
    num_communities = len(set([id for rel, adj in adj_lists.iteritems() for id in adj if rel[0] == "community"]))

    feature_modules = {
        "comment" : nn.Embedding(comment_feats.shape[0], comment_feats.shape[1]), 
        "post" : nn.Embedding(comment_feats.shape[0], comment_feats.shape[1]), 
        "user" : nn.Embedding(num_users+1, embed_dim), 
        "community" : nn.Embedding(num_communities+1, embed_dim), 
        "type" : nn.Embedding(6, embed_dim)}
    feature_modules["comment"].weight = nn.Parameter(torch.FloatTensor(comment_feats), requires_grad=False)
    feature_modules["post"].weight = nn.Parameter(torch.FloatTensor(post_feats), requires_grad=False)
    for mode in ["user", "community", "type"]:
        feature_modules[mode].weight.data.normal_(0, 1./embed_dim)
    features = lambda nodes, mode : feature_modules[mode](
            torch.autograd.Variable(torch.LongTensor(nodes)+1))
    feature_dims = {mode : embed.weight.size()[1] for mode, embed in feature_modules.iteritems()}
    graph = Graph(features, feature_dims, relations, adj_lists)
    return graph, feature_modules
Пример #2
0
def load_graph(data_dir, embed_dim, graph_data_path = "/graph_data.pkl"):
    '''
    Given embed_dim, load graph data from file and construc Graph() object

    Return:
        graph: a Graph() object
        feature_modules: a dict of embedding matrix by node type, each embed matrix shape: [num_ent_by_type + 2, embed_dim]
        node_maps: a dict()
            key: type, 5 types: function, sideeffects, protein, disease, drug
            value: dict():
                key: global node id
                value: local node id for this type
    '''

    '''
    rels: a dict() of all triple templates
        key:    domain entity type
        value:  a list of tuples (range entity type, predicate)
    adj_lists: a dict about the edges in KG
        key: triple template, i.e. ('drug', 'psoriatic_arthritis', 'drug')
        value: a defaultdict about all the edges instance of thos metapath
            key: the head entity id
            value: a set of tail entity ids
    node_maps: a dict () about node types
        key: type, 5 types: function, sideeffects, protein, disease, drug
        value: a list of node id
    '''
    rels, adj_lists, node_maps, rid2inverse = pickle.load(open(data_dir+graph_data_path, "rb"))
    node_maps = {m : {n : i for i, n in enumerate(id_list)} for m, id_list in node_maps.iteritems()}
    '''
    node_maps: a dict()
        key: type, 5 types: function, sideeffects, protein, disease, drug
        value: dict():
            key: global node id
            value: local node id for this type
    '''
    for m in node_maps:
        node_maps[m][-1] = -1
    feature_dims = {m : embed_dim for m in rels}
    if embed_dim > 0:
        # initialze embedding matrix for each node type [num_ent_by_type + 2, embed_dim]
        feature_modules = {m : torch.nn.Embedding(len(node_maps[m])+1, embed_dim) for m in rels}
        for mode in rels:
            # define embedding initialization method: normal dist
            feature_modules[mode].weight.data.normal_(0, 1./embed_dim)
        '''
        features(nodes, mode): a embedding lookup function to make a dict() from node type to embeddingbag
            nodes: a lists of global node id which are in type (mode)
            mode: node type
            return: embedding vectors, shape [num_node, embed_dim]
        '''
        features = lambda nodes, mode : feature_modules[mode](
                torch.autograd.Variable(torch.LongTensor([node_maps[mode][n] for n in nodes])+1))
    else:
        feature_modules = None
        features = None
    graph = Graph(features, feature_dims, rels, adj_lists, rid2inverse = rid2inverse)
    return graph, feature_modules, node_maps
Пример #3
0
def load_graph(data_dir, embed_dim):
    rels, adj_lists, node_maps = pickle.load(open(data_dir+"/graph_data.pkl", "rb"))
    node_maps = {m : {n : i for i, n in enumerate(id_list)} for m, id_list in node_maps.iteritems()}
    for m in node_maps:
        node_maps[m][-1] = -1
    feature_dims = {m : embed_dim for m in rels}
    feature_modules = {m : torch.nn.Embedding(len(node_maps[m])+1, embed_dim) for m in rels}
    for mode in rels:
        feature_modules[mode].weight.data.normal_(0, 1./embed_dim)
    features = lambda nodes, mode : feature_modules[mode](
            torch.autograd.Variable(torch.LongTensor([node_maps[mode][n] for n in nodes])+1))
    graph = Graph(features, feature_dims, rels, adj_lists)
    return graph, feature_modules, node_maps
Пример #4
0
def train(feature_dim, lr, model, batch_size, max_batches, tol, max_path_len):
    feature_dim = 16
    relations, adj_lists, node_maps = pickle.load(
        open("/dfs/scratch0/netquery/cancer.pkl"))
    relations['disease'].remove(('disease', '0'))
    del adj_lists[('disease', '0', 'disease')]
    for rel1 in relations:
        for rel2 in relations[rel1]:
            print rel1, rel2, len(adj_lists[(rel1, rel2[1], rel2[0])])
    for mode in node_maps:
        node_maps[mode][-1] = len(node_maps[mode])
    feature_dims = {mode: feature_dim for mode in relations}
    feature_modules = {
        mode: nn.EmbeddingBag(len(node_maps[mode]), feature_dim)
        for mode in relations
    }
    for feature_module in feature_modules.values():
        feature_module.weight.data.normal_(0, 1. / np.sqrt(feature_dim))
    cuda = True
    if cuda:
        features = lambda nodes, mode, offset: feature_modules[mode].forward(
            Variable(
                torch.LongTensor([node_maps[mode][node]
                                  for node in nodes])).cuda(),
            Variable(torch.LongTensor(offset)).cuda())
    else:
        features = lambda nodes, mode, offset: feature_modules[mode].forward(
            Variable(
                torch.LongTensor([node_maps[mode][node] for node in nodes])),
            Variable(torch.LongTensor(offset)))

    graph = Graph(features, feature_dims, relations, adj_lists)
    edges = graph.get_all_edges_byrel()
    train_edges = {
        rel: edge_list[:int(0.9 * len(edge_list))]
        for rel, edge_list in edges.iteritems()
    }
    test_edges = {
        rel: edge_list[int(0.9 * len(edge_list)):]
        for rel, edge_list in edges.iteritems()
    }
    graph.remove_edges(
        [e for edge_list in test_edges.values() for e in edge_list])

    direct_enc = DirectEncoder(graph.features, feature_modules)
    dec = BilinearPathDecoder(graph.relations, feature_dims)
    enc_dec = PathEncoderDecoder(graph, direct_enc, dec)
    if cuda:
        enc_dec.cuda()
    optimizer = optim.SGD(enc_dec.parameters(), lr=0.5, momentum=0.000)

    start = time.time()
    print "{:d} training edges".format(
        sum([len(rel_edges) for rel_edges in train_edges.values()]))
    batch_size = 512
    num_batches = 20000
    tol = 0.0001
    losses = []
    ema_loss = None
    for i in range(num_batches):
        rel = graph.sample_relation()
        random.shuffle(train_edges[rel])
        edges = train_edges[rel][:batch_size]
        if len(edges) == 0:
            continue
        optimizer.zero_grad()
        enc_dec.graph.remove_edges(edges)
        loss = enc_dec.margin_loss([edge[0] for edge in edges],
                                   [edge[1] for edge in edges], [rel])
        enc_dec.graph.add_edges(edges)
        losses.append(loss.data[0])
        if ema_loss == None:
            ema_loss = loss.data[0]
        else:
            ema_loss = 0.99 * ema_loss + 0.01 * loss.data[0]
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print i, ema_loss
        if i > 2000 and i % 100 == 0:
            conv = np.mean(losses[i - 2000:i - 1000]) - np.mean(
                losses[i - 1000:i])
            print "conv", conv
            if conv < tol:
                break
    print "MRR:", evaluate_edge_auc(test_edges, graph, enc_dec)

    batch_size = 512
    num_batches = 100000
    ema_loss = None
    optimizer = optim.SGD(enc_dec.parameters(), lr=0.5, momentum=0.000)
    for i in range(num_batches):
        rels = graph.sample_metapath()
        nodes1, nodes2 = zip(
            *[graph.sample_path_with_rels(rels) for _ in range(batch_size)])

        optimizer.zero_grad()
        loss = enc_dec.margin_loss(nodes1, nodes2, rels)
        losses.append(loss.data[0])
        if ema_loss == None:
            ema_loss = loss.data[0]
        else:
            ema_loss = 0.99 * ema_loss + 0.01 * loss.data[0]
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print i, ema_loss
        if i % 5000 == 0:
            print "MRR:", evaluate_edge_auc(test_edges, graph, enc_dec)

    total = time.time() - start
    print "Time:", total
    print "Converged after:", i
    print "Per example:", total / batch_size / float(i)
    print "MRR:", evaluate_edge_auc(test_edges, graph, enc_dec)
Пример #5
0
def train(feature_dim, lr, model, batch_size, max_batches, tol, cuda, results,
          decoder, opt, agg):

    # load the data
    # relations, adj_lists, node_maps = pickle.load(open("/dfs/scratch0/netquery/cancer.pkl"))
    relations, adj_lists, node_maps = pickle.load(open("cancer.pkl"))

    # delete this relation because it doesn't have enough data
    relations['disease'].remove(('disease', '0'))
    del adj_lists[('disease', '0', 'disease')]

    # add dummy node (messy hack for nw)
    for mode in node_maps:
        node_maps[mode][-1] = len(node_maps[mode])

    # set the feature dimensions to be equal for all modes
    feature_dims = {mode: feature_dim for mode in relations}
    # the feature modules for all nodes are embedding lookups.
    feature_modules = {
        mode: nn.Embedding(len(node_maps[mode]), feature_dim)
        for mode in relations
    }

    # need to define the feature function that maps nodes to features
    if cuda:
        features = lambda nodes, mode: feature_modules[mode].forward(
            Variable(
                torch.LongTensor([node_maps[mode][node]
                                  for node in nodes])).cuda())
    else:
        features = lambda nodes, mode: feature_modules[mode].forward(
            Variable(
                torch.LongTensor([node_maps[mode][node] for node in nodes])))

    # give reasonable initialization to features
    for feature_module in feature_modules.values():
        feature_module.weight.data.normal_(0, 1. / np.sqrt(feature_dim))

    # build the graph
    graph = Graph(features, feature_dims, relations, adj_lists)

    # get mapping from relations->list of edges for that relation
    edges = graph.get_all_edges_byrel()

    # seperate into train and test sets
    train_edges = {
        rel: edge_list[:int(0.9 * len(edge_list))]
        for rel, edge_list in edges.iteritems()
    }
    test_edges = {
        rel: edge_list[int(0.9 * len(edge_list)):]
        for rel, edge_list in edges.iteritems()
    }
    graph.remove_edges(
        [e for edge_list in test_edges.values() for e in edge_list])

    # for simplicity the embedding and hidden dimensions are equal
    out_dims = {mode: feature_dim for mode in graph.relations}

    # define the encoder.
    # Either direct or based on single-step convolution
    if model == "direct":
        enc = DirectEncoder(graph.features, feature_modules)
        dec = get_decoder(graph, feature_dims, decoder)
        enc_dec = EdgeEncoderDecoder(graph, enc, dec)
    else:
        if agg == "mean":
            aggregator = MeanAggregator(graph.features)
        elif agg == "pool":
            aggregator = PoolAggregator(graph.features, graph.feature_dims)
        enc = Encoder(graph.features,
                      graph.feature_dims,
                      out_dims,
                      graph.relations,
                      graph.adj_lists,
                      concat=True,
                      feature_modules=feature_modules,
                      cuda=cuda,
                      aggregator=aggregator)
        dec = get_decoder(graph, enc.out_dims, decoder)
        enc_dec = EdgeEncoderDecoder(graph, enc, dec)
    if cuda:
        enc_dec.cuda()

    if opt == "sgd":
        optimizer = optim.SGD(enc_dec.parameters(), lr=lr, momentum=0.000)
    elif opt == "sgd-momentum":
        optimizer = optim.SGD(enc_dec.parameters(), lr=lr, momentum=0.9)
    elif opt == "adam":
        optimizer = optim.Adam(enc_dec.parameters(), lr=lr)

    # Main training loop
    start = time.time()
    ema_loss = None
    print "{:d} training edges".format(
        sum([len(rel_edges) for rel_edges in train_edges.values()]))
    losses = []
    for i in range(max_batches):
        rel = graph.sample_relation()
        random.shuffle(train_edges[rel])
        edges = train_edges[rel][:batch_size]
        if len(edges) == 0:
            continue
        optimizer.zero_grad()
        graph.remove_edges(edges)
        loss = enc_dec.margin_loss([edge[0] for edge in edges],
                                   [edge[1] for edge in edges], rel)
        graph.add_edges(edges)
        losses.append(loss.data[0])
        if ema_loss == None:
            ema_loss = loss.data[0]
        else:
            ema_loss = 0.99 * ema_loss + 0.01 * loss.data[0]
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print i, ema_loss
        if i > 2000 and i % 100 == 0:
            conv = np.mean(losses[i - 2000:i - 1000]) - np.mean(
                losses[i - 1000:i])
            print "conv", conv
            if conv < tol:
                break

    total = time.time() - start
    test_auc = evaluate_edge_auc(test_edges, graph, enc_dec)
    test_loss = evaluate_edge_margin(test_edges, graph, enc_dec)
    with open(results, "a") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            str(lr),
            str(batch_size),
            str(total),
            str(i),
            str(total / batch_size / float(i)),
            str(test_auc),
            str(test_loss),
            str(ema_loss),
            str(conv)
        ])
    print "Time:", total
    print "Converged after:", i
    print "Per example:", total / batch_size / float(i)
    print "AUC:", test_auc
    print "Loss:", test_loss
Пример #6
0
def load_graph(info_dir, embed_dim=16, cuda=False):
    print "Loading adjacency info..."
    adj_lists = pickle.load(open(info_dir + "/adj_lists.pkl"))
    relations = pickle.load(open(info_dir + "/rels.pkl"))
    post_words = pickle.load(open(info_dir + "/post_words.pkl"))

    num_users = len(
        set([
            id for rel, adj in adj_lists.iteritems() for id in adj
            if rel[0] == "user"
        ]))
    num_communities = len(
        set([
            id for rel, adj in adj_lists.iteritems() for id in adj
            if rel[0] == "community"
        ]))
    num_words = len(set([w for words in post_words.values() for w in words]))
    post_words = {
        post: torch.LongTensor([w for w in post_words[post]])
        for post in post_words
    }

    feature_modules = {
        "post": nn.EmbeddingBag(num_words, embed_dim),
        "user": nn.Embedding(num_users + 1, embed_dim),
        "community": nn.Embedding(num_communities + 1, embed_dim),
    }
    for mode in feature_modules:
        feature_modules[mode].weight.data.normal_(0, 1. / embed_dim)
    if not cuda:

        def _feature_func(nodes, mode):
            if mode != "post":
                return feature_modules[mode](
                    torch.autograd.Variable(torch.LongTensor(nodes) + 1))
            else:
                offsets = np.concatenate(
                    ([0],
                     np.cumsum(
                         [post_words[post].size()[0] for post in nodes[:-1]])))
                return feature_modules[mode](torch.autograd.Variable(
                    torch.cat([post_words[post] for post in nodes])),
                                             torch.autograd.Variable(
                                                 torch.LongTensor(offsets)))
    else:

        def _feature_func(nodes, mode):
            if mode != "post":
                return feature_modules[mode](
                    torch.autograd.Variable(torch.LongTensor(nodes) +
                                            1).cuda())
            else:
                offsets = np.concatenate(
                    ([0],
                     np.cumsum(
                         [post_words[post].size()[0] for post in nodes[:-1]])))
                return feature_modules[mode](
                    torch.autograd.Variable(
                        torch.cat([post_words[post]
                                   for post in nodes])).cuda(),
                    torch.autograd.Variable(torch.LongTensor(offsets)).cuda())

    feature_dims = {
        mode: embed.weight.size()[1]
        for mode, embed in feature_modules.iteritems()
    }
    graph = Graph(_feature_func, feature_dims, relations, adj_lists)
    return graph, feature_modules
def train(feature_dim, lr_edge, lr_metapath, lr_int, model, batch_size,
          max_batches, tol, max_path_len, cuda, results, decoder, opt, agg):
    feature_dim = 16
    relations, adj_lists, node_maps = pickle.load(
        open("/dfs/scratch0/netquery/cancer.pkl"))
    # relations, adj_lists, node_maps = pickle.load(open("cancer.pkl"))
    relations['disease'].remove(('disease', '0'))
    del adj_lists[('disease', '0', 'disease')]
    for rel1 in relations:
        for rel2 in relations[rel1]:
            print rel1, rel2, len(adj_lists[(rel1, rel2[1], rel2[0])])
    for mode in node_maps:
        node_maps[mode][-1] = len(node_maps[mode])
    feature_dims = {mode: feature_dim for mode in relations}
    feature_modules = {
        mode: nn.Embedding(len(node_maps[mode]), feature_dim)
        for mode in relations
    }
    for feature_module in feature_modules.values():
        feature_module.weight.data.normal_(0, 1. / feature_dim)

    if cuda:
        features = lambda nodes, mode: feature_modules[mode].forward(
            Variable(
                torch.LongTensor([node_maps[mode][node]
                                  for node in nodes])).cuda())
    else:
        features = lambda nodes, mode: feature_modules[mode].forward(
            Variable(
                torch.LongTensor([node_maps[mode][node] for node in nodes])))

    graph = Graph(features, feature_dims, relations, adj_lists)

    #Create chains and intersections on entire graph
    cancer_chains, cancer_neg_chains = graph.create_chains_byrels()
    cancer_pos_ints, cancer_neg_ints = graph.create_intersections_byrels()

    #Create all edges, metapaths and intersections
    edges = {}
    metapaths = {}
    for rel in cancer_chains:
        if len(rel) == 1:
            edges[rel[0]] = [(node1, node2, rel[0])
                             for node1 in cancer_chains[rel]
                             for node2 in cancer_chains[rel][node1]]
        elif len(rel) in [2, 3]:
            metapaths[rel] = [(node1, entry[-1], rel)
                              for node1 in cancer_chains[rel]
                              for entry in cancer_chains[rel][node1]]

    for edge_list in edges.values():
        random.shuffle(edge_list)
    for metapath_list in metapaths.values():
        random.shuffle(metapath_list)

    pos_ints = {}
    for rel in cancer_pos_ints:
        if len(rel) == 3:
            pos_ints[rel] = [(node1, node2, node3, target)
                             for (node1, node2, node3) in cancer_pos_ints[rel]
                             for target in cancer_pos_ints[rel][(node1, node2,
                                                                 node3)]]
        else:
            pos_ints[rel] = [(node1, node2, target)
                             for (node1, node2) in cancer_pos_ints[rel]
                             for target in cancer_pos_ints[rel][(node1, node2)]
                             ]
    #Get test edges and remove them from the graph
    train_edges = {
        rel: edge_list[:int(0.9 * len(edge_list))]
        for rel, edge_list in edges.iteritems()
    }
    test_edges = {
        rel: edge_list[int(0.9 * len(edge_list)):]
        for rel, edge_list in edges.iteritems()
    }
    graph.remove_edges(
        [e for edge_list in test_edges.values() for e in edge_list])

    #Create TRAIN chains and metapaths from the train graph (test edges removed)
    train_cancer_chains, train_cancer_neg_chains = graph.create_chains_byrels()
    train_cancer_pos_ints, train_cancer_neg_ints = graph.create_intersections_byrels(
    )

    #Create TRAINING metapaths and intersections from the train graph
    train_metapaths = {}
    for rel in train_cancer_chains:
        if len(rel) in [2, 3]:
            train_metapaths[rel] = [
                (node1, entry[-1], rel) for node1 in train_cancer_chains[rel]
                for entry in train_cancer_chains[rel][node1]
            ]
    train_pos_ints = {}
    for rel in train_cancer_pos_ints:
        if len(rel) == 3:
            train_pos_ints[rel] = [
                (node1, node2, node3, target)
                for (node1, node2, node3) in train_cancer_pos_ints[rel]
                for target in train_cancer_pos_ints[rel][(node1, node2, node3)]
            ]
        else:
            train_pos_ints[rel] = [
                (node1, node2, target)
                for (node1, node2) in train_cancer_pos_ints[rel]
                for target in train_cancer_pos_ints[rel][(node1, node2)]
            ]

    #Create test metapaths and test intersections be removinf training metapaths and intersections from the full sets respectively
    test_metapaths = {
        rel: list(set(metapaths[rel]) - set(train_metapaths[rel]))
        for rel in metapaths
    }
    test_ints = {
        rel: list(set(pos_ints[rel]) - set(train_pos_ints[rel]))
        for rel in pos_ints
    }
    '''
    with open("train_edges.pkl","wb") as f:
    	pickle.dump(train_edges, f)
    with open("test_edges.pkl", "wb") as f:
    	pickle.dump(test_edges, f)
    with open("train_metapaths.pkl", "wb") as f:
    	pickle.dump(train_metapaths, f)
    with open("test_metapaths.pkl", "wb") as f:
    	pickle.dump(test_metapaths, f)
    with open("train_ints.pkl", "wb") as f:
   	pickle.dump(train_pos_ints, f)
    with open("test_ints.pkl", "wb") as f:
    	pickle.dump(test_ints, f)
    '''

    # for simplicity the embedding and hidden dimensions are equal
    out_dims = {mode: feature_dim for mode in graph.relations}

    if model == "direct":
        enc = DirectEncoder(graph.features, feature_modules)
        dec = get_decoder(graph, feature_dims, decoder)
    else:
        if agg == "mean":
            aggregator = FastMeanAggregator(graph.features)
        elif agg == "pool":
            aggregator = FastPoolAggregator(graph.features, graph.feature_dims)
        enc = Encoder(graph.features,
                      graph.feature_dims,
                      out_dims,
                      graph.relations,
                      graph.adj_lists,
                      concat=True,
                      feature_modules=feature_modules,
                      cuda=cuda,
                      aggregator=aggregator)
        dec = get_decoder(graph, enc.out_dims, decoder)

    inter_dec = MinIntersection(feature_dims.keys(), feature_dims,
                                feature_dims)
    combined_enc_dec = LogCombinedEncoderDecoder(graph, enc, dec, inter_dec)
    if cuda:
        combined_enc_dec.cuda()

    print "Checking eval functions"
    beg_int_auc = evaluate_intersect_auc(test_ints, cancer_neg_ints, graph,
                                         combined_enc_dec, True)
    #beg_int_loss = evaluate_intersect_margin(test_ints, cancer_neg_ints, graph, combined_enc_dec, False)
    beg_path_auc = evaluate_metapath_auc(test_metapaths,
                                         cancer_neg_chains,
                                         graph,
                                         combined_enc_dec,
                                         batch_size=batch_size)
    beg_edge_auc = evaluate_edge_auc(test_edges, cancer_neg_chains, graph,
                                     combined_enc_dec)
    #beg_edge_loss = evaluate_edge_margin(test_edges, cancer_neg_chains, graph, combined_enc_dec)
    #beg_path_loss = evaluate_metapath_margin(test_metapaths, cancer_neg_chains, graph, combined_enc_dec)
    print beg_edge_auc, beg_path_auc  #, beg_int_auc, beg_edge_loss, beg_path_loss, beg_int_loss

    losses = []
    ema_loss = None

    if opt == "sgd":
        optimizer = optim.SGD(combined_enc_dec.parameters(),
                              lr=lr_edge,
                              momentum=0.000)
    elif opt == "sgd-momentum":
        optimizer = optim.SGD(combined_enc_dec.parameters(),
                              lr=lr_edge,
                              momentum=0.9)
    elif opt == "adam":
        optimizer = optim.Adam(combined_enc_dec.parameters(), lr=lr_edge)

    conv = -1
    for i in range(max_batches):
        rel = graph.sample_relation()
        #print len(train_edges[rel])
        start = random.randint(0, max(0, len(train_edges[rel]) - batch_size))
        edges = train_edges[rel][start:start + batch_size]
        if len(edges) == 0:
            continue
        optimizer.zero_grad()
        #combined_enc_dec.graph.remove_edges(edges)
        neg_nodes = [
            random.choice(train_cancer_neg_chains[(rel, )][e[0]])
            for e in edges
        ]
        loss = combined_enc_dec.margin_loss([edge[0] for edge in edges],
                                            [edge[1] for edge in edges], [rel],
                                            "path", neg_nodes)
        #combined_enc_dec.graph.add_edges(edges)
        losses.append(loss.data[0])
        if ema_loss == None:
            ema_loss = loss.data[0]
        else:
            ema_loss = 0.99 * ema_loss + 0.01 * loss.data[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm(combined_enc_dec.parameters(), 0.00001)
        optimizer.step()
        if i % 100 == 0:
            print i, ema_loss
        if i > 2000 and i % 100 == 0:
            conv = np.mean(losses[i - 2000:i - 1000]) - np.mean(
                losses[i - 1000:i])
            print "conv", conv
            if conv < tol:
                break
    print "After training on edges:"
    print combined_enc_dec.edge_dec.mats
    train1_edge_auc = evaluate_edge_auc(test_edges, cancer_neg_chains, graph,
                                        combined_enc_dec)
    #train1_edge_loss = evaluate_edge_margin(test_edges, cancer_neg_chains, graph, combined_enc_dec)
    #train1_path_loss = evaluate_metapath_margin(test_metapaths, cancer_neg_chains, graph, combined_enc_dec)
    #    train1_path_auc = evaluate_metapath_auc(test_metapaths, cancer_neg_chains, graph, combined_enc_dec, batch_size=batch_size)
    train1_int_auc = evaluate_intersect_auc(test_ints, cancer_neg_ints, graph,
                                            combined_enc_dec, True)
    #train1_int_loss = evaluate_intersect_margin(test_ints, cancer_neg_ints, graph, combined_enc_dec, False)
    print train1_edge_auc, train1_int_auc  #, train1_edge_loss, train1_path_loss, train1_int_loss
    """ 
    losses = []
    ema_loss = None
    if opt == "sgd":
        optimizer = optim.SGD(combined_enc_dec.parameters(), lr=lr_metapath, momentum=0.000)
    elif opt == "sgd-momentum":
        optimizer = optim.SGD(combined_enc_dec.parameters(), lr=lr_metapath, momentum=0.9)
    elif opt == "adam":
        optimizer = optim.Adam(combined_enc_dec.parameters(), lr=lr_metapath)

    conv = -1
    for i in range(max_batches):
        while True:
            rels = graph.sample_metapath()
            if len(train_metapaths[rels]) > 0:
                break
        start = random.randint(0, max(0,len(train_metapaths[rels])-batch_size))
        edges = train_metapaths[rels][start:start+batch_size]
        nodes1 = [edge[0] for edge in edges]
        nodes2 = [edge[1] for edge in edges]
        neg_nodes = [random.choice(train_cancer_neg_chains[rels][e[0]]) for e in edges]
        optimizer.zero_grad()
        loss = combined_enc_dec.margin_loss(nodes1, nodes2, rels, "path", neg_nodes)
        losses.append(loss.data[0])
        if ema_loss == None:
            ema_loss = loss.data[0]
        else:
            ema_loss = 0.99*ema_loss + 0.01*loss.data[0]
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print i, ema_loss
        if i > 2000 and i % 100 == 0:
            conv = np.mean(losses[i-2000:i-1000]) - np.mean(losses[i-1000:i]) 
            print "conv", conv
            if conv < tol:
                break
        if i % 5000 == 0:
            print "MRR:", evaluate_edge_auc(test_edges, cancer_neg_chains, graph, combined_enc_dec)

    print "After training on metapaths:"
    train2_edge_auc = evaluate_edge_auc(test_edges, cancer_neg_chains, graph, combined_enc_dec)
    #train2_edge_loss = evaluate_edge_margin(test_edges, cancer_neg_chains, graph, combined_enc_dec)
    #train2_path_loss = evaluate_metapath_margin(test_metapaths, cancer_neg_chains, graph, combined_enc_dec)
    train2_path_auc = evaluate_metapath_auc(test_metapaths, cancer_neg_chains, graph, combined_enc_dec, batch_size=batch_size)
    #train2_int_auc =  evaluate_intersect_auc(test_ints, cancer_neg_ints, graph, combined_enc_dec, False)
    #train2_int_loss = evaluate_intersect_margin(test_ints, cancer_neg_ints, graph, combined_enc_dec, False)
    print train2_edge_auc, train2_path_auc#, train2_int_auc, train2_edge_loss, train2_path_loss, train2_int_loss
    """

    losses = []
    ema_loss = None
    if opt == "sgd":
        optimizer = optim.SGD(combined_enc_dec.inter_dec.parameters(),
                              lr=lr_int,
                              momentum=0.000)
    elif opt == "sgd-momentum":
        optimizer = optim.SGD(combined_enc_dec.parameters(),
                              lr=lr_int,
                              momentum=0.9)
    elif opt == "adam":
        optimizer = optim.Adam(combined_enc_dec.parameters(), lr=lr_int)

    conv = -1
    for i in range(max_batches):
        while True:
            rels = graph.sample_intersection()
            random.shuffle(train_pos_ints[rels])
            samples = train_pos_ints[rels][:batch_size]
            if len(samples) > 0:
                break
        query_nodes1 = [edge[0] for edge in samples]
        query_nodes2 = [edge[1] for edge in samples]
        if len(rels) == 3:
            query_nodes3 = [edge[2] for edge in samples]
            target_nodes = [edge[3] for edge in samples]
            neg_nodes = [
                random.choice(train_cancer_neg_ints[rels][(query_nodes1[j],
                                                           query_nodes2[j],
                                                           query_nodes3[j])])
                for j in range(len(samples))
            ]
        else:
            query_nodes3 = []
            target_nodes = [edge[2] for edge in samples]
            neg_nodes = [
                random.choice(train_cancer_neg_ints[rels][(query_nodes1[j],
                                                           query_nodes2[j])])
                for j in range(len(samples))
            ]

        optimizer.zero_grad()
        loss = combined_enc_dec.margin_loss(query_nodes1, query_nodes2, rels,
                                            "intersect", neg_nodes,
                                            target_nodes, query_nodes3)
        losses.append(loss.data[0])
        if ema_loss == None:
            ema_loss = loss.data[0]
        else:
            ema_loss = 0.99 * ema_loss + 0.01 * loss.data[0]
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print i, ema_loss
        if i > 2000 and i % 100 == 0:
            conv = np.mean(losses[i - 2000:i - 1000]) - np.mean(
                losses[i - 1000:i])
            print "conv", conv
            if conv < tol:
                break
        if i % 5000 == 0:
            print "MRR:", evaluate_edge_auc(test_edges, cancer_neg_chains,
                                            graph, combined_enc_dec)
            print "Inersection AUC:", evaluate_intersect_auc(
                test_ints, cancer_neg_ints, graph, combined_enc_dec, True)

    print "After training on intersections:"
    train3_edge_auc = evaluate_edge_auc(test_edges, cancer_neg_chains, graph,
                                        combined_enc_dec)
    #    train3_edge_loss = evaluate_edge_margin(test_edges, cancer_neg_chains, graph, combined_enc_dec)
    #train3_path_loss = evaluate_metapath_margin(test_metapaths, cancer_neg_chains, graph, combined_enc_dec)
    #train3_path_auc = evaluate_metapath_auc(test_metapaths, cancer_neg_chains, graph, combined_enc_dec, batch_size=batch_size)
    train3_int_auc = evaluate_intersect_auc(test_ints, cancer_neg_ints, graph,
                                            combined_enc_dec, True)
    train3_int_auc_train = evaluate_intersect_auc(train_pos_ints,
                                                  train_cancer_neg_ints, graph,
                                                  combined_enc_dec, True)
    #    train3_int_loss = evaluate_intersect_margin(test_ints, cancer_neg_ints, graph, combined_enc_dec, True)
    print train3_edge_auc, train3_int_auc, train3_int_auc_train  #, train3_edge_loss, train3_path_loss, train3_int_loss

    with open(results, "a") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            str(lr_edge),
            str(lr_metapath),
            str(beg_edge_auc),
            str(beg_path_auc),
            str(train1_edge_auc),
            str(train1_path_auc),
            str(train2_edge_auc),
            str(train2_path_auc)
        ])
Пример #8
0
def train(feature_dim, lr, model, batch_size, max_batches, tol, max_path_len,
          cuda, results, decoder, opt, agg):
    feature_dim = 16
    # relations, adj_lists, node_maps = pickle.load(open("/dfs/scratch0/netquery/cancer.pkl"))
    relations, adj_lists, node_maps = pickle.load(open("cancer.pkl"))
    relations['disease'].remove(('disease', '0'))
    del adj_lists[('disease', '0', 'disease')]
    for rel1 in relations:
        for rel2 in relations[rel1]:
            print rel1, rel2, len(adj_lists[(rel1, rel2[1], rel2[0])])
    for mode in node_maps:
        node_maps[mode][-1] = len(node_maps[mode])
    feature_dims = {mode: feature_dim for mode in relations}
    feature_modules = {
        mode: nn.Embedding(len(node_maps[mode]), feature_dim)
        for mode in relations
    }
    for feature_module in feature_modules.values():
        feature_module.weight.data.normal_(0, 1. / np.sqrt(feature_dim))

    if cuda:
        features = lambda nodes, mode: feature_modules[mode].forward(
            Variable(
                torch.LongTensor([node_maps[mode][node]
                                  for node in nodes])).cuda())
    else:
        features = lambda nodes, mode: feature_modules[mode].forward(
            Variable(
                torch.LongTensor([node_maps[mode][node] for node in nodes])))

    graph = Graph(features, feature_dims, relations, adj_lists)

    # cancer_chains = graph.create_chains_byrels()
    # cancer_pos_ints, cancer_neg_ints = graph.create_intersections_byrels(cancer_chains)

    metapaths = graph.get_all_metapaths_byrel()
    train_metapaths = {
        rel: metapath_list[:int(0.9 * len(metapath_list))]
        for rel, metapath_list in metapaths.iteritems()
    }
    test_metapaths = {
        rel: metapath_list[int(0.9 * len(metapath_list)):]
        for rel, metapath_list in metapaths.iteritems()
    }

    edges = graph.get_all_edges_byrel()
    train_edges = {
        rel: edge_list[:int(0.9 * len(edge_list))]
        for rel, edge_list in edges.iteritems()
    }
    test_edges = {
        rel: edge_list[int(0.9 * len(edge_list)):]
        for rel, edge_list in edges.iteritems()
    }
    graph.remove_edges(
        [e for edge_list in test_edges.values() for e in edge_list])

    # for simplicity the embedding and hidden dimensions are equal
    out_dims = {mode: feature_dim for mode in graph.relations}

    if model == "direct":
        enc = DirectEncoder(graph.features, feature_modules)
        dec = get_decoder(graph, feature_dims, decoder)
    else:
        if agg == "mean":
            aggregator = FastMeanAggregator(graph.features)
        elif agg == "pool":
            aggregator = FastPoolAggregator(graph.features, graph.feature_dims)
        enc = Encoder(graph.features,
                      graph.feature_dims,
                      out_dims,
                      graph.relations,
                      graph.adj_lists,
                      concat=True,
                      feature_modules=feature_modules,
                      cuda=cuda,
                      aggregator=aggregator)
        dec = get_decoder(graph, enc.out_dims, decoder)

    enc_dec = MetapathEncoderDecoder(graph, enc, dec)
    if cuda:
        enc_dec.cuda()

    if opt == "sgd":
        optimizer = optim.SGD(enc_dec.parameters(), lr=lr, momentum=0.000)
    elif opt == "sgd-momentum":
        optimizer = optim.SGD(enc_dec.parameters(), lr=lr, momentum=0.9)
    elif opt == "adam":
        optimizer = optim.Adam(enc_dec.parameters(), lr=lr)

    start = time.time()
    print "{:d} training edges".format(
        sum([len(rel_edges) for rel_edges in train_edges.values()]))
    losses = []
    ema_loss = None

    conv = -1
    for i in range(max_batches):
        rel = graph.sample_relation()
        random.shuffle(train_edges[rel])
        edges = train_edges[rel][:batch_size]
        if len(edges) == 0:
            continue
        optimizer.zero_grad()
        enc_dec.graph.remove_edges(edges)
        loss = enc_dec.margin_loss([edge[0] for edge in edges],
                                   [edge[1] for edge in edges], [rel])
        enc_dec.graph.add_edges(edges)
        losses.append(loss.data[0])
        if ema_loss == None:
            ema_loss = loss.data[0]
        else:
            ema_loss = 0.99 * ema_loss + 0.01 * loss.data[0]
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print i, ema_loss
        if i > 2000 and i % 100 == 0:
            conv = np.mean(losses[i - 2000:i - 1000]) - np.mean(
                losses[i - 1000:i])
            print "conv", conv
            if conv < tol:
                break
    old_edge_auc = evaluate_edge_auc(test_edges, graph, enc_dec)
    print "MRR:", old_edge_auc
    old_edge_loss = evaluate_edge_margin(test_edges, graph, enc_dec)

    old_path_loss = evaluate_metapath_margin(test_metapaths, graph, enc_dec)
    old_path_auc = evaluate_metapath_auc(test_metapaths,
                                         graph,
                                         enc_dec,
                                         batch_size=batch_size)

    print "Metapath auc:", old_path_auc
    print "Metapath margin: ", old_path_loss

    ema_loss = None
    if opt == "sgd":
        optimizer = optim.SGD(enc_dec.parameters(), lr=lr, momentum=0.000)
    elif opt == "sgd-momentum":
        optimizer = optim.SGD(enc_dec.parameters(), lr=lr, momentum=0.9)
    elif opt == "adam":
        optimizer = optim.Adam(enc_dec.parameters(), lr=lr)
    for i in range(max_batches):
        while True:
            rels = graph.sample_metapath()
            random.shuffle(train_metapaths[rels])
            edges = train_metapaths[rels][:batch_size]
            if len(edges) > 0:
                break
        nodes1 = [edge[0] for edge in edges]
        nodes2 = [edge[1] for edge in edges]

        optimizer.zero_grad()
        loss = enc_dec.margin_loss(nodes1, nodes2, rels)
        losses.append(loss.data[0])
        if ema_loss == None:
            ema_loss = loss.data[0]
        else:
            ema_loss = 0.99 * ema_loss + 0.01 * loss.data[0]
        loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print i, ema_loss
        if i > 2000 and i % 100 == 0:
            conv = np.mean(losses[i - 2000:i - 1000]) - np.mean(
                losses[i - 1000:i])
            print "conv", conv
            if conv < tol:
                break
        if i % 5000 == 0:
            print "MRR:", evaluate_edge_auc(test_edges, graph, enc_dec)

    total = time.time() - start
    test_auc = evaluate_edge_auc(test_edges, graph, enc_dec)
    test_loss = evaluate_edge_margin(test_edges, graph, enc_dec)
    path_auc = evaluate_metapath_auc(test_metapaths,
                                     graph,
                                     enc_dec,
                                     batch_size=batch_size)
    path_loss = evaluate_metapath_margin(test_metapaths, graph, enc_dec)

    with open(results, "a") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow([
            str(lr),
            str(batch_size),
            str(total),
            str(i),
            str(total / batch_size / float(i)),
            str(old_path_auc),
            str(old_edge_loss),
            str(test_auc),
            str(test_loss),
            str(ema_loss),
            str(conv),
            str(old_path_loss),
            str(old_path_auc),
            str(path_loss),
            str(path_auc)
        ])

    print "Time:", total
    print "Converged after:", i
    print "Per example:", total / batch_size / float(i)
    print "MRR:", test_auc
    print "Loss:", test_loss
    print "Metapath auc:", path_auc
    print "Metapath margin: ", path_loss