Exemplo n.º 1
0
def _save_g(file_path, g, labels=None):
    save_graphs(file_path, g, labels=labels)
Exemplo n.º 2
0
def create_old_heterograph_files():
    path = os.path.join(os.path.dirname(__file__), "data/hetero1.bin")
    g_list0 = create_heterographs(F.int64) + create_heterographs(F.int32)
    labels_dict = {"graph_label": F.ones(54)}
    save_graphs(path, g_list0, labels_dict)
Exemplo n.º 3
0
    train_G = nx.from_scipy_sparse_matrix(
        full_adj[:n_training_samples][:, :n_training_samples])
    train_DGL = dgl.DGLGraph()
    train_DGL.from_networkx(train_G, edge_attrs=['weight'])
    # train_DGL.from_scipy_sparse_matrix(full_adj[:n_training_samples][:, :n_training_samples])
    assert (len(train_DGL) == train_features.shape[0])

    test_G = nx.from_scipy_sparse_matrix(
        full_adj[n_training_docs:][:, n_training_docs:])
    test_DGL = dgl.DGLGraph()
    test_DGL.from_networkx(test_G, edge_attrs=['weight'])
    # test_DGL.from_scipy_sparse_matrix(full_adj[n_training_docs:][:, n_training_docs:])
    assert (len(test_DGL) == test_features.shape[0])

    Gs = [train_DGL, test_DGL]
    save_graphs('graph.bin', Gs)
print(Gs[0])
print('load graph done')


class Model(nn.Module):
    def __init__(self, feature_dim, inter_dim, final_dim):
        super(Model, self).__init__()
        self.gcn1 = GraphConv(feature_dim, inter_dim)
        self.gcn2 = GraphConv(inter_dim, final_dim)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, graph, features):
        x = self.gcn1(graph, features)
        # x = self.dropout(x)
        x = F.relu(x)
Exemplo n.º 4
0
    def _pre_process(self,
                     smiles_to_graph,
                     node_featurizer,
                     edge_featurizer,
                     load,
                     log_every,
                     init_mask,
                     n_jobs=1):
        """Pre-process the dataset

        * Convert molecules from smiles format into DGLGraphs
          and featurize their atoms
        * Set missing labels to be 0 and use a binary masking
          matrix to mask them

        Parameters
        ----------
        smiles_to_graph : callable, SMILES -> DGLGraph
            Function for converting a SMILES (str) into a DGLGraph.
        node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
            Featurization for nodes like atoms in a molecule, which can be used to update
            ndata for a DGLGraph.
        edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
            Featurization for edges like bonds in a molecule, which can be used to update
            edata for a DGLGraph.
        load : bool
            Whether to load the previously pre-processed dataset or pre-process from scratch.
            ``load`` should be False when we want to try different graph construction and
            featurization methods and need to preprocess from scratch. Default to True.
        log_every : bool
            Print a message every time ``log_every`` molecules are processed. It only comes
            into effect when :attr:`n_jobs` is greater than 1.
        init_mask : bool
            Whether to initialize a binary mask indicating the existence of labels.
        n_jobs : int
            Degree of parallelism for pre processing. Default to 1.
        """
        if os.path.exists(self.cache_file_path) and load:
            # DGLGraphs have been constructed before, reload them
            print('Loading previously saved dgl graphs...')
            self.graphs, label_dict = load_graphs(self.cache_file_path)
            self.labels = label_dict['labels']
            if init_mask:
                self.mask = label_dict['mask']
            self.valid_ids = label_dict['valid_ids'].tolist()
        else:
            print('Processing dgl graphs from scratch...')
            if n_jobs > 1:
                self.graphs = pmap(smiles_to_graph,
                                   self.smiles,
                                   node_featurizer=node_featurizer,
                                   edge_featurizer=edge_featurizer,
                                   n_jobs=n_jobs)
            else:
                self.graphs = []
                for i, s in enumerate(self.smiles):
                    if (i + 1) % log_every == 0:
                        print('Processing molecule {:d}/{:d}'.format(
                            i + 1, len(self)))
                    self.graphs.append(
                        smiles_to_graph(s,
                                        node_featurizer=node_featurizer,
                                        edge_featurizer=edge_featurizer))

            # Keep only valid molecules
            self.valid_ids = []
            graphs = []
            for i, g in enumerate(self.graphs):
                if g is not None:
                    self.valid_ids.append(i)
                    graphs.append(g)
            self.graphs = graphs
            _label_values = self.df[self.task_names].values
            # np.nan_to_num will also turn inf into a very large number
            self.labels = F.zerocopy_from_numpy(
                np.nan_to_num(_label_values).astype(
                    np.float32))[self.valid_ids]
            valid_ids = torch.tensor(self.valid_ids)
            if init_mask:
                self.mask = F.zerocopy_from_numpy(
                    (~np.isnan(_label_values)).astype(
                        np.float32))[self.valid_ids]
                save_graphs(self.cache_file_path,
                            self.graphs,
                            labels={
                                'labels': self.labels,
                                'mask': self.mask,
                                'valid_ids': valid_ids
                            })
            else:
                self.mask = None
                save_graphs(self.cache_file_path,
                            self.graphs,
                            labels={
                                'labels': self.labels,
                                'valid_ids': valid_ids
                            })

        self.smiles = [self.smiles[i] for i in self.valid_ids]
Exemplo n.º 5
0
src = torch.tensor(graph_df.u.values)
dst = torch.tensor(graph_df.i.values)
label = torch.tensor(graph_df.label.values, dtype=torch.float32)
timestamp = torch.tensor(graph_df.ts.values, dtype=torch.float32)
edge_feat = torch.tensor(edge_features[1:], dtype=torch.float32)

g = dgl.graph((torch.cat([src, dst]), torch.cat([dst, src])))
len_event = src.shape[0]

g.edata['label'] = label.repeat(2).squeeze()
g.edata['timestamp'] = timestamp.repeat(2).squeeze()
g.edata['feat'] = edge_feat.repeat(2, 1).squeeze()

print(g)
save_graphs(f"./data/{args.data}.bin", g)

if args.new_node_count:
    origin_num_edges = g.num_edges() // 2
    train_eid = torch.arange(0, int(0.7 * origin_num_edges))
    un_train_eid = torch.arange(int(0.7 * origin_num_edges), origin_num_edges)

    train_g = dgl.graph(g.find_edges(train_eid))
    val_n_test_g = dgl.compact_graphs(dgl.graph(g.find_edges(un_train_eid)))

    print(
        f'total nodes: {g.num_nodes()}, training nodes: {train_g.num_nodes()}, val_n_test nodes: {val_n_test_g.num_nodes()}'
    )
    old_nodes = val_n_test_g.num_nodes() - g.num_nodes() + train_g.num_nodes()
    print(
        f'old nodes in val_n_test: {old_nodes} ({round((old_nodes)*100/val_n_test_g.num_nodes(),4)}%)'
Exemplo n.º 6
0
def gaussian_square(x0: float,
                    xn: float,
                    y0: float,
                    yn: float,
                    stop: float,
                    steps: int,
                    f: str = '0',
                    ud_top: str = '0',
                    ud_bottom: str = '0',
                    ud_left: str = '0',
                    ud_right: str = '0',
                    u0: str = '0',
                    cell_size: float = 5.,
                    tol: float = 1e-4,
                    dy: bool = False,
                    path: str = 'data/gaussian_square_static.bin'):
    '''Create Gaussian Equation Dataset

    2D Gaussian equation, dynamic process, rectangle domain, 
    can custom each boundary's condition, boundary, can choose
    static boundary condition or dynamic boundary condition. 

    Args:
        x0: <float> left boundary for x
        xn: <float> right boundary for x
        y0: <float> left boundary for y
        yn: <float> right boundary for y
        stop: <float> process stop time
        steps: <int> number of time step
        f: <str> right part of laplace function, in cpp argument format
        ud_top: <str> boundary condition on the top of rectangle
        ud_bottom: <str> boundary condition on the bottom of rectangle
        ud_left: <str> boundary condition on the left of rectangle
        ud_right: <str> boundary condition on the right of rectangle
        u0: <str> initialization condition function, in cpp argument format
        cell_siez: <float> cell size for created mesh
        tol: <float> boundary bias, e.g. (x-tol, x+tol) is a boundary on x
        dy: <bool> if the boundary condition is dynamic
        path: <str> path for saving generated dgl graph, in .bin format
    '''
    if (dy):
        mesh, function_space, bc = rectangle(x0, xn, y0, yn, ud_top, ud_bottom,
                                             ud_left, ud_right, cell_size, 0,
                                             tol)
    else:
        mesh, function_space, bc = rectangle(x0, xn, y0, yn, ud_top, ud_bottom,
                                             ud_left, ud_right, cell_size, tol)
    dt = stop / steps
    u0 = Expression(u0, degree=2)
    f = Expression(f, degree=2)
    un = interpolate(u0, function_space)

    u = TrialFunction(function_space)
    v = TestFunction(function_space)
    F = u * v * dx + dt * dot(grad(u), grad(v)) * dx - (un + dt * f) * v * dx
    a, L = lhs(F), rhs(F)

    u = Function(function_space)
    t = 0
    graphs = []
    for _ in range(steps):
        t += dt
        if (dy):
            _, bc = rectangle(x0, xn, y0, yn, ud_top, ud_bottom, ud_left,
                              ud_right, cell_size, t, tol)
        solve(a == L, u, bc)
        un.assign(u)
        graphs.append(to_dgl(function=u, mesh=mesh))

    save_graphs(path, graphs)
Exemplo n.º 7
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        if osp.exists(pre_processed_file_path):
            self.graph, _ = load_graphs(pre_processed_file_path)

        else:
            ### check if the downloaded file exists
            if self.binary:
                # npz format
                has_necessary_file_simple = osp.exists(
                    osp.join(self.root, 'raw',
                             'data.npz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(
                    osp.join(self.root, 'raw',
                             'edge_index_dict.npz')) and self.is_hetero
            else:
                # csv file
                has_necessary_file_simple = osp.exists(
                    osp.join(self.root, 'raw',
                             'edge.csv.gz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(
                    osp.join(self.root, 'raw',
                             'triplet-type-list.csv.gz')) and self.is_hetero

            has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero
            if not has_necessary_file:
                url = self.meta_info['url']
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print('Stop download.')
                    exit(-1)

            raw_dir = osp.join(self.root, 'raw')

            add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

            ### pre-process and save
            if self.meta_info['additional node files'] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    'additional node files'].split(',')

            if self.meta_info['additional edge files'] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    'additional edge files'].split(',')

            if self.is_hetero:
                graph = read_heterograph_dgl(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files,
                    binary=self.binary)[0]
            else:
                graph = read_graph_dgl(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files,
                    binary=self.binary)[0]

            print('Saving...')
            save_graphs(pre_processed_file_path, graph, {})

            self.graph, _ = load_graphs(pre_processed_file_path)
Exemplo n.º 8
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        if osp.exists(pre_processed_file_path):

            if not self.is_hetero:
                self.graph, _ = load_graphs(pre_processed_file_path)
            else:
                with open(pre_processed_file_path, 'rb') as f:
                    self.graph = pickle.load(f)

        else:
            ### check if the downloaded file exists
            has_necessary_file_simple = osp.exists(
                osp.join(self.root, "raw",
                         "edge.csv.gz")) and (not self.is_hetero)
            has_necessary_file_hetero = osp.exists(
                osp.join(self.root, "raw",
                         "triplet-type-list.csv.gz")) and self.is_hetero

            has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero
            if not has_necessary_file:
                url = self.meta_info[self.name]["url"]
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(
                        osp.join(self.original_root, self.download_name),
                        self.root)
                else:
                    print("Stop download.")
                    exit(-1)

            raw_dir = osp.join(self.root, "raw")

            add_inverse_edge = self.meta_info[
                self.name]["add_inverse_edge"] == "True"

            ### pre-process and save
            if self.meta_info[self.name]["additional node files"] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info[
                    self.name]["additional node files"].split(',')

            if self.meta_info[self.name]["additional edge files"] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info[
                    self.name]["additional edge files"].split(',')

            if self.is_hetero:
                graph = read_csv_heterograph_dgl(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files)[0]

                with open(pre_processed_file_path, 'wb') as f:
                    pickle.dump([graph], f)

                with open(pre_processed_file_path, 'rb') as f:
                    self.graph = pickle.load(f)

            else:
                graph = read_csv_graph_dgl(
                    raw_dir,
                    add_inverse_edge=add_inverse_edge,
                    additional_node_files=additional_node_files,
                    additional_edge_files=additional_edge_files)[0]

                print('Saving...')
                save_graphs(pre_processed_file_path, graph, {})

                self.graph, _ = load_graphs(pre_processed_file_path)
Exemplo n.º 9
0
        neg_score = score_func(neg_g, emb).reshape(-1, neg_sample_size)
        filter_bias = neg_g.edata['false_neg'].reshape(
            -1, neg_sample_size).to(device)
        pos_score = F.logsigmoid(pos_score)
        neg_score = F.logsigmoid(neg_score)
        neg_score -= filter_bias.float()
        pos_score = pos_score.unsqueeze(1)
        rankings = torch.sum(neg_score >= pos_score, dim=1) + 1
        return np.mean(1.0 / rankings.cpu().numpy())


device = torch.device(('cpu', 'cuda')[torch.cuda.is_available()])
g = load_ws()
g = g.to(device)
g.readonly()
save_graphs('./ws.bin', g)
features = g.ndata['features']
features = features.to(device)
in_feats = g.ndata['features'].shape[1]

#Model hyperparameters
n_hidden = in_feats
n_layers = 1
dropout = 0.5
aggregator_type = 'gcn'

# create GraphSAGE model
gconv_model = GraphSAGEModel(in_feats, n_hidden, n_hidden, n_layers, F.relu,
                             dropout, aggregator_type)
eids = np.random.permutation(g.number_of_edges())
train_eids = eids[:int(len(eids) * 0.8)]
Exemplo n.º 10
0
    def pre_process(self):
        processed_dir = osp.join(self.root, 'processed')
        pre_processed_file_path = osp.join(processed_dir, 'dgl_data_processed')

        if osp.exists(pre_processed_file_path):
            self.graph, label_dict = load_graphs(pre_processed_file_path)
            if self.is_hetero:
                self.labels = label_dict
            else:
                self.labels = label_dict['labels']


        else:
            ### check if the downloaded file exists
            if self.binary:
                # npz format
                has_necessary_file_simple = osp.exists(osp.join(self.root, 'raw', 'data.npz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(osp.join(self.root, 'raw', 'edge_index_dict.npz')) and self.is_hetero
            else:
                # csv file
                has_necessary_file_simple = osp.exists(osp.join(self.root, 'raw', 'edge.csv.gz')) and (not self.is_hetero)
                has_necessary_file_hetero = osp.exists(osp.join(self.root, 'raw', 'triplet-type-list.csv.gz')) and self.is_hetero
            
            has_necessary_file = has_necessary_file_simple or has_necessary_file_hetero

            if not has_necessary_file:
                url = self.meta_info['url']
                if decide_download(url):
                    path = download_url(url, self.original_root)
                    extract_zip(path, self.original_root)
                    os.unlink(path)
                    # delete folder if there exists
                    try:
                        shutil.rmtree(self.root)
                    except:
                        pass
                    shutil.move(osp.join(self.original_root, self.download_name), self.root)
                else:
                    print('Stop download.')
                    exit(-1)

            raw_dir = osp.join(self.root, 'raw')

            ### pre-process and save
            add_inverse_edge = self.meta_info['add_inverse_edge'] == 'True'

            if self.meta_info['additional node files'] == 'None':
                additional_node_files = []
            else:
                additional_node_files = self.meta_info['additional node files'].split(',')

            if self.meta_info['additional edge files'] == 'None':
                additional_edge_files = []
            else:
                additional_edge_files = self.meta_info['additional edge files'].split(',')


            if self.is_hetero:
                graph = read_heterograph_dgl(raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files, binary=self.binary)[0]
                
                if self.binary:
                    tmp = np.load(osp.join(raw_dir, 'node-label.npz'))
                    label_dict = {}
                    for key in list(tmp.keys()):
                        label_dict[key] = tmp[key]
                    del tmp
                else:
                    label_dict = read_node_label_hetero(raw_dir)

                # convert into torch tensor
                if 'classification' in self.task_type:
                    for nodetype in label_dict.keys():
                        # detect if there is any nan
                        node_label = label_dict[nodetype]
                        if np.isnan(node_label).any():
                            label_dict[nodetype] = torch.from_numpy(node_label).to(torch.float32)
                        else:
                            label_dict[nodetype] = torch.from_numpy(node_label).to(torch.long)
                else:
                    for nodetype in label_dict.keys():
                        node_label = label_dict[nodetype]
                        label_dict[nodetype] = torch.from_numpy(node_label).to(torch.float32)

            else:
                graph = read_graph_dgl(raw_dir, add_inverse_edge = add_inverse_edge, additional_node_files = additional_node_files, additional_edge_files = additional_edge_files, binary=self.binary)[0]

                ### adding prediction target
                if self.binary:
                    node_label = np.load(osp.join(raw_dir, 'node-label.npz'))['node_label']
                else:
                    node_label = pd.read_csv(osp.join(raw_dir, 'node-label.csv.gz'), compression='gzip', header = None).values

                if 'classification' in self.task_type:
                    # detect if there is any nan
                    if np.isnan(node_label).any():
                        node_label = torch.from_numpy(node_label).to(torch.float32)
                    else:
                        node_label = torch.from_numpy(node_label).to(torch.long)
                else:
                    node_label = torch.from_numpy(node_label).to(torch.float32)

                label_dict = {'labels': node_label}

            print('Saving...')
            save_graphs(pre_processed_file_path, graph, label_dict)

            self.graph, label_dict = load_graphs(pre_processed_file_path)

            if self.is_hetero:
                self.labels = label_dict
            else:
                self.labels = label_dict['labels']
Exemplo n.º 11
0
def save_graph(file_path, g):
    save_graphs(file_path, g)