def get_graph_kernel_dataset(dataset_ID, feat_norm='zscore'):
    print('Loading data')
    nx_graphs, y = read_graphs_txt(dataset_ID)

    # Preprocessing
    y = np.array(y)[..., None]
    y = OneHotEncoder(sparse=False, categories='auto').fit_transform(y)

    # Get node attributes
    try:
        A, X_attr, _ = nx_to_numpy(nx_graphs,
                                   nf_keys=['attributes'],
                                   auto_pad=False)
        X_attr = node_feat_norm(X_attr, feat_norm)
    except KeyError:
        print('Featureless nodes')
        A, X_attr, _ = nx_to_numpy(nx_graphs,
                                   auto_pad=False)  # na will be None

    # Get clustering coefficients (always zscore norm)
    clustering_coefficients = [
        np.array(list(nx.clustering(g).values()))[..., None] for g in nx_graphs
    ]
    clustering_coefficients = node_feat_norm(clustering_coefficients, 'zscore')

    # Get node degrees
    node_degrees = np.array([np.sum(_, axis=-1, keepdims=True) for _ in A])
    node_degrees = node_feat_norm(node_degrees, feat_norm)

    # Get node labels (always ohe norm)
    try:
        _, X_labs, _ = nx_to_numpy(nx_graphs,
                                   nf_keys=['label'],
                                   auto_pad=False)
        X_labs = node_feat_norm(X_labs, 'ohe')
    except KeyError:
        print('Label-less nodes')
        X_labs = None

    # Concatenate features
    Xs = [node_degrees, clustering_coefficients]
    if X_attr is not None:
        Xs.append(X_attr)
    if X_labs is not None:
        Xs.append(X_labs)
    X = [np.concatenate(x_, axis=-1) for x_ in zip(*Xs)]
    X = np.array(X)

    return A, X, y
示例#2
0
def generate_graph_matrices(graphs, auto_pad=False):
    
    
    A, X, E = nx_to_numpy(graphs, nf_keys=['atomic_num'],
                               ef_keys=['bond_type'], auto_pad=auto_pad, self_loops=True)
    
    uniq_X = np.unique([v for x in X for v in np.unique(x)])
    X = [label_to_one_hot(x, uniq_X) for x in X]
    uniq_E = np.unique([v for x in E for v in np.unique(x)])
    E = [label_to_one_hot(x, uniq_E) for x in E]
    
    return A, X, E
示例#3
0
def generate_graph_matrices(graphs, auto_pad=False):
    """
    Generate A, X, E matrix from smiles
    :param graphs: list of networkx graphs
    :param auto_pad: bool. whether to pad the node matrix to have the same length
    :return A, X, E
    """

    A, X, E = nx_to_numpy(graphs,
                          nf_keys=['atomic_num'],
                          ef_keys=['bond_type'],
                          auto_pad=auto_pad,
                          self_loops=True)

    uniq_X = np.unique([v for x in X for v in np.unique(x)])
    X = [label_to_one_hot(x, uniq_X) for x in X]
    uniq_E = np.unique([v for x in E for v in np.unique(x)])
    E = [label_to_one_hot(x, uniq_E) for x in E]

    return A, X, E
示例#4
0
文件: qm9.py 项目: yaniv256/spektral
def load_data(nf_keys=None,
              ef_keys=None,
              auto_pad=True,
              self_loops=False,
              amount=None,
              return_type='numpy'):
    """
    Loads the QM9 chemical data set of small molecules.

    Nodes represent heavy atoms (hydrogens are discarded), edges represent
    chemical bonds.

    The node features represent the chemical properties of each atom, and are
    loaded according to the `nf_keys` argument.
    See `spektral.datasets.qm9.NODE_FEATURES` for possible node features, and
    see [this link](http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx)
    for the meaning of each property. Usually, it is sufficient to load the
    atomic number.

    The edge features represent the type and stereoscopy of each chemical bond
    between two atoms.
    See `spektral.datasets.qm9.EDGE_FEATURES` for possible edge features, and
    see [this link](http://www.nonlinear.com/progenesis/sdf-studio/v0.9/faq/sdf-file-format-guidance.aspx)
    for the meaning of each property. Usually, it is sufficient to load the
    type of bond.

    :param nf_keys: list or str, node features to return (see `qm9.NODE_FEATURES`
    for available features);
    :param ef_keys: list or str, edge features to return (see `qm9.EDGE_FEATURES`
    for available features);
    :param auto_pad: if `return_type='numpy'`, zero pad graph matrices to have 
    the same number of nodes;
    :param self_loops: if `return_type='numpy'`, add self loops to adjacency 
    matrices;
    :param amount: the amount of molecules to return (in ascending order by
    number of atoms).
    :param return_type: `'numpy'`, `'networkx'`, or `'sdf'`, data format to return;
    :return:
    - if `return_type='numpy'`, the adjacency matrix, node features,
    edge features, and a Pandas dataframe containing labels;
    - if `return_type='networkx'`, a list of graphs in Networkx format,
    and a dataframe containing labels;   
    - if `return_type='sdf'`, a list of molecules in the internal SDF format and
    a dataframe containing labels.
    """
    if return_type not in RETURN_TYPES:
        raise ValueError('Possible return_type: {}'.format(RETURN_TYPES))

    if not os.path.exists(DATA_PATH):
        _download_data()  # Try to download dataset

    print('Loading QM9 dataset.')
    sdf_file = os.path.join(DATA_PATH, 'qm9.sdf')
    data = load_sdf(sdf_file, amount=amount)  # Internal SDF format

    # Load labels
    labels_file = os.path.join(DATA_PATH, 'qm9.sdf.csv')
    labels = load_csv(labels_file)
    if amount is not None:
        labels = labels[:amount]
    if return_type is 'sdf':
        return data, labels
    else:
        # Convert to Networkx
        data = [sdf_to_nx(_) for _ in data]

    if return_type is 'numpy':
        if nf_keys is not None:
            if isinstance(nf_keys, str):
                nf_keys = [nf_keys]
        else:
            nf_keys = NODE_FEATURES
        if ef_keys is not None:
            if isinstance(ef_keys, str):
                ef_keys = [ef_keys]
        else:
            ef_keys = EDGE_FEATURES

        adj, nf, ef = nx_to_numpy(data,
                                  auto_pad=auto_pad,
                                  self_loops=self_loops,
                                  nf_keys=nf_keys,
                                  ef_keys=ef_keys)
        return adj, nf, ef, labels
    elif return_type is 'networkx':
        return data, labels
    else:
        # Should not get here
        raise RuntimeError()
示例#5
0
def load_data(return_type='numpy',
              nf_keys=None,
              ef_keys=None,
              auto_pad=True,
              self_loops=False,
              amount=None):
    """
    Loads the QM9 molecules dataset.
    :param return_type: 'networkx', 'numpy', or 'sdf', data format to return;
    :param nf_keys: list or str, node features to return (see `qm9.NODE_FEATURES`
    for available features);
    :param ef_keys: list or str, edge features to return (see `qm9.EDGE_FEATURES`
    for available features);
    :param auto_pad: if `return_type='numpy'`, zero pad graph matrices to have 
    the same number of nodes;
    :param self_loops: if `return_type='numpy'`, add self loops to adjacency 
    matrices;
    :param amount: the amount of molecules to return (in order).
    :return: if `return_type='numpy'`, the adjacency matrix, node features,
    edge features, and a Pandas dataframe containing labels;
    if `return_type='networkx'`, a list of graphs in Networkx format,
    and a dataframe containing labels;   
    if `return_type='sdf'`, a list of molecules in the internal SDF format and
    a dataframe containing labels.
    """
    if return_type not in RETURN_TYPES:
        raise ValueError('Possible return_type: {}'.format(RETURN_TYPES))

    if not os.path.exists(DATA_PATH):
        _ = dataset_downloader()  # Try to download dataset

    print('Loading QM9 dataset.')
    sdf_file = os.path.join(DATA_PATH, 'qm9.sdf')
    data = load_sdf(sdf_file, amount=amount)  # Internal SDF format

    # Load labels
    labels_file = os.path.join(DATA_PATH, 'qm9.sdf.csv')
    labels = load_csv(labels_file)
    if amount is not None:
        labels = labels[:amount]
    if return_type is 'sdf':
        return data, labels
    else:
        # Convert to Networkx
        data = [sdf_to_nx(_, keep_hydrogen=True) for _ in data]

    if return_type is 'numpy':
        if nf_keys is not None:
            if isinstance(nf_keys, str):
                nf_keys = [nf_keys]
        else:
            nf_keys = NODE_FEATURES
        if ef_keys is not None:
            if isinstance(ef_keys, str):
                ef_keys = [ef_keys]
        else:
            ef_keys = EDGE_FEATURES

        adj, nf, ef = nx_to_numpy(data,
                                  auto_pad=auto_pad,
                                  self_loops=self_loops,
                                  nf_keys=nf_keys,
                                  ef_keys=ef_keys)
        return adj, nf, ef, labels
    elif return_type is 'networkx':
        return data, labels
    else:
        # Should not get here
        raise RuntimeError()
示例#6
0
文件: tud.py 项目: kmader/spektral
def load_data(dataset_name, normalize_features=None, clean=False):
    """
    Loads one of the Benchmark Data Sets for Graph Kernels from TU Dortmund
    ([link](https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets)).
    The node features are computed by concatenating the following features for
    each node:

    - node attributes, if available, normalized as specified in `normalize_features`;
    - clustering coefficient, normalized with z-score;
    - node degrees, normalized as specified in `normalize_features`;
    - node labels, if available, one-hot encoded.
    :param dataset_name: name of the dataset to load (see `spektral.datasets.tud.AVAILABLE_DATASETS`).
    :param normalize_features: `None`, `'zscore'` or `'ohe'`, how to normalize
    the node features (only works for node attributes).
    :param clean: if True, return a version of the dataset with no isomorphic
    graphs.
    :return:
    - a list of adjacency matrices;
    - a list of node feature matrices;
    - a numpy array containing the one-hot encoded targets.
    """
    if dataset_name not in AVAILABLE_DATASETS:
        raise ValueError('Available datasets: {}'.format(AVAILABLE_DATASETS))

    if clean:
        dataset_name += '_clean'
    if not os.path.exists(DATA_PATH + dataset_name):
        _download_data(dataset_name)

    # Read data
    nx_graphs, y = _read_graphs(dataset_name)

    # Preprocessing
    y = np.array(y)[..., None]
    y = OneHotEncoder(sparse=False, categories='auto').fit_transform(y)

    # Get node attributes
    try:
        A, X_attr, _ = nx_to_numpy(nx_graphs,
                                   nf_keys=['attributes'],
                                   auto_pad=False)
        X_attr = _normalize_node_features(X_attr, normalize_features)
    except KeyError:
        print('Featureless nodes')
        A, X_attr, _ = nx_to_numpy(nx_graphs, auto_pad=False)

    # Get clustering coefficients (always zscore norm)
    clustering_coefficients = [
        np.array(list(nx.clustering(g).values()))[..., None] for g in nx_graphs
    ]
    clustering_coefficients = _normalize_node_features(clustering_coefficients,
                                                       'zscore')

    # Get node degrees
    node_degrees = np.array([np.sum(_, axis=-1, keepdims=True) for _ in A])
    node_degrees = _normalize_node_features(node_degrees, 'zscore')

    # Get node labels
    try:
        _, X_labs, _ = nx_to_numpy(nx_graphs,
                                   nf_keys=['label'],
                                   auto_pad=False)
        X_labs = _normalize_node_features(X_labs, 'ohe')
    except KeyError:
        print('Label-less nodes')
        X_labs = None

    # Concatenate features
    Xs = [node_degrees, clustering_coefficients]
    if X_attr is not None:
        Xs.append(X_attr)
    if X_labs is not None:
        Xs.append(X_labs)
    X = [np.concatenate(x_, axis=-1) for x_ in zip(*Xs)]
    X = np.array(X)

    return A, X, y