Exemplo n.º 1
0
def read_edgelist(f, sep="\t", header=None, **readcsvkwargs):
    """
    Creates a csrgraph from an edgelist

    f : str
        Filename to read
    sep : str
        CSV-style separator. Eg. Use "," if comma separated
    header : int or None
        pandas read_csv parameter. Use if column names are present
    read_csv_kwargs : keyword arguments for pd.read_csv
        Pass these kwargs as you would normally to pd.read_csv.
    Returns : csrgraph
    """
    # Read in csv correctly to each column
    elist = pd.read_csv(f, sep=sep, header=header, **readcsvkwargs)
    if len(elist.columns) == 2:
        elist.columns = ['src', 'dst']
    elif len(elist.columns) == 3:
        elist.columns = ['src', 'dst', 'weight']
    else:
        raise ValueError(f"""
            Invalid columns: {elist.columns}
            Expected 2 (source, destination)
            or 3 (source, destination, weight)
            Read File: \n{elist.head(5)}
        """)
    if elist.src.min() < 0 or elist.dst.min() < 0:
        raise ValueError(f"""
            Invalid uint32 value in node IDs. Max/min :
            SRC: {elist.src.max()}, {elist.src.min()}
            DST: {elist.dst.max()}, {elist.dst.min()}
        """)
    elist.sort_values(by='src', inplace=True, ignore_index=True)
    # Create name mapping to normalize node IDs
    allnodes = list(set(elist.src.unique()).union(set(elist.dst.unique())))
    # This factors all the unique nodes to unique IDs
    names = (np.array(pd.Series(allnodes).astype('category').cat.categories))
    name_dict = dict(zip(names, np.arange(names.shape[0])))
    elist.src = elist.src.map(name_dict).astype(np.uint32)
    elist.dst = elist.dst.map(name_dict).astype(np.uint32)
    nnodes = names.shape[0]
    G = methods._edgelist_to_graph(elist, nnodes, nodenames=names)
    # clean up temp data
    elist = None
    gc.collect()
    return G
Exemplo n.º 2
0
    def random_walk_resample(self, walklen=4, epochs=30):
        """
        Create a new graph from random walk co-occurences.

        First, we generate random walks on the graph
        Then, any nodes appearing together in a walk get an edge
        Edge weights are co-occurence counts.

        Recommendation: many short walks > fewer long walks

        TODO: add node2vec walk parameters
        """
        walks = self.random_walks(walklen=walklen, epochs=epochs)
        elist = random_walks.walks_to_edgelist(walks)
        return methods._edgelist_to_graph(elist,
                                          nnodes=self.nnodes,
                                          nodenames=self.names)
Exemplo n.º 3
0
def read_edgelist(f, directed=True, sep=r"\s+", header=None, keep_default_na=False, **readcsvkwargs):
    """
    Creates a csrgraph from an edgelist.

    The edgelist should be in the form 
       [source  destination]
        or 
       [source  destination  edge_weight]

    The first column needs to be the source, the second the destination.
    If there is a third column it's assumed to be edge weights.

    Otherwise, all arguments from pandas.read_csv can be used to read the file.

    f : str
        Filename to read
    directed : bool
        Whether the graph is directed or undirected.
        All csrgraphs are directed, undirected graphs simply add "return edges"
    sep : str
        CSV-style separator. Eg. Use "," if comma separated
    header : int or None
        pandas read_csv parameter. Use if column names are present
    keep_default_na: bool
        pandas read_csv argument to prevent casting any value to NaN
    read_csv_kwargs : keyword arguments for pd.read_csv
        Pass these kwargs as you would normally to pd.read_csv.
    Returns : csrgraph
    """
    # Read in csv correctly to each column
    elist = pd.read_csv(f, sep=sep, header=header, keep_default_na=keep_default_na, **readcsvkwargs)
    print('memory0', memory_profiler.memory_usage()[0])
    print('elist size 0', sys.getsizeof(elist))
    if len(elist.columns) == 2:
        elist.columns = ['src', 'dst']
        elist['weight'] = np.ones(elist.shape[0])
    elif len(elist.columns) == 3:
        elist.columns = ['src', 'dst', 'weight']
    else: 
        raise ValueError(f"""
            Invalid columns: {elist.columns}
            Expected 2 (source, destination)
            or 3 (source, destination, weight)
            Read File: \n{elist.head(5)}
        """)
    print('using modified csrgraph')
    # Create name mapping to normalize node IDs
    # Somehow this is 1.5x faster than np.union1d. Shame on numpy.
    allnodes = list(
        set(elist.src.unique())
        .union(set(elist.dst.unique())))
    # Factor all nodes to unique IDs
    names = (
        pd.Series(allnodes).astype('category')
        .cat.categories
    )
    nnodes = names.shape[0]
    # Get the input data type
    if nnodes > UINT32_MAX:
        dtype = np.uint64
    elif nnodes > UINT16_MAX:
        dtype = np.uint32
    else:
        dtype = np.uint16
    print(f"nnodes: {nnodes} dtype: {dtype}")
    name_dict = dict(zip(names,
                         np.arange(names.shape[0], dtype=dtype)))
    print('memory1', memory_profiler.memory_usage()[0])
    print('elist size 1', sys.getsizeof(elist))
    elist.src = elist.src.map(name_dict)
    elist.dst = elist.dst.map(name_dict)
    # convert names from float to uint16
    if dtype == np.uint16:
        elist.src = np.uint16(elist.src.values)
        elist.dst = np.uint16(elist.dst.values)
    print('memory2', memory_profiler.memory_usage()[0])
    print('elist size 2', sys.getsizeof(elist))
    # convert weights to float32 (NEED TO DOUCLBE CHECK this doesn't affect the embedding output)
    elist.weight = np.float16(elist.weight.values)
    print('memory3', memory_profiler.memory_usage()[0])
    print('elist size 3', sys.getsizeof(elist))
    # clean up temp data
    allnodes = None
    name_dict = None
    gc.collect()
    # If undirected graph, append edgelist to reversed self
    if not directed:
        print("before elist.copy")
        other_df = elist.copy()
        print('memory4', memory_profiler.memory_usage()[0])
        other_df.columns = ['dst', 'src', 'weight']
        elist = pd.concat([elist, other_df])
        print('memory4a', memory_profiler.memory_usage()[0])
        other_df = None
        gc.collect()
        print('memory5', memory_profiler.memory_usage()[0])
    # Need to sort by src for _edgelist_to_graph
    #elist = elist.sort_values(by='src')
#     order = np.lexsort(elist['src'].values)
#     for col in list(elist.columns):
#         elist[col] = elist[col].values[order]
    # extract numpy arrays and clear memory
    # try converting pandas to numpy arrays and then sort
    print('memory6', memory_profiler.memory_usage()[0])
    src = elist.src.to_numpy()
    dst = elist.dst.to_numpy()
    weight = elist.weight.to_numpy()
    elist = None
    gc.collect()
    print('memory7', memory_profiler.memory_usage()[0])
    # sort the arrays
    sort_index = np.argsort(src)
    src = src[sort_index]
    dst = dst[sort_index]
    weight = weight[sort_index]
    weight = np.float32(weight) # change the weight back to np.float32. csrgraph doesn't accept np.float16 type
    del sort_index
    G = methods._edgelist_to_graph(
        src, dst, weight,
        nnodes, nodenames=names
    )
    print('memory8', memory_profiler.memory_usage()[0])
    return G
Exemplo n.º 4
0
def read_adjacency(f, directed=True):
    '''Creates a csrgraph from an adjacency matrix.'''
    print('using read_adjacency not read_edgelist')
    adj_df = pd.read_csv(f, index_col=0)
    adj_df = adj_df.astype('float16')
    allnodes = list(adj_df.columns)
    adj_df.values[tuple([np.arange(len(adj_df))]*2)] = np.nan
    elist = adj_df.stack().reset_index()
    del adj_df        
    print('memory0', memory_profiler.memory_usage()[0])
    print('elist size 0', sys.getsizeof(elist))
    if len(elist.columns) == 3:
        elist.columns = ['src', 'dst', 'weight']
    else: 
        raise ValueError(f"""
            Invalid columns: {elist.columns}
            Expected 3 (source, destination, weight)
            Read File: \n{elist.head(5)}
        """)
    # Create name mapping to normalize node IDs
    # Somehow this is 1.5x faster than np.union1d. Shame on numpy.
#     allnodes = list(
#         set(elist.src.unique())
#         .union(set(elist.dst.unique())))
#     # Factor all nodes to unique IDs
    names = (
        pd.Series(allnodes).astype('category')
        .cat.categories
    )
    nnodes = names.shape[0]
    # Get the input data type
    if nnodes > UINT32_MAX:
        dtype = np.uint64
    elif nnodes > UINT16_MAX:
        dtype = np.uint32
    else:
        dtype = np.uint16
    print(f"nnodes: {nnodes} dtype: {dtype}")
    name_dict = dict(zip(names,
                         np.arange(names.shape[0], dtype=dtype)))
    print('memory1', memory_profiler.memory_usage()[0])
    print('elist size 1', sys.getsizeof(elist))
    elist.src = elist.src.map(name_dict)
    elist.dst = elist.dst.map(name_dict)
    # convert names from float to uint16
    if dtype == np.uint16:
        elist.src = np.uint16(elist.src.values)
        elist.dst = np.uint16(elist.dst.values)
    print('memory2', memory_profiler.memory_usage()[0])
    print('elist size 2', sys.getsizeof(elist))
    # convert weights to float32 (NEED TO DOUCLBE CHECK this doesn't affect the embedding output)
    elist.weight = np.float16(elist.weight.values)
    print('memory3', memory_profiler.memory_usage()[0])
    print('elist size 3', sys.getsizeof(elist))
    # clean up temp data
    allnodes = None
    name_dict = None
    gc.collect()
    # If undirected graph, append edgelist to reversed self
    if not directed:
        print("before elist.copy")
        other_df = elist.copy()
        print('memory4', memory_profiler.memory_usage()[0])
        other_df.columns = ['dst', 'src', 'weight']
        elist = pd.concat([elist, other_df])
        print('memory4a', memory_profiler.memory_usage()[0])
        other_df = None
        gc.collect()
        print('memory5', memory_profiler.memory_usage()[0])
    # extract numpy arrays and clear memory
    # try converting pandas to numpy arrays and then sort
    print('memory6', memory_profiler.memory_usage()[0])
    src = elist.src.to_numpy()
    dst = elist.dst.to_numpy()
    weight = elist.weight.to_numpy()
    elist = None
    gc.collect()
    print('memory7', memory_profiler.memory_usage()[0])
    # sort the arrays
    sort_index = np.argsort(src)
    src = src[sort_index]
    dst = dst[sort_index]
    weight = weight[sort_index]
    weight = np.float32(weight) # change the weight back to np.float32. csrgraph doesn't accept np.float16 type
    del sort_index
    G = methods._edgelist_to_graph(
        src, dst, weight,
        nnodes, nodenames=names
    )
    print('memory8', memory_profiler.memory_usage()[0])
    return G
Exemplo n.º 5
0
def read_edgelist(f,
                  directed=True,
                  sep=r"\s+",
                  header=None,
                  keep_default_na=False,
                  **readcsvkwargs):
    """
    Creates a csrgraph from an edgelist.

    The edgelist should be in the form 
       [source  destination]
        or 
       [source  destination  edge_weight]

    The first column needs to be the source, the second the destination.
    If there is a third column it's assumed to be edge weights.

    Otherwise, all arguments from pandas.read_csv can be used to read the file.

    f : str
        Filename to read
    directed : bool
        Whether the graph is directed or undirected.
        All csrgraphs are directed, undirected graphs simply add "return edges"
    sep : str
        CSV-style separator. Eg. Use "," if comma separated
    header : int or None
        pandas read_csv parameter. Use if column names are present
    keep_default_na: bool
        pandas read_csv argument to prevent casting any value to NaN
    read_csv_kwargs : keyword arguments for pd.read_csv
        Pass these kwargs as you would normally to pd.read_csv.
    Returns : csrgraph
    """
    # Read in csv correctly to each column
    elist = pd.read_csv(f,
                        sep=sep,
                        header=header,
                        keep_default_na=keep_default_na,
                        **readcsvkwargs)
    if len(elist.columns) == 2:
        elist.columns = ['src', 'dst']
        elist['weight'] = np.ones(elist.shape[0])
    elif len(elist.columns) == 3:
        elist.columns = ['src', 'dst', 'weight']
    else:
        raise ValueError(f"""
            Invalid columns: {elist.columns}
            Expected 2 (source, destination)
            or 3 (source, destination, weight)
            Read File: \n{elist.head(5)}
        """)
    # Create name mapping to normalize node IDs
    # Somehow this is 1.5x faster than np.union1d. Shame on numpy.
    allnodes = list(set(elist.src.unique()).union(set(elist.dst.unique())))
    # Factor all nodes to unique IDs
    names = (pd.Series(allnodes).astype('category').cat.categories)
    nnodes = names.shape[0]
    # Get the input data type
    if nnodes > UINT32_MAX:
        dtype = np.uint64
    else:
        dtype = np.uint32
    name_dict = dict(zip(names, np.arange(names.shape[0], dtype=dtype)))
    elist.src = elist.src.map(name_dict)
    elist.dst = elist.dst.map(name_dict)
    # clean up temp data
    allnodes = None
    name_dict = None
    gc.collect()
    # If undirected graph, append edgelist to reversed self
    if not directed:
        other_df = elist.copy()
        other_df.columns = ['dst', 'src', 'weight']
        elist = pd.concat([elist, other_df])
        other_df = None
        gc.collect()
    # Need to sort by src for _edgelist_to_graph
    elist = elist.sort_values(by='src')
    # extract numpy arrays and clear memory
    src = elist.src.to_numpy()
    dst = elist.dst.to_numpy()
    weight = elist.weight.to_numpy()
    elist = None
    gc.collect()
    G = methods._edgelist_to_graph(src, dst, weight, nnodes, nodenames=names)
    return G