Пример #1
0
 def _transform_vertex_pair_base(self, G, v, u, distance, feature_list):
     # for all radii
     for radius in range(self.min_r, self.r + 2, 2):
         for label_index in range(G.graph['label_size']):
             if radius < len(
                     G.node[v]['neighborhood_graph_hash']
                 [label_index]) and radius < len(
                     G.node[u]['neighborhood_graph_hash'][label_index]):
                 # feature as a pair of neighbourhoods at a radius,distance
                 # canonicazation of pair of neighborhoods
                 v_hash = G.node[v]['neighborhood_graph_hash'][label_index][
                     radius]
                 u_hash = G.node[u]['neighborhood_graph_hash'][label_index][
                     radius]
                 if v_hash < u_hash:
                     first_hash = v_hash
                     second_hash = u_hash
                 else:
                     first_hash = u_hash
                     second_hash = v_hash
                 t = [first_hash, second_hash, radius, distance]
                 feature = fast_hash(t, self.bitmask)
                 key = fast_hash([radius, distance], self.bitmask)
                 # if self.weighted == False :
                 if G.graph.get('weighted', False) is False:
                     feature_list[key][feature] += 1
                 else:
                     feature_list[key][feature] += G.node[v][
                         'neighborhood_graph_weight'][radius] + G.node[u][
                             'neighborhood_graph_weight'][radius]
Пример #2
0
def graph_hash(graph, hash_bitmask, node_name_label=lambda id, node: node['hlabel']):
    """
        so we calculate a hash of a graph
    """
    node_names = {n: calc_node_name(graph, n, hash_bitmask, node_name_label) for n in graph.nodes()}
    tmp_fast_hash = lambda a, b: fast_hash([(a ^ b) + (a + b), min(a, b), max(a, b)])
    l = [tmp_fast_hash(node_names[a], node_names[b]) for (a, b) in graph.edges()]
    l.sort()
    # isolates are isolated nodes
    isolates = [n for (n, d) in graph.degree_iter() if d == 0]
    z = [node_name_label(node_id, graph.node[node_id]) for node_id in isolates]
    z.sort()
    return fast_hash(l + z, hash_bitmask)
Пример #3
0
 def _compute_neighborhood_graph_hash(self, root, graph):
     # list all hashed labels at increasing distances
     hash_list = []
     # for all distances
     root_dist_dict = graph.node[root]['remote_neighbours']
     for node_set in root_dist_dict.itervalues():
         # create a list of hashed labels
         hash_label_list = []
         for v in node_set:
             # compute the vertex hashed label by hashing the hlabel
             # field
             # with the degree of the vertex (obtained as the size of
             # the adjacency dictionary for the vertex v)
             # or, in case positional is set, using the relative
             # position of the vertex v w.r.t. the root vertex
             if self.positional:
                 vhlabel = fast_hash_2(
                     graph.node[v]['hlabel'], root - v)
             else:
                 vhlabel = fast_hash_2(
                     graph.node[v]['hlabel'], len(graph[v]))
             hash_label_list.append(vhlabel)
         # sort it
         hash_label_list.sort()
         # hash it
         hashed_nodes_at_distance_d_in_neighborhood = fast_hash(
             hash_label_list)
         hash_list.append(hashed_nodes_at_distance_d_in_neighborhood)
     # hash the sequence of hashes of the node set at increasing
     # distances into a list of features
     hash_neighborhood = fast_hash_vec(hash_list)
     graph.node[root]['neigh_graph_hash'] = hash_neighborhood
Пример #4
0
 def _compute_neighborhood_graph_hash(self, root, graph):
     # list all hashed labels at increasing distances
     hash_list = []
     # for all distances
     root_dist_dict = graph.node[root]['remote_neighbours']
     for node_set in root_dist_dict.itervalues():
         # create a list of hashed labels
         hash_label_list = []
         for v in node_set:
             # compute the vertex hashed label by hashing the hlabel
             # field
             # with the degree of the vertex (obtained as the size of
             # the adjacency dictionary for the vertex v)
             # or, in case positional is set, using the relative
             # position of the vertex v w.r.t. the root vertex
             if self.positional:
                 vhlabel = fast_hash_2(
                     graph.node[v]['hlabel'],
                     root - v)
             else:
                 vhlabel = \
                     fast_hash_2(graph.node[v]['hlabel'],
                                 len(graph[v]))
             hash_label_list.append(vhlabel)
         # sort it
         hash_label_list.sort()
         # hash it
         hashed_nodes_at_distance_d_in_neighborhood = fast_hash(
             hash_label_list)
         hash_list.append(hashed_nodes_at_distance_d_in_neighborhood)
     # hash the sequence of hashes of the node set at increasing
     # distances into a list of features
     hash_neighborhood = fast_hash_vec(hash_list)
     graph.node[root]['neigh_graph_hash'] = hash_neighborhood
Пример #5
0
def graph_hash(graph, hash_bitmask, node_name_label=None):
    """
        so we calculate a hash of a graph
    """
    l = []
    node_name_cache = {}
    all_nodes = set(graph.nodes())
    visited = set()
    # all the edges
    for (a, b) in graph.edges():
        visited.add(a)
        visited.add(b)

        ha = node_name_cache.get(a, -1)
        if ha == -1:
            ha = calc_node_name(graph, a, hash_bitmask, node_name_label)
            node_name_cache[a] = ha
        hb = node_name_cache.get(b, -1)
        if hb == -1:
            hb = calc_node_name(graph, b, hash_bitmask, node_name_label)
            node_name_cache[b] = hb
        l.append((ha ^ hb) + (ha + hb))
        # z=(ha ^ hb) + (ha + hb)
        # l.append( fast_hash([ha,hb],hash_bitmask) +z )
    l.sort()

    # nodes that dont have edges
    if node_name_label is None:
        z = [graph.node[node_id]['hlabel'][0] for node_id in all_nodes - visited]
    else:
        z = [graph.node[node_id][node_name_label] for node_id in all_nodes - visited]
    z.sort()
    ihash = fast_hash(l + z, hash_bitmask)
    return ihash
Пример #6
0
 def _compute_neighborhood_graph_hash(self, root, G):
     hash_neighborhood_list = []
     # for all labels
     for label_index in range(G.graph['label_size']):
         # list all hashed labels at increasing distances
         hash_list = []
         # for all distances
         root_dist_dict = G.node[root]['remote_neighbours']
         for node_set in root_dist_dict.itervalues():
             # create a list of hashed labels
             hash_label_list = []
             for v in node_set:
                 vhlabel = G.node[v]['hlabel'][label_index]
                 hash_label_list.append(vhlabel)
             # sort it
             hash_label_list.sort()
             # hash it
             hashed_nodes_at_distance_d_in_neighborhood_set = fast_hash(
                 hash_label_list, self.bitmask)
             hash_list.append(
                 hashed_nodes_at_distance_d_in_neighborhood_set)
         # hash the sequence of hashes of the node set at increasing
         # distances into a list of features
         hash_neighborhood = fast_hash_vec(hash_list, self.bitmask)
         hash_neighborhood_list.append(hash_neighborhood)
     G.node[root]['neighborhood_graph_hash'] = hash_neighborhood_list
Пример #7
0
def calc_node_name(interfacegraph, node, hash_bitmask, node_name_label=lambda id, node: node['hlabel']):
    '''
     part of generating the hash for a graph is calculating the hash of a node in the graph
     # the case that n has no neighbors is currently untested...
    '''
    d = nx.single_source_shortest_path_length(interfacegraph, node, 20)
    l = [node_name_label(nid, interfacegraph.node[nid]) + dis for nid, dis in d.items()]
    l.sort()
    return fast_hash(l, hash_bitmask)
Пример #8
0
 def _transform(self, instance_id, seq):
     if seq is None or len(seq) == 0:
         raise Exception('ERROR: something went wrong, empty instance at position %d.' % instance_id)
     # extract kmer hash codes for all kmers up to r in all positions in seq
     seq_len = len(seq)
     neighborhood_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)]
     # construct features as pairs of kmers up to distance d for all radii up to r
     feature_list = defaultdict(lambda: defaultdict(float))
     for pos in range(seq_len):
         for radius in range(self.min_r, self.r + 1):
             if radius < len(neighborhood_hash_cache[pos]):
                 feature = [neighborhood_hash_cache[pos][radius], radius]
                 for distance in range(self.min_d, self.d + 1):
                     if pos + distance + radius < seq_len:
                         dfeature = feature + [distance, neighborhood_hash_cache[pos + distance][radius]]
                         feature_code = fast_hash(dfeature, self.bitmask)
                         key = fast_hash([radius, distance], self.bitmask)
                         feature_list[key][feature_code] += 1
     return self._normalization(feature_list, instance_id)
Пример #9
0
 def _label_preprocessing(self, G):
     try:
         G.graph['label_size'] = self.label_size
         for n, d in G.nodes_iter(data=True):
             # for dense or sparse vectors
             if isinstance(d['label'], list) or isinstance(
                     d['label'], dict):
                 node_entity, data = self._extract_entity_and_label(d)
                 if isinstance(d['label'], dict):
                     data = self._convert_dict_to_sparse_vector(data)
                 # create a list of integer codes of size: label_size
                 # each integer code is determined as follows:
                 # for each entity, use the correspondent discretization_model_dict[node_entity] to extract the id of the
                 # nearest cluster centroid, return the centroid id as the
                 # integer code
                 hlabel = []
                 for i in range(self.label_size):
                     if len(self.discretization_model_dict[node_entity]
                            ) < i:
                         raise Exception(
                             'Error: discretization_model_dict for node entity: %s has length: %d but component %d was required'
                             % (node_entity,
                                len(self.
                                    discretization_model_dict[node_entity]),
                                i))
                     predictions = self.discretization_model_dict[
                         node_entity][i].predict(data)
                     if len(predictions) != 1:
                         raise Exception(
                             'Error: discretizer has not returned an individual prediction but %d predictions'
                             % len(predictions))
                     discretization_code = predictions[0] + 1
                     code = fast_hash(
                         [hash(node_entity), discretization_code],
                         self.bitmask)
                     hlabel.append(code)
                 G.node[n]['hlabel'] = hlabel
             elif isinstance(d['label'], basestring):
                 # copy a hashed version of the string for a number of times equal to self.label_size
                 # in this way qualitative ( i.e. string ) labels can be
                 # compared to the discretized labels
                 hlabel = int(hash(d['label']) & self.bitmask) + 1
                 G.node[n]['hlabel'] = [hlabel] * self.label_size
             else:
                 raise Exception(
                     'ERROR: something went wrong, type of node label is unknown: %s'
                     % d['label'])
     except Exception as e:
         import datetime
         curr_time = datetime.datetime.now().strftime(
             "%A, %d. %B %Y %I:%M%p")
         print("Program run failed on %s" % curr_time)
         print(e.__doc__)
         print(e.message)
Пример #10
0
 def _compute_vertex_based_features(self, seq):
     if seq is None or len(seq) == 0:
         raise Exception('ERROR: something went wrong, empty instance.')
     # extract kmer hash codes for all kmers up to r in all positions in seq
     feature_dict = {}
     seq_len = len(seq)
     neighborhood_hash_cache = [self._compute_neighborhood_hash(seq, pos) for pos in range(seq_len)]
     for pos in range(seq_len):
         # construct features as pairs of kmers up to distance d for all radii up to r
         feature_list = defaultdict(lambda: defaultdict(float))
         for radius in range(self.min_r, self.r + 1):
             if radius < len(neighborhood_hash_cache[pos]):
                 feature = [neighborhood_hash_cache[pos][radius], radius]
                 for distance in range(self.min_d, self.d + 1):
                     if pos + distance + radius < seq_len:
                         dfeature = feature + [distance, neighborhood_hash_cache[pos + distance][radius]]
                         feature_code = fast_hash(dfeature, self.bitmask)
                         key = fast_hash([radius, distance], self.bitmask)
                         feature_list[key][feature_code] += 1
         feature_dict.update(self._normalization(feature_list, pos))
     X = self._convert_dict_to_sparse_matrix(feature_dict)
     return X
Пример #11
0
 def _transform_vertex_pair_base(self, G, v, u, distance, feature_list):
     # for all radii
     for radius in range(self.min_r, self.r + 2, 2):
         for label_index in range(G.graph['label_size']):
             if radius < len(G.node[v]['neighborhood_graph_hash'][label_index]) and radius < len(G.node[u]['neighborhood_graph_hash'][label_index]):
                 # feature as a pair of neighbourhoods at a radius,distance
                 # canonicazation of pair of neighborhoods
                 v_hash = G.node[v]['neighborhood_graph_hash'][label_index][radius]
                 u_hash = G.node[u]['neighborhood_graph_hash'][label_index][radius]
                 if v_hash < u_hash:
                     first_hash = v_hash
                     second_hash = u_hash
                 else:
                     first_hash = u_hash
                     second_hash = v_hash
                 t = [first_hash, second_hash, radius, distance]
                 feature = fast_hash(t, self.bitmask)
                 key = fast_hash([radius, distance], self.bitmask)
                 # if self.weighted == False :
                 if G.graph.get('weighted', False) is False:
                     feature_list[key][feature] += 1
                 else:
                     feature_list[key][feature] += G.node[v]['neighborhood_graph_weight'][radius] + G.node[u]['neighborhood_graph_weight'][radius]
Пример #12
0
def calc_node_name(interfacegraph, node, hash_bitmask, node_name_label):
    '''
     part of generating the hash for a graph is calculating the hash of a node in the graph
    '''
    d = nx.single_source_shortest_path_length(interfacegraph, node, 20)
    # d is now node:dist
    # l is a list of  hash(label,distance)
    # l=[   func([interfacegraph.node[nid]['intlabel'],dis])  for nid,dis in d.items()]
    if node_name_label is None:
        l = [interfacegraph.node[nid]['hlabel'][0] + dis for nid, dis in d.items()]
    else:
        l = [interfacegraph.node[nid][node_name_label] + dis for nid, dis in d.items()]
    l.sort()
    l = fast_hash(l, hash_bitmask)
    return l
Пример #13
0
def calc_node_name2(interfacegraph, node, hash_bitmask, node_name_label):
    '''
     part of generating the hash for a graph is calculating the hash of a node in the graph
    '''
    d = nx.single_source_shortest_path_length(interfacegraph, node, 20)
    # d is now node:dist
    # l is a list of  hash(label,distance)
    # l=[   func([interfacegraph.node[nid]['intlabel'],dis])  for nid,dis in d.items()]
    if node_name_label is None:
        l = []
        # # print "hlabel used: ", [interfacegraph.node[nid]['hlabel'] for nid, dis in d.items()]
        # # print "values of dis: ", [(nid, dis) for nid, dis in d.items()]
        for nid, dis in d.items():
            l.extend(interfacegraph.node[nid]['hlabel'])
        # ##### l = [interfacegraph.node[nid]['hlabel'][-1] + dis for nid, dis in d.items()]
    else:
        l = [interfacegraph.node[nid][node_name_label] + dis for nid, dis in d.items()]
    l.sort()
    # print "sorted l: ", l
    l = fast_hash(l, hash_bitmask)
    return l
Пример #14
0
 def _label_preprocessing(self, G):
     try:
         G.graph['label_size'] = self.label_size
         for n, d in G.nodes_iter(data=True):
             # for dense or sparse vectors
             if isinstance(d['label'], list) or isinstance(d['label'], dict):
                 node_entity, data = self._extract_entity_and_label(d)
                 if isinstance(d['label'], dict):
                     data = self._convert_dict_to_sparse_vector(data)
                 # create a list of integer codes of size: label_size
                 # each integer code is determined as follows:
                 # for each entity, use the correspondent discretization_model_dict[node_entity] to extract the id of the
                 # nearest cluster centroid, return the centroid id as the
                 # integer code
                 hlabel = []
                 for i in range(self.label_size):
                     if len(self.discretization_model_dict[node_entity]) < i:
                         raise Exception('Error: discretization_model_dict for node entity: %s has length: %d but component %d was required' % (
                             node_entity, len(self.discretization_model_dict[node_entity]), i))
                     predictions = self.discretization_model_dict[node_entity][i].predict(data)
                     if len(predictions) != 1:
                         raise Exception('Error: discretizer has not returned an individual prediction but %d predictions' % len(predictions))
                     discretization_code = predictions[0] + 1
                     code = fast_hash([hash(node_entity), discretization_code], self.bitmask)
                     hlabel.append(code)
                 G.node[n]['hlabel'] = hlabel
             elif isinstance(d['label'], basestring):
                 # copy a hashed version of the string for a number of times equal to self.label_size
                 # in this way qualitative ( i.e. string ) labels can be
                 # compared to the discretized labels
                 hlabel = int(hash(d['label']) & self.bitmask) + 1
                 G.node[n]['hlabel'] = [hlabel] * self.label_size
             else:
                 raise Exception('ERROR: something went wrong, type of node label is unknown: %s' % d['label'])
     except Exception as e:
         import datetime
         curr_time = datetime.datetime.now().strftime("%A, %d. %B %Y %I:%M%p")
         print("Program run failed on %s" % curr_time)
         print(e.__doc__)
         print(e.message)
Пример #15
0
 def _compute_neighborhood_graph_hash(self, root, G):
     hash_neighborhood_list = []
     # for all labels
     for label_index in range(G.graph['label_size']):
         # list all hashed labels at increasing distances
         hash_list = []
         # for all distances
         root_dist_dict = G.node[root]['remote_neighbours']
         for node_set in root_dist_dict.itervalues():
             # create a list of hashed labels
             hash_label_list = []
             for v in node_set:
                 vhlabel = G.node[v]['hlabel'][label_index]
                 hash_label_list.append(vhlabel)
             # sort it
             hash_label_list.sort()
             # hash it
             hashed_nodes_at_distance_d_in_neighborhood_set = fast_hash(hash_label_list, self.bitmask)
             hash_list.append(hashed_nodes_at_distance_d_in_neighborhood_set)
         # hash the sequence of hashes of the node set at increasing
         # distances into a list of features
         hash_neighborhood = fast_hash_vec(hash_list, self.bitmask)
         hash_neighborhood_list.append(hash_neighborhood)
     G.node[root]['neighborhood_graph_hash'] = hash_neighborhood_list
Пример #16
0
def _fhash(stuff):
    return eden.fast_hash(stuff, 2 ** 20 - 1)