def make_birel_matrix(self, relation='children'): birel = np.zeros((len( self.graph_structs), self._max_nodes, self._max_bi_relations, 2), dtype='int32') for i, gs in enumerate(self.graph_structs): for j, nid in enumerate(gs.graph.nodes): nid_token = get_node_token(gs.graph, nid) for k, rel_nid in enumerate(getattr(gs, relation)[nid]): rel_token = get_node_token(gs.graph, rel_nid) try: # birel[i, j, k, :] = [nid_token, rel_token] birel[i, j, k, :] = [nid, rel_nid] except IndexError: continue return birel
def make_vocabulary(self): if self.word2ind is not None and len(self.word2ind) > 1: logging.info('word2ind already exists ({0} entries). Reusing it. Some entries are: {1}'.format( len(self.word2ind), list(self.word2ind.items())[:10])) return self.word2ind logging.info('word2ind does not exist. Creating it.') counter = Counter() constants = [] special = [] for gs in self.graph_structs: graph = gs.graph for nid in graph.nodes: token = get_node_token(graph, nid) if get_label(graph, nid, 'type') == 'constant': constants.append(token) counter[token] += 1 else: special.append(token) counter[token] += 1 logging.info('Most common 10 tokens: {0}'.format(counter.most_common()[:10])) special = sorted(set(special)) logging.info('Got {0} special tokens: {1}'.format(len(special), special)) constants = sorted(set(constants)) logging.info('Got {0} constant tokens. Some of them are: {1}'.format( len(constants), constants[:10])) vocab = special + constants assert '<unk>' not in vocab [self.word2ind[w] for w in vocab] logging.info('word2ind created. Some entries are: {0}'.format( list(self.word2ind.items())[:10])) return self.word2ind
def make_treelet_matrix(self, relation='treelet_predicate'): treelets = np.zeros( (len(self.graph_structs), self._max_nodes, self._max_treelets, 3), dtype='int32') for i, gs in enumerate(self.graph_structs): for j, nid in enumerate(gs.graph.nodes): nid_token = get_node_token(gs.graph, nid) for k, (rel1_nid, rel2_nid) in enumerate(getattr(gs, relation)[nid]): rel1_token = get_node_token(gs.graph, rel1_nid) rel2_token = get_node_token(gs.graph, rel2_nid) treelets[i, j, k, :] = [ self.word2ind[nid_token], self.word2ind[rel1_token], self.word2ind[rel2_token] ] return treelets
def make_vocabulary(self): counter = Counter() constants = [] special = [] for gs in self.graph_structs: graph = gs.graph for nid in graph.nodes: token = get_node_token(graph, nid) if get_label(graph, nid, 'type') == 'constant': constants.append(token) counter[token] += 1 else: special.append(token) counter[token] += 1 logging.info('Most common 10 tokens: {0}'.format( counter.most_common()[:10])) special = sorted(set(special)) logging.info('Got {0} special tokens: {1}'.format( len(special), special)) constants = sorted(set(constants)) logging.info('Got {0} constant tokens. Some of them are: {1}'.format( len(constants), constants[:10])) vocab = special + constants assert '<unk>' not in vocab [self.word2ind[w] for w in vocab] self.word2emb = np.random.uniform(size=(len(self.word2ind), 2)) return self.word2ind
def make_node_inds(self): node_inds = np.zeros((len(self.graph_structs), self._max_nodes), dtype='float32') for i, gs in enumerate(self.graph_structs): for j, nid in enumerate(gs.graph.nodes): node_token = get_node_token(gs.graph, nid) node_inds[i, nid] = self.word2ind[node_token] return node_inds