class UastIds2Bag: """ Converts a UAST to a bag-of-identifiers. """ def __init__(self, vocabulary, token_parser=None): """ :param vocabulary: The mapping from tokens to bag keys. If None, no mapping is performed. :param token_parser: Specify token parser if you want to use a custome one. \ :class:'TokenParser' is used if it is not specified. """ self._vocabulary = FakeVocabulary( ) if vocabulary is None else vocabulary self._token_parser = TokenParser( ) if token_parser is None else token_parser @property def vocabulary(self): return self._vocabulary def uast_to_bag( self, uast, roles_filter="//*[@roleIdentifier and not(@roleQualified)]"): """ Converts a UAST to a bag-of-words. The weights are identifier frequencies. The identifiers are preprocessed by :class:`TokenParser`. :param uast: The UAST root node. :param roles_filter: The libuast xpath query to filter identifiers. :return: """ import bblfsh nodes = bblfsh.filter(uast, roles_filter) bag = defaultdict(int) for node in nodes: for sub in self._token_parser.process_token(node.token): try: bag[self._vocabulary[sub]] += 1 except KeyError: continue return bag
class UastIds2Bag: """ Converts a UAST to a bag-of-identifiers. """ def __init__(self, vocabulary, token_parser=None): """ :param vocabulary: The mapping from tokens to bag keys. If None, no mapping is performed. :param token_parser: Specify token parser if you want to use a custome one. \ :class:'TokenParser' is used if it is not specified. """ self._vocabulary = vocabulary if vocabulary is not None else FakeVocabulary() self._token_parser = TokenParser() if token_parser is None else token_parser @property def vocabulary(self): return self._vocabulary def uast_to_bag(self, uast, role=SIMPLE_IDENTIFIER): """ Converts a UAST to a bag-of-words. The weights are identifier frequencies. The identifiers are preprocessed by :class:`TokenParser`. :param uast: :param role: Specify role of bblfsh Node if you want to get bag of words form them. :return: """ stack = [uast] bag = defaultdict(int) while stack: node = stack.pop(0) if role in node.roles: for sub in self._token_parser.process_token(node.token): try: bag[self._vocabulary[sub]] += 1 except KeyError: continue stack.extend(node.children) return bag
class UastIds2Bag: """ Converts a UAST to a bag-of-identifiers. """ def __init__(self, vocabulary): """ :param vocabulary: The mapping from tokens to bag keys. \ If None, no mapping is performed. """ self._vocabulary = vocabulary if vocabulary is not None else FakeVocabulary() self._token_parser = TokenParser() @property def vocabulary(self): return self._vocabulary def uast_to_bag(self, uast): """ Converts a UAST to a bag-of-words. The weights are identifier frequencies. The identifiers are preprocessed by :class:`TokenParser`. :param uast: :return: """ stack = [uast] bag = defaultdict(int) while stack: node = stack.pop(0) if SIMPLE_IDENTIFIER in node.roles: for sub in self._token_parser.process_token(node.token): try: bag[self._vocabulary[sub]] += 1 except KeyError: continue stack.extend(node.children) return bag
class Repo2CooccBase(Repo2Base): """ Converts UASTs to co-occurrence matrices. """ def __init__(self, *args, **kwargs): super(Repo2CooccBase, self).__init__(*args, **kwargs) self._token_parser = TokenParser() def convert_uasts(self, file_uast_generator): word2ind = self._get_vocabulary() dok_matrix = defaultdict(int) for file_uast in file_uast_generator: self._traverse_uast(file_uast.response.uast, word2ind, dok_matrix) n_tokens = len(word2ind) mat = coo_matrix((n_tokens, n_tokens), dtype=numpy.float32) if n_tokens == 0: return [], mat mat.row = row = numpy.empty(len(dok_matrix), dtype=numpy.int32) mat.col = col = numpy.empty(len(dok_matrix), dtype=numpy.int32) mat.data = data = numpy.empty(len(dok_matrix), dtype=numpy.float32) for i, (coord, val) in enumerate(sorted(dok_matrix.items())): row[i], col[i] = coord data[i] = val return self._get_result(word2ind, mat) def _get_vocabulary(self): raise NotImplementedError def _get_result(self, word2ind, mat): raise NotImplementedError def _update_dict(self, generator, word2ind, tokens): raise NotImplementedError def _flatten_children(self, root): ids = [] stack = list(root.children) for node in stack: if SIMPLE_IDENTIFIER in node.roles: ids.append(node) else: stack.extend(node.children) return ids @staticmethod def _all2all(words, word2ind): for i in range(len(words)): for j in range(i + 1, len(words)): try: wi = word2ind[words[i]] wj = word2ind[words[j]] except KeyError: continue yield wi, wj, 1 yield wj, wi, 1 def _process_node(self, root, word2ind, mat): children = self._flatten_children(root) tokens = [] for ch in children: self._update_dict(self._token_parser.process_token(ch.token), word2ind, tokens) if (root.token.strip() is not None and root.token.strip() != "" and SIMPLE_IDENTIFIER in root.roles): self._update_dict(self._token_parser.process_token(root.token), word2ind, tokens) for triplet in self._all2all(tokens, word2ind): mat[(triplet[0], triplet[1])] += triplet[2] return children def _extract_ids(self, root): queue = [root] while queue: node = queue.pop() if SIMPLE_IDENTIFIER in node.roles: yield node.token queue.extend(node.children) def _traverse_uast(self, root, word2ind, dok_mat): """ Traverses UAST and extract the co-occurrence matrix. """ stack = [root] new_stack = [] while stack: for node in stack: children = self._process_node(node, word2ind, dok_mat) new_stack.extend(children) stack = new_stack new_stack = []
class ProxBase(Model2Base): """ Contains common utilities for proximity matrix models. Proximity matrix captures structural information of the graph. Consider A to be adjacency matrix, then useful proximity matrices could be A^2, A(A^k-I)/(A-I), etc. To get node (entities corresponding to proximity matrix rows) embeddings we just decompose it. """ MODEL_FROM_CLASS = UASTModel MODEL_TO_CLASS = Cooccurrences def __init__(self, edges=EDGE_TYPES, *args, **kwargs): super(ProxBase, self).__init__(*args, **kwargs) self.edges = set(edges) self._token_parser = TokenParser() self._clear() def convert_model(self, model) -> Cooccurrences: """ Update attributes by processing UASTs in the input model. Then convert it into Cooccurrences model. :param model: UASTModel instance. :return: Cooccurences model for all UASTs in `model`. """ for uast in model.uasts: self._traverse_uast(uast) roles_to_roles = defaultdict(int) tokens_to_tokens = defaultdict(int) roles_to_tokens = defaultdict(int) def add_permutations(edge_type, node_items_list, item_to_item): if edge_type in self.edges: for node_items in node_items_list: for node_item_a, node_item_b in permutations( node_items, 2): item_to_item[(node_item_a, node_item_b)] += 1 def add_product(edge_type, items_a, items_b, item_to_item): if edge_type in self.edges: for item_a, item_b in product(items_a, items_b): item_to_item[(item_a, item_b)] += 1 add_permutations("r", self.roles, roles_to_roles) add_permutations("t", self.tokens, tokens_to_tokens) for node_roles, node_tokens in zip(self.roles, self.tokens): add_product("rt", node_roles, node_tokens, roles_to_tokens) for node_a, node_b in self.dok_matrix: roles_a = self.roles[node_a] roles_b = self.roles[node_b] tokens_a = self.tokens[node_a] tokens_b = self.tokens[node_b] add_product("R", roles_a, roles_b, roles_to_roles) add_product("T", tokens_a, tokens_b, tokens_to_tokens) add_product("RT", roles_a, tokens_b, roles_to_tokens) if roles_to_roles or roles_to_tokens: n_roles = len(self.role2ind) else: n_roles = 0 if tokens_to_tokens or roles_to_tokens: n_tokens = len(self.token2ind) else: n_tokens = 0 n_nodes = n_roles + n_tokens n_values = len(roles_to_roles) + len(tokens_to_tokens) + len( roles_to_tokens) mat = coo_matrix((n_nodes, n_nodes), dtype=numpy.float32) mat.row = row = numpy.empty(n_values, dtype=numpy.int32) mat.col = col = numpy.empty(n_values, dtype=numpy.int32) mat.data = data = numpy.empty(n_values, dtype=numpy.float32) def fill_mat(item_to_item, offset): for i, (coord, val) in enumerate(sorted(item_to_item.items())): row[i + fill_mat.count] = coord[0] + offset[0] col[i + fill_mat.count] = coord[1] + offset[1] data[i + fill_mat.count] = val fill_mat.count += len(item_to_item) fill_mat.count = 0 fill_mat(roles_to_roles, (0, 0)) fill_mat(roles_to_tokens, (0, n_roles)) fill_mat(tokens_to_tokens, (n_roles, n_roles)) mat = coo_matrix(mat + mat.T - diags(mat.diagonal())) tokens, mat = self._adj_to_feat(self.role2ind, self.token2ind, mat) self._clear() prox = Cooccurrences() prox.construct(tokens=tokens, matrix=mat) return prox def _adj_to_feat(self, role2ind: Dict[int, int], token2ind: Dict[int, int], mat) -> Tuple: """ This must be implemented in the child classes. :param role2ind: Mapping from roles to indices, starting with 0. :param token2ind: Mapping from tokens to indices, starting with 0. :param mat: Adjacency matrix ('scipy.sparse.coo_matrix') with rows corresponding to node roles followed by node tokens. :return: tuple('tokens', 'matrix'). 'tokens' are generalized tokens (usually roles+tokens). 'matrix' rows correspond to 'tokens'. """ raise NotImplementedError def _clear(self): """ Release memory. """ self.roles = list() self.tokens = list() self.role2ind = dict() self.token2ind = dict() self.dok_matrix = defaultdict(int) def _traverse_uast(self, root) -> None: """ Traverse UAST and extract adjacency matrix. :param root: UAST root node. :return: None """ n_nodes = len(self.roles) queue = deque([(root, n_nodes)]) # (node, node_idx) while queue: node, node_idx = queue.popleft() node_tokens = list(self._token_parser.process_token(node.token)) for role in node.roles: self.role2ind.setdefault(role, len(self.role2ind)) for token in node_tokens: self.token2ind.setdefault(token, len(self.token2ind)) self.roles.append([self.role2ind[role] for role in node.roles]) self.tokens.append( [self.token2ind[token] for token in node_tokens]) for ch in node.children: n_nodes += 1 self.dok_matrix[(node_idx, n_nodes)] += 1 queue.append((ch, n_nodes))