Exemplo n.º 1
0
 def bblfsh_node_to_node(
     self, bblfsh_node: BblfshNode, parent: Optional[Node]
 ) -> Node:
     """Create a `Node` given a `BblfshNode` and an optional parent."""
     position = bool(
         bblfsh_node.start_position.offset or bblfsh_node.end_position.offset
     )
     if position:
         start = bblfsh_node.start_position.offset
         end = bblfsh_node.end_position.offset
         if self.convert_to_utf8:
             start = self.binary_to_str[start]
             end = self.binary_to_str[end]
         token = self.file_content[start:end]
     else:
         start = None
         end = None
         token = bblfsh_node.token
         # Workaround https://github.com/bblfsh/javascript-driver/issues/65
         if not token and bblfsh_node.internal_type == "StringLiteralTypeAnnotation":
             token = bblfsh_node.properties["value"]
     return Node(
         token=token,
         internal_type=bblfsh_node.internal_type,
         roles=[role_name(role_id) for role_id in bblfsh_node.roles],
         parent=parent,
         start=start,
         end=end,
     )
Exemplo n.º 2
0
 def __str__(self):
     text = "BaseNode(roles=" + str([bblfsh.role_name(role) for role in self.roles]) + \
            ", token=" + self.token + \
            ", internal_type=" + self.internal_type + \
            ", properties=" + str(self.properties) + ")"
     # text = "BaseNode(roles=" + self.roles + \
     return text
Exemplo n.º 3
0
def node_to_roles(node: bblfsh.Node):
    """
    Converte bblfsh roles of a node to a unique string representation
    :param node: base_node
    :return: node's roles or the string's hash (in case its UP/DOWN token or a leaf)
    """
    return " | ".join(bblfsh.role_name(r) for r in sorted(node.roles))
Exemplo n.º 4
0
def get_node_properties(tree):
    D = tree.get_dict()
    node_properties = {}
    node_properties['token'] = ""
    node_properties['roles'] = []
    if '@type' in tree.get_dict():
        node_properties['internal_type'] = tree.internal_type
    else:
        node_properties['internal_type'] = None
    try:
        node_properties['start_line'] = D['@pos']['start']['line']
        node_properties['start_col'] = D['@pos']['start']['col']
        node_properties['end_line'] = D['@pos']['end']['line']
        node_properties['end_col'] = D['@pos']['end']['col']
    except:
        node_properties['start_line'] = 0
        node_properties['start_col'] = 0
        node_properties['end_line'] = 0
        node_properties['end_col'] = 0
        pass

    try:
        node_properties['roles'] = [bblfsh.role_name(r) for r in tree.roles]
    except:
        pass

    if '@token' in tree.get_dict():
        node_properties['token'] = tree.token

    if node_properties['internal_type'] == 'uast:Identifier':
        node_properties['token'] = tree.get()['Name']

    return node_properties
Exemplo n.º 5
0
def save_roles(output, nodes_count):
    roles_count = [(tuple([bblfsh.role_name(role_id) for role_id in n]), count)
                   for n, count in nodes_count]
    if not os.path.exists(output):
        os.makedirs(output)
    with open('{}/{}'.format(output, 'roles_count.pickle'), 'wb') as f:
        pickle.dump(roles_count, f)
Exemplo n.º 6
0
def print_statistics(rules_count, nodes_count):
    print("Top twenty rules:")
    for i in range(20):
        print("{}. {}\n".format(i, rules_count[i][0]))

    print("Top twenty nodes:")
    for i in range(20):
        print("{}. {}\n\t{}\n".format(
            i, [bblfsh.role_name(role_id) for role_id in nodes_count[i][0]],
            nodes_count[i][1][0]))

    with open('../results/rules.bin', 'w') as rules_file:
        rules_file.write("\n".join(
            [str(r) for r in [r for r, count, in rules_count]]))
Exemplo n.º 7
0
def get_rule(node):
    if len(node.children) == 0:
        token = "{} {}".format([bblfsh.role_name(i) for i in node.roles],
                               node.token)
        return [], BaseNode(node.roles, token, node.internal_type,
                            node.properties)

    rules = []
    rhs = []
    tokens = "{}".format([bblfsh.role_name(i) for i in node.roles])
    for i, child in enumerate(node.children):
        c_rules, c_node = get_rule(child)
        rhs.append(c_node)

        # if i > 0:
        tokens += " "

        tokens += c_node.token
        rules.extend(c_rules)

    tokens += "\n"
    lhs = BaseNode(node.roles, tokens, node.internal_type, node.properties)
    rules.append(Rule(rhs, lhs))
    return rules, lhs
Exemplo n.º 8
0
def plot_tsne(nodes_dict):
    """
    Compute the t-SNE dimensionality reduction values of input parameter and plot them in 2D
    :param id_word_vec: vector containing the tuples (id, word, embedding) to be plotted
    """
    nodes = [node for node, _ in nodes_dict]
    counts = [len(instances) for node, instances in nodes_dict]
    tsne = TSNE(n_components=2)
    X_tsne = tsne.fit_transform([BaseNode.roles_as_vector(n) for n in nodes])
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], s=counts)

    for i, roles in enumerate(nodes):
        plt.annotate([bblfsh.role_name(role) for role in roles],
                     (X_tsne[i, 0], X_tsne[i, 1]))

    plt.show()
Exemplo n.º 9
0
 def _group_quote_predictions(
         self, vnodes_y: Sequence[VirtualNode],
         vnodes: Sequence[VirtualNode]) -> QuotedNodeTripleMapping:
     quotes_classes = frozenset(
         (CLASS_INDEX[CLS_DOUBLE_QUOTE], CLASS_INDEX[CLS_SINGLE_QUOTE]))
     y_indices = {id(vnode): i for i, vnode in enumerate(vnodes_y)}
     grouped_predictions = OrderedDict()
     for vnode1, vnode2, vnode3 in zip(vnodes, islice(vnodes, 1, None),
                                       islice(vnodes, 2, None)):
         if (id(vnode1) not in y_indices or id(vnode3) not in y_indices
                 or vnode2.node is None
                 or vnode1.y[-1] not in quotes_classes
                 or vnode3.y[0] != vnode1.y[-1]):
             continue
         vnode2_roles = frozenset(
             role_name(role_id) for role_id in vnode2.node.roles)
         if "STRING" in vnode2_roles:
             grouped_predictions[id(vnode1)] = vnode1, vnode2, vnode3
             grouped_predictions[id(vnode3)] = None
     return grouped_predictions
Exemplo n.º 10
0
 def testRoleIdName(self) -> None:
     self.assertEqual(role_id(role_name(1)), 1)
     self.assertEqual(role_name(role_id("IDENTIFIER")), "IDENTIFIER")
Exemplo n.º 11
0
 def _compute_row(self, node: bblfsh.Node) -> Iterable[Tuple[int, int]]:
     for role_id in node.roles:
         role = bblfsh.role_name(role_id)
         if role in self.selected_names_index:
             yield 1, self.selected_names_index[role]
Exemplo n.º 12
0
def main():
    """Entry point."""
    args = parse_args()
    slogging.setup(args.log_level, False)
    clients = threading.local()
    pool = ThreadPoolExecutor(max_workers=args.threads)
    log = logging.getLogger("main")
    log.info("Will parse %d files in %d threads", len(args.input),
             args.threads)
    internal_types = defaultdict(int)
    roles = defaultdict(int)
    reserved = set()
    language = args.parquet_language
    inputs = list(handle_input_arg(args.input))
    progress = tqdm(total=len(inputs))
    progress_lock = threading.Lock()
    errors = False

    def analyze_code_file(path: str):
        nonlocal errors
        if errors:
            return
        try:
            try:
                client = clients.client
            except AttributeError:
                client = bblfsh.BblfshClient(args.bblfsh)
                clients.client = client
            response = client.parse(path)
            nonlocal language
            if not language:
                language = response.language
            elif language != response.language:
                log.warning("dropped %s - language mismatch %s != %s", path,
                            language, response.language)
                return
            content = Path(path).read_text()
            analyze_uast(path, content, response.uast, internal_types, roles,
                         reserved)
        except:  # noqa: E722
            log.exception("Parsing %s", path)
            errors = True
        finally:
            with progress_lock:
                progress.disable = False  # this is needed, do not remove
                progress.update(1)

    def analyze_parquet_row(row: pandas.Series, filepath: str):
        nonlocal errors
        if errors:
            return
        nonlocal language
        try:
            path = "%s:%s" % (filepath, row.path)
            analyze_uast(path, row.content.decode(errors="ignore"),
                         bblfsh.Node.FromString(row.uast), internal_types,
                         roles, reserved)
        except DecodeError as e:
            log.warning(e)
        except:  # noqa: E722
            log.exception("Parsing %s", row.path)
            errors = True
        finally:
            with progress_lock:
                progress.disable = False  # this is needed, do not remove
                progress.update(1)

    try:
        if args.parquet:
            if not language:
                raise ValueError(
                    "--parquet-language must be specified with --parquet.")
            with progress:
                for filepath in inputs:
                    try:
                        data = pandas.read_parquet(filepath)
                    except:  # noqa: E722
                        log.warning("Bad parquet file %s", filepath)
                    else:
                        analyze = partial(analyze_parquet_row,
                                          filepath=filepath)
                        for _, row in data.iterrows():
                            progress.total += 1
                            pool.submit(analyze, row)
                    progress.update(1)
        else:
            with progress:
                for filepath in inputs:
                    pool.submit(analyze_code_file, filepath)
    finally:
        pool.shutdown()
    if errors:
        return 1
    reserved.discard("")
    log.info("Internal types: %d", len(internal_types))
    log.info("UAST roles: %d", len(roles))
    log.info("Reserved: %d", len(reserved))

    roles = {bblfsh.role_name(role_id): n for role_id, n in roles.items()}
    generate_files(args.output, internal_types, roles, reserved)
Exemplo n.º 13
0
 def merge_roles(roles: Iterable[int]):
     return " | ".join(bblfsh.role_name(r) for r in sorted(roles))
Exemplo n.º 14
0
 def __repr__(self):
     return "[" + " ".join(["{}/{}".format(bblfsh.role_name(role), role) for role in self.roles]) + "]"
Exemplo n.º 15
0
 def named_roles(self):
     return [bblfsh.role_name(role) for role in self.roles]
Exemplo n.º 16
0
 def testRoleIdName(self):
     assert (role_id(role_name(1)) == 1)
     assert (role_name(role_id("IDENTIFIER")) == "IDENTIFIER")