def _add_per_subtoken_nodes(unsplittable_node_names: Set[str],
                            raw_data: Dict[str, Any]) -> None:
    graph_node_labels = raw_data['ContextGraph']['NodeLabels']
    subtoken_to_using_nodes = defaultdict(set)

    max_used_node_id = 0
    for node_id, node_label in graph_node_labels.items():
        node_id = int(node_id)
        max_used_node_id = max(node_id, max_used_node_id)

        # Skip AST nodes and punctuation:
        if node_label in unsplittable_node_names:
            continue

        for subtoken in split_identifier_into_parts(node_label):
            if re.search('[a-zA-Z0-9]', subtoken):
                subtoken_to_using_nodes[subtoken].add(node_id)

    subtoken_node_id = max_used_node_id
    new_edges = []
    for subtoken, using_nodes in subtoken_to_using_nodes.items():
        subtoken_node_id += 1
        graph_node_labels[str(subtoken_node_id)] = subtoken
        new_edges.extend([(using_node_id, subtoken_node_id)
                          for using_node_id in using_nodes])

    raw_data['ContextGraph']['Edges'][USES_SUBTOKEN_EDGE_NAME] = new_edges
예제 #2
0
def process_code(code_string):
    g = _tokenize_code_string(code_string)

    s = []
    prev_tok = ""
    def_tok_seen = False  # to deal with nested functions

    if g:
        try:
            for toknum, tokval, _, _, _ in g:
                if toknum != ENCODING and toknum != ENDMARKER:
                    tok = spl_tokens.get(toknum) if spl_tokens.get(
                        toknum) else tokval
                    if (tok.startswith('"""') or
                            tok.startswith("'''")) and prev_tok == spl_tokens[
                                INDENT] and toknum == STRING and def_tok_seen:
                        # It is most likely a docstring.
                        docstr = process_string_tokes(tok, is_docstr=True)
                        s.extend(docstr)
                        prev_tok = tok
                        def_tok_seen = False
                        continue
                    else:
                        prev_tok = tok
                    if tok != spl_tokens[INDENT] and tok != spl_tokens[
                            DEDENT] and tok != spl_tokens[NEWLINE]:
                        if toknum == STRING:
                            s.extend(process_string_tokes(tok))
                            continue
                        if tok == "def":
                            def_tok_seen = True
                        toks = split_identifier_into_parts(tok)
                        if not toks[0].startswith(
                                "#"
                        ):  # If the line it self is an in-line comment
                            for t in toks:
                                if not t.startswith(
                                        "#"
                                ):  # If we have in-line comments after the code (like this one)
                                    s.append(t.rstrip().lstrip().lower())
                    else:
                        s.append(tok.rstrip().lstrip().lower())

            # return " ".join(s)
            return divide_code_in_logical_lines(s)
        except Exception as e:
            print(e)
            return None
    return None
예제 #3
0
 def run_test(self, identifier: str, expected: List[str]) -> None:
     actual = split.split_identifier_into_parts(identifier)
     self.assertEqual(expected, actual)