def __init__(self, bpe_model, vocab, *args, **kwargs):
        super(SpacyPythonBpe, self).__init__(vocab, *args, **kwargs)

        from SourceCodeTools.nlp.embed.bpe import load_bpe_model, make_tokenizer

        self.bpe_tokenizer = make_tokenizer(load_bpe_model(bpe_model))
        self.__vocab = vocab
    def __init__(self, bpe_tokenizer_path, create_subword_instances,
                 connect_subwords):
        from SourceCodeTools.nlp.embed.bpe import make_tokenizer
        from SourceCodeTools.nlp.embed.bpe import load_bpe_model

        self.bpe = make_tokenizer(load_bpe_model(bpe_tokenizer_path)) \
            if bpe_tokenizer_path else None
        self.create_subword_instances = create_subword_instances
        self.connect_subwords = connect_subwords
    def init_subwords(self, elements, num_buckets, max_len):
        from SourceCodeTools.nlp.embed.bpe import load_bpe_model, make_tokenizer
        tokenize = make_tokenizer(load_bpe_model(self.tokenizer_path))

        names = elements['dst']
        reprs = names.map(tokenize) \
            .map(lambda tokens: (token_hasher(t, num_buckets) for t in tokens)) \
            .map(lambda int_tokens: np.fromiter(int_tokens, dtype=np.int32))\
            .map(lambda parts: create_fixed_length(parts, max_len, 0))

        self.name2repr = dict(zip(names, reprs))

        self.embed = nn.Embedding(num_buckets, self.emb_size, padding_idx=0)
        self.norm = nn.LayerNorm(self.emb_size)
    def instantiate(self, nodes, orig_edges, **kwargs):
        edges = orig_edges.copy()
        if "type_backup" in edges.columns:
            type_col = "type_backup"
        elif "type" in edges.columns:
            type_col = "type"
        else:
            raise Exception("Column `type` or `backup_type` not found")

        from SourceCodeTools.nlp.embed.bpe import load_bpe_model, make_tokenizer
        tokenize = make_tokenizer(load_bpe_model(kwargs['tokenizer_path']))

        subword_nodes = nodes.query("type_backup == 'subword'")
        subword2key = dict(
            zip(subword_nodes["name"],
                zip(subword_nodes["type"], subword_nodes["typed_id"])))

        node2type = dict(zip(nodes["id"], nodes["type"]))
        node2typed_id = dict(zip(nodes["id"], nodes["typed_id"]))

        node2name = kwargs["node2name"]
        node2name.eval("src_type = src.map(@node2type.get)",
                       local_dict={"node2type": node2type},
                       inplace=True)
        node2name.eval("src_typed_id = src.map(@node2typed_id.get)",
                       local_dict={"node2typed_id": node2typed_id},
                       inplace=True)

        self.lookup = {}
        for node_type, node_id, var_name in node2name[[
                "src_type", "src_typed_id", "dst"
        ]].values:
            key = (node_type, node_id)
            subwords = tokenize(var_name)
            if key not in self.lookup:
                self.lookup[key] = []

            # TODO
            #  Some subwords did not appear in the list of known subwords. Although this is not an issue,
            #  this can indicate that variable names are not extracted correctly. Need to verify.
            self.lookup[key].extend(
                [subword2key[sub] for sub in subwords if sub in subword2key])
Пример #5
0
    def _op_tokens(self):
        if self.tokenizer_path is None:
            from SourceCodeTools.code.python_tokens_to_bpe_subwords import python_ops_to_bpe
            logging.info("Using heuristic tokenization for ops")

            # def op_tokenize(op_name):
            #     return python_ops_to_bpe[op_name] if op_name in python_ops_to_bpe else None
            return python_ops_to_bpe
        else:
            # from SourceCodeTools.code.python_tokens_to_bpe_subwords import op_tokenize_or_none

            tokenizer = make_tokenizer(load_bpe_model(self.tokenizer_path))

            # def op_tokenize(op_name):
            #     return op_tokenize_or_none(op_name, tokenizer)

            from SourceCodeTools.code.python_tokens_to_bpe_subwords import python_ops_to_literal
            return {
                op_name: tokenizer(op_literal)
                for op_name, op_literal in python_ops_to_literal.items()
            }
    def init_subwords(self, elements, num_buckets, max_len):
        from SourceCodeTools.nlp.embed.bpe import load_bpe_model, make_tokenizer
        self.tokenizer = load_bpe_model(self.tokenizer_path)
        tokenize = make_tokenizer(self.tokenizer)
        self.vocab = Vocabulary()
        self.num_buckets = self.tokenizer.vocab_size()
        self.pad_id = self.vocab.pad_id

        docs = self.elements['dst']
        tokens = docs.map(lambda text: ["<pad>"] + tokenize(text) + ["</s>"])
        lengths = tokens.map(lambda tokens: min(len(tokens) - 1, max_len)
                             )  # the pad will go away
        [self.vocab.add(doc) for doc in tokens]
        self.num_buckets = len(self.vocab)

        reprs = tokens \
            .map(lambda tokens: map(lambda token: self.vocab[token], tokens))\
            .map(lambda int_tokens: np.fromiter(int_tokens, dtype=np.int32))\
            .map(lambda parts: create_fixed_length(parts, max_len, self.pad_id))

        self.id2repr = dict(zip(self.elements["id"], reprs))
        self.id2len = dict(zip(self.elements["id"], lengths))
Пример #7
0
    def __init__(self, corpus, tokenizer, output_path):
        self.corpus = corpus
        self.output_path = output_path

        if tokenizer == "default":
            logging.info("Loading regex tokenizer")
            from nltk import RegexpTokenizer
            tok = RegexpTokenizer("[\\w]+|[^\\w\\s]|[0-9]+")

            def tokenize(text):
                return tok.tokenize(text)

            self.tokenize = tokenize
        elif os.path.isfile(tokenizer):
            logging.info("Loading bpe tokenizer")
            from SourceCodeTools.nlp.embed.bpe import load_bpe_model, make_tokenizer
            self.tokenize = make_tokenizer(load_bpe_model(tokenizer))

        self.dictionary = self.load_dictionary()

        self.corpus_mm = self.load_corpus()
        self.train, self.test = train_test_split(self.corpus_mm, test_size=0.1)
from SourceCodeTools.nlp.embed.bpe import load_bpe_model, make_tokenizer
from SourceCodeTools.nlp.embed.fasttext import train_wor2vec
import argparse

parser = argparse.ArgumentParser(description='Train word vectors')
parser.add_argument('tokenizer_model_path',
                    type=str,
                    default=None,
                    help='Path to sentencepiece tokenizer model')
parser.add_argument('input_file',
                    type=str,
                    default=None,
                    help='Path to corpus')
parser.add_argument('output_dir',
                    type=str,
                    default=None,
                    help='Output saving directory')
parser.add_argument('--emb_size', type=int, default=100, help='')
args = parser.parse_args()

train_wor2vec(corpus_path=args.input_file,
              output_path=args.output_dir,
              tokenizer=make_tokenizer(
                  load_bpe_model(args.tokenizer_model_path)),
              emb_size=args.emb_size)
Пример #9
0
def _get_from_ast(bodies,
                  node_resolver,
                  bpe_tokenizer_path=None,
                  create_subword_instances=True,
                  connect_subwords=False):
    ast_edges = None

    bodies_with_replacements = {}

    subword_tokenizer = make_tokenizer(load_bpe_model((bpe_tokenizer_path))) \
        if bpe_tokenizer_path else None

    tokenizer = RegexpTokenizer("\w+|[^\w\s]")

    for ind_bodies, (_, row) in custom_tqdm(enumerate(bodies.iterrows()),
                                            message="Extracting AST edges",
                                            total=len(bodies)):
        orig_body = row['body_with_random_replacements']
        if not isinstance(orig_body, str):
            continue

        srctrl2original = get_srctrl2original_replacements(row)

        c = orig_body.lstrip()
        strip_len = len(orig_body) - len(c)

        try:
            ast.parse(c)
        except SyntaxError as e:
            print(e)
            continue

        replacements = row['random_2_srctrl']

        g = AstGraphGenerator(c)

        edges = g.get_edges()

        if len(edges) == 0:
            continue

        # replacements_lookup = lambda x: complex_replacement_lookup(x, replacements)
        replacements_lookup = lambda x: \
            GNode(name=random_replacement_lookup(x.name, x.type, replacements, tokenizer),
                  type=x.type) if "@" not in x.name else \
                GNode(name=random_replacement_lookup(x.name.split("@")[0], x.type, replacements, tokenizer) +
                           "@" + x.name.split("@")[1],
                      type=x.type)

        edges['src'] = edges['src'].apply(replacements_lookup)
        edges['dst'] = edges['dst'].apply(replacements_lookup)

        resolve = lambda node: node_resolver.resolve(node, srctrl2original)

        edges['src'] = edges['src'].apply(resolve)
        edges['dst'] = edges['dst'].apply(resolve)

        edges = replace_mentions_with_subword_instances(
            edges,
            subword_tokenizer,
            create_subword_instances=create_subword_instances,
            connect_subwords=connect_subwords)

        resolve_node_id = lambda node: node_resolver.resolve_node_id(
            node, row['id'])

        edges['src'] = edges['src'].apply(resolve_node_id)
        edges['dst'] = edges['dst'].apply(resolve_node_id)

        extract_id = lambda node: node.id
        edges['src'] = edges['src'].apply(extract_id)
        edges['dst'] = edges['dst'].apply(extract_id)

        # edges = edges.append(node_resolver.get_mention_edges())
        edges = edges.drop_duplicates(subset=["src", "dst", "type"])

        edges['id'] = 0

        ast_nodes = resolve_self_collision(
            filter_nodes(
                adjust_offsets(
                    to_offsets(c, get_ast_nodes(edges), as_bytes=True),
                    -strip_len), orig_body))

        srctrl_nodes = list(
            map(
                lambda x: (x[0], x[1],
                           node_resolver.resolve(GNode(name=x[2], type="Name"),
                                                 srctrl2original).global_id),
                to_offsets(row['body_with_random_replacements'],
                           format_replacement_offsets(
                               row['replacement_list']))))

        all_offsets = join_offsets(sorted(ast_nodes, key=lambda x: x[0]),
                                   sorted(srctrl_nodes, key=lambda x: x[0]))

        bodies_with_replacements[row['id']] = all_offsets

        # append_edges(path=edges_with_ast_name, edges=edges)
        edges['mentioned_in'] = row['id']
        ast_edges = append_edges(ast_edges=ast_edges, new_edges=edges)
        # print("\r%d/%d" % (ind_bodies, len(bodies['body_normalized'])), end="")

    # print(" " * 30, end="\r")

    bodies['graph_node_replacements'] = bodies['id'].apply(
        lambda id_: bodies_with_replacements.get(id_, None))

    # write_nodes(path=nodes_with_ast_name, node_resolver=node_resolver)

    # ast_nodes = pd.DataFrame(node_resolver.new_nodes)[['id', 'type', 'serialized_name', 'mentioned_in']].astype(
    #     {'mentioned_in': 'Int32'}
    # )

    ast_edges = ast_edges.append(node_resolver.get_mention_edges())
    ast_edges['id'] = 0

    ast_nodes = node_resolver.new_nodes_for_write()
    ast_edges = ast_edges.rename(
        {
            'src': 'source_node_id',
            'dst': 'target_node_id'
        }, axis=1).astype({'mentioned_in': 'Int32'})

    # assert leaf_nodes_are_leaf_types(ast_nodes, ast_edges)
    leaf_nodes_are_leaf_types(ast_nodes, ast_edges)

    return ast_nodes, ast_edges, bodies
Пример #10
0
def create_tokenizer(type, bpe_path=None, regex=None):
    """
    Create tokenizer instance. Usage

    ```
    tok = create_tokenizer("spacy") # create spacy doc
    tokens = tok("string for tokenization")

    ...

    tok = create_tokenizer("bpe") # create list of tokens
    tokens = tok("string for tokenization")
    ```

    :param type: tokenizer type is one of [spacy|spacy_bpe|regex|bpe]. Spacy creates a blank english tokenizer with additional
        tokenization rules. Regex tokenizer is a simple tokenizer from nltk that uses regular expression
        `[\w]+|[^\w\s]|[0-9]+`. BPE tokenizer is an instance of sentencepiece model (requires pretrained model).
    :param bpe_path: path for pretrained BPE model. Used for BPE tokenizer
    :param regex: Override regular expression for Regex tokenizer.
    :return: Returns spacy pipeline (nlp) or a tokenize function.
    """
    if type == "spacy":
        import spacy
        return _inject_tokenizer(spacy.blank("en"))
    elif type == "spacy_bpe":
        import spacy
        nlp = spacy.blank("en")

        if bpe_path is None:
            raise Exception("Specify path for bpe tokenizer model")

        nlp.tokenizer = SpacyPythonBpe(bpe_path, nlp.vocab)
        return nlp
    elif type == "regex":
        from nltk import RegexpTokenizer
        if regex is None:
            regex = "[\w]+|[^\w\s]|[0-9]+"
        _tokenizer = RegexpTokenizer(regex)

        def default_tokenizer(text):
            return _tokenizer.tokenize(text)

        return default_tokenizer
    elif type == "bpe":
        if bpe_path is None:
            raise Exception("Specify path for bpe tokenizer model")

        from SourceCodeTools.nlp.embed.bpe import load_bpe_model, make_tokenizer

        return make_tokenizer(load_bpe_model(bpe_path))
    elif type == "codebert":
        from transformers import RobertaTokenizer
        import spacy
        from spacy.tokens import Doc

        tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
        nlp = spacy.blank("en")

        def tokenize(text):
            tokens = tokenizer.tokenize(text)
            doc = Doc(nlp.vocab, tokens, spaces=[False] * len(tokens))
            return doc

        return tokenize
    else:
        raise Exception("Supported tokenizer types: spacy, regex, bpe")