def create_from_dataset():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("dataset_path", type=str, help="")
    parser.add_argument("output_path", type=str, help="")
    parser.add_argument("--format",
                        "-f",
                        dest="format",
                        default="jsonl",
                        help="jsonl|csv")
    parser.add_argument("--remove_default", action="store_true", default=False)

    args = parser.parse_args()

    global remove_default
    remove_default = args.remove_default

    node_maps = get_node_maps(
        unpersist(join(args.dataset_path, "common_nodes.bz2")))
    filecontent = get_filecontent_maps(
        unpersist(join(args.dataset_path, "common_filecontent.bz2")))
    offsets = group_offsets(
        unpersist(join(args.dataset_path, "common_offsets.bz2")))

    data = []
    nlp = create_tokenizer("spacy")

    for ind, (f_body, f_offsets) in enumerate(
            iterate_functions(offsets, node_maps, filecontent)):
        data.append(process_body(nlp, f_body, replacements=f_offsets))

    store(data, args)
def render_annotations(annotations):
    nlp = create_tokenizer("spacy")
    entries = ""
    for annotation in annotations:
        text, predicted, annotated = annotation
        doc = nlp(text[0])
        entries += entry.format(annotate(doc, predicted), annotate(doc, annotated))

    return html_template.format(entries)
def process_package(working_directory, global_names=None):
    """
    Find functions with annotations, extract annotation information, strip documentation and type annotations.
    :param working_directory: location of package related files
    :param global_names: optional, mapping from global node ids to names
    :return: list of entries in spacy compatible format
    """
    bodies = unpersist_if_present(
        os.path.join(working_directory, "source_graph_bodies.bz2"))
    if bodies is None:
        return []

    offsets_path = os.path.join(working_directory, "offsets.bz2")

    # offsets store information about spans for nodes referenced in the source code
    if os.path.isfile(offsets_path):
        offsets = unpersist(offsets_path)
    else:
        logging.warning(f"No file with offsets: {offsets_path}")
        offsets = None

    def load_local2global(working_directory):
        local2global = unpersist(
            os.path.join(working_directory, "local2global_with_ast.bz2"))
        id_maps = dict(zip(local2global['id'], local2global['global_id']))
        return id_maps

    id_maps = load_local2global(working_directory)

    local_names = load_names(
        os.path.join(working_directory, "nodes_with_ast.bz2"))

    nlp = create_tokenizer("spacy")

    data = []

    for ind, (_, row) in tqdm(enumerate(bodies.iterrows()),
                              total=len(bodies),
                              leave=True,
                              desc=os.path.basename(working_directory)):
        body = row['body']

        if offsets is not None:
            graph_node_spans = offsets_for_func(offsets, body, row["id"])
        else:
            graph_node_spans = []

        entry = process_body(nlp, body, replacements=graph_node_spans)

        if entry is not None:
            entry = to_global_ids(entry, id_maps, global_names, local_names)
            data.append(entry)

    return data
Пример #4
0
def source_code_graph_alignment(source_codes,
                                node_spans,
                                tokenizer="codebert"):
    supported_tokenizers = ["spacy", "codebert"]
    assert tokenizer in supported_tokenizers, f"Only these tokenizers supported for alignment: {supported_tokenizers}"
    nlp = create_tokenizer(tokenizer)

    for code, spans in zip(source_codes, node_spans):
        yield align_tokens_with_graph(nlp(code),
                                      resolve_self_collisions2(spans),
                                      tokenzer_name=tokenizer)
    def __init__(
            self, data, batch_size: int, seq_len: int,
            wordmap: Dict[str, int], *, graphmap: Optional[Dict[str, int]], tagmap: Optional[TagMap] = None,
            mask_unlabeled_declarations=True,
            class_weights=False, element_hash_size=1000, len_sort=True, tokenizer="spacy", no_localization=False
    ):

        self.create_cache()

        self.data = sorted(data, key=lambda x: len(x[0])) if len_sort else data
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.class_weights = None
        self.mask_unlabeled_declarations = mask_unlabeled_declarations
        self.tokenizer = tokenizer
        if tokenizer == "codebert":
            self.vocab = spacy.blank("en").vocab
        self.no_localization = no_localization

        self.nlp = create_tokenizer(tokenizer)
        if tagmap is None:
            self.tagmap = tag_map_from_sentences(list(zip(*[self.prepare_sent(sent) for sent in data]))[1])
        else:
            self.tagmap = tagmap

        self.graphpad = len(graphmap) if graphmap is not None else None
        self.wordpad = len(wordmap)
        self.tagpad = self.tagmap["O"]
        self.prefpad = element_hash_size
        self.suffpad = element_hash_size

        self.graphmap_func = (lambda g: graphmap.get(g, len(graphmap))) if graphmap is not None else None
        self.wordmap_func = lambda w: wordmap.get(w, len(wordmap))
        self.tagmap_func = lambda t: self.tagmap.get(t, self.tagmap["O"])
        self.prefmap_func = lambda w: token_hasher(w[:3], element_hash_size)
        self.suffmap_func = lambda w: token_hasher(w[-3:], element_hash_size)

        self.mask_unlblpad = 1.
        if mask_unlabeled_declarations:
            self.mask_unlbl_func = lambda t: 1 if t == "O" else 0
        else:
            self.mask_unlbl_func = lambda t: 1.

        self.classwpad = 1.
        if class_weights:
            self.class_weights = ClassWeightNormalizer()
            self.class_weights.init(list(zip(*[self.prepare_sent(sent) for sent in data]))[1])
            self.classw_func = lambda t: self.class_weights.get(t, self.classwpad)
        else:
            self.classw_func = lambda t: 1.
def test_SpacyPythonBpe():
    from SourceCodeTools.nlp import create_tokenizer
    nlp = create_tokenizer("spacy_bpe", bpe_path="/Users/LTV/Dropbox (Personal)/sentencepiece_bpe.model")

    # code = """    def method2(self) :
    #     variable1 = self.field
    #     variable2 = str(variable1)
    #     return variable2"""

    code = """    def method2(self) :
 
        variable1 = self.field
        variable2 = str(variable1)
        return variable2"""

    doc = nlp(code)

    assert str(doc) == code

    print(doc)
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_path", "-m", dest="model_path", default=None)
    parser.add_argument("--vectors", "-v", dest="vectors", default=None)
    parser.add_argument("data_path")
    parser.add_argument("--output_model",
                        "-o",
                        dest="output_model",
                        default="spacy-typing-ner")
    parser.add_argument("--epochs", "-e", dest="epochs", default=90, type=int)
    parser.add_argument("--seed",
                        "-s",
                        dest="seed",
                        default=42,
                        type=int,
                        help="Seed for random dataset split")
    parser.add_argument("--bpe", dest="bpe", default=None, type=str, help="")
    args = parser.parse_args()

    train_data, test_data = read_data(open(args.data_path, "r").readlines(),
                                      include_only="categories",
                                      random_seed=args.seed)

    if args.model_path is not None:
        model = spacy.load(args.model_path)
    else:
        if args.vectors is not None:
            model = create_tokenizer("spacy_bpe", bpe_path=args.bpe)
            add_vectors(model, args.vectors)
        else:
            raise Exception(
                "You should provide either an initialized spacy model or pretrained vectors"
            )

    train_spacy_categorizer(train_data,
                            test_data,
                            model=model,
                            output_dir=args.output_model,
                            n_iter=args.epochs)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("bodies")
    parser.add_argument("bpe_path")
    parser.add_argument("--num_layers", default=8, type=int)

    args = parser.parse_args()

    bodies = unpersist(args.bodies)
    bpe = create_tokenizer(type="bpe", bpe_path=args.bpe_path)
    mention_tokenizer = MentionTokenizer(args.bpe_path,
                                         create_subword_instances=True,
                                         connect_subwords=False)

    lengths_tr = {}
    lengths_gnn = {}
    ratio = []

    for body in tqdm(bodies["body"]):
        if not has_valid_syntax(body):
            continue

        n_tokens = compute_transformer_passings(body, bpe)
        n_edges = compute_gnn_passings(body, mention_tokenizer)

        if n_tokens not in lengths_tr:
            lengths_tr[n_tokens] = []
        if n_tokens not in lengths_gnn:
            lengths_gnn[n_tokens] = []

        lengths_tr[n_tokens].append(n_tokens**2 * args.num_layers)
        lengths_gnn[n_tokens].append(n_edges)  # * args.num_layers)
        ratio.append((n_tokens, n_edges))

    for key in lengths_tr:
        data_tr = np.array(lengths_tr[key])
        data_gnn = np.array(lengths_gnn[key])

        lengths_tr[key] = np.mean(data_tr)  #, np.std(data_tr))
        lengths_gnn[key] = np.mean(data_gnn)  #, np.std(data_gnn))

    data_ratios = np.array(ratio)

    plt.plot(data_ratios[:, 0], data_ratios[:, 1], "*")
    plt.xlabel("Number of Tokens")
    plt.ylabel("Number of Edges")
    plt.savefig("tokens_edges.png")
    plt.close()

    plt.hist(data_ratios[:, 1] / data_ratios[:, 0], bins=20)
    plt.xlabel("Number of edges / Number of tokens")
    plt.savefig("ratio.png")
    plt.close()

    ratio = data_ratios[:, 1] / data_ratios[:, 0]
    ratio = (np.mean(ratio), np.std(ratio))

    plt.plot(list(lengths_tr.keys()),
             np.log10(np.array(list(lengths_tr.values()))), "*")
    plt.plot(list(lengths_gnn.keys()),
             np.log10(np.array(list(lengths_gnn.values()))), "*")
    plt.plot(list(lengths_gnn.keys()),
             np.log10(np.array(list(lengths_gnn.values())) * args.num_layers),
             "*")
    plt.legend([
        f"Transformer {args.num_layers} layers", "GNN L layers",
        f"GNN L*{args.num_layers} layers"
    ])
    plt.xlabel("Number of Tokens")
    plt.ylabel("log10(Number of Message Exchanges)")
    plt.savefig("avg_passings.png")
    plt.close()
import pickle, sys, json, re
import spacy
from spacy.gold import biluo_tags_from_offsets

from SourceCodeTools.nlp import create_tokenizer

nlp = create_tokenizer("spacy")

TRAIN_DATA = []
with open(sys.argv[1], "r") as data:
    for line in data:
        entry = json.loads(line)
        TRAIN_DATA.append([entry['text'], {'entities': entry['ents']}])
        TRAIN_DATA[-1][1]['entities'] = [
            (int(e[0]), int(e[1]), e[2]) for e in TRAIN_DATA[-1][1]['entities']
        ]

for text, ent in TRAIN_DATA:
    doc = nlp(text)
    entities = ent['entities']
    tags = biluo_tags_from_offsets(doc, entities)
    for token, tag in zip(doc, tags):
        print(token.text, tag, sep="\t")
    print("\t")
    # TODO
    # filter valid
    # if text.startswith("def format_percentiles("):
    #     print("-" in tags)
    #     print(tags)
    #     print(entities)
    #