def create_from_dataset(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("dataset_path", type=str, help="") parser.add_argument("output_path", type=str, help="") parser.add_argument("--format", "-f", dest="format", default="jsonl", help="jsonl|csv") parser.add_argument("--remove_default", action="store_true", default=False) args = parser.parse_args() global remove_default remove_default = args.remove_default node_maps = get_node_maps( unpersist(join(args.dataset_path, "common_nodes.bz2"))) filecontent = get_filecontent_maps( unpersist(join(args.dataset_path, "common_filecontent.bz2"))) offsets = group_offsets( unpersist(join(args.dataset_path, "common_offsets.bz2"))) data = [] nlp = create_tokenizer("spacy") for ind, (f_body, f_offsets) in enumerate( iterate_functions(offsets, node_maps, filecontent)): data.append(process_body(nlp, f_body, replacements=f_offsets)) store(data, args)
def render_annotations(annotations): nlp = create_tokenizer("spacy") entries = "" for annotation in annotations: text, predicted, annotated = annotation doc = nlp(text[0]) entries += entry.format(annotate(doc, predicted), annotate(doc, annotated)) return html_template.format(entries)
def process_package(working_directory, global_names=None): """ Find functions with annotations, extract annotation information, strip documentation and type annotations. :param working_directory: location of package related files :param global_names: optional, mapping from global node ids to names :return: list of entries in spacy compatible format """ bodies = unpersist_if_present( os.path.join(working_directory, "source_graph_bodies.bz2")) if bodies is None: return [] offsets_path = os.path.join(working_directory, "offsets.bz2") # offsets store information about spans for nodes referenced in the source code if os.path.isfile(offsets_path): offsets = unpersist(offsets_path) else: logging.warning(f"No file with offsets: {offsets_path}") offsets = None def load_local2global(working_directory): local2global = unpersist( os.path.join(working_directory, "local2global_with_ast.bz2")) id_maps = dict(zip(local2global['id'], local2global['global_id'])) return id_maps id_maps = load_local2global(working_directory) local_names = load_names( os.path.join(working_directory, "nodes_with_ast.bz2")) nlp = create_tokenizer("spacy") data = [] for ind, (_, row) in tqdm(enumerate(bodies.iterrows()), total=len(bodies), leave=True, desc=os.path.basename(working_directory)): body = row['body'] if offsets is not None: graph_node_spans = offsets_for_func(offsets, body, row["id"]) else: graph_node_spans = [] entry = process_body(nlp, body, replacements=graph_node_spans) if entry is not None: entry = to_global_ids(entry, id_maps, global_names, local_names) data.append(entry) return data
def source_code_graph_alignment(source_codes, node_spans, tokenizer="codebert"): supported_tokenizers = ["spacy", "codebert"] assert tokenizer in supported_tokenizers, f"Only these tokenizers supported for alignment: {supported_tokenizers}" nlp = create_tokenizer(tokenizer) for code, spans in zip(source_codes, node_spans): yield align_tokens_with_graph(nlp(code), resolve_self_collisions2(spans), tokenzer_name=tokenizer)
def __init__( self, data, batch_size: int, seq_len: int, wordmap: Dict[str, int], *, graphmap: Optional[Dict[str, int]], tagmap: Optional[TagMap] = None, mask_unlabeled_declarations=True, class_weights=False, element_hash_size=1000, len_sort=True, tokenizer="spacy", no_localization=False ): self.create_cache() self.data = sorted(data, key=lambda x: len(x[0])) if len_sort else data self.batch_size = batch_size self.seq_len = seq_len self.class_weights = None self.mask_unlabeled_declarations = mask_unlabeled_declarations self.tokenizer = tokenizer if tokenizer == "codebert": self.vocab = spacy.blank("en").vocab self.no_localization = no_localization self.nlp = create_tokenizer(tokenizer) if tagmap is None: self.tagmap = tag_map_from_sentences(list(zip(*[self.prepare_sent(sent) for sent in data]))[1]) else: self.tagmap = tagmap self.graphpad = len(graphmap) if graphmap is not None else None self.wordpad = len(wordmap) self.tagpad = self.tagmap["O"] self.prefpad = element_hash_size self.suffpad = element_hash_size self.graphmap_func = (lambda g: graphmap.get(g, len(graphmap))) if graphmap is not None else None self.wordmap_func = lambda w: wordmap.get(w, len(wordmap)) self.tagmap_func = lambda t: self.tagmap.get(t, self.tagmap["O"]) self.prefmap_func = lambda w: token_hasher(w[:3], element_hash_size) self.suffmap_func = lambda w: token_hasher(w[-3:], element_hash_size) self.mask_unlblpad = 1. if mask_unlabeled_declarations: self.mask_unlbl_func = lambda t: 1 if t == "O" else 0 else: self.mask_unlbl_func = lambda t: 1. self.classwpad = 1. if class_weights: self.class_weights = ClassWeightNormalizer() self.class_weights.init(list(zip(*[self.prepare_sent(sent) for sent in data]))[1]) self.classw_func = lambda t: self.class_weights.get(t, self.classwpad) else: self.classw_func = lambda t: 1.
def test_SpacyPythonBpe(): from SourceCodeTools.nlp import create_tokenizer nlp = create_tokenizer("spacy_bpe", bpe_path="/Users/LTV/Dropbox (Personal)/sentencepiece_bpe.model") # code = """ def method2(self) : # variable1 = self.field # variable2 = str(variable1) # return variable2""" code = """ def method2(self) : variable1 = self.field variable2 = str(variable1) return variable2""" doc = nlp(code) assert str(doc) == code print(doc)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--model_path", "-m", dest="model_path", default=None) parser.add_argument("--vectors", "-v", dest="vectors", default=None) parser.add_argument("data_path") parser.add_argument("--output_model", "-o", dest="output_model", default="spacy-typing-ner") parser.add_argument("--epochs", "-e", dest="epochs", default=90, type=int) parser.add_argument("--seed", "-s", dest="seed", default=42, type=int, help="Seed for random dataset split") parser.add_argument("--bpe", dest="bpe", default=None, type=str, help="") args = parser.parse_args() train_data, test_data = read_data(open(args.data_path, "r").readlines(), include_only="categories", random_seed=args.seed) if args.model_path is not None: model = spacy.load(args.model_path) else: if args.vectors is not None: model = create_tokenizer("spacy_bpe", bpe_path=args.bpe) add_vectors(model, args.vectors) else: raise Exception( "You should provide either an initialized spacy model or pretrained vectors" ) train_spacy_categorizer(train_data, test_data, model=model, output_dir=args.output_model, n_iter=args.epochs)
def main(): parser = argparse.ArgumentParser() parser.add_argument("bodies") parser.add_argument("bpe_path") parser.add_argument("--num_layers", default=8, type=int) args = parser.parse_args() bodies = unpersist(args.bodies) bpe = create_tokenizer(type="bpe", bpe_path=args.bpe_path) mention_tokenizer = MentionTokenizer(args.bpe_path, create_subword_instances=True, connect_subwords=False) lengths_tr = {} lengths_gnn = {} ratio = [] for body in tqdm(bodies["body"]): if not has_valid_syntax(body): continue n_tokens = compute_transformer_passings(body, bpe) n_edges = compute_gnn_passings(body, mention_tokenizer) if n_tokens not in lengths_tr: lengths_tr[n_tokens] = [] if n_tokens not in lengths_gnn: lengths_gnn[n_tokens] = [] lengths_tr[n_tokens].append(n_tokens**2 * args.num_layers) lengths_gnn[n_tokens].append(n_edges) # * args.num_layers) ratio.append((n_tokens, n_edges)) for key in lengths_tr: data_tr = np.array(lengths_tr[key]) data_gnn = np.array(lengths_gnn[key]) lengths_tr[key] = np.mean(data_tr) #, np.std(data_tr)) lengths_gnn[key] = np.mean(data_gnn) #, np.std(data_gnn)) data_ratios = np.array(ratio) plt.plot(data_ratios[:, 0], data_ratios[:, 1], "*") plt.xlabel("Number of Tokens") plt.ylabel("Number of Edges") plt.savefig("tokens_edges.png") plt.close() plt.hist(data_ratios[:, 1] / data_ratios[:, 0], bins=20) plt.xlabel("Number of edges / Number of tokens") plt.savefig("ratio.png") plt.close() ratio = data_ratios[:, 1] / data_ratios[:, 0] ratio = (np.mean(ratio), np.std(ratio)) plt.plot(list(lengths_tr.keys()), np.log10(np.array(list(lengths_tr.values()))), "*") plt.plot(list(lengths_gnn.keys()), np.log10(np.array(list(lengths_gnn.values()))), "*") plt.plot(list(lengths_gnn.keys()), np.log10(np.array(list(lengths_gnn.values())) * args.num_layers), "*") plt.legend([ f"Transformer {args.num_layers} layers", "GNN L layers", f"GNN L*{args.num_layers} layers" ]) plt.xlabel("Number of Tokens") plt.ylabel("log10(Number of Message Exchanges)") plt.savefig("avg_passings.png") plt.close()
import pickle, sys, json, re import spacy from spacy.gold import biluo_tags_from_offsets from SourceCodeTools.nlp import create_tokenizer nlp = create_tokenizer("spacy") TRAIN_DATA = [] with open(sys.argv[1], "r") as data: for line in data: entry = json.loads(line) TRAIN_DATA.append([entry['text'], {'entities': entry['ents']}]) TRAIN_DATA[-1][1]['entities'] = [ (int(e[0]), int(e[1]), e[2]) for e in TRAIN_DATA[-1][1]['entities'] ] for text, ent in TRAIN_DATA: doc = nlp(text) entities = ent['entities'] tags = biluo_tags_from_offsets(doc, entities) for token, tag in zip(doc, tags): print(token.text, tag, sep="\t") print("\t") # TODO # filter valid # if text.startswith("def format_percentiles("): # print("-" in tags) # print(tags) # print(entities) #