def index( *, uasts_dir: str, instance_file: str, configs_dir: str, encoder_edge_types: List[str], max_length: int, log_level: str, ) -> None: """Index UASTs with respect to some fields.""" Config.from_arguments(locals(), ["uasts_dir", "instance_file"], "configs_dir").save( Path(configs_dir) / "index.json") logger = setup_logging(__name__, log_level) uasts_dir_path = Path(uasts_dir).expanduser().resolve() instance_file_path = Path(instance_file).expanduser().resolve() instance = Instance(fields=[ TypedDGLGraphField(name="typed_dgl_graph", type="graph", edge_types=encoder_edge_types), MetadataField(name="metadata", type="metadata"), BinaryLabelsField(name="label", type="label"), IndexesField(name="indexes", type="indexes"), InternalTypeField(name="internal_type", type="input"), RolesField(name="roles", type="input"), LengthField(name="max_length", type="input", max_length=max_length), ]) logger.info(f"Indexing %s", uasts_dir_path) for file_path in uasts_dir_path.rglob("*.asdf"): with asdf_open(str(file_path)) as af: instance.index({ Nodes: Nodes.from_tree(af.tree["nodes"]), CodRepLabel: CodRepLabel.from_tree(af.tree["codrep_label"]), str: af.tree["filepath"], }) instance.save(instance_file_path) logger.info(f"Indexed %s", uasts_dir_path)
def tensorize( *, uasts_dir: str, instance_file: str, tensors_dir: str, configs_dir: str, n_workers: int, pickle_protocol: int, log_level: str, ) -> None: """Tensorize the UASTs.""" Config.from_arguments(locals(), ["uasts_dir", "instance_file", "tensors_dir"], "configs_dir").save( Path(configs_dir) / "tensorize.json") logger = setup_logging(__name__, log_level) uasts_dir_path = Path(uasts_dir).expanduser().resolve() tensors_dir_path = Path(tensors_dir).expanduser().resolve() with bz2_open(instance_file, "rb") as fh: instance = pickle_load(fh) worker = partial( _tensorize_worker, instance=instance, logger=logger, uasts_dir_path=uasts_dir_path, output_dir_path=tensors_dir_path, pickle_protocol=pickle_protocol, ) logger.info(f"Tensorizing %s", uasts_dir_path) with Pool(n_workers) as pool: pool.map( worker, (p.relative_to(uasts_dir_path) for p in uasts_dir_path.rglob("*.asdf")), ) logger.info(f"Tensorized %s", uasts_dir_path)
def parse(*, raw_dir: str, uasts_dir: str, configs_dir: str, log_level: str) -> None: """Parse a CodRep 2019 dataset into UASTs.""" Config.from_arguments(locals(), ["raw_dir", "uasts_dir"], "configs_dir").save( Path(configs_dir) / "parse.json") logger = setup_logging(__name__, log_level) raw_dir_path = Path(raw_dir).expanduser().resolve() uasts_dir_path = Path(uasts_dir).expanduser().resolve() uasts_dir_path.mkdir(parents=True, exist_ok=True) parser = JavaParser(split_formatting=True) logger.info("Parsing %s", raw_dir_path) labels_file = raw_dir_path / "out.txt" extract_labels = labels_file.is_file() if extract_labels: error_offsets = {} for i, line in enumerate(labels_file.open("r", encoding="utf8")): error_offsets["%d.txt" % i] = int(line) - 1 for file_path in raw_dir_path.rglob("*.txt"): if file_path.samefile(labels_file): continue file_path_relative = file_path.relative_to(raw_dir_path) start = time() logger.debug("Parsing %s", file_path_relative) try: nodes = parser.parse(raw_dir_path, file_path_relative) except ParsingException: continue logger.debug( "Parsed %s into %d nodes in %.2fms", file_path_relative, len(nodes.nodes), (time() - start) * 1000, ) error_node_index = None if extract_labels: error_offset = error_offsets[file_path.name] for formatting_i, i in enumerate(nodes.formatting_indexes): node = nodes.nodes[i] if node.start == error_offset: error_node_index = formatting_i break else: logger.warning( "Could not retrieve a formatting node for the error at offset %d " "of file %s.", error_offset, file_path.with_suffix("").name, ) continue codrep_label = CodRepLabel( error_index=error_node_index, n_formatting_nodes=len(nodes.formatting_indexes), ) output_subdirectory = uasts_dir_path / file_path_relative.parent output_subdirectory.mkdir(parents=True, exist_ok=True) with (output_subdirectory / file_path.with_suffix(".asdf").name).open("wb") as fh: af = AsdfFile( dict( nodes=nodes.to_tree(file_path.read_text(encoding="utf-8")), codrep_label=codrep_label.to_tree(), filepath=str(file_path_relative), )) af.write_to(fh, all_array_compression="bzp2")
def run( *, raw_dir: str, uasts_dir: str, instance_file: str, tensors_dir: str, checkpoint_file: str, configs_dir: str, training_configs_dir: str, prefix: str, metadata_dir: Optional[str], log_level: str, ) -> None: """Run the model and output CodRep predictions.""" arguments = locals() configs_dir_path = Path(configs_dir).expanduser().resolve() configs_dir_path.mkdir(parents=True, exist_ok=True) training_configs_dir_path = Path(training_configs_dir).expanduser().resolve() tensors_dir_path = Path(tensors_dir).expanduser().resolve() Config.from_arguments( arguments, ["instance_file", "checkpoint_file"], "configs_dir" ).save(configs_dir_path / "train.json") logger = setup_logging(__name__, log_level) training_configs = {} for step in ["parse", "tensorize", "train"]: with (training_configs_dir_path / step).with_suffix(".json").open( "r", encoding="utf8" ) as fh: training_configs[step] = json_load(fh) parse( raw_dir=raw_dir, uasts_dir=uasts_dir, configs_dir=configs_dir, log_level=log_level, ) tensorize( uasts_dir=uasts_dir, instance_file=instance_file, tensors_dir=tensors_dir, configs_dir=configs_dir, n_workers=training_configs["tensorize"]["options"]["n_workers"], pickle_protocol=training_configs["tensorize"]["options"]["pickle_protocol"], log_level=log_level, ) dataset = CodRepDataset(input_dir=tensors_dir_path) logger.info(f"Dataset of size {len(dataset)}") with bz2_open(instance_file, "rb") as fh_instance: instance = pickle_load(fh_instance) model = build_model( instance=instance, model_decoder_type=training_configs["train"]["options"]["model_decoder_type"], model_encoder_iterations=training_configs["train"]["options"][ "model_encoder_iterations" ], model_encoder_output_dim=training_configs["train"]["options"][ "model_encoder_output_dim" ], model_encoder_message_dim=training_configs["train"]["options"][ "model_encoder_message_dim" ], model_learning_rate=training_configs["train"]["options"]["model_learning_rate"], model_batch_size=training_configs["train"]["options"]["model_batch_size"], train_dataset=dataset, eval_dataset=None, test_dataset=None, ) # The model needs a forward to be completely initialized. model.training_step(instance.collate([dataset[0]]), 0) logger.info(f"Configured model {model}") model.load_state_dict( torch_load(checkpoint_file, map_location="cpu")["model_state_dict"] ) model.eval() logger.info(f"Loaded model parameters from %s", checkpoint_file) metadata = None if metadata_dir is None else model.build_metadata() metadata_output = ( None if metadata_dir is None else Path(metadata_dir) / "metadata.json" ) dataloader = model.train_dataloader() graph_field = instance.get_field_by_type("graph") label_field = instance.get_field_by_type("label") indexes_field = instance.get_field_by_type("indexes") metadata_field = instance.get_field_by_type("metadata") graph_input_fields = instance.get_fields_by_type("input") graph_input_dimensions = [48, 48, 32] feature_names = [field.name for field in graph_input_fields] with no_grad(): for batch in dataloader: graph, etypes = batch[graph_field.name] features = [batch[field_name] for field_name in feature_names] indexes, offsets = batch[indexes_field.name].indexes forward = model.forward(graph, etypes, features, indexes) model.decode( batched_graph=graph, indexes=indexes, offsets=offsets, forward=forward, paths=batch[metadata_field.name], prefix=prefix, metadata=metadata, ) if metadata_output is not None: with metadata_output.open("w", encoding="utf8") as fh: json_dump(metadata, fh)
def train( *, instance_file: str, tensors_dir: str, train_dir: str, configs_dir: str, model_encoder_iterations: int, model_encoder_output_dim: int, model_encoder_message_dim: int, model_decoder_type: str, model_learning_rate: float, model_batch_size: int, trainer_epochs: int, trainer_eval_every: int, trainer_limit_epochs_at: Optional[int], trainer_train_eval_split: float, trainer_selection_metric: str, trainer_kept_checkpoints: int, trainer_cuda: Optional[int], log_level: str, ) -> None: """Run the training.""" Config.from_arguments(locals(), ["instance_file", "tensors_dir", "train_dir"], "configs_dir").save( Path(configs_dir) / "train.json") logger = setup_logging(__name__, log_level) tensors_dir_path = Path(tensors_dir).expanduser().resolve() train_dir_path = Path(train_dir).expanduser().resolve() train_dir_path.mkdir(parents=True, exist_ok=True) with bz2_open(instance_file, "rb") as fh: instance = pickle_load(fh) dataset = CodRepDataset(input_dir=tensors_dir_path) logger.info("Dataset of size %d", len(dataset)) train_length = round(0.9 * len(dataset)) eval_length = round(0.05 * len(dataset)) test_length = len(dataset) - train_length - eval_length train_dataset, eval_dataset, test_dataset = random_split( dataset, [train_length, eval_length, test_length]) if trainer_cuda is not None: if not cuda_is_available(): raise RuntimeError("CUDA is not available on this system.") device = torch_device("cuda:%d" % trainer_cuda) else: device = torch_device("cpu") model = build_model( instance=instance, model_encoder_iterations=model_encoder_iterations, model_encoder_output_dim=model_encoder_output_dim, model_encoder_message_dim=model_encoder_message_dim, model_decoder_type=model_decoder_type, model_learning_rate=model_learning_rate, model_batch_size=model_batch_size, train_dataset=train_dataset, eval_dataset=eval_dataset, test_dataset=test_dataset, ) # The model needs a forward to be completely initialized. model.training_step(instance.collate([dataset[0]]), 0) logger.info("Configured model %s", model) checkpoint_callback = ModelCheckpoint( filepath=train_dir, save_best_only=True, verbose=True, monitor="eval_mrr", mode="max", prefix="", ) trainer = Trainer(default_save_path=train_dir, checkpoint_callback=checkpoint_callback) trainer.fit(model)