def save_resource(self, resource: Any) -> None: if not self._cache_dir: raise ResourceException( f"Trying to save a resource but the cache dir was not set.") self._cache_dir.mkdir(parents=True, exist_ok=True) with bz2_open(self._resource_cache_path(resource), "wb") as fh: pickle_dump(resource, fh)
def save_log(self): for i, filename, messages, category in self.conversations: if not self._dest: continue if category == 'log': header = ('时刻', '消息类型', '消息方向', 'ID', '微信号', '昵称', '备注名', '内容') fname = filename elif category == 'contacts': header = ('ID', '微信号', '昵称', '备注', '性别', '国', '省', '市', '签名') fname = path_join(filename, 'contacts') elif category == 'group': header = ('ID', '微信号', '昵称', '备注', '国', '省', '市', '签名') fname = filename fpath = path_join(self._dest, i, fname) try: makedirs(dirname(fpath)) except FileExistsError: pass if self._bom: encoding = 'utf-8-sig' else: encoding = 'utf8' if self._compress: fo = bz2_open(fpath + '.csv.bz2', 'wt', encoding=encoding, newline='') else: fo = open(fpath + '.csv', 'w', encoding=encoding, newline='') wt = csv_writer(fo) wt.writerow(header) wt.writerows(messages) fo.close()
def get_resource(self, name: str) -> Any: """Get the resource registered under a given name.""" if name not in self._instance_registry: raise ResourceException(f"Resource {name} not found.") if self._cache_dir: cache_path = self._name_cache_path(name) if cache_path.is_file(): with bz2_open(cache_path, "rb") as fh: return pickle_load(fh) return self._instance_registry[name]
def _tensorize_worker(self, file_path: Path) -> None: self._logger.debug(f"Tensorizing {file_path}") with asdf_open(str(self.parse_dir / file_path)) as af: nodes_instance = Nodes.from_tree(af.tree["nodes"]) tensors = self.instance.tensorize(nodes_instance) output_dir = (self.tensor_dir / "pickle" / file_path).parent output_dir.mkdir(parents=True, exist_ok=True) with bz2_open((output_dir / file_path.name).with_suffix(".pickle.bz2"), "wb") as fh: dump(tensors, fh, protocol=self.pickle_protocol) self._logger.debug(f"Tensorized {file_path}")
def register_resource(self, name: str, resource: Any) -> None: if name in self._instance_registry: raise ResourceException(f"Resource name {name} already in use.") if self._cache_dir: cache_path = self._name_cache_path(name) if cache_path.is_file(): self._logger.debug( f"Using the cached version of the {name} resource") with bz2_open(cache_path, "rb") as fh: resource = pickle_load(fh) self._instance_registry[name] = resource self._name_registry[id(resource)] = name
def tensorize( *, uasts_dir: str, instance_file: str, tensors_dir: str, configs_dir: str, n_workers: int, pickle_protocol: int, log_level: str, ) -> None: """Tensorize the UASTs.""" Config.from_arguments(locals(), ["uasts_dir", "instance_file", "tensors_dir"], "configs_dir").save( Path(configs_dir) / "tensorize.json") logger = setup_logging(__name__, log_level) uasts_dir_path = Path(uasts_dir).expanduser().resolve() tensors_dir_path = Path(tensors_dir).expanduser().resolve() with bz2_open(instance_file, "rb") as fh: instance = pickle_load(fh) worker = partial( _tensorize_worker, instance=instance, logger=logger, uasts_dir_path=uasts_dir_path, output_dir_path=tensors_dir_path, pickle_protocol=pickle_protocol, ) logger.info(f"Tensorizing %s", uasts_dir_path) with Pool(n_workers) as pool: pool.map( worker, (p.relative_to(uasts_dir_path) for p in uasts_dir_path.rglob("*.asdf")), ) logger.info(f"Tensorized %s", uasts_dir_path)
def _tensorize_worker( file_path: Path, instance: Instance, logger: Logger, uasts_dir_path: Path, output_dir_path: Path, pickle_protocol: int, ) -> None: logger.debug(f"Tensorizing {file_path}") with asdf_open(str(uasts_dir_path / file_path)) as af: tensors = instance.tensorize( { Nodes: Nodes.from_tree(af.tree["nodes"]), CodRepLabel: CodRepLabel.from_tree(af.tree["codrep_label"]), str: af.tree["filepath"], } ) output_dir = (output_dir_path / file_path).parent output_dir.mkdir(parents=True, exist_ok=True) with bz2_open((output_dir / file_path.name).with_suffix(".pickle.bz2"), "wb") as fh: pickle_dump(tensors, fh, protocol=pickle_protocol) logger.debug(f"Tensorized {file_path}")
def load_from_file(path) -> 'Python object': with bz2_open(path, mode='rb') as file: return loads(file.read())
def save_to_file(path, data): with bz2_open(path, mode='wb') as file: file.write(dumps(data, protocol=HIGHEST_PROTOCOL))
def save(self, file_path: Path) -> None: with bz2_open(file_path, "wb") as fh: pickle_dump(self, fh)
def __getitem__(self, index: int) -> Dict[str, Any]: with bz2_open(self._pickles[index], "rb") as fh: return pickle_load(fh)
def train( *, instance_file: str, tensors_dir: str, train_dir: str, configs_dir: str, model_encoder_iterations: int, model_encoder_output_dim: int, model_encoder_message_dim: int, model_decoder_type: str, model_learning_rate: float, model_batch_size: int, trainer_epochs: int, trainer_eval_every: int, trainer_limit_epochs_at: Optional[int], trainer_train_eval_split: float, trainer_selection_metric: str, trainer_kept_checkpoints: int, trainer_cuda: Optional[int], log_level: str, ) -> None: """Run the training.""" Config.from_arguments(locals(), ["instance_file", "tensors_dir", "train_dir"], "configs_dir").save( Path(configs_dir) / "train.json") logger = setup_logging(__name__, log_level) tensors_dir_path = Path(tensors_dir).expanduser().resolve() train_dir_path = Path(train_dir).expanduser().resolve() train_dir_path.mkdir(parents=True, exist_ok=True) with bz2_open(instance_file, "rb") as fh: instance = pickle_load(fh) dataset = CodRepDataset(input_dir=tensors_dir_path) logger.info("Dataset of size %d", len(dataset)) train_length = round(0.9 * len(dataset)) eval_length = round(0.05 * len(dataset)) test_length = len(dataset) - train_length - eval_length train_dataset, eval_dataset, test_dataset = random_split( dataset, [train_length, eval_length, test_length]) if trainer_cuda is not None: if not cuda_is_available(): raise RuntimeError("CUDA is not available on this system.") device = torch_device("cuda:%d" % trainer_cuda) else: device = torch_device("cpu") model = build_model( instance=instance, model_encoder_iterations=model_encoder_iterations, model_encoder_output_dim=model_encoder_output_dim, model_encoder_message_dim=model_encoder_message_dim, model_decoder_type=model_decoder_type, model_learning_rate=model_learning_rate, model_batch_size=model_batch_size, train_dataset=train_dataset, eval_dataset=eval_dataset, test_dataset=test_dataset, ) # The model needs a forward to be completely initialized. model.training_step(instance.collate([dataset[0]]), 0) logger.info("Configured model %s", model) checkpoint_callback = ModelCheckpoint( filepath=train_dir, save_best_only=True, verbose=True, monitor="eval_mrr", mode="max", prefix="", ) trainer = Trainer(default_save_path=train_dir, checkpoint_callback=checkpoint_callback) trainer.fit(model)
def run( *, raw_dir: str, uasts_dir: str, instance_file: str, tensors_dir: str, checkpoint_file: str, configs_dir: str, training_configs_dir: str, prefix: str, metadata_dir: Optional[str], log_level: str, ) -> None: """Run the model and output CodRep predictions.""" arguments = locals() configs_dir_path = Path(configs_dir).expanduser().resolve() configs_dir_path.mkdir(parents=True, exist_ok=True) training_configs_dir_path = Path( training_configs_dir).expanduser().resolve() tensors_dir_path = Path(tensors_dir).expanduser().resolve() Config.from_arguments(arguments, ["instance_file", "checkpoint_file"], "configs_dir").save(configs_dir_path / "train.json") logger = setup_logging(__name__, log_level) training_configs = {} for step in ["parse", "tensorize", "train"]: with (training_configs_dir_path / step).with_suffix(".json").open( "r", encoding="utf8") as fh: training_configs[step] = json_load(fh) parse( raw_dir=raw_dir, uasts_dir=uasts_dir, configs_dir=configs_dir, log_level=log_level, ) tensorize( uasts_dir=uasts_dir, instance_file=instance_file, tensors_dir=tensors_dir, configs_dir=configs_dir, n_workers=training_configs["tensorize"]["options"]["n_workers"], pickle_protocol=training_configs["tensorize"]["options"] ["pickle_protocol"], log_level=log_level, ) dataset = CodRepDataset(input_dir=tensors_dir_path) logger.info(f"Dataset of size {len(dataset)}") with bz2_open(instance_file, "rb") as fh: instance = pickle_load(fh) model = build_model( instance=instance, model_decoder_type=training_configs["train"]["options"] ["model_decoder_type"], model_encoder_iterations=training_configs["train"]["options"] ["model_encoder_iterations"], model_encoder_output_dim=training_configs["train"]["options"] ["model_encoder_output_dim"], model_encoder_message_dim=training_configs["train"]["options"] ["model_encoder_message_dim"], ) # The model needs a forward to be completely initialized. model(instance.collate([dataset[0]])) logger.info(f"Configured model {model}") model.load_state_dict( torch_load(checkpoint_file, map_location="cpu")["model_state_dict"]) model.eval() logger.info(f"Loaded model parameters from %s", checkpoint_file) dataloader = DataLoader( dataset, shuffle=False, collate_fn=instance.collate, batch_size=10, num_workers=1, ) metadata = None if metadata_dir is None else model.build_metadata() metadata_output = (None if metadata_dir is None else Path(metadata_dir) / "metadata.json") with no_grad(): for sample in dataloader: sample = model(sample) model.decode(sample=sample, prefix=prefix, metadata=metadata) if metadata_output is not None: with metadata_output.open("w", encoding="utf8") as fh: json_dump(metadata, fh)
def train( *, instance_file: str, tensors_dir: str, train_dir: str, configs_dir: str, model_encoder_iterations: int, model_encoder_output_dim: int, model_encoder_message_dim: int, model_decoder_type: str, optimizer_type: str, optimizer_learning_rate: float, scheduler_step_size: int, scheduler_gamma: float, trainer_epochs: int, trainer_batch_size: int, trainer_eval_every: int, trainer_limit_epochs_at: Optional[int], trainer_train_eval_split: float, trainer_metric_names: List[str], trainer_selection_metric: str, trainer_kept_checkpoints: int, trainer_cuda: Optional[int], log_level: str, ) -> None: """Run the training.""" Config.from_arguments(locals(), ["instance_file", "tensors_dir", "train_dir"], "configs_dir").save( Path(configs_dir) / "train.json") logger = setup_logging(__name__, log_level) tensors_dir_path = Path(tensors_dir).expanduser().resolve() train_dir_path = Path(train_dir).expanduser().resolve() train_dir_path.mkdir(parents=True, exist_ok=True) with bz2_open(instance_file, "rb") as fh: instance = pickle_load(fh) dataset = CodRepDataset(input_dir=tensors_dir_path) logger.info("Dataset of size %d", len(dataset)) model = build_model( instance=instance, model_encoder_iterations=model_encoder_iterations, model_encoder_output_dim=model_encoder_output_dim, model_encoder_message_dim=model_encoder_message_dim, model_decoder_type=model_decoder_type, ) # The model needs a forward to be completely initialized. model(instance.collate([dataset[0]])) logger.info("Configured model %s", model) if Optimizer(optimizer_type) is Optimizer.Adam: optimizer: TorchOptimizer = Adam(params=model.parameters(), lr=optimizer_learning_rate) else: optimizer = SGD(params=model.parameters(), lr=optimizer_learning_rate) scheduler = StepLR(optimizer=optimizer, step_size=scheduler_step_size, gamma=scheduler_gamma) trainer = Trainer( instance=instance, epochs=trainer_epochs, batch_size=trainer_batch_size, eval_every=trainer_eval_every, train_eval_split=trainer_train_eval_split, limit_epochs_at=trainer_limit_epochs_at, metric_names=trainer_metric_names, selection_metric=trainer_selection_metric, kept_checkpoints=trainer_kept_checkpoints, run_dir_path=train_dir_path, dataset=dataset, model=model, optimizer=optimizer, scheduler=scheduler, cuda_device=trainer_cuda, ) trainer.train()
def run( *, raw_dir: str, uasts_dir: str, instance_file: str, tensors_dir: str, checkpoint_file: str, configs_dir: str, training_configs_dir: str, prefix: str, metadata_dir: Optional[str], log_level: str, ) -> None: """Run the model and output CodRep predictions.""" arguments = locals() configs_dir_path = Path(configs_dir).expanduser().resolve() configs_dir_path.mkdir(parents=True, exist_ok=True) training_configs_dir_path = Path(training_configs_dir).expanduser().resolve() tensors_dir_path = Path(tensors_dir).expanduser().resolve() Config.from_arguments( arguments, ["instance_file", "checkpoint_file"], "configs_dir" ).save(configs_dir_path / "train.json") logger = setup_logging(__name__, log_level) training_configs = {} for step in ["parse", "tensorize", "train"]: with (training_configs_dir_path / step).with_suffix(".json").open( "r", encoding="utf8" ) as fh: training_configs[step] = json_load(fh) parse( raw_dir=raw_dir, uasts_dir=uasts_dir, configs_dir=configs_dir, log_level=log_level, ) tensorize( uasts_dir=uasts_dir, instance_file=instance_file, tensors_dir=tensors_dir, configs_dir=configs_dir, n_workers=training_configs["tensorize"]["options"]["n_workers"], pickle_protocol=training_configs["tensorize"]["options"]["pickle_protocol"], log_level=log_level, ) dataset = CodRepDataset(input_dir=tensors_dir_path) logger.info(f"Dataset of size {len(dataset)}") with bz2_open(instance_file, "rb") as fh_instance: instance = pickle_load(fh_instance) model = build_model( instance=instance, model_decoder_type=training_configs["train"]["options"]["model_decoder_type"], model_encoder_iterations=training_configs["train"]["options"][ "model_encoder_iterations" ], model_encoder_output_dim=training_configs["train"]["options"][ "model_encoder_output_dim" ], model_encoder_message_dim=training_configs["train"]["options"][ "model_encoder_message_dim" ], model_learning_rate=training_configs["train"]["options"]["model_learning_rate"], model_batch_size=training_configs["train"]["options"]["model_batch_size"], train_dataset=dataset, eval_dataset=None, test_dataset=None, ) # The model needs a forward to be completely initialized. model.training_step(instance.collate([dataset[0]]), 0) logger.info(f"Configured model {model}") model.load_state_dict( torch_load(checkpoint_file, map_location="cpu")["model_state_dict"] ) model.eval() logger.info(f"Loaded model parameters from %s", checkpoint_file) metadata = None if metadata_dir is None else model.build_metadata() metadata_output = ( None if metadata_dir is None else Path(metadata_dir) / "metadata.json" ) dataloader = model.train_dataloader() graph_field = instance.get_field_by_type("graph") label_field = instance.get_field_by_type("label") indexes_field = instance.get_field_by_type("indexes") metadata_field = instance.get_field_by_type("metadata") graph_input_fields = instance.get_fields_by_type("input") graph_input_dimensions = [48, 48, 32] feature_names = [field.name for field in graph_input_fields] with no_grad(): for batch in dataloader: graph, etypes = batch[graph_field.name] features = [batch[field_name] for field_name in feature_names] indexes, offsets = batch[indexes_field.name].indexes forward = model.forward(graph, etypes, features, indexes) model.decode( batched_graph=graph, indexes=indexes, offsets=offsets, forward=forward, paths=batch[metadata_field.name], prefix=prefix, metadata=metadata, ) if metadata_output is not None: with metadata_output.open("w", encoding="utf8") as fh: json_dump(metadata, fh)