Exemplo n.º 1
0
 def save_resource(self, resource: Any) -> None:
     if not self._cache_dir:
         raise ResourceException(
             f"Trying to save a resource but the cache dir was not set.")
     self._cache_dir.mkdir(parents=True, exist_ok=True)
     with bz2_open(self._resource_cache_path(resource), "wb") as fh:
         pickle_dump(resource, fh)
Exemplo n.º 2
0
 def save_log(self):
   for i, filename, messages, category in self.conversations:
     if not self._dest:
       continue
     if category == 'log':
       header = ('时刻', '消息类型', '消息方向', 'ID', '微信号', '昵称', '备注名', '内容')
       fname = filename
     elif category == 'contacts':
       header = ('ID', '微信号', '昵称', '备注', '性别', '国', '省', '市', '签名')
       fname = path_join(filename, 'contacts')
     elif category == 'group':
       header = ('ID', '微信号', '昵称', '备注', '国', '省', '市', '签名')
       fname = filename
     fpath = path_join(self._dest, i, fname)
     try:
       makedirs(dirname(fpath))
     except FileExistsError:
       pass
     if self._bom:
       encoding = 'utf-8-sig'
     else:
       encoding = 'utf8'
     if self._compress:
       fo = bz2_open(fpath + '.csv.bz2', 'wt', encoding=encoding, newline='')
     else:
       fo = open(fpath + '.csv', 'w', encoding=encoding, newline='')
     wt = csv_writer(fo)
     wt.writerow(header)
     wt.writerows(messages)
     fo.close()
Exemplo n.º 3
0
 def get_resource(self, name: str) -> Any:
     """Get the resource registered under a given name."""
     if name not in self._instance_registry:
         raise ResourceException(f"Resource {name} not found.")
     if self._cache_dir:
         cache_path = self._name_cache_path(name)
         if cache_path.is_file():
             with bz2_open(cache_path, "rb") as fh:
                 return pickle_load(fh)
     return self._instance_registry[name]
Exemplo n.º 4
0
 def _tensorize_worker(self, file_path: Path) -> None:
     self._logger.debug(f"Tensorizing {file_path}")
     with asdf_open(str(self.parse_dir / file_path)) as af:
         nodes_instance = Nodes.from_tree(af.tree["nodes"])
     tensors = self.instance.tensorize(nodes_instance)
     output_dir = (self.tensor_dir / "pickle" / file_path).parent
     output_dir.mkdir(parents=True, exist_ok=True)
     with bz2_open((output_dir / file_path.name).with_suffix(".pickle.bz2"),
                   "wb") as fh:
         dump(tensors, fh, protocol=self.pickle_protocol)
     self._logger.debug(f"Tensorized  {file_path}")
Exemplo n.º 5
0
 def register_resource(self, name: str, resource: Any) -> None:
     if name in self._instance_registry:
         raise ResourceException(f"Resource name {name} already in use.")
     if self._cache_dir:
         cache_path = self._name_cache_path(name)
         if cache_path.is_file():
             self._logger.debug(
                 f"Using the cached version of the {name} resource")
             with bz2_open(cache_path, "rb") as fh:
                 resource = pickle_load(fh)
     self._instance_registry[name] = resource
     self._name_registry[id(resource)] = name
Exemplo n.º 6
0
def tensorize(
    *,
    uasts_dir: str,
    instance_file: str,
    tensors_dir: str,
    configs_dir: str,
    n_workers: int,
    pickle_protocol: int,
    log_level: str,
) -> None:
    """Tensorize the UASTs."""
    Config.from_arguments(locals(),
                          ["uasts_dir", "instance_file", "tensors_dir"],
                          "configs_dir").save(
                              Path(configs_dir) / "tensorize.json")
    logger = setup_logging(__name__, log_level)

    uasts_dir_path = Path(uasts_dir).expanduser().resolve()
    tensors_dir_path = Path(tensors_dir).expanduser().resolve()

    with bz2_open(instance_file, "rb") as fh:
        instance = pickle_load(fh)

    worker = partial(
        _tensorize_worker,
        instance=instance,
        logger=logger,
        uasts_dir_path=uasts_dir_path,
        output_dir_path=tensors_dir_path,
        pickle_protocol=pickle_protocol,
    )

    logger.info(f"Tensorizing %s", uasts_dir_path)
    with Pool(n_workers) as pool:
        pool.map(
            worker,
            (p.relative_to(uasts_dir_path)
             for p in uasts_dir_path.rglob("*.asdf")),
        )
    logger.info(f"Tensorized  %s", uasts_dir_path)
Exemplo n.º 7
0
def _tensorize_worker(
    file_path: Path,
    instance: Instance,
    logger: Logger,
    uasts_dir_path: Path,
    output_dir_path: Path,
    pickle_protocol: int,
) -> None:
    logger.debug(f"Tensorizing {file_path}")
    with asdf_open(str(uasts_dir_path / file_path)) as af:
        tensors = instance.tensorize(
            {
                Nodes: Nodes.from_tree(af.tree["nodes"]),
                CodRepLabel: CodRepLabel.from_tree(af.tree["codrep_label"]),
                str: af.tree["filepath"],
            }
        )
    output_dir = (output_dir_path / file_path).parent
    output_dir.mkdir(parents=True, exist_ok=True)
    with bz2_open((output_dir / file_path.name).with_suffix(".pickle.bz2"), "wb") as fh:
        pickle_dump(tensors, fh, protocol=pickle_protocol)
    logger.debug(f"Tensorized  {file_path}")
Exemplo n.º 8
0
def load_from_file(path) -> 'Python object':
    with bz2_open(path, mode='rb') as file:
        return loads(file.read())
Exemplo n.º 9
0
def save_to_file(path, data):
    with bz2_open(path, mode='wb') as file:
        file.write(dumps(data, protocol=HIGHEST_PROTOCOL))
Exemplo n.º 10
0
def load_from_file(path) -> 'Python object':
    with bz2_open(path, mode='rb') as file:
        return loads(file.read())
Exemplo n.º 11
0
def save_to_file(path, data):
    with bz2_open(path, mode='wb') as file:
        file.write(dumps(data, protocol=HIGHEST_PROTOCOL))
Exemplo n.º 12
0
 def save(self, file_path: Path) -> None:
     with bz2_open(file_path, "wb") as fh:
         pickle_dump(self, fh)
Exemplo n.º 13
0
 def __getitem__(self, index: int) -> Dict[str, Any]:
     with bz2_open(self._pickles[index], "rb") as fh:
         return pickle_load(fh)
Exemplo n.º 14
0
def train(
    *,
    instance_file: str,
    tensors_dir: str,
    train_dir: str,
    configs_dir: str,
    model_encoder_iterations: int,
    model_encoder_output_dim: int,
    model_encoder_message_dim: int,
    model_decoder_type: str,
    model_learning_rate: float,
    model_batch_size: int,
    trainer_epochs: int,
    trainer_eval_every: int,
    trainer_limit_epochs_at: Optional[int],
    trainer_train_eval_split: float,
    trainer_selection_metric: str,
    trainer_kept_checkpoints: int,
    trainer_cuda: Optional[int],
    log_level: str,
) -> None:
    """Run the training."""
    Config.from_arguments(locals(),
                          ["instance_file", "tensors_dir", "train_dir"],
                          "configs_dir").save(
                              Path(configs_dir) / "train.json")
    logger = setup_logging(__name__, log_level)

    tensors_dir_path = Path(tensors_dir).expanduser().resolve()
    train_dir_path = Path(train_dir).expanduser().resolve()
    train_dir_path.mkdir(parents=True, exist_ok=True)

    with bz2_open(instance_file, "rb") as fh:
        instance = pickle_load(fh)

    dataset = CodRepDataset(input_dir=tensors_dir_path)
    logger.info("Dataset of size %d", len(dataset))

    train_length = round(0.9 * len(dataset))
    eval_length = round(0.05 * len(dataset))
    test_length = len(dataset) - train_length - eval_length

    train_dataset, eval_dataset, test_dataset = random_split(
        dataset, [train_length, eval_length, test_length])

    if trainer_cuda is not None:
        if not cuda_is_available():
            raise RuntimeError("CUDA is not available on this system.")
        device = torch_device("cuda:%d" % trainer_cuda)
    else:
        device = torch_device("cpu")
    model = build_model(
        instance=instance,
        model_encoder_iterations=model_encoder_iterations,
        model_encoder_output_dim=model_encoder_output_dim,
        model_encoder_message_dim=model_encoder_message_dim,
        model_decoder_type=model_decoder_type,
        model_learning_rate=model_learning_rate,
        model_batch_size=model_batch_size,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        test_dataset=test_dataset,
    )
    # The model needs a forward to be completely initialized.
    model.training_step(instance.collate([dataset[0]]), 0)
    logger.info("Configured model %s", model)

    checkpoint_callback = ModelCheckpoint(
        filepath=train_dir,
        save_best_only=True,
        verbose=True,
        monitor="eval_mrr",
        mode="max",
        prefix="",
    )

    trainer = Trainer(default_save_path=train_dir,
                      checkpoint_callback=checkpoint_callback)
    trainer.fit(model)
Exemplo n.º 15
0
def run(
    *,
    raw_dir: str,
    uasts_dir: str,
    instance_file: str,
    tensors_dir: str,
    checkpoint_file: str,
    configs_dir: str,
    training_configs_dir: str,
    prefix: str,
    metadata_dir: Optional[str],
    log_level: str,
) -> None:
    """Run the model and output CodRep predictions."""
    arguments = locals()
    configs_dir_path = Path(configs_dir).expanduser().resolve()
    configs_dir_path.mkdir(parents=True, exist_ok=True)
    training_configs_dir_path = Path(
        training_configs_dir).expanduser().resolve()
    tensors_dir_path = Path(tensors_dir).expanduser().resolve()
    Config.from_arguments(arguments, ["instance_file", "checkpoint_file"],
                          "configs_dir").save(configs_dir_path / "train.json")
    logger = setup_logging(__name__, log_level)

    training_configs = {}
    for step in ["parse", "tensorize", "train"]:
        with (training_configs_dir_path / step).with_suffix(".json").open(
                "r", encoding="utf8") as fh:
            training_configs[step] = json_load(fh)

    parse(
        raw_dir=raw_dir,
        uasts_dir=uasts_dir,
        configs_dir=configs_dir,
        log_level=log_level,
    )

    tensorize(
        uasts_dir=uasts_dir,
        instance_file=instance_file,
        tensors_dir=tensors_dir,
        configs_dir=configs_dir,
        n_workers=training_configs["tensorize"]["options"]["n_workers"],
        pickle_protocol=training_configs["tensorize"]["options"]
        ["pickle_protocol"],
        log_level=log_level,
    )

    dataset = CodRepDataset(input_dir=tensors_dir_path)
    logger.info(f"Dataset of size {len(dataset)}")

    with bz2_open(instance_file, "rb") as fh:
        instance = pickle_load(fh)

    model = build_model(
        instance=instance,
        model_decoder_type=training_configs["train"]["options"]
        ["model_decoder_type"],
        model_encoder_iterations=training_configs["train"]["options"]
        ["model_encoder_iterations"],
        model_encoder_output_dim=training_configs["train"]["options"]
        ["model_encoder_output_dim"],
        model_encoder_message_dim=training_configs["train"]["options"]
        ["model_encoder_message_dim"],
    )
    # The model needs a forward to be completely initialized.
    model(instance.collate([dataset[0]]))
    logger.info(f"Configured model {model}")

    model.load_state_dict(
        torch_load(checkpoint_file, map_location="cpu")["model_state_dict"])
    model.eval()
    logger.info(f"Loaded model parameters from %s", checkpoint_file)

    dataloader = DataLoader(
        dataset,
        shuffle=False,
        collate_fn=instance.collate,
        batch_size=10,
        num_workers=1,
    )

    metadata = None if metadata_dir is None else model.build_metadata()
    metadata_output = (None if metadata_dir is None else Path(metadata_dir) /
                       "metadata.json")

    with no_grad():
        for sample in dataloader:
            sample = model(sample)
            model.decode(sample=sample, prefix=prefix, metadata=metadata)

    if metadata_output is not None:
        with metadata_output.open("w", encoding="utf8") as fh:
            json_dump(metadata, fh)
Exemplo n.º 16
0
def train(
    *,
    instance_file: str,
    tensors_dir: str,
    train_dir: str,
    configs_dir: str,
    model_encoder_iterations: int,
    model_encoder_output_dim: int,
    model_encoder_message_dim: int,
    model_decoder_type: str,
    optimizer_type: str,
    optimizer_learning_rate: float,
    scheduler_step_size: int,
    scheduler_gamma: float,
    trainer_epochs: int,
    trainer_batch_size: int,
    trainer_eval_every: int,
    trainer_limit_epochs_at: Optional[int],
    trainer_train_eval_split: float,
    trainer_metric_names: List[str],
    trainer_selection_metric: str,
    trainer_kept_checkpoints: int,
    trainer_cuda: Optional[int],
    log_level: str,
) -> None:
    """Run the training."""
    Config.from_arguments(locals(),
                          ["instance_file", "tensors_dir", "train_dir"],
                          "configs_dir").save(
                              Path(configs_dir) / "train.json")
    logger = setup_logging(__name__, log_level)

    tensors_dir_path = Path(tensors_dir).expanduser().resolve()
    train_dir_path = Path(train_dir).expanduser().resolve()
    train_dir_path.mkdir(parents=True, exist_ok=True)

    with bz2_open(instance_file, "rb") as fh:
        instance = pickle_load(fh)

    dataset = CodRepDataset(input_dir=tensors_dir_path)
    logger.info("Dataset of size %d", len(dataset))

    model = build_model(
        instance=instance,
        model_encoder_iterations=model_encoder_iterations,
        model_encoder_output_dim=model_encoder_output_dim,
        model_encoder_message_dim=model_encoder_message_dim,
        model_decoder_type=model_decoder_type,
    )
    # The model needs a forward to be completely initialized.
    model(instance.collate([dataset[0]]))
    logger.info("Configured model %s", model)

    if Optimizer(optimizer_type) is Optimizer.Adam:
        optimizer: TorchOptimizer = Adam(params=model.parameters(),
                                         lr=optimizer_learning_rate)
    else:
        optimizer = SGD(params=model.parameters(), lr=optimizer_learning_rate)
    scheduler = StepLR(optimizer=optimizer,
                       step_size=scheduler_step_size,
                       gamma=scheduler_gamma)
    trainer = Trainer(
        instance=instance,
        epochs=trainer_epochs,
        batch_size=trainer_batch_size,
        eval_every=trainer_eval_every,
        train_eval_split=trainer_train_eval_split,
        limit_epochs_at=trainer_limit_epochs_at,
        metric_names=trainer_metric_names,
        selection_metric=trainer_selection_metric,
        kept_checkpoints=trainer_kept_checkpoints,
        run_dir_path=train_dir_path,
        dataset=dataset,
        model=model,
        optimizer=optimizer,
        scheduler=scheduler,
        cuda_device=trainer_cuda,
    )
    trainer.train()
Exemplo n.º 17
0
Arquivo: run.py Projeto: m09/mloncode
def run(
    *,
    raw_dir: str,
    uasts_dir: str,
    instance_file: str,
    tensors_dir: str,
    checkpoint_file: str,
    configs_dir: str,
    training_configs_dir: str,
    prefix: str,
    metadata_dir: Optional[str],
    log_level: str,
) -> None:
    """Run the model and output CodRep predictions."""
    arguments = locals()
    configs_dir_path = Path(configs_dir).expanduser().resolve()
    configs_dir_path.mkdir(parents=True, exist_ok=True)
    training_configs_dir_path = Path(training_configs_dir).expanduser().resolve()
    tensors_dir_path = Path(tensors_dir).expanduser().resolve()
    Config.from_arguments(
        arguments, ["instance_file", "checkpoint_file"], "configs_dir"
    ).save(configs_dir_path / "train.json")
    logger = setup_logging(__name__, log_level)

    training_configs = {}
    for step in ["parse", "tensorize", "train"]:
        with (training_configs_dir_path / step).with_suffix(".json").open(
            "r", encoding="utf8"
        ) as fh:
            training_configs[step] = json_load(fh)

    parse(
        raw_dir=raw_dir,
        uasts_dir=uasts_dir,
        configs_dir=configs_dir,
        log_level=log_level,
    )

    tensorize(
        uasts_dir=uasts_dir,
        instance_file=instance_file,
        tensors_dir=tensors_dir,
        configs_dir=configs_dir,
        n_workers=training_configs["tensorize"]["options"]["n_workers"],
        pickle_protocol=training_configs["tensorize"]["options"]["pickle_protocol"],
        log_level=log_level,
    )

    dataset = CodRepDataset(input_dir=tensors_dir_path)
    logger.info(f"Dataset of size {len(dataset)}")

    with bz2_open(instance_file, "rb") as fh_instance:
        instance = pickle_load(fh_instance)

    model = build_model(
        instance=instance,
        model_decoder_type=training_configs["train"]["options"]["model_decoder_type"],
        model_encoder_iterations=training_configs["train"]["options"][
            "model_encoder_iterations"
        ],
        model_encoder_output_dim=training_configs["train"]["options"][
            "model_encoder_output_dim"
        ],
        model_encoder_message_dim=training_configs["train"]["options"][
            "model_encoder_message_dim"
        ],
        model_learning_rate=training_configs["train"]["options"]["model_learning_rate"],
        model_batch_size=training_configs["train"]["options"]["model_batch_size"],
        train_dataset=dataset,
        eval_dataset=None,
        test_dataset=None,
    )

    # The model needs a forward to be completely initialized.
    model.training_step(instance.collate([dataset[0]]), 0)
    logger.info(f"Configured model {model}")

    model.load_state_dict(
        torch_load(checkpoint_file, map_location="cpu")["model_state_dict"]
    )
    model.eval()
    logger.info(f"Loaded model parameters from %s", checkpoint_file)

    metadata = None if metadata_dir is None else model.build_metadata()
    metadata_output = (
        None if metadata_dir is None else Path(metadata_dir) / "metadata.json"
    )

    dataloader = model.train_dataloader()

    graph_field = instance.get_field_by_type("graph")
    label_field = instance.get_field_by_type("label")
    indexes_field = instance.get_field_by_type("indexes")
    metadata_field = instance.get_field_by_type("metadata")
    graph_input_fields = instance.get_fields_by_type("input")
    graph_input_dimensions = [48, 48, 32]
    feature_names = [field.name for field in graph_input_fields]
    with no_grad():
        for batch in dataloader:
            graph, etypes = batch[graph_field.name]
            features = [batch[field_name] for field_name in feature_names]
            indexes, offsets = batch[indexes_field.name].indexes
            forward = model.forward(graph, etypes, features, indexes)
            model.decode(
                batched_graph=graph,
                indexes=indexes,
                offsets=offsets,
                forward=forward,
                paths=batch[metadata_field.name],
                prefix=prefix,
                metadata=metadata,
            )

    if metadata_output is not None:
        with metadata_output.open("w", encoding="utf8") as fh:
            json_dump(metadata, fh)