def main(args: argparse.Namespace) -> None: # Fix random seeds and threads tf.keras.utils.set_random_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Create logdir name args.logdir = os.path.join( "logs", "{}-{}-{}".format( os.path.basename(globals().get("__file__", "notebook")), datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join( ("{}={}".format(re.sub("(.)[^_]*_?", r"\1", k), v) for k, v in sorted(vars(args).items()))))) # Load the data. Using analyses is only optional. morpho = MorphoDataset("czech_pdt_lemmas", add_bow_eow=True) analyses = MorphoAnalyzer("czech_pdt_analyses") # TODO: Create the model and train it model = ... # Generate test set annotations, but in `args.logdir` to allow parallel execution. os.makedirs(args.logdir, exist_ok=True) with open(os.path.join(args.logdir, "lemmatizer_competition.txt"), "w", encoding="utf-8") as predictions_file: # Predict the tags on the test set; update the following prediction # command if you use other output structure than in lemmatizer_noattn. predictions = model.predict(test) for sentence in predictions: for word in sentence: print(word.numpy().decode("utf-8"), file=predictions_file) print(file=predictions_file)
class Network: def __init__(self, pdt, args): # TODO: Define a suitable model. def train(self, pdt, args): # TODO: Train the network on a given dataset. raise NotImplementedError() def predict(self, dataset, args): # TODO: The `predict` method should return a list, each element corresponding # to one sentence. Each sentence should be list/np.ndarray of words, # each word a list/np.ndarray of chosen characters, possibly ended # by MorphoDataset.Factor.EOW. raise NotImplementedError() if __name__ == "__main__": # Parse arguments parser = argparse.ArgumentParser() # TODO: Define reasonable defaults and optionally more parameters parser.add_argument("--batch_size", default=None, type=int, help="Batch size.") parser.add_argument("--epochs", default=None, type=int, help="Number of epochs.") parser.add_argument("--seed", default=42, type=int, help="Random seed.") parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.") parser.add_argument("--verbose", default=False, action="store_true", help="Verbose TF logging.") args = parser.parse_args([] if "__file__" not in globals() else None) # Fix random seeds and threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Report only errors by default if not args.verbose: os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" # Create logdir name args.logdir = os.path.join("logs", "{}-{}-{}".format( os.path.basename(globals().get("__file__", "notebook")), datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items()))) )) # Load the data. Using analyses is only optional. morpho = MorphoDataset("czech_pdt") analyses = MorphoAnalyzer("czech_pdt_analyses") # Create the network and train network = Network(morpho, args) network.train(morpho, args) # Generate test set annotations, but to allow parallel execution, create it # in in args.logdir if it exists. out_path = "lemmatizer_competition_test.txt" if os.path.isdir(args.logdir): out_path = os.path.join(args.logdir, out_path) with open(out_path, "w", encoding="utf-8") as out_file: for i, sentence in enumerate(network.predict(morpho.test, args)): for j in range(len(morpho.test.data[morpho.test.FORMS].word_strings[i])): lemma = [] for c in map(int, sentence[j]): if c == MorphoDataset.Factor.EOW: break lemma.append(morpho.test.data[morpho.test.LEMMAS].alphabet[c]) print(morpho.test.data[morpho.test.FORMS].word_strings[i][j], "".join(lemma), morpho.test.data[morpho.test.TAGS].word_strings[i][j], sep="\t", file=out_file) print(file=out_file)