def prepare_model(self, model: str, use_latest: bool = False, debug:bool = False, cross_proj: bool=False): if not use_latest: model_work_dir = self.work_dir / model else: model_work_dir = self.work_dir / f"{model}-latest" # end if if cross_proj: model_work_dir = Path(f"{model_work_dir}-cross-proj") if debug: model_work_dir = Path(f"{model_work_dir}-debug") IOUtils.mk_dir(model_work_dir) if model == "DeepCom": from csevo.ml.DeepComRunner import DeepComRunner runner = DeepComRunner(model_work_dir, use_latest) elif model == "Seq2seq": from csevo.ml.Seq2seqRunner import Seq2seqRunner runner = Seq2seqRunner(model_work_dir, use_latest) elif model == "Seq2seqAtt": from csevo.ml.Seq2seqAttRunner import Seq2seqAttRunner runner = Seq2seqAttRunner(model_work_dir, use_latest) elif model == "DeepCom-SBT": from csevo.ml.DeepComSBTRunner import DeepComSBTRunner runner = DeepComSBTRunner(model_work_dir, use_latest) elif model == "DeepCom-Preorder": from csevo.ml.DeepComPreorderRunner import DeepComPreorderRunner runner = DeepComPreorderRunner(model_work_dir, use_latest) elif model == "Code2Seq": from csevo.ml.Code2SeqRunner import Code2SeqRunner runner = Code2SeqRunner(model_work_dir, use_latest, debug, cross_proj) elif model == "Bi-LSTM": from csevo.ml.BiLSTMRunner import BiLSTMRunner runner = BiLSTMRunner(model_work_dir, use_latest, debug, cross_proj) elif model == "no-split-Bi-LSTM": from csevo.ml.NoSplitBiLSTMRunner import BiLSTMRunner runner = BiLSTMRunner(model_work_dir, use_latest, debug, cross_proj) elif model == "Transformer": from csevo.ml.TransformerRunner import TransformerRunner runner = TransformerRunner(model_work_dir, use_latest) else: raise ValueError(f"Model {model} not ready yet") # end if runner.prepare() return
def prepare_data(self): data_prefix = f"{self.eval_setting}-{self.year}" data_dir = self.work_dir / "data" self.logger.info(f"Preparing the data for {self.eval_setting} {self.year} at {data_dir}") IOUtils.rm_dir(data_dir) IOUtils.mk_dir(data_dir) # Copy train/val/test_common/test_standard data BashUtils.run(f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.train}/train {data_dir}/train", expected_return_code=0) BashUtils.run(f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.val}/valid {data_dir}/valid", expected_return_code=0) BashUtils.run(f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_common}/test {data_dir}/{Macros.test_common}", expected_return_code=0) BashUtils.run(f"cp -r {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/test {data_dir}/{Macros.test_standard}", expected_return_code=0) # Copy vocab BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/vocab* {data_dir}/", expected_return_code=0) return
def prepare_configs_and_scripts(self, trials: List[int]): with open(self.base_config_file, "r") as f: base_config = yaml.load(f) exp_dir = self.work_dir for trial in trials: seed = random.randint(0, 9) trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) config = copy.copy(base_config) config["data"] = str(self.data_dir / "biLSTM") config["save_model"] = str(trial_dir / "bestLSTM") config_file = trial_dir / "config.yaml" with open(config_file, "w+") as f: yaml.dump(config, f) train_script_file = trial_dir/"train.sh" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"module load cuda/10.1 cudnn/7.6.2\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"cd {self.code_dir}\n" \ f"export MKL_SERVICE_FORCE_INTEL=1\n"\ f"python3 train.py --config {config_file} --world_size 1 --gpu_ranks 0 -keep_checkpoint 1 " \ f"--seed {seed} &> {trial_dir}/train-log.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: test_script_file = trial_dir/f"{test_type}.sh" output_file = trial_dir / f"output_{test_type}.txt" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"module load cuda/10.1 cudnn/7.6.2\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"cd {self.code_dir}\n" \ f"export MKL_SERVICE_FORCE_INTEL=1\n"\ f"python3 translate.py "\ f"--model {trial_dir}/*.pt --output {output_file} --src {self.data_dir}/src-{test_type}.txt "\ f"&> {trial_dir}/{test_type}-log.txt\n" \ f"python3 eval_utils.py " \ f"{self.data_dir}/tgt-{test_type}.txt {output_file} {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) # end for return
def require_special_repo(cls, directory: Path, branch: str): cls.logger.info(f"Updating {directory} to {branch} branch") if directory.exists(): if not directory.is_dir() or not (directory / ".git").is_dir(): LoggingUtils.log_and_raise( cls.logger, f"Path {directory} already exists but is not a proper git repository!", Exception) # end if with IOUtils.cd(directory): BashUtils.run(f"git pull", expected_return_code=0) # end with else: IOUtils.mk_dir(directory) with IOUtils.cd(directory): BashUtils.run( f"git clone --single-branch -b {branch} -- {cls.get_git_url()} .", expected_return_code=0)
def process_data( self, data_dir: Path, output_processed_data_dir: Path, is_train: bool = False, ) -> NoReturn: """ Processes the data to the intermediate format. """ self.logger.info(self.logging_prefix + f"Processing data from {data_dir} to {output_processed_data_dir}") IOUtils.rm_dir(output_processed_data_dir) IOUtils.mk_dir(output_processed_data_dir) if is_train: # Preprocess with training data, if needed self.preprocess_with_train_data(data_dir, output_processed_data_dir) # end if self.process_data_impl(data_dir, output_processed_data_dir) return
def prepare_model(self, model: str, year: int, eval_setting: str, debug: bool = False): sub_dir_name = f"{model}" if debug: sub_dir_name = f"{sub_dir_name}-debug" model_work_dir = self.work_dir / sub_dir_name IOUtils.mk_dir(model_work_dir) if model == "DeepCom": from csevo.ml.DeepComRunner import DeepComRunner runner = DeepComRunner(model_work_dir, year, eval_setting) elif model == "Seq2seq": from csevo.ml.Seq2seqRunner import Seq2seqRunner runner = Seq2seqRunner(model_work_dir, year, eval_setting) elif model == "Seq2seqAtt": from csevo.ml.Seq2seqAttRunner import Seq2seqAttRunner runner = Seq2seqAttRunner(model_work_dir, year, eval_setting) elif model == "DeepCom-SBT": from csevo.ml.DeepComSBTRunner import DeepComSBTRunner runner = DeepComSBTRunner(model_work_dir, year, eval_setting) elif model == "DeepCom-Preorder": from csevo.ml.DeepComPreorderRunner import DeepComPreorderRunner runner = DeepComPreorderRunner(model_work_dir, year, eval_setting) elif model == "Code2Seq": from csevo.ml.Code2SeqRunner import Code2SeqRunner runner = Code2SeqRunner(model_work_dir, year, eval_setting) elif model == "Bi-LSTM": from csevo.ml.BiLSTMRunner import BiLSTMRunner runner = BiLSTMRunner(model_work_dir, year, eval_setting) elif model == "no-split-Bi-LSTM": from csevo.ml.NoSplitBiLSTMRunner import BiLSTMRunner runner = BiLSTMRunner(model_work_dir, year, eval_setting) elif model == "Transformer": from csevo.ml.TransformerRunner import TransformerRunner runner = TransformerRunner(model_work_dir, year, eval_setting) else: raise ValueError(f"Model {model} not ready yet") # end if runner.prepare() return
def __init__(self): self.plots_dir: Path = Macros.paper_dir / "figs" IOUtils.mk_dir(self.plots_dir) # Initialize seaborn sns.set() sns.set_palette("Dark2") sns.set_context("paper") mpl.rcParams["axes.titlesize"] = 24 mpl.rcParams["axes.labelsize"] = 24 mpl.rcParams["font.size"] = 18 mpl.rcParams["xtick.labelsize"] = 24 mpl.rcParams["xtick.major.size"] = 14 mpl.rcParams["xtick.minor.size"] = 14 mpl.rcParams["ytick.labelsize"] = 24 mpl.rcParams["ytick.major.size"] = 14 mpl.rcParams["ytick.minor.size"] = 14 mpl.rcParams["legend.fontsize"] = 18 mpl.rcParams["legend.title_fontsize"] = 18 # print(mpl.rcParams) return
def train( self, train_processed_data_dir: Path, val_processed_data_dir: Path, output_model_dir: Path, force_retrain: bool = False, ) -> NoReturn: """ Trains the model on the training data. The trained model should be saved to output_dir. This function auto-saves a training-completed.txt as a proof of completion of training at the end. :param train_processed_data_dir: the directory containing the processed train data :param val_processed_data_dir: the directory containing the processed val data :param output_model_dir: the directory to save the output model :param force_retrain: if set to True, re-train the model even if it was already trained (will remove previously trained model) """ if force_retrain or not self.is_training_completed(output_model_dir): self.logger.info( self.logging_prefix + f"Training model at {output_model_dir}; train: {train_processed_data_dir}, val: {val_processed_data_dir}" ) IOUtils.rm_dir(output_model_dir) IOUtils.mk_dir(output_model_dir) # Save spec & configs of this model IOUtils.dump(output_model_dir / "config-dict.json", IOUtils.jsonfy(self.config), IOUtils.Format.jsonPretty) IOUtils.dump(output_model_dir / "spec.json", IOUtils.jsonfy(self.spec), IOUtils.Format.jsonPretty) self.train_impl(train_processed_data_dir, val_processed_data_dir, output_model_dir) IOUtils.dump(output_model_dir / self.TRAINING_COMPLETED_FILE_NAME, str(time.time_ns()), IOUtils.Format.txt) # end if return
def prepare_data(self): data_prefix = f"{self.eval_setting}-{self.year}" IOUtils.rm_dir(self.data_dir) IOUtils.mk_dir(self.data_dir) # build dataset used by Open-NMT BashUtils.run(f"cp {self.model_data_dir}/{data_prefix}-{Macros.train}/biLSTM* {self.data_dir}/", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/src-test.txt {self.data_dir}/src-{Macros.test_common}.txt", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_common}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_common}.txt", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/src-test.txt {self.data_dir}/src-{Macros.test_standard}.txt", expected_return_code=0) BashUtils.run( f"cp {self.model_data_dir}/{data_prefix}-{Macros.test_standard}/tgt-test.txt {self.data_dir}/tgt-{Macros.test_standard}.txt", expected_return_code=0) return
def prepare_configs_and_scripts(self, trials: List[int]): exp_dir = self.work_dir for trial in trials: trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) model_dir = trial_dir / "models" IOUtils.mk_dir(model_dir) log_dir = trial_dir / "logs" IOUtils.mk_dir(log_dir) data = str(exp_dir / "data/code2seq") val_data = data + ".val.c2s" train_log = trial_dir / "training-trace.json" train_script_file = trial_dir / f"{Macros.train}.sh" # Copy config file BashUtils.run( f"cp {self.base_config_file} {trial_dir}/config.yaml", expected_return_code=0) output_file = trial_dir / "output_tmp.txt" reference_file = trial_dir / "ref_tmp.txt" config_file = trial_dir / "config.yaml" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}\n" \ f"python -u code2seq.py --data {data} --test {val_data} --log {train_log} --config {config_file} " \ f"--pred_file {output_file} --ref_file {reference_file} "\ f"--save_prefix {model_dir}/model --gpu_id $1 &> {trial_dir}/train-log.txt" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) for test_type in [Macros.test_common, Macros.test_standard]: test_data = exp_dir / "data" / f"code2seq.{test_type}.c2s" output_file = trial_dir / f"output_{test_type}.txt" reference_file = trial_dir / f"ref_{test_type}.txt" test_script_file = trial_dir / f"{test_type}.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}\n" \ f"python3 code2seq.py --load {model_dir}/model_best --test {test_data} --config {config_file} " \ f"--pred_file {output_file} --ref_file {reference_file} "\ f"--gpu_id $1 &> {trial_dir}/{test_type}-log.txt\n" \ f"python3 eval_utils.py {reference_file} {output_file} {trial_dir}/results_{test_type}.json\n" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) return
def prepare_configs_and_scripts(self, trials: List[int]): base_config = IOUtils.load(self.base_config_file, IOUtils.Format.jsonPretty) if not self.use_latest: exps = [f"{t}{t+1}-train" for t in range(13, 18)] for exp in exps: exp_dir = self.work_dir / exp for trial in trials: trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) output_file = trial_dir / "output.txt" config = copy.copy(base_config) config["data_dir"] = str(exp_dir) config["model_dir"] = str(trial_dir / "model") config["output"] = str(output_file) config_file = trial_dir / "config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) train_script_file = trial_dir / "train.sh" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --train -v --gpu-id $1 &> {trial_dir}/log-train.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) test_script_file = trial_dir / "test.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --eval {exp_dir}/test/test.token.code {exp_dir}/test/test.token.sbt {exp_dir}/test/test.token.nl &> {trial_dir}/log-test.txt" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) eval_script_file = trial_dir / "val.sh" eval_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 Bleu.py {exp_dir}/test/test.token.nl {trial_dir}/output.txt {trial_dir}\n" IOUtils.dump(eval_script_file, eval_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {eval_script_file}", expected_return_code=0) # end for # end for else: exp_dir = self.work_dir / "latest" for trial in trials: trial_dir = exp_dir / f"trial-{trial}" IOUtils.mk_dir(trial_dir) output_file = trial_dir / "output.txt" config = copy.copy(base_config) config["data_dir"] = str(exp_dir) config["model_dir"] = str(trial_dir / "model") config["output"] = str(output_file) config_file = trial_dir / "config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) train_script_file = trial_dir / "train.sh" train_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --train -v --gpu-id $1 &> {trial_dir}/log-train.txt\n" IOUtils.dump(train_script_file, train_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {train_script_file}", expected_return_code=0) test_script_file = trial_dir / "test.sh" test_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 __main__.py {config_file} --eval {exp_dir}/test/test.token.code {exp_dir}/test/test.token.sbt {exp_dir}/test/test.token.nl &> {trial_dir}/log-test.txt" IOUtils.dump(test_script_file, test_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {test_script_file}", expected_return_code=0) eval_script_file = trial_dir / "val.sh" eval_script = f"#!/bin/bash\n" \ f"source {TACCRunnerConsts.conda_init_path[TACCRunnerConsts.get_cur_cluster()]}\n" \ f"conda activate {self.CONDA_ENV}\n" \ f"module load cuda/10.0 cudnn/7.6.2\n" \ f"cd {self.code_dir}/translate\n" \ f"python3 Bleu.py {exp_dir}/test/test.token.nl {trial_dir}/output.txt {trial_dir}\n" IOUtils.dump(eval_script_file, eval_script, IOUtils.Format.txt) BashUtils.run(f"chmod +x {eval_script_file}", expected_return_code=0) return
def extract_data_from_corpus( cls, corpus_path: Path, trainevals: List[str], groups: List[str], output_path: Path, ): # 1. Prepare output path if output_path.is_dir(): cls.logger.warning( f"{output_path} already exists, will overwrite the files.") elif output_path.is_file(): LoggingUtils.log_and_raise( cls.logger, f"{output_path} already exists as a file. Aborting.", Exception) else: IOUtils.mk_dir(output_path) # end if assert all( [traineval in Macros.DS_TRAINEVALS for traineval in trainevals]) assert all([ group in Macros.DS_GROUPS + [Macros.DS_GROUP_TA] for group in groups ]) data_mgr = FilesManager(corpus_path) # 2. Load lemmas and definitions lemmas_filtered: List[Lemma] = data_mgr.load_data( [FilesManager.LEMMAS_FILTERED], IOUtils.Format.json, is_batched=True, clz=Lemma) definitions: List[Definition] = data_mgr.load_data( [FilesManager.DEFINITIONS, "definitions.json"], IOUtils.Format.json, clz=Definition) # 3. Output to output_path for each combination of traineval and group for traineval in trainevals: for group in groups: IOUtils.mk_dir(output_path / f"{group}-{traineval}") data_indexes = IOUtils.load( Macros.project_dir / "training" / f"{group}-{traineval}.json", IOUtils.Format.json) IOUtils.dump( output_path / f"{group}-{traineval}/lemmas.json", IOUtils.jsonfy([ l for l in lemmas_filtered if l.data_index in data_indexes ]), IOUtils.Format.json) IOUtils.dump( output_path / f"{group}-{traineval}/definitions.json", IOUtils.jsonfy([ d for d in definitions if d.data_index in data_indexes ]), IOUtils.Format.json) # end for # end for return
def extract_data_project( cls, project_path: Path, files: Optional[List[str]], exclude_files: Optional[List[str]], exclude_pattern: Optional[str], serapi_options: str, output_path: Path, ): # 1. Prepare output path if output_path.is_dir(): cls.logger.warning( f"{output_path} already exists, will overwrite the files.") elif output_path.is_file(): LoggingUtils.log_and_raise( cls.logger, f"{output_path} already exists as a file. Aborting.", Exception) else: IOUtils.mk_dir(output_path) # end if # 2. Extract documents, tok.sexp and ast.sexp coq_documents: Dict[str, CoqDocument] = collections.OrderedDict() ast_sexp_lists: Dict[str, List[SexpNode]] = dict() tok_sexp_lists: Dict[str, List[SexpNode]] = dict() with IOUtils.cd(project_path): coq_files: List[str] = BashUtils.run( f"find -name '*.v' -type f").stdout.split("\n")[:-1] coq_files = [coq_file[2:] for coq_file in coq_files] if files is not None: coq_files = [f for f in coq_files if f in files] # end if if exclude_files is not None: coq_files = [f for f in coq_files if f not in exclude_files] # end if if exclude_pattern is not None: re_exclude_pattern = re.compile(exclude_pattern) coq_files = [ f for f in coq_files if not re_exclude_pattern.fullmatch(f) ] # end if for i, coq_file in enumerate(tqdm(coq_files)): try: # Read file with open(coq_file, "r", newline="") as f: source_code = f.read() # end with # Get unicode offsets unicode_offsets = ParserUtils.get_unicode_offsets( source_code) # Call SerAPI ast_sexp_str: str = BashUtils.run( f"sercomp {serapi_options} --mode=sexp -- {coq_file}", expected_return_code=0).stdout tok_sexp_str: str = BashUtils.run( f"sertok {serapi_options} -- {coq_file}", expected_return_code=0).stdout # Parse ast sexp ast_sexp_list: List[SexpNode] = SexpParser.parse_list( ast_sexp_str) tok_sexp_list: List[SexpNode] = SexpParser.parse_list( tok_sexp_str) # Parse the document coq_document = CoqParser.parse_document( source_code, ast_sexp_list, tok_sexp_list, unicode_offsets=unicode_offsets) # Set meta data coq_document.file_name = coq_file coq_document.project_name = project_path.name coq_documents[coq_file] = coq_document ast_sexp_lists[coq_file] = ast_sexp_list tok_sexp_lists[coq_file] = tok_sexp_list except KeyboardInterrupt: cls.logger.warning("Keyboard interrupt!") raise except: cls.logger.warning( f"File {coq_file} failed! Exception was: {traceback.format_exc()}" ) continue # end try # end for # 3. Extract and save lemmas and definitions lemmas: List[Lemma] = list() definitions: List[Definition] = list() # Increase recursion limit because the backend sexps are CRAZZZZY deep sys.setrecursionlimit(10000) for file_path, doc in tqdm(coq_documents.items()): ast_sexp_list = ast_sexp_lists[file_path] lemmas_doc = cls.collect_lemmas_doc(doc, ast_sexp_list, serapi_options) lemmas.extend(lemmas_doc) definitions_doc = cls.collect_definitions_doc( doc, ast_sexp_list) definitions.extend(definitions_doc) # end for IOUtils.dump(output_path / "lemmas.json", IOUtils.jsonfy(lemmas), IOUtils.Format.json) IOUtils.dump(output_path / "definitions.json", IOUtils.jsonfy(definitions), IOUtils.Format.json) # end with return
def process_shared(self, output_dir: Path, years: List[str], eval_settings: List[str], task: str = "CG"): """ Extracts the train/val/test method-data for all eval_setting/year. This is a shared step for the processing for all models, so do this first (and once). 1. split the data into train/val/test 2. extract for every setting """ shared_data_dir = output_dir / f"{task}-shared" IOUtils.mk_dir(shared_data_dir) # Load project list projects = IOUtils.load(Macros.data_dir / f"projects-github-{task}-100.json") # Load data projects_2_data_list: Dict[str, List] = dict() for proj in tqdm(projects): # split data split method in the projects, create 19-20-methods-train.json and latest-methods-val.json files ds = DataSpliter() ds.project_data_split(proj, task) method_data_list = IOUtils.load(Macros.repos_results_dir / proj / "collector" / "method-data.json") projects_2_data_list[proj] = method_data_list # split data across projects num_proj = len(projects) random.seed(Environment.random_seed) random.Random(Environment.random_seed).shuffle(projects) train_index = round(num_proj * self.TRAIN_RATIO) valid_index = train_index + round(num_proj * self.VAL_RATIO) train_projs = projects[:train_index] valid_projs = projects[train_index:valid_index] test_projs = projects[valid_index:] project_split = { "train": train_projs, "val": valid_projs, "test": test_projs } #project_split = IOUtils.load(Macros.data_dir/f"projects-split-{task}-100.json") IOUtils.dump(Macros.data_dir / f"projects-split-{task}-100.json", project_split, IOUtils.Format.jsonNoSort) assert len(project_split["test"]) > len(project_split["val"]) data_type_2_project_list: Dict[str, List] = { Macros.train: project_split["train"], Macros.val: project_split["val"], Macros.test: project_split["test"], } for year in years: data_type_2_data_list: Dict[str, List] = dict() year = int(year) # test_common: D_test(P_test, year-1, year) data_type_2_data_list[f"{year}-{Macros.test_common}"] = list() for proj in tqdm(data_type_2_project_list[Macros.test]): filter_indexes_file = Macros.repos_results_dir / proj / "collector" / f"19-20-methods-{task}-test.json" filter_indexes = IOUtils.load(filter_indexes_file) data_type_2_data_list[f"{year}-{Macros.test_common}"] += [ projects_2_data_list[proj][i] for i in filter_indexes ] for eval_setting in eval_settings: if eval_setting == "evo": # train: D(P, year-3, year-2) # val: D(P, year-2, year-1) # test_standard: D(P, year-1, year) data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.train}"] = list() data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.val}"] = list() data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.test_standard}"] = list( ) for proj in tqdm(projects): all_filter_indexes_file = Macros.repos_results_dir / proj / "collector" / f"method-project-{task}-filtered.json" all_filter_indexes = IOUtils.load( all_filter_indexes_file) train_filter_indexes = [ af["method_ids"] for af in all_filter_indexes if af["time"] == f"{year-3}_Jan_1-{year-2}_Jan_1" ][0] train_filter_indexes += [ af["method_ids"] for af in all_filter_indexes if af["time"] == f"{year-4}_Jan_1-{year-3}_Jan_1" ][0] val_filter_indexes = [ af["method_ids"] for af in all_filter_indexes if af["time"] == f"{year-2}_Jan_1-{year-1}_Jan_1" ][0] test_standard_filter_indexes = [ af["method_ids"] for af in all_filter_indexes if af["time"] == f"{year-1}_Jan_1-{year}_Jan_1" ][0] proj_data_list = projects_2_data_list[proj] data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.train}"] += [ proj_data_list[i] for i in train_filter_indexes ] data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.val}"] += [ proj_data_list[i] for i in val_filter_indexes ] data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.test_standard}"] += [ proj_data_list[i] for i in test_standard_filter_indexes ] elif eval_setting == "crossproj-evo": # train: D(P_train, year-3, year-2) # val: D(P_val, year-2, year-1) # test: D(P_test, year-1, year) data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.train}"] = list() data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.val}"] = list() data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.test_standard}"] = list( ) for data_type_tvt, project_list in data_type_2_project_list.items( ): if data_type_tvt == Macros.test: data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.test_standard}"] = data_type_2_data_list[ f"{year}-{Macros.test_common}"] else: for proj in tqdm(project_list): all_filter_indexes_file = Macros.repos_results_dir / proj / "collector" / f"method-project-{task}-filtered.json" all_filter_indexes = IOUtils.load( all_filter_indexes_file) proj_data_list = projects_2_data_list[proj] if data_type_tvt == Macros.train: train_filter_indexes = [ af["method_ids"] for af in all_filter_indexes if af["time"] == f"{year - 3}_Jan_1-{year - 2}_Jan_1" ][0] train_filter_indexes += [ af["method_ids"] for af in all_filter_indexes if af["time"] == f"{year - 4}_Jan_1-{year - 3}_Jan_1" ][0] data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.train}"] += [ proj_data_list[i] for i in train_filter_indexes ] elif data_type_tvt == Macros.val: val_filter_indexes = [ af["method_ids"] for af in all_filter_indexes if af["time"] == f"{year - 2}_Jan_1-{year - 1}_Jan_1" ][0] data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.val}"] += [ proj_data_list[i] for i in val_filter_indexes ] elif eval_setting == "crossproj": # train: D(P_train, year) # val: D(P_val, year) # test_standard: D(P_test, year) data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.train}"] = list() data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.val}"] = list() data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.test_standard}"] = list( ) for data_type_tvt, project_list in data_type_2_project_list.items( ): data_type = data_type_tvt if data_type_tvt != Macros.test else Macros.test_standard for proj in tqdm(project_list): latest_filter_indexes = list() for t in [Macros.train, Macros.val, Macros.test]: filter_indexes_file = Macros.repos_results_dir / proj / "collector" / f"latest-methods-{task}-{t}.json" latest_filter_indexes += IOUtils.load( filter_indexes_file) data_type_2_data_list[ f"{eval_setting}-{year}-{data_type}"] += [ projects_2_data_list[proj][i] for i in latest_filter_indexes ] elif eval_setting == "mixedproj": # train: D_train(P, year) # val: D_val(P, year) # test_standard: D_test(P, year) data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.train}"] = list() data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.val}"] = list() data_type_2_data_list[ f"{eval_setting}-{year}-{Macros.test_standard}"] = list( ) for proj in tqdm(projects): proj_data_list = projects_2_data_list[proj] for data_type_tvt, data_type in zip( [Macros.train, Macros.val, Macros.test], [Macros.train, Macros.val, Macros.test_standard]): filter_indexes_file = Macros.repos_results_dir / proj / "collector" / f"latest-methods-{task}-{data_type_tvt}.json" filter_indexes = IOUtils.load(filter_indexes_file) data_type_2_data_list[ f"{eval_setting}-{year}-{data_type}"] += [ proj_data_list[i] for i in filter_indexes ] for dt, data_list in data_type_2_data_list.items(): IOUtils.dump(shared_data_dir / f"{dt}.json", data_list, IOUtils.Format.json) return
def process(self, model: str, output_dir: Path, task: str, year: int, eval_setting: str): """ Main entry for processors of different models. :param model: the model name, one of {"DeepCom", "ast-attendgru"} :param output_dir: the output directory (usually data/models) :param task: the task name, either "CG" or "MN" :param year: the year that the testing data should be on :param eval_setting: the evaluation setting, one of {"evo", "crossproj", "mixedproj"} """ assert year == self.EVO_YEARS[ -1] # TODO: Only support the latest year for now assert task in self.TASKS.keys() model_data_dir = output_dir / model if model == "DeepCom": from csevo.processor.DeepComProcessor import DeepComProcessor processor = DeepComProcessor() elif model == "DeepCom-Preorder": from csevo.processor.DeepComProcessor import DeepComProcessor processor = DeepComProcessor() elif model == "Bi-LSTM": from csevo.processor.BiLSTMProcessor import BiLSTMProcessor processor = BiLSTMProcessor() elif model == "no-split-Bi-LSTM": from csevo.processor.BiLSTMProcessor import BiLSTMProcessor processor = BiLSTMProcessor() elif model == "Transformer": from csevo.processor.TransformerProcessor import TransformerProcessor processor = TransformerProcessor() data_prefix = f"{eval_setting}-{year}" processor.process_data(model_data_dir, data_prefix) return elif model == "ASTAttendGRU": from csevo.processor.ASTAttendGRUProcessor import ASTAttendGRUProcessor processor = ASTAttendGRUProcessor() elif model == "Code2Seq": from csevo.processor.Code2SeqProcessor import Code2SeqProcessor processor = Code2SeqProcessor() else: raise ValueError(f"Illegal model {model}") # end if error_ids = None # Load dataset after split (from shared directory) shared_data_dir = output_dir / f"{task}-shared" self.logger.info(f"Loading dataset from {shared_data_dir}") data_type_2_data_list: Dict[str, List] = dict() data_type_2_data_list[Macros.test_common] = IOUtils.load( shared_data_dir / f"{year}-{Macros.test_common}.json", IOUtils.Format.json) for dt in [Macros.train, Macros.val, Macros.test_standard]: data_type_2_data_list[dt] = IOUtils.load( shared_data_dir / f"{eval_setting}-{year}-{dt}.json", IOUtils.Format.json) # Process each set for data_type, data_list in data_type_2_data_list.items(): sub_dir_name = f"{eval_setting}-{year}-{data_type}" if data_type in [Macros.test_common, Macros.test_standard]: data_type_tvt = Macros.test else: data_type_tvt = data_type model_dt_output_dir = model_data_dir / sub_dir_name IOUtils.mk_dir(model_dt_output_dir) if model == "DeepCom": error_ids = processor.process_data(data_list, data_type_tvt, model_dt_output_dir, "sbt") elif model == "DeepCom-Preorder": error_ids = processor.process_data(data_list, data_type_tvt, model_dt_output_dir, "Preorder") elif model == "Code2Seq": error_ids = processor.process_data(data_list, data_type_tvt, model_dt_output_dir) elif model == "Bi-LSTM": processor.process_data(data_list, data_type_tvt, model_dt_output_dir) elif model == "no-split-Bi-LSTM": processor.process_data(data_list, data_type_tvt, model_dt_output_dir, split=False) if error_ids is not None: self.logger.warning(f"Error data count: {len(error_ids)}") IOUtils.dump(model_data_dir / f"error-ids-{sub_dir_name}.json", error_ids, IOUtils.Format.json) # extra step for Open-NMT data if model == "Bi-LSTM" or model == "no-split-Bi-LSTM": # build dataset used by Open-NMT BashUtils.run( f"onmt_preprocess -train_src {model_data_dir}/{eval_setting}-{year}-{Macros.train}/src-train.txt " f"-train_tgt {model_data_dir}/{eval_setting}-{year}-{Macros.train}/tgt-train.txt " f"-valid_src {model_data_dir}/{eval_setting}-{year}-{Macros.val}/src-val.txt " f"-valid_tgt {model_data_dir}/{eval_setting}-{year}-{Macros.val}/tgt-val.txt " f"-save_data {model_data_dir}/{eval_setting}-{year}-{Macros.train}/biLSTM --src_seq_length 200 --src_seq_" f"length_trunc 200", expected_return_code=0) return
def process_data(self, method_data_list, data_type, output_dir, traversal) -> List[int]: self.logger.info(f"Start processing") # Use DeepCom's required names data_type = { Macros.train: "train", Macros.val: "valid", Macros.test: "test", "debug": "debug", }[data_type] # Initialize vocab, error_ids (shared between processes) manager = multiprocessing.Manager() code_vocab = manager.dict() nl_vocab = manager.dict() sbt_vocab = manager.dict() vocabs_lock = manager.Lock() error_ids = manager.list() error_ids_lock = manager.Lock() # Multi-processing, split the tasks evenly tasks_each_process = len( method_data_list) // Macros.multi_processing + 1 processes = list() for pid in range(Macros.multi_processing): beg = pid * tasks_each_process method_data_list_p = method_data_list[beg:beg + tasks_each_process] output_dir_p = output_dir / str(pid) IOUtils.mk_dir(output_dir_p) process = multiprocessing.Process( target=self.process_data_mp, args=(method_data_list_p, data_type, output_dir_p, pid, beg, code_vocab, nl_vocab, sbt_vocab, vocabs_lock, error_ids, error_ids_lock, traversal)) process.start() processes.append(process) # end for for process in processes: process.join() # end for # Merge results code_file_name = data_type + ".token.code" nl_file_name = data_type + ".token.nl" sbt_file_name = data_type + ".token.sbt" data_type_output_dir = output_dir / data_type IOUtils.mk_dir(data_type_output_dir) for pid in range(Macros.multi_processing): for fname in [code_file_name, nl_file_name, sbt_file_name]: BashUtils.run( f"cat {output_dir}/{pid}/{fname} >> {data_type_output_dir}/{fname}" ) # end for IOUtils.rm_dir(output_dir / str(pid)) # end for error_ids.sort() # Build vocab if data_type == "train": code_vocab_file = output_dir / "vocab.code" nl_vocab_file = output_dir / "vocab.nl" sbt_vocab_file = output_dir / "vocab.sbt" fcv = open(code_vocab_file, "w+") fnv = open(nl_vocab_file, "w+") fsv = open(sbt_vocab_file, "w+") # write vocab to files special_tokens = [ '<S>', '</S>', '<UNK>', '<KEEP>', '<DEL>', '<INS>', '<SUB>', '<NONE>' ] # Filter based on frequency, keep first MAX_VOCAB code_vocabs_list = special_tokens + list( code_vocab.keys())[:self.MAX_VOCAB] nl_vocabs_list = special_tokens + list( nl_vocab.keys())[:self.MAX_VOCAB] sbt_vocabs_list = special_tokens + list( sbt_vocab.keys())[:self.MAX_VOCAB] for v in code_vocabs_list: fcv.write(v + "\n") for v in nl_vocabs_list: fnv.write(v + "\n") for v in sbt_vocabs_list: fsv.write(v + "\n") fcv.close() fsv.close() fnv.close() # end if return list(error_ids)
def make_plot_draft_learning_curve(self, training_log_path: Path, output_name: str, ): special_plots_dir = self.plots_dir / "draft-learning-curve" IOUtils.mk_dir(special_plots_dir) fig: plt.Figure = plt.figure(figsize=(12,9)) # TODO: these metrics may be specific to Code2Seq only x_field = "batch" yl_field = "training_loss" yr_field = "eval F1" x_min = 0 x_max = -np.Inf yl_min = np.Inf yl_max = -np.Inf yr_min = np.Inf yr_max = -np.Inf # First, get ranges for all metrics (we want to use same ranges in all subplots) tvt_2_training_log = dict() tvt_2_x = dict() tvt_2_yl = dict() tvt_2_yr = dict() for tvt in [Macros.lat_lat, Macros.evo_lat, Macros.lat_evo, Macros.evo_evo]: # TODO: this path is hardcoded and work for Code2Seq 1 trial training_log = IOUtils.load(training_log_path / tvt / "trial-0" / "logs" / "train_log.json", IOUtils.Format.json) x = [d[x_field] for d in training_log] yl = [d[yl_field] for d in training_log] yr = [d[yr_field] for d in training_log] tvt_2_training_log[tvt] = training_log tvt_2_x[tvt] = x tvt_2_yl[tvt] = yl tvt_2_yr[tvt] = yr x_min = min(x_min, min(x)) x_max = max(x_max, max(x)) yl_min = min(yl_min, min(yl)) yl_max = max(yl_max, max(yl)) yr_min = min(yr_min, min(yr)) yr_max = max(yr_max, max(yr)) # end for x_lim = (x_min - (x_max - x_min) / 30, x_max + (x_max - x_min) / 30) yl_lim = (np.exp(np.log(yl_min) - (np.log(yl_max) - np.log(yl_min)) / 30), np.exp(np.log(yl_max) + (np.log(yl_max) - np.log(yl_min)) / 30)) yr_lim = (yr_min - (yr_max - yr_min) / 30, yr_max + (yr_max - yr_min) / 30) for t_i, t in enumerate([Macros.lat, Macros.evo]): for vt_i, vt in enumerate([Macros.lat, Macros.evo]): tvt = f"{t}-{vt}" tvt_i = (t_i)*2+(vt_i)+1 x = tvt_2_x[tvt] yl = tvt_2_yl[tvt] yr = tvt_2_yr[tvt] axl: plt.Axes = fig.add_subplot(2, 2, tvt_i) axr = axl.twinx() colorl = "tab:red" colorr = "tab:blue" axl.plot(x, yl, color=colorl) axr.plot(x, yr, color=colorr) axl.set_xlabel(x_field) axl.set_xlim(x_lim[0], x_lim[1]) axl.set_ylabel(yl_field, color=colorl) axl.set_yscale("log") axl.set_ylim(yl_lim[0], yl_lim[1]) axr.set_ylabel(yr_field, color=colorr) axr.set_ylim(yr_lim[0], yr_lim[1]) axl.set_title(tvt) # end for # end for fig.tight_layout() with IOUtils.cd(special_plots_dir): fig.savefig(f"{output_name}.eps") # end with return
def collect_project(self, project_name: str, project_url: str): Environment.require_collector() # 0. Download repo downloads_dir = self.repos_downloads_dir / project_name results_dir = self.repos_results_dir / project_name # Remove previous results if any IOUtils.rm_dir(results_dir) IOUtils.mk_dir(results_dir) # Clone the repo if not exists if not downloads_dir.exists(): with IOUtils.cd(self.repos_downloads_dir): with TimeUtils.time_limit(300): BashUtils.run(f"git clone {project_url} {project_name}", expected_return_code=0) # end with # end with # end if project_data = ProjectData.create() project_data.name = project_name project_data.url = project_url # 1. Get list of revisions with IOUtils.cd(downloads_dir): git_log_out = BashUtils.run(f"git log --pretty=format:'%H %P'", expected_return_code=0).stdout for line in git_log_out.splitlines()[:self.MAX_REVISIONS]: shas = line.split() project_data.revisions.append(shas[0]) project_data.parent_revisions[shas[0]] = shas[1:] # end for # end with # 2. Get revisions in different year with IOUtils.cd(downloads_dir): for year in self.YEARS: git_log_out = BashUtils.run( f"git rev-list -1 --before=\"Jan 1 {year}\" origin", expected_return_code=0).stdout project_data.year_revisions[str(year) + "_Jan_1"] = git_log_out.rstrip() # end for # end with project_data_file = results_dir / "project.json" IOUtils.dump(project_data_file, IOUtils.jsonfy(project_data), IOUtils.Format.jsonPretty) # 2. Start java collector # Prepare config log_file = results_dir / "collector-log.txt" output_dir = results_dir / "collector" config = { "collect": True, "projectDir": str(downloads_dir), "projectDataFile": str(project_data_file), "logFile": str(log_file), "outputDir": str(output_dir), "year": True # To indicate whether to collect all evo data or yearly data } config_file = results_dir / "collector-config.json" IOUtils.dump(config_file, config, IOUtils.Format.jsonPretty) self.logger.info( f"Starting the Java collector. Check log at {log_file} and outputs at {output_dir}" ) rr = BashUtils.run( f"java -jar {Environment.collector_jar} {config_file}", expected_return_code=0) if rr.stderr: self.logger.warning(f"Stderr of collector:\n{rr.stderr}") # end if # 3. In some cases, save collected data to appropriate location or database # TODO private info # On luzhou server for user pynie, move it to a dedicated location at /user/disk2 if BashUtils.run( f"hostname").stdout.strip() == "luzhou" and BashUtils.run( f"echo $USER").stdout.strip() == "pynie": alter_results_dir = Path( "/home/disk2/pynie/csevo-results") / project_name IOUtils.rm_dir(alter_results_dir) IOUtils.mk_dir(alter_results_dir.parent) BashUtils.run(f"mv {results_dir} {alter_results_dir}") self.logger.info(f"Results moved to {alter_results_dir}") # end if # -1. Remove repo IOUtils.rm_dir(downloads_dir) return
def split_dataset( cls, assignments_path: Path, output_dir: Path, seed, use_new_sub_tokenizer: bool, ): # Load the assignments dataset, as a flattened list data_list = cls.load_data_list(assignments_path) # Shuffle the data before splitting data_list = cls.shuffle_data(data_list, seed) # Split the data with 8:1:1 ratio split_index = len(data_list) // 10 val_data_list = data_list[:split_index] test_data_list = data_list[split_index:2 * split_index] train_data_list = data_list[2 * split_index:] # Remove the data testing set that appeared in train/val seen_data_in_train_val = set() for data in train_data_list + val_data_list: key = hash((tuple(data["l"]), tuple(data["r"]), tuple([ tuple(data[f"pa{pa_i+1}"]) for pa_i in range(Macros.MAX_PA_IN_MODEL) ]))) seen_data_in_train_val.add(key) # end for test_duplicate_indexes = list() for i, data in enumerate(test_data_list): key = hash((tuple(data["l"]), tuple(data["r"]), tuple([ tuple(data[f"pa{pa_i+1}"]) for pa_i in range(Macros.MAX_PA_IN_MODEL) ]))) if key in seen_data_in_train_val: test_duplicate_indexes.append(i) # end if # end for for i in reversed(test_duplicate_indexes): del test_data_list[i] # end for # Sub-tokenize; this is after recording duplicates, as we # detect duplicates on token level for data in train_data_list + val_data_list + test_data_list: cls.sub_tokenize_data(data, use_new_sub_tokenizer) # end for # Collect statistics statistics = { "num-data": len(data_list), "num-data-train": len(train_data_list), "num-data-val": len(val_data_list), "num-data-test": len(test_data_list), "num-test-duplicate": len(test_duplicate_indexes), } # Save dataset after splitting; the tokens for each field of # data are joined to a single string separated by space IOUtils.mk_dir(output_dir) cls.dump_data_list(output_dir / "train.json", train_data_list) cls.dump_data_list(output_dir / "val.json", val_data_list) cls.dump_data_list(output_dir / "test.json", test_data_list) IOUtils.dump(output_dir / "statistics.json", statistics, IOUtils.Format.jsonNoSort) return
def __init__(self): self.tables_dir: Path = Macros.paper_dir / "tables" IOUtils.mk_dir(self.tables_dir) self.metrics_dir: Path = Macros.results_dir / "metrics" return
def split_dataset_always_end( cls, assignments_path: Path, output_dir: Path, seed, use_new_sub_tokenizer: bool, ): data_list = cls.load_data_list(assignments_path) file_list = cls.shuffle_data(cls.extract_file_list(data_list), seed) file_list = [item for sublist in file_list for item in sublist] val_data_list = list() test_data_list = list() train_data_list = list() files_to_ass = dict() for fsha in file_list: assignments = cls.extract_assignments_from([fsha], data_list) if len(assignments) > 0: files_to_ass[fsha] = assignments file_list = list(files_to_ass.keys()) files_to_ix = dict.fromkeys(file_list, -1) bound = int(len(data_list) * 0.1) while len(test_data_list) < bound: fsha = file_list[np.random.randint(0, len(file_list) - 1)] ix = files_to_ix[fsha] assignments = files_to_ass[fsha] if len(assignments) >= -ix: test_data_list.append(assignments[ix]) files_to_ix[fsha] = ix - 1 while len(val_data_list) < bound: fsha = file_list[np.random.randint(0, len(file_list) - 1)] ix = files_to_ix[fsha] assignments = files_to_ass[fsha] if len(assignments) >= -ix: val_data_list.append(assignments[ix]) files_to_ix[fsha] = ix - 1 for fsha in file_list: assignments = files_to_ass[fsha] ix = files_to_ix[fsha] if ix == -1: ix = len(assignments) else: ix = ix + 1 if len(assignments) >= -ix: train_data_list.extend(assignments[0:ix]) statistics = { "num-data": len(data_list), "num-data-train": len(train_data_list), "num-data-val": len(val_data_list), "num-data-test": len(test_data_list), "num-files": len(file_list), } IOUtils.mk_dir(output_dir) cls.dump_data_list(output_dir / "train.json", train_data_list) cls.dump_data_list(output_dir / "val.json", val_data_list) cls.dump_data_list(output_dir / "test.json", test_data_list) IOUtils.dump(output_dir / "statistics.json", statistics, IOUtils.Format.jsonNoSort) IOUtils.dump(output_dir / "files.json", file_list, IOUtils.Format.jsonNoSort) return
def split_dataset_cross_project( cls, assignments_path: Path, output_dir: Path, seed, use_new_sub_tokenizer: bool, ): # Load the assignments dataset, as a flattened list data_list = cls.load_data_list(assignments_path) # Load the mapping from project name to file cksums proj_2_cksums: Dict[str, List[str]] = cls.load_proj_2_cksums() # Each file can only be assigned to one project; shuffle the project list and assign in order projs_shuffle = cls.shuffle_data(sorted(list(proj_2_cksums.keys())), seed) seen_cksum = set() for proj in projs_shuffle: proj_2_cksums[proj] = [ c for c in proj_2_cksums[proj] if c not in seen_cksum ] # Remove the project if all files in it has been seen if len(proj_2_cksums[proj]) == 0: del proj_2_cksums[proj] seen_cksum.update(proj_2_cksums[proj]) # end for # Shuffle projects list once again as some projects may be removed due to no data projs_shuffle = cls.shuffle_data(sorted(list(proj_2_cksums.keys())), seed) # Split the data by project with roughly 8:1:1 ratio num_data = len(data_list) # First take test set until >= 10% data test_proj_list = list() test_data_list = list() while len(test_data_list) < 0.1 * num_data: proj = projs_shuffle.pop() test_proj_list.append(proj) test_data_list += [ d for d in data_list if d["file_sha"][0] in proj_2_cksums[proj] ] # end while test_data_list = cls.shuffle_data(test_data_list, seed) # Then take train set until >= 80% data train_proj_list = list() train_data_list = list() while len(train_data_list) < 0.8 * num_data: proj = projs_shuffle.pop() train_proj_list.append(proj) train_data_list += [ d for d in data_list if d["file_sha"][0] in proj_2_cksums[proj] ] # end while train_data_list = cls.shuffle_data(train_data_list, seed) # Remaining are assigned to val val_proj_list = projs_shuffle val_data_list = list() for proj in val_proj_list: val_data_list += [ d for d in data_list if d["file_sha"][0] in proj_2_cksums[proj] ] # end for val_data_list = cls.shuffle_data(val_data_list, seed) # Remove the data testing set that appeared in train/val seen_data_in_train_val = set() for data in train_data_list + val_data_list: key = hash((tuple(data["l"]), tuple(data["r"]), tuple([ tuple(data[f"pa{pa_i+1}"]) for pa_i in range(Macros.MAX_PA_IN_MODEL) ]))) seen_data_in_train_val.add(key) # end for test_duplicate_indexes = list() for i, data in enumerate(test_data_list): key = hash((tuple(data["l"]), tuple(data["r"]), tuple([ tuple(data[f"pa{pa_i+1}"]) for pa_i in range(Macros.MAX_PA_IN_MODEL) ]))) if key in seen_data_in_train_val: test_duplicate_indexes.append(i) # end if # end for for i in reversed(test_duplicate_indexes): del test_data_list[i] # end for # Sub-tokenize; this is after recording duplicates, as we detect duplicates on token level for data in train_data_list + val_data_list + test_data_list: cls.sub_tokenize_data(data, use_new_sub_tokenizer) # end for # Collect statistics statistics = { "num-data": len(data_list), "num-data-train": len(train_data_list), "num-data-val": len(val_data_list), "num-data-test": len(test_data_list), "num-proj": len(train_proj_list) + len(val_proj_list) + len(test_proj_list), "num-proj-train": len(train_proj_list), "num-proj-val": len(val_proj_list), "num-proj-test": len(test_proj_list), "num-test-duplicate": len(test_duplicate_indexes), } # Save dataset after splitting; the tokens for each field of data are joined to a single string separated by space IOUtils.mk_dir(output_dir) cls.dump_data_list(output_dir / "train.json", train_data_list) cls.dump_data_list(output_dir / "val.json", val_data_list) cls.dump_data_list(output_dir / "test.json", test_data_list) IOUtils.dump(output_dir / "train-proj-list.json", train_proj_list, IOUtils.Format.jsonPretty) IOUtils.dump(output_dir / "val-proj-list.json", val_proj_list, IOUtils.Format.jsonPretty) IOUtils.dump(output_dir / "test-proj-list.json", test_proj_list, IOUtils.Format.jsonPretty) IOUtils.dump(output_dir / "statistics.json", statistics, IOUtils.Format.jsonNoSort) return
def make_table_draft_model_results( self, results_path: Path, output_name: str, ): special_tables_dir = self.tables_dir / "draft-model-results" IOUtils.mk_dir(special_tables_dir) file = latex.File(special_tables_dir / f"{output_name}.tex") # Header file.append(r"\begin{table*}") file.append(r"\begin{small}") file.append(r"\begin{center}") file.append(r"\caption{Model Results (Draft) from " + str(results_path).replace("_", r"\_") + "}") metrics = None for tvt in [ Macros.lat_lat, Macros.evo_lat, Macros.lat_evo, Macros.evo_evo ]: results = IOUtils.load(results_path / tvt / "test_results.json") # Flatten Rouge scores if "Rouge" in results: if results["Rouge"] == 0: results["Rouge1-F1"] = 0 results["Rouge2-F1"] = 0 results["RougeL-F1"] = 0 else: results["Rouge1-F1"] = results["Rouge"]["rouge-1"]["f"] results["Rouge2-F1"] = results["Rouge"]["rouge-2"]["f"] results["RougeL-F1"] = results["Rouge"]["rouge-l"]["f"] # end if del results["Rouge"] # end if if metrics is None: metrics = list(sorted(results.keys())) # Table header line file.append(r"\begin{tabular}{l | " + "r" * len(metrics) + "}") file.append(r"\toprule") file.append("Training-Testing & " + " & ".join(metrics) + r"\\") file.append(r"\midrule") # end if file.append(tvt) for m in metrics: file.append(f"& {results[m]:.2f}") # end for file.append(r"\\") # end for # Footer file.append(r"\bottomrule") file.append(r"\end{tabular}") file.append(r"\end{center}") file.append(r"\end{small}") file.append(r"\end{table*}") file.save() return
def submit_script(cls, cluster: str, name: str, log_path: Path, script: str, queue: str = None, timeout: str = None, require_conda: bool = True, conda_env: str = None, modules: List[str] = None, ) -> int: # Get default values if modules is None: modules = TACCRunnerConsts.modules[cluster] # end if if queue is None: queue = TACCRunnerConsts.queue[cluster] # end if if timeout is None: timeout = TACCRunnerConsts.timeout[cluster] # end if if conda_env is None: conda_env = TACCRunnerConsts.conda_env[cluster] # end if # Prepare submit script IOUtils.mk_dir(log_path) s = f"""#!/bin/bash #SBATCH -J {name} # Job name #SBATCH -o {log_path}/%j.stdout # Name of stdout output file(%j expands to jobId) #SBATCH -e {log_path}/%j.stderr # Name of stderr output file(%j expands to jobId) #SBATCH -p {queue} # Queue name #SBATCH -N 1 # Total number of nodes requested #SBATCH -n 1 # Total number of mpi tasks requested #SBATCH -t {timeout} # Max run time (hh:mm:ss) #SBATCH [email protected] #SBATCH --mail-type=ALL # The next line is required if the user has more than one project #SBATCH -A {TACCRunnerConsts.allocation} # Allocation name to charge job against module reset module unload python2 """ for m in modules: s += f"module load {m}\n" # end for s += f""" module list echo "START: $(date)" # Launch serial code... # Do not use ibrun or any other MPI launcher """ if require_conda: s += f""" unset PYTHONPATH source {TACCRunnerConsts.conda_init_path[cluster]} conda activate {conda_env} """ s += f""" cd {Macros.python_dir} {script} echo "END: $(date)" """ # Submit the script submit_script = BashUtils.get_temp_file() IOUtils.dump(submit_script, s, IOUtils.Format.txt) receipt = BashUtils.run(f"sbatch {submit_script}", expected_return_code=0).stdout # Get job id as the last number in output job_id = int(receipt.splitlines()[-1].split()[-1]) # Save the script at log_path as well BashUtils.run(f"mv {submit_script} {log_path}/{job_id}.sh") return job_id