def main(args): parser = argparse.ArgumentParser() parser.add_argument('-i', dest='inputs', type=str, nargs="+", help="Input files (JSON) for SPR1 splits.") parser.add_argument('-o', dest='output_dir', type=str, required=True, help="Output directory.") args = parser.parse_args(args) if not os.path.isdir(args.output_dir): os.mkdir(args.output_dir) pd.options.display.float_format = '{:.2f}'.format for fname in args.inputs: log.info("Converting %s", fname) source_records = list(utils.load_json_data(fname)) converted_records = (convert_record(r) for r in tqdm(source_records)) stats = utils.EdgeProbingDatasetStats() converted_records = stats.passthrough(converted_records) target_fname = os.path.join(args.output_dir, os.path.basename(fname)) utils.write_json_data(target_fname, converted_records) log.info("Wrote examples to %s", target_fname) log.info(stats.format())
def _vi_to_zalo(): squad_dir = "squad_data" zalo_samples = [] _id = 0 for file_name in ["vi_train-v2.0.json", "vi_dev-v2.0.json"]: file_path = "{}/{}".format(squad_dir, file_name) samples = read_json_data(file_path) for sample in samples["data"]: title = sample["title"] for p in sample["paragraphs"]: context = p["context"] for qa in p["qas"]: zalo_sample = { "id": "squad-{}".format(_id), "title": title, "question": qa["question"], "text": context, "label": not qa["is_impossible"], } zalo_samples.append(zalo_sample) _id += 1 out_path = "qna_data/squad.json" write_json_data(out_path, zalo_samples) print ("Write file {}".format(out_path))
def convert_with_stats(source_records, target_fname, convert_fn): converted_records = (convert_fn(r) for r in tqdm(source_records)) stats = utils.EdgeProbingDatasetStats() converted_records = stats.passthrough(converted_records) utils.write_json_data(target_fname, converted_records) log.info("Wrote examples to %s", target_fname) log.info(stats.format())
def main(args): import argparse parser = argparse.ArgumentParser() parser.add_argument( "--ontonotes", type=str, required=True, help="Path to OntoNotes, e.g. /path/to/conll-formatted-ontonotes-5.0", ) parser.add_argument("--tasks", type=str, nargs="+", help="Tasks, one or more of {const, coref, ner, srl}.") parser.add_argument( "--splits", type=str, nargs="+", default=["train", "development", "test", "conll-2012-test"], help= "Splits, one or more of {train, development, test, conll-2012-test}.", ) parser.add_argument("-o", dest="output_dir", type=str, default=".", help="Output directory for JSON files.") args = parser.parse_args(args) if not os.path.isdir(args.output_dir): os.mkdir(args.output_dir) import pandas as pd pd.options.display.float_format = "{:.2f}".format # Load OntoNotes reader. ontonotes = Ontonotes() for split in args.splits: for task in args.tasks: source_path = os.path.join(args.ontonotes, "data", split) print('########### Reading ontonotes split from', source_path) ontonotes_reader = ontonotes.dataset_iterator( file_path=source_path) log.info("Processing split '%s' for task '%s'", split, task) task_dir = os.path.join(args.output_dir, task) if not os.path.isdir(task_dir): os.mkdir(task_dir) target_fname = os.path.join(task_dir, f"{split}.json") ontonotes_stats = collections.Counter() converted_records = process_task_split(tqdm(ontonotes_reader), task, ontonotes_stats) stats = utils.EdgeProbingDatasetStats() converted_records = stats.passthrough(converted_records) utils.write_json_data(target_fname, converted_records) log.info("Wrote examples to %s", target_fname) log.info(stats.format()) log.info(str(pd.Series(ontonotes_stats, dtype=object)))
def main(args): parser = argparse.ArgumentParser() parser.add_argument("-i", dest="input_files", type=str, nargs="+", help="Input file(s), e.g. en_ewt-ud-*.conllu") parser.add_argument( "-o", dest="output_dir", type=str, required=True, help="Output directory, e.g. /path/to/edges/data/ud_ewt", ) args = parser.parse_args(args) if not os.path.isdir(args.output_dir): os.mkdir(args.output_dir) for filename in args.input_files: with open(filename) as fd: records = convert_ud_file(fd) stats = utils.EdgeProbingDatasetStats() records = stats.passthrough(records) target_basename = os.path.basename(filename).replace( ".conllu", ".json") target_fname = os.path.join(args.output_dir, target_basename) utils.write_json_data(target_fname, records) log.info("Wrote examples to %s", target_fname) log.info(stats.format()) log.info("Done!")
def do_notes(in_dir): """ Read tracks info inside the in_dir folder and write tracks info to the tracks.json file :param in_dir: Input dir :return None """ # create or read tracks json out_file = f"{in_dir}/tracks.json" if os.path.exists(out_file): out_json = read_json_data(out_file) else: out_json = {"instruments": {}} # we update this object instruments_json = out_json["instruments"] # get instruments info and update tracks json for path in glob.glob(f"{in_dir}/scores/*.mid"): instrument_name = path.split("\\")[-1].replace(".mid", "") instrument_info = get_instrument_info(path) instruments_json[instrument_name] = instrument_info # write tracks json write_json_data(out_file, out_json) print(f"Write file {out_file}")
def save_json(self, file_path): save_data = { "unk_id": self.unk_id, "max_sent_len": self.max_sent_len, "word2id": self.word2id, "id2word": self.id2word, } write_json_data(file_path, save_data)
def _translate_to_qnli(dataset): # vi_train_questions.json # vi_train_sentences.json # vi_train.tsv src_folder = "glue_data/qnli/en" trg_folder = "glue_data/qnli/vi" json_files = ["questions", "sentences"] tran_ids_to_questions_dict = _from_translate_to_json( dataset, src_folder, trg_folder, "questions") tran_ids_to_sentences_dict = _from_translate_to_json( dataset, src_folder, trg_folder, "sentences") # build translated table table_path = "glue_data/qnli/{}_table.tsv".format(dataset) df_table = pd.read_csv(table_path, encoding="utf-8", quoting=csv.QUOTE_NONE, sep="\t") tran_dict = { "index": [], "question": [], "sentence": [], "label": [], } for i in range(0, df_table.shape[0]): index_id = df_table["index"][i] label = df_table["label"][i] question = tran_ids_to_questions_dict[str(df_table["question"][i])] sentence = tran_ids_to_sentences_dict[str(df_table["sentence"][i])] # remove end question mark from question if question[-1] == "?": question = question[:-1].strip() tran_dict["index"].append(index_id) tran_dict["question"].append(question) tran_dict["sentence"].append(sentence) tran_dict["label"].append(label) tran_df = pd.DataFrame(tran_dict) tran_dataset_path = "glue_data/qnli/vi_{}.tsv".format(dataset) _write_tsv(tran_df, tran_dataset_path) print("Write file {}".format(tran_dataset_path)) # write translated questions, sentences json tran_ids_to_questions_path = "glue_data/qnli/vi_{}_questions.json".format( dataset) tran_ids_to_sentences_path = "glue_data/qnli/vi_{}_sentences.json".format( dataset) write_json_data(tran_ids_to_questions_path, tran_ids_to_questions_dict) print("Write file {}".format(tran_ids_to_questions_path)) write_json_data(tran_ids_to_sentences_path, tran_ids_to_sentences_dict) print("Write file {}".format(tran_ids_to_sentences_path))
def _build_data(dataset): # train_questions.json # train_sentences.json # train_table.tsv tsv_path = "glue_data/qnli/{}.tsv".format(dataset) table_path = "glue_data/qnli/{}_table.tsv".format(dataset) df = pd.read_csv(tsv_path, encoding="utf-8", quoting=csv.QUOTE_NONE, sep="\t") table = { "index": [], "question": [], "sentence": [], "label": [], } questions_to_id_dict = {} sentences_to_id_dict = {} curr_q_id = 0 curr_s_id = 0 for i in range(0, df.shape[0]): index = df["index"][i] question = df["question"][i] sentence = df["sentence"][i] label = df["label"][i] if question not in questions_to_id_dict: questions_to_id_dict[question] = curr_q_id curr_q_id += 1 if sentence not in sentences_to_id_dict: sentences_to_id_dict[sentence] = curr_s_id curr_s_id += 1 q_id = questions_to_id_dict[question] s_id = sentences_to_id_dict[sentence] table["index"].append(index) table["label"].append(label) table["question"].append(q_id) table["sentence"].append(s_id) id_to_qs_dict = {v: k for k, v in questions_to_id_dict.items()} id_to_ss_dict = {v: k for k, v in sentences_to_id_dict.items()} id_to_qs_path = "glue_data/qnli/{}_questions.json".format(dataset) id_to_ss_path = "glue_data/qnli/{}_sentences.json".format(dataset) write_json_data(id_to_qs_path, id_to_qs_dict) print("Write file {}".format(id_to_qs_path)) write_json_data(id_to_ss_path, id_to_ss_dict) print("Write file {}".format(id_to_ss_path)) table_df = pd.DataFrame(table) _write_tsv(table_df, table_path) print("Write file {}".format(table_path))
def _to_zalo(): split_folder = "glue_data\qnli\split_tsv" en_folder_zalo = "glue_data/qnli/en" for tsv_path in glob.glob("{}\*.tsv".format(split_folder)): df = pd.read_csv(tsv_path, encoding="utf-8", quoting=csv.QUOTE_NONE, sep="\t") json_examples = _tsv_to_zalo(df) json_file_name = tsv_path.split('\\')[-1].replace(".tsv", ".json") json_path = "{}/{}".format(en_folder_zalo, json_file_name) write_json_data(json_path, json_examples) print("Write to file {}".format(json_path))
def preprocess_qna_data( self, method, bert_type, dataset_types, ): for dataset_type in dataset_types: data_file = "qna_data/en_{}.json".format(dataset_type) # Init features columns if self.for_train: features_columns = { "id": [], "question": [], "text": [], "label": [], "pid": [], } json_samples = read_json_data(data_file) for json_sample in json_samples: if self.for_train: features_columns["id"].append(json_sample["id"]) features_columns["label"].append(1 if json_sample["label"] else 0) features_columns["pid"].append(json_sample["pid"]) for key in ["question", "text"]: pre_key = "{}_{}_{}".format( method, bert_type, key ) pre_text, tokens_id = self.pre_process_text( json_sample[key], method, self.for_train ) json_sample[pre_key] = pre_text if self.for_train: features_columns[key].append(tokens_id) # samples with preprocessed keys write_json_data(data_file, json_samples) print ("{}. Length {}. Done write to file {}".format( dataset_type, len(json_samples), data_file )) # generate featured dataset if self.for_train: folder_name = "{}_{}".format(method, bert_type) self.write_features_columns( features_columns, folder_name, dataset_type )
def _summary(tsv_dir): summary_dict = {} for tsv_path in glob.glob("{}/*.tsv".format(tsv_dir)): file_name = tsv_path.split("\\")[-1].replace(".tsv", "") res_dict, q_len_df, s_len_df, outlier_df = _get_tsv_summary(tsv_path) if res_dict: summary_dict[file_name] = res_dict if q_len_df is not None and s_len_df is not None: q_len_df.to_csv("{}/{}_question_length_dist.csv".format(tsv_dir, file_name), header=True, index=False) s_len_df.to_csv("{}/{}_sentence_length_dist.csv".format(tsv_dir, file_name), header=True, index=False) if outlier_df is not None: outlier_df.to_csv( "{}/{}_outlier.csv".format(tsv_dir, file_name), header=False, encoding="utf-8", sep="\t", quoting=csv.QUOTE_NONE) summary_path = "{}/summary.json".format(tsv_dir) write_json_data(summary_path, summary_dict) print ("Write file {}".format(summary_path))
def convert_data(dataset_type, include_txt=True): file_path = "qna_data/{}.json".format(dataset_type) data_json = read_json_data(file_path) converted_samples = [] for sample_json in data_json: converted_sample = None if dataset_type in ["train", "squad"]: converted_sample = sample_json converted_sample["pid"] = "p1" converted_samples.append(converted_sample) elif dataset_type in ["test", "private", "ltest"]: for p in sample_json["paragraphs"]: if 'label' in p: label = True if p['label'] == '1' else False else: label = False converted_sample = { "id": sample_json["__id__"], "title": sample_json["title"], "question": sample_json["question"], "text": p["text"], "label": label, "pid": p["id"] } converted_samples.append(converted_sample) new_file_path = "qna_data/vi_{}.json".format(dataset_type) write_json_data(new_file_path, converted_samples) print ("Length {}. Done write to file {}".format(len(converted_samples), new_file_path)) write_txt_for_translation(converted_samples, dataset_type) # write only vi files print ("Done write raw files for translation")
def build_data(self): corpus = self.train_paragraph_texts + \ self.train_question_texts + \ self.test_question_texts + \ self.test_paragraph_texts vocab = VocabEntry.from_corpus(corpus, freq_cutoff=1) vocab_file = "qna_data/{}_vocab.json".format(self.method) vocab.save_json(vocab_file) self.train_questions = vocab.padd_sents(self.train_question_texts, start_end=False) self.train_paragraphs = vocab.padd_sents(self.train_paragraph_texts, start_end=False) self.test_questions = vocab.padd_sents(self.test_question_texts, start_end=False) self.test_paragraphs = vocab.padd_sents(self.test_paragraph_texts, start_end=False) save_data = { "train_questions": self.train_questions, "train_paragraphs": self.train_paragraphs, "test_questions": self.test_questions, "test_paragraphs": self.test_paragraphs, } save_file = "qna_data/{}_dataset.json".format(self.method) write_json_data(save_file, save_data) self.vocab = vocab self._to_numpy() print("corpus len: ", len(corpus)) print(corpus[0]) print("max length: ", vocab.max_sent_len)
def convert_raw_en_to_json(dataset_type): raw_id_type_file = "qna_data/back_tran/raw_id_type_{}.txt".format(dataset_type) raw_en_file = "qna_data/back_tran/raw_vi_{}.txt".format(dataset_type) en_file = "qna_data/vi_{}.json".format(dataset_type) # for getting the title only vi_json_file = "qna_data/vi_{}.json".format(dataset_type[1:]) vi_json_samples = read_json_data(vi_json_file) en_json_samples = [] id_lines = [line.strip() for line in open(raw_id_type_file, "r", encoding="utf-8")] en_lines = [line.strip() for line in open(raw_en_file, "r", encoding="utf-8")] current_question = None text_idx = 0 for i, id_line in enumerate(id_lines): parts = id_line.split("\t") if parts[1] == "question": current_question = { "id": parts[0], "question": en_lines[i], } elif parts[1] == "text": en_json_sample = copy.deepcopy(current_question) en_json_sample["title"] = vi_json_samples[text_idx]["title"] en_json_sample["text"] = en_lines[i] en_json_sample["label"] = True if parts[3] == "True" else False en_json_sample["pid"] = parts[2] en_json_samples.append(en_json_sample) text_idx += 1 write_json_data(en_file, en_json_samples) print ("{}. Length {}. Done write to file {}".format( dataset_type, len(en_json_samples), en_file ))
def do_mappings(in_dir): """ Update the mappings likes convert bbt to second :param in_dir: The input project dir :return: None """ tracks_file = f"{in_dir}/tracks.json" tracks_json = read_json_data(tracks_file) tracks_data = tracks_json["tracks_data"] bpm = tracks_data["bpm"] ppq = tracks_data["ppq"] time_signature = tracks_data["time_signature"] mappings = tracks_data["mappings"] for mapping in mappings: loops_data = mapping.get("loops_data") if loops_data: between_first_s = bbt_to_second( bpm, ppq, loops_data.get("between_first_bbt", "1:01:00"), time_signature) between_second_s = bbt_to_second( bpm, ppq, loops_data.get("between_second_bbt", "1:01:00"), time_signature) loops_data["between"] = between_second_s - between_first_s for loop in loops_data["loops"]: print(loop) loop["start"] = bbt_to_second(bpm, ppq, loop["start_bbt"], time_signature) write_json_data(tracks_file, tracks_json) print(f"Write file {tracks_file}")
def preprocess_qna_data( self, method, cased, dataset_types, ): folder_name = "{}_{}".format(method, cased) folder_path = "qna_data/pre_data/vi_{}".format(folder_name) create_folder(folder_path) # preprocess fields dataset_features_columns = {} for dataset_type in dataset_types: data_file = "qna_data/vi_{}.json".format(dataset_type) # Init features columns if self.for_train: features_columns = { "id": [], "question": [], "text": [], "label": [], "pid": [], } json_samples = read_json_data(data_file) for json_sample in json_samples: if self.for_train: features_columns["id"].append(json_sample["id"]) features_columns["label"].append(1 if json_sample["label"] else 0) features_columns["pid"].append(json_sample["pid"]) for key in ["question", "text"]: pre_key = "{}_{}_{}".format( method, cased, key ) pre_text, tokens = self.pre_process_text( json_sample[key], method, cased, self.for_train, key ) json_sample[pre_key] = pre_text if self.for_train: features_columns[key].append(tokens) # samples with preprocessed keys write_json_data(data_file, json_samples) print ("{}. Length {}. Done write to file {}".format( dataset_type, len(json_samples), data_file )) # save for writing later when we have vocab if self.for_train: dataset_features_columns[dataset_type] = features_columns # build vocab vocab_file = "{}/vocab.json".format(folder_path, dataset_type) if self.build_vocab: self._build_vocab(vocab_file, method, cased) else: self.vocab = VocabEntry.from_json(vocab_file) # write configs configs = { "vocab_size": len(self.vocab), "question_size": self.question_size, "text_size": self.text_size, } configs_file = "{}/configs.json".format(folder_path) write_json_data(configs_file, configs) print ("Done wirte config file {}".format(configs_file)) # write features columns # generate featured dataset if self.for_train: for dataset_type, features_columns in dataset_features_columns.items(): self.write_features_columns( features_columns, folder_name, dataset_type )
scratch = ScratchCode() if mode == '1': filename = input( 'Введите имя файла, куда будут записаны скретч-коды: ') directory, file = os.path.split(filename) if not os.path.isdir(filename) and not directory or os.path.exists( directory): scratch_codes_count = get_value('scratch_codes_count') serial_number_length = len( str(int(first_serial_number) + scratch_codes_count).zfill(first_serial_number_length)) scratch_codes = scratch.generate(serial_number_length, hash_type, hash_length, scratch_codes_count) write_json_data(scratch_codes, filename) else: print('Неправильное имя файла.') if mode == '2': filename = input( 'Введите имя файла, откуда будут взяты скретч-коды для проверки: ') if os.path.isfile(filename): scratch_codes = get_json_data(filename) serial_number_length = len( str(int(first_serial_number) + len(scratch_codes)).zfill(first_serial_number_length)) scratch.create_activated_codes_table() checked_codes, right_codes = scratch.check(scratch_codes,
def create_match_details_directory(self,directory): for index in range(self.start,self.end): url = self.get_url(index) if not os.path.exists(directory): os.makedirs(os.path.dirname(directory), exist_ok=True) write_json_data(os.path.join(directory,f"{self.get_file_name(url)}.json"),get_detail_json(url))