def unambiguize(dir): root = "/Users/alexwarstadt/Workspace/data_generation/outputs/inductive_biases/" dir_path = os.path.join(root, dir) train_file = jsonlines.open(os.path.join(dir_path, "train.jsonl")) train_control_file = jsonlines.open( os.path.join(dir_path, "control_train.jsonl")) train_pairs = read_pairs(train_file) train_control_pairs = read_pairs(train_control_file) percents = [0.0026, 0.005, 0.01, 0.02, 0.04, 0.08, 0.16] for p in percents: data = random.sample(train_pairs, int(5000.0 * (1 - p))) data.extend(random.sample(train_control_pairs, int(5000 * p))) output_file = open(os.path.join(dir_path, "train_%s.jsonl" % str(p)), "w") w = jsonlines.Writer(output_file) data = unzip_pairs(data) w.write_all(data) w.close() test_file = jsonlines.open(os.path.join(dir_path, "test.jsonl")) test_control_file = jsonlines.open( os.path.join(dir_path, "control_test.jsonl")) test_data = [x for x in test_file] test_data.extend([x for x in test_control_file]) output_file = open(os.path.join(dir_path, "test_combined.jsonl"), "w") w = jsonlines.Writer(output_file) w.write_all(test_data) w.close()
def __init__( self, owner, filename: str, format: str = "jsonl", compression: Optional[str] = None, **kwargs: Any, ): super().__init__(owner) dirname = os.path.abspath(os.path.dirname(filename)) basename = os.path.basename(filename) nodes_filename = os.path.join(dirname if dirname else "", f"{basename}_nodes.{format}") edges_filename = os.path.join(dirname if dirname else "", f"{basename}_edges.{format}") if dirname: os.makedirs(dirname, exist_ok=True) if compression == "gz": nodes_filename += f".{compression}" edges_filename += f".{compression}" NFH = gzip.open(nodes_filename, "wb") self.NFH = jsonlines.Writer(NFH) EFH = gzip.open(edges_filename, "wb") self.EFH = jsonlines.Writer(EFH) else: self.NFH = jsonlines.open(nodes_filename, "w") self.EFH = jsonlines.open(edges_filename, "w")
def generate_paradigm(self, number_to_generate=1000, rel_output_path=None, absolute_path=None): """ Contains the main loop for generating a full dataset for a given paradigm. Also contains exception handling: some exceptions are tolerated because sometimes no matching arguments can be found, but if at least 10% of cases have an exception, it terminates since this is probably an issue in the code, and it could cause an infinite loop otherwise. :param number_to_generate: number of minimal pairs/sets to generate :param rel_output_path: relative path of output file :param absolute_path: absolute path of output file :return: None """ if rel_output_path is not None: project_root = "/".join( os.path.join(os.path.dirname( os.path.abspath(__file__))).split("/")[:-1]) output = open(os.path.join(project_root, rel_output_path), "w") elif absolute_path is not None: output = open(absolute_path, "w") else: raise Exception("You need to give an output path") past_sentences = set() generated_data = [] pairID = 0 error_counter = 0 constant_data = self.make_metadata_dict() print("Generating data for " + constant_data["UID"]) self.make_logger(constant_data) output_writer = jsonlines.Writer(output, flush=True) while len(past_sentences) < number_to_generate: try: new_data, track_sentence = self.sample() if track_sentence not in past_sentences: past_sentences.add(track_sentence) for field in self.data_fields: if field in new_data: new_data[field] = string_beautify(new_data[field]) new_data.update(constant_data) new_data["pairID"] = str(pairID) pairID += 1 if pairID % 100 == 0: print("%d sentences generated" % pairID) output_writer.write(new_data) except Exception as e: self.log_exception(e) print(self.get_stack_trace(e)) error_counter += 1 if error_counter > number_to_generate // 5: pass # raise Exception("Over 20\% of samples result in errors. You should fix this.") jsonlines.Writer(output).write_all(generated_data)
def execute(self, context): """ Executed by task_instance at runtime """ mongo_conn = MongoHook(self.mongo_conn_id).get_conn() s3_conn = S3Hook(self.s3_conn_id) # Grab collection and execute query according to whether or not it is a pipeline collection = mongo_conn.get_database(self.mongo_db).get_collection( self.mongo_collection) results = collection.aggregate( self.mongo_query) if self.is_pipeline else collection.find( self.mongo_query) # Performs transform then stringifies the docs results into json format # docs_str = self._stringify(self.transform(results)) tmp_file = NamedTemporaryFile() print("writing results to temp file") start = datetime.now() with jsonlines.Writer(tmp_file) as writer: writer.write_all(results) tmp_file.close() end = datetime.now() print("took %i seconds" % (end - start).total_seconds()) s3_conn.load_file(tmp_file.name, self.s3_key, bucket_name=self.s3_bucket, replace=self.replace) os.unlink(tmp_file.name)
def main(): parser = argparse.ArgumentParser(prog="examples-to-dataset.py") parser.add_argument("examples_directory", help="Directory with WAV files and JSON intents") parser.add_argument("output_file", help="jsonl output file") parser.add_argument( "--dataset-directory", default=os.getcwd(), help="Base directory of dataset (default: cwd)", ) args = parser.parse_args() examples_dir = Path(args.examples_directory) dataset_dir = Path(args.dataset_directory) with open(args.output_file, "w") as output_file: with jsonlines.Writer(output_file) as out: for wav_path in examples_dir.glob("*.wav"): json_path = wav_path.with_suffix(".json") if not json_path.is_file(): continue wav_name = str(wav_path.relative_to(dataset_dir)) with open(json_path, "r") as json_file: intent = json.load(json_file) intent["wav_name"] = wav_name out.write(intent)
def myfilter(): funs0 = {'Concat', 'Length', 'Substr', 'Index', 'Replace'} funs1 = {'ToLower', 'ToUpper', 'Trim', 'TrimEnd', 'TrimStart'} infile = open('./__exprs.jsonl', mode='r') outfile = open(input('Enter output file name: '), mode='w') reader = jsonlines.Reader(infile) writer = jsonlines.Writer(outfile) pred = lambda x: len(x['text']) < 300 and \ allin(x['functions'], funs0.union(funs1)) and \ oneof(x['functions'], funs0) and \ oneof(x['functions'], funs1) filtered = dict((x['text'], x) for x in reader if pred(x)).values() sorted_ = sorted(filtered, key=lambda x: len(x['functions']), reverse=True) writer.write_all(sorted_) # slice = itertools.islice(filtered, 10) # print(list(slice)) reader.close() infile.close() writer.close() outfile.close()
def query_all_training_data(self): """ Iterate through training data, query and write to file :return: """ # open data file and read lines with open(self.data_path, 'r') as f: for line in tqdm.tqdm(f): try: dataline = json.loads(line) s_ent = dataline['source_ent'] t_ent = dataline['target_ent'] label = dataline['label'] s_ent_new = self.update_json_ent(s_ent) t_ent_new = self.update_json_ent(t_ent) line = { 'source_ent': s_ent_new, 'target_ent': t_ent_new, 'label': label } with jsonlines.Writer(open(self.out_path, 'a')) as writer: writer.write(line) except Exception: continue return
def save_documents(jsonl_out: pathlib.Path, documents: t.Iterator[dict]) -> None: """ Saves the documents to a `JSONL` file Parameters ---------- jsonl_out : pathlib.Path The JSONL file to contain all the documents documents : Iterator[dict] The JSON documents to save """ bar_i = 0 widgets = [ 'Saving JSONL # ', pb.Counter(), ' ', pb.Timer(), ' ', pb.BouncingBar(marker='.', left='[', right=']') ] with pb.ProgressBar(widgets=widgets) as bar: with open(jsonl_out, 'w', encoding='utf-8') as fp: with jl.Writer(fp, compact=True, sort_keys=True) as writer: for document in documents: writer.write(document) bar_i = bar_i + 1 bar.update(bar_i)
def write_jsonls(file_path, data, log=log): "Sream Write to JSON Lines. 'data' must be namedtuple. schema is a dict of field to data-type" import jsonlines s_t = now() l_t = now() msg_dlt = 10000 counter = 0 counter2 = 0 with open(file_path, 'wb') as f: w = jsonlines.Writer(f) for row in data: w.write(row) counter += 1 counter2 += 1 # progress message if counter2 % msg_dlt == 0: secs_l = (now() - l_t).total_seconds() if secs_l >= 20: secs = (now() - s_t).total_seconds() rate = round(counter2 / secs_l, 1) mins = round(secs / 60, 1) log("{} min ## Writing to JSON: {} rows @ {} r/s.".format( mins, counter, rate)) l_t = now() counter2 = 0 secs = (now() - s_t).total_seconds() rate = round(counter / secs, 1) log("Wrote {} rows to {} [{} r/s].".format(counter, file_path, rate))
def filter_dataset_parallel(path: str, out_path: str): full_path = pathlib.Path(path).resolve() read_f = gzip.open(full_path, "rb") if path.endswith(".jsonl.gz") else full_path.open("r") reader = jsonlines.Reader(read_f) full_out_path = pathlib.Path(out_path).resolve() write_f = gzip.open(full_out_path, "wb") if out_path.endswith(".jsonl.gz") else full_out_path.open("w") writer = jsonlines.Writer(write_f) logger.debug(f"Writing output to {full_out_path}...") examples = [] logger.debug(f"Loading {full_path}") pool = Pool(processes=16) total_lines = 0 num_written = 0 for has_error, json_dict in tqdm.tqdm(pool.imap(has_transform_error, reader, chunksize=4), desc=full_path.name): total_lines += 1 if not has_error: writer.write(json_dict) num_written += 1 if total_lines % 1000 == 0: logger.debug(f"Filtered jsonl to {num_written}/{total_lines}") logger.debug(f"DONE: Filtered jsonl to {num_written}/{total_lines}") read_f.close() write_f.close()
def write_jsonl(entities: List[dict], fp: Union[BinaryIO, TextIO], max_size: int = None, default=None): """ Converts the provided list of dicts to jsonl and writes to the file handle :param entities: The list of dicts to be converted to jsonl :param fp: The file handle to which the output should be written :param max_size: A per-line size limit - jsonl lines over this size will be dropped from the output :param default: A function that should return a serializable version of the dict """ def bytes_serializer(obj): if isinstance(obj, bytes): return obj.decode('utf8', errors='backslashreplace') elif isinstance(obj, dict): return json.dumps(obj, separators=(',', ':'), default=default) raise TypeError(repr(obj) + ' is not JSONL serializable') # Convert the result dictionary of metadata into serialized bytes and then store them with jsonlines.Writer(fp, compact=True, flush=True, dumps=bytes_serializer) as writer: for obj in entities: # Verify that the object isn't over a maximum size limit if max_size is not None: obj_size = get_size(obj) if obj_size > max_size: logger.warning( f'Dropping result that is over size limit ({obj_size} bytes > {max_size} byte limit)' ) continue writer.write(obj)
def do_phonemize(args): """ Reads JSONL from stdin with "clean_words" property. Looks up or guesses phonetic pronuncation(s) for all clean words. Prints a line of JSON for each input line. """ from .lang import get_phonemizer from .commands import phonemize word_break = IPA.BREAK_WORD if args.word_breaks else None phonemizer = get_phonemizer( args.language, args.lang_dir, use_word_indexes=args.word_indexes, word_break=word_break, no_g2p=args.no_g2p, ) if os.isatty(sys.stdin.fileno()): print("Reading tokenize JSONL from stdin...", file=sys.stderr) def sentence_generator(): for line in sys.stdin: line = line.strip() if not line: continue yield json.loads(line) writer = jsonlines.Writer(sys.stdout, flush=True) for utt_json in phonemize(phonemizer, sentence_generator()): writer.write(utt_json)
def yaml2nedb(yamlfile): output = [] with open(yamlfile, "r") as reader: for obj in yaml.safe_load_all(reader): output.append(obj) jsonwriter = jsonlines.Writer(sys.stdout, compact=True) jsonwriter.write_all(output)
def send_event(topic, payload_dict={}): print(topic, end=" ", file=events_out_file) with jsonlines.Writer(events_out_file) as out: out.write(payload_dict) events_out_file.flush()
def do_tokenize(args): """ Split lines from stdin into sentences, tokenize and clean. Prints a line of JSON for each sentence. """ from .commands import tokenize from .lang import get_tokenizer tokenizer = get_tokenizer(args.language, lang_dir=args.lang_dir, no_pos=args.no_pos) if args.text: # Use arguments lines = args.text else: # Use stdin lines = sys.stdin if os.isatty(sys.stdin.fileno()): print("Reading text from stdin...", file=sys.stderr) writer = jsonlines.Writer(sys.stdout, flush=True) for sent_json in tokenize( tokenizer, lines, is_csv=args.csv, csv_delimiter=args.csv_delimiter, split_sentences=args.split_sentences, ): writer.write(sent_json)
def main(path_corpus_root: str, path_processed_jsonl: str, path_morph_freq_file: str): __morphs = [] fp = open(path_processed_jsonl, 'w') writer = jsonlines.Writer(fp) for root, subdirs, files in os.walk(path_corpus_root): logging.info('processing {}...'.format(files)) if len(subdirs) > 0: continue for f_name in tqdm.tqdm(files): __root, dir_name = os.path.split(root) path_text_file = os.path.join(root, f_name) processed_obj = load_liverdoor_corpus(path_text_file, f_name, dir_name) writer.write(processed_obj) __morphs += processed_obj['morphs'] else: pass else: writer.close() fp.close() morphs_freq = [(k, v) for k, v in Counter(__morphs).items()] fp = open(path_morph_freq_file, 'w') for t in morphs_freq: fp.write('{} {}'.format(t[0], t[1]) + '\n') else: fp.close()
def do_phones_to_phonemes(config, args): """Transform/group phones in a pronuncation into language phonemes""" phonemes_path = Path(pydash.get(config, "language.phonemes")) with open(phonemes_path, "r") as phonemes_file: phonemes = gruut_ipa.Phonemes.from_text(phonemes_file) keep_stress = pydash.get(config, "language.keep_stress", False) if args.phones: phones = args.phones else: # Read from stdin phones = sys.stdin if os.isatty(sys.stdin.fileno()): print("Reading pronunciations from stdin...", file=sys.stderr) writer = jsonlines.Writer(sys.stdout, flush=True) for line in phones: line = line.strip() if line: line_phonemes = phonemes.split(line, keep_stress=keep_stress) phonemes_list = [p.text for p in line_phonemes] writer.write({ "language": args.language, "raw_text": line, "phonemes_text": " ".join(phonemes_list), "phonemes_list": phonemes_list, "phonemes": [p.to_dict() for p in line_phonemes], })
def do_print_phoneme_counts(config, args): """ Print counts of all phonemes from the lexicon. """ gruut_lang = try_load_language(args) writer = jsonlines.Writer(sys.stdout, flush=True) phoneme_counts = Counter() for phoneme in gruut_lang.phonemes: phoneme_counts[phoneme.text] = 0 for word_prons in gruut_lang.phonemizer.lexicon.values(): for word_pron in word_prons: for phoneme in word_pron: if not phoneme: continue while phoneme and (gruut_ipa.IPA.is_stress(phoneme[0]) or gruut_ipa.IPA.is_accent(phoneme[0])): phoneme = phoneme[1:] if phoneme: phoneme_counts[phoneme] += 1 writer.write(phoneme_counts.most_common())
def main(): parser = argparse.ArgumentParser("vocab_g2p") parser.add_argument("--output", default=None, help="Path to write guesses as JSON (default: stdout)") parser.add_argument("--debug", action="store_true", help="Print DEBUG messages to console") args, _ = parser.parse_known_args() # ------------------------------------------------------------------------- if args.debug: logging.basicConfig(level=logging.DEBUG) logger.debug(args) output_file = sys.stdout if args.output: output_file = open(args.output, "w") pronunciations = defaultdict(list) for line in sys.stdin: word, phonemes = re.split(r"\s+", line.strip(), maxsplit=1) pronunciations[word].append(phonemes) with jsonlines.Writer(output_file) as out: out.write(pronunciations)
def generate(gpt2, split, out_dir, start=None, end=None): print('Getting prefixes') id_prefixs = get_prefixs(split) if (start is not None) and (end is not None): id_prefixs = id_prefixs[start:end] print(len(id_prefixs)) # get out path if not os.path.exists(out_dir): os.mkdir(out_dir) out_fn = '{}'.format(split) if (start is not None) and (end is not None): out_fn += '_{}-{}'.format(start, end) out_fn += '.jsonl' out_fp = os.path.join(out_dir, out_fn) print(out_fp) print('Generating text') output = [] step = 10 writer = jsonlines.Writer(open(out_fp, 'w'), flush=True) for i in range(0, len(id_prefixs), step): print('GENERATED: ', i) cur_ids, cur_prefixs = zip(*id_prefixs[i:i+step]) cur_gentexts = gpt2.generate_conditional(prompts=cur_prefixs) # add to batch cur_batch = [{'id': cur_ids[j], 'text': cur_gentexts[j]} for j in range(len(cur_ids))] for item in cur_batch: writer.write(item)
def write(self, tweets: Union[Any, List[Any]]) -> None: """Write Tweet or list of Tweets to file""" with jl.Writer(self.f) as writer: if not isinstance(tweets, list): writer.write(tweets) else: writer.write_all(tweets) self.f.close()
def test_writer_flags(): fp = io.BytesIO() with jsonlines.Writer(fp, compact=True, sort_keys=True) as writer: writer.write(collections.OrderedDict([ ('b', 2), ('a', 1), ])) assert fp.getvalue() == b'{"a":1,"b":2}\n'
def send_event(topic, payload_dict={}, show_event=True): if show_event: print(topic, end=" ") with jsonlines.Writer(events_out_file) as out: out.write(payload_dict) events_out_file.flush()
def _save_articles_to_jsonl(results: t.Iterator[dict], jsonl_out: pathlib.Path) -> None: """ Writes the relevant data to disk """ with open(jsonl_out, 'w', encoding='utf-8') as fp: with jl.Writer(fp, compact=True, sort_keys=True) as writer: for item in results: writer.write(item)
def print_json(value: typing.Any, out_file=sys.stdout) -> None: """Print a single line of JSON to stdout.""" import jsonlines with jsonlines.Writer(out_file) as out: # pylint: disable=E1101 out.write(value) out_file.flush()
def merge_sorted_jsonl_files(input_file_path_1: str, input_file_path_2: str, result_file_path: str) -> None: with open(input_file_path_1, "r") as data_read_1, open( input_file_path_2, "r") as data_read_2, open(result_file_path, "w") as data_write: pairs = list(create_pairs(data_read_1, data_read_2)) writer = jsonlines.Writer(data_write) while pairs: write_next_min_item(pairs, writer)
def fix_ngram(): with jsonlines.open(file) as reader: for obj in reader: with jsonlines.Writer( open( os.path.join(results_dir, "blimp_ngram_simplelm_peephole.jsonl"), "w")) as writer: writer.write_all(obj)
def main(): reader = csv.DictReader(sys.stdin) writer = jsonlines.Writer(sys.stdout.buffer) try: writer.write_all(reader) except BrokenPipeError: pass return 0
def __init__(self, file_name: str): """Create a file object""" self.file_name = file_name self._clear() self.writer = jsonlines.Writer(fp=open(file_name, mode="a", encoding="utf-8"), dumps=NumpyEncoder().encode, flush=False)
def generate_paradigm(self, number_to_generate=1000, rel_output_path=None, absolute_path=None): if rel_output_path is not None: project_root = "/".join( os.path.join(os.path.dirname( os.path.abspath(__file__))).split("/")[:-1]) output = open(os.path.join(project_root, rel_output_path), "w") elif absolute_path is not None: output = open(absolute_path, "w") else: raise Exception("You need to give an output path") past_sentences = set() generated_data = [] pairID = 0 error_counter = 0 constant_data = self.make_metadata_dict() print("Generating data for " + constant_data["UID"]) self.make_logger(constant_data) output_writer = jsonlines.Writer(output, flush=True) while len(past_sentences) < number_to_generate: try: new_data, track_sentence = self.sample() if track_sentence not in past_sentences: past_sentences.add(track_sentence) for field in self.data_fields: if field in new_data: new_data[field] = string_beautify(new_data[field]) new_data.update(constant_data) new_data["pairID"] = str(pairID) pairID += 1 if pairID % 100 == 0: print("%d sentences generated" % pairID) output_writer.write(new_data) except Exception as e: self.log_exception(e) print(self.get_stack_trace(e)) error_counter += 1 if error_counter > number_to_generate // 5: pass # raise Exception("Over 20\% of samples result in errors. You should fix this.") jsonlines.Writer(output).write_all(generated_data)