def unambiguize(dir):
    root = "/Users/alexwarstadt/Workspace/data_generation/outputs/inductive_biases/"
    dir_path = os.path.join(root, dir)
    train_file = jsonlines.open(os.path.join(dir_path, "train.jsonl"))
    train_control_file = jsonlines.open(
        os.path.join(dir_path, "control_train.jsonl"))
    train_pairs = read_pairs(train_file)
    train_control_pairs = read_pairs(train_control_file)

    percents = [0.0026, 0.005, 0.01, 0.02, 0.04, 0.08, 0.16]
    for p in percents:
        data = random.sample(train_pairs, int(5000.0 * (1 - p)))
        data.extend(random.sample(train_control_pairs, int(5000 * p)))
        output_file = open(os.path.join(dir_path, "train_%s.jsonl" % str(p)),
                           "w")
        w = jsonlines.Writer(output_file)
        data = unzip_pairs(data)
        w.write_all(data)
        w.close()

    test_file = jsonlines.open(os.path.join(dir_path, "test.jsonl"))
    test_control_file = jsonlines.open(
        os.path.join(dir_path, "control_test.jsonl"))
    test_data = [x for x in test_file]
    test_data.extend([x for x in test_control_file])
    output_file = open(os.path.join(dir_path, "test_combined.jsonl"), "w")
    w = jsonlines.Writer(output_file)
    w.write_all(test_data)
    w.close()
示例#2
0
 def __init__(
     self,
     owner,
     filename: str,
     format: str = "jsonl",
     compression: Optional[str] = None,
     **kwargs: Any,
 ):
     super().__init__(owner)
     dirname = os.path.abspath(os.path.dirname(filename))
     basename = os.path.basename(filename)
     nodes_filename = os.path.join(dirname if dirname else "",
                                   f"{basename}_nodes.{format}")
     edges_filename = os.path.join(dirname if dirname else "",
                                   f"{basename}_edges.{format}")
     if dirname:
         os.makedirs(dirname, exist_ok=True)
     if compression == "gz":
         nodes_filename += f".{compression}"
         edges_filename += f".{compression}"
         NFH = gzip.open(nodes_filename, "wb")
         self.NFH = jsonlines.Writer(NFH)
         EFH = gzip.open(edges_filename, "wb")
         self.EFH = jsonlines.Writer(EFH)
     else:
         self.NFH = jsonlines.open(nodes_filename, "w")
         self.EFH = jsonlines.open(edges_filename, "w")
示例#3
0
 def generate_paradigm(self,
                       number_to_generate=1000,
                       rel_output_path=None,
                       absolute_path=None):
     """
     Contains the main loop for generating a full dataset for a given paradigm.
     Also contains exception handling: some exceptions are tolerated because sometimes no matching arguments can be found,
     but if at least 10% of cases have an exception, it terminates since this is probably an issue in the code, and
     it could cause an infinite loop otherwise.
     :param number_to_generate: number of minimal pairs/sets to generate
     :param rel_output_path: relative path of output file
     :param absolute_path: absolute path of output file
     :return: None
     """
     if rel_output_path is not None:
         project_root = "/".join(
             os.path.join(os.path.dirname(
                 os.path.abspath(__file__))).split("/")[:-1])
         output = open(os.path.join(project_root, rel_output_path), "w")
     elif absolute_path is not None:
         output = open(absolute_path, "w")
     else:
         raise Exception("You need to give an output path")
     past_sentences = set()
     generated_data = []
     pairID = 0
     error_counter = 0
     constant_data = self.make_metadata_dict()
     print("Generating data for " + constant_data["UID"])
     self.make_logger(constant_data)
     output_writer = jsonlines.Writer(output, flush=True)
     while len(past_sentences) < number_to_generate:
         try:
             new_data, track_sentence = self.sample()
             if track_sentence not in past_sentences:
                 past_sentences.add(track_sentence)
                 for field in self.data_fields:
                     if field in new_data:
                         new_data[field] = string_beautify(new_data[field])
                         new_data.update(constant_data)
                 new_data["pairID"] = str(pairID)
                 pairID += 1
                 if pairID % 100 == 0:
                     print("%d sentences generated" % pairID)
                 output_writer.write(new_data)
         except Exception as e:
             self.log_exception(e)
             print(self.get_stack_trace(e))
             error_counter += 1
             if error_counter > number_to_generate // 5:
                 pass
                 # raise Exception("Over 20\% of samples result in errors. You should fix this.")
     jsonlines.Writer(output).write_all(generated_data)
    def execute(self, context):
        """
        Executed by task_instance at runtime
        """
        mongo_conn = MongoHook(self.mongo_conn_id).get_conn()
        s3_conn = S3Hook(self.s3_conn_id)

        # Grab collection and execute query according to whether or not it is a pipeline
        collection = mongo_conn.get_database(self.mongo_db).get_collection(
            self.mongo_collection)
        results = collection.aggregate(
            self.mongo_query) if self.is_pipeline else collection.find(
                self.mongo_query)

        # Performs transform then stringifies the docs results into json format
        # docs_str = self._stringify(self.transform(results))
        tmp_file = NamedTemporaryFile()
        print("writing results to temp file")
        start = datetime.now()
        with jsonlines.Writer(tmp_file) as writer:
            writer.write_all(results)
        tmp_file.close()
        end = datetime.now()
        print("took %i seconds" % (end - start).total_seconds())

        s3_conn.load_file(tmp_file.name,
                          self.s3_key,
                          bucket_name=self.s3_bucket,
                          replace=self.replace)
        os.unlink(tmp_file.name)
def main():
    parser = argparse.ArgumentParser(prog="examples-to-dataset.py")
    parser.add_argument("examples_directory",
                        help="Directory with WAV files and JSON intents")
    parser.add_argument("output_file", help="jsonl output file")
    parser.add_argument(
        "--dataset-directory",
        default=os.getcwd(),
        help="Base directory of dataset (default: cwd)",
    )
    args = parser.parse_args()

    examples_dir = Path(args.examples_directory)
    dataset_dir = Path(args.dataset_directory)

    with open(args.output_file, "w") as output_file:
        with jsonlines.Writer(output_file) as out:
            for wav_path in examples_dir.glob("*.wav"):
                json_path = wav_path.with_suffix(".json")
                if not json_path.is_file():
                    continue

                wav_name = str(wav_path.relative_to(dataset_dir))

                with open(json_path, "r") as json_file:
                    intent = json.load(json_file)

                intent["wav_name"] = wav_name
                out.write(intent)
示例#6
0
def myfilter():
    funs0 = {'Concat', 'Length', 'Substr', 'Index', 'Replace'}
    funs1 = {'ToLower', 'ToUpper', 'Trim', 'TrimEnd', 'TrimStart'}

    infile = open('./__exprs.jsonl', mode='r')
    outfile = open(input('Enter output file name: '), mode='w')

    reader = jsonlines.Reader(infile)
    writer = jsonlines.Writer(outfile)

    pred = lambda x: len(x['text']) < 300 and \
        allin(x['functions'], funs0.union(funs1)) and \
        oneof(x['functions'], funs0) and \
        oneof(x['functions'], funs1)

    filtered = dict((x['text'], x) for x in reader if pred(x)).values()

    sorted_ = sorted(filtered, key=lambda x: len(x['functions']), reverse=True)
    writer.write_all(sorted_)

    # slice = itertools.islice(filtered, 10)
    # print(list(slice))

    reader.close()
    infile.close()

    writer.close()
    outfile.close()
示例#7
0
    def query_all_training_data(self):
        """
        Iterate through training data, query and write to file
        :return:
        """
        # open data file and read lines
        with open(self.data_path, 'r') as f:
            for line in tqdm.tqdm(f):
                try:
                    dataline = json.loads(line)
                    s_ent = dataline['source_ent']
                    t_ent = dataline['target_ent']
                    label = dataline['label']

                    s_ent_new = self.update_json_ent(s_ent)
                    t_ent_new = self.update_json_ent(t_ent)

                    line = {
                        'source_ent': s_ent_new,
                        'target_ent': t_ent_new,
                        'label': label
                    }
                    with jsonlines.Writer(open(self.out_path, 'a')) as writer:
                        writer.write(line)
                except Exception:
                    continue
        return
示例#8
0
def save_documents(jsonl_out: pathlib.Path,
                   documents: t.Iterator[dict]) -> None:
    """
    Saves the documents to a `JSONL` file

    Parameters
    ----------
    jsonl_out : pathlib.Path
        The JSONL file to contain all the documents
    documents : Iterator[dict]
        The JSON documents to save
    """
    bar_i = 0
    widgets = [
        'Saving JSONL # ',
        pb.Counter(), ' ',
        pb.Timer(), ' ',
        pb.BouncingBar(marker='.', left='[', right=']')
    ]
    with pb.ProgressBar(widgets=widgets) as bar:
        with open(jsonl_out, 'w', encoding='utf-8') as fp:
            with jl.Writer(fp, compact=True, sort_keys=True) as writer:
                for document in documents:
                    writer.write(document)
                    bar_i = bar_i + 1
                    bar.update(bar_i)
示例#9
0
def write_jsonls(file_path, data, log=log):
    "Sream Write to JSON Lines. 'data' must be namedtuple. schema is a dict of field to data-type"
    import jsonlines
    s_t = now()
    l_t = now()
    msg_dlt = 10000
    counter = 0
    counter2 = 0

    with open(file_path, 'wb') as f:
        w = jsonlines.Writer(f)
        for row in data:
            w.write(row)
            counter += 1
            counter2 += 1

            # progress message
            if counter2 % msg_dlt == 0:
                secs_l = (now() - l_t).total_seconds()
                if secs_l >= 20:
                    secs = (now() - s_t).total_seconds()
                    rate = round(counter2 / secs_l, 1)
                    mins = round(secs / 60, 1)
                    log("{} min ## Writing to JSON: {} rows @ {} r/s.".format(
                        mins, counter, rate))
                    l_t = now()
                    counter2 = 0

    secs = (now() - s_t).total_seconds()
    rate = round(counter / secs, 1)
    log("Wrote {} rows to {} [{} r/s].".format(counter, file_path, rate))
示例#10
0
def filter_dataset_parallel(path: str, out_path: str):
    full_path = pathlib.Path(path).resolve()
    read_f = gzip.open(full_path, "rb") if path.endswith(".jsonl.gz") else full_path.open("r")
    reader = jsonlines.Reader(read_f)

    full_out_path = pathlib.Path(out_path).resolve()
    write_f = gzip.open(full_out_path, "wb") if out_path.endswith(".jsonl.gz") else full_out_path.open("w")
    writer = jsonlines.Writer(write_f)
    logger.debug(f"Writing output to {full_out_path}...")

    examples = []
    logger.debug(f"Loading {full_path}")
    pool = Pool(processes=16)
    total_lines = 0
    num_written = 0
    for has_error, json_dict in tqdm.tqdm(pool.imap(has_transform_error, reader, chunksize=4), desc=full_path.name):
        total_lines += 1
        if not has_error:
            writer.write(json_dict)
            num_written += 1

        if total_lines % 1000 == 0:
            logger.debug(f"Filtered jsonl to {num_written}/{total_lines}")
    logger.debug(f"DONE: Filtered jsonl to {num_written}/{total_lines}")
    read_f.close()
    write_f.close()
def write_jsonl(entities: List[dict],
                fp: Union[BinaryIO, TextIO],
                max_size: int = None,
                default=None):
    """
    Converts the provided list of dicts to jsonl and writes to the file handle
    :param entities: The list of dicts to be converted to jsonl
    :param fp: The file handle to which the output should be written
    :param max_size: A per-line size limit - jsonl lines over this size will be dropped from the output
    :param default: A function that should return a serializable version of the dict
    """
    def bytes_serializer(obj):
        if isinstance(obj, bytes):
            return obj.decode('utf8', errors='backslashreplace')
        elif isinstance(obj, dict):
            return json.dumps(obj, separators=(',', ':'), default=default)
        raise TypeError(repr(obj) + ' is not JSONL serializable')

    # Convert the result dictionary of metadata into serialized bytes and then store them
    with jsonlines.Writer(fp, compact=True, flush=True,
                          dumps=bytes_serializer) as writer:
        for obj in entities:
            # Verify that the object isn't over a maximum size limit
            if max_size is not None:
                obj_size = get_size(obj)
                if obj_size > max_size:
                    logger.warning(
                        f'Dropping result that is over size limit ({obj_size} bytes > {max_size} byte limit)'
                    )
                    continue

            writer.write(obj)
示例#12
0
文件: __main__.py 项目: rhasspy/gruut
def do_phonemize(args):
    """
    Reads JSONL from stdin with "clean_words" property.

    Looks up or guesses phonetic pronuncation(s) for all clean words.

    Prints a line of JSON for each input line.
    """
    from .lang import get_phonemizer
    from .commands import phonemize

    word_break = IPA.BREAK_WORD if args.word_breaks else None
    phonemizer = get_phonemizer(
        args.language,
        args.lang_dir,
        use_word_indexes=args.word_indexes,
        word_break=word_break,
        no_g2p=args.no_g2p,
    )

    if os.isatty(sys.stdin.fileno()):
        print("Reading tokenize JSONL from stdin...", file=sys.stderr)

    def sentence_generator():
        for line in sys.stdin:
            line = line.strip()
            if not line:
                continue

            yield json.loads(line)

    writer = jsonlines.Writer(sys.stdout, flush=True)
    for utt_json in phonemize(phonemizer, sentence_generator()):
        writer.write(utt_json)
def yaml2nedb(yamlfile):
    output = []
    with open(yamlfile, "r") as reader:
        for obj in yaml.safe_load_all(reader):
            output.append(obj)
    jsonwriter = jsonlines.Writer(sys.stdout, compact=True)
    jsonwriter.write_all(output)
示例#14
0
    def send_event(topic, payload_dict={}):
        print(topic, end=" ", file=events_out_file)

        with jsonlines.Writer(events_out_file) as out:
            out.write(payload_dict)

        events_out_file.flush()
示例#15
0
文件: __main__.py 项目: rhasspy/gruut
def do_tokenize(args):
    """
    Split lines from stdin into sentences, tokenize and clean.

    Prints a line of JSON for each sentence.
    """
    from .commands import tokenize
    from .lang import get_tokenizer

    tokenizer = get_tokenizer(args.language, lang_dir=args.lang_dir, no_pos=args.no_pos)

    if args.text:
        # Use arguments
        lines = args.text
    else:
        # Use stdin
        lines = sys.stdin

        if os.isatty(sys.stdin.fileno()):
            print("Reading text from stdin...", file=sys.stderr)

    writer = jsonlines.Writer(sys.stdout, flush=True)
    for sent_json in tokenize(
        tokenizer,
        lines,
        is_csv=args.csv,
        csv_delimiter=args.csv_delimiter,
        split_sentences=args.split_sentences,
    ):
        writer.write(sent_json)
def main(path_corpus_root: str, path_processed_jsonl: str,
         path_morph_freq_file: str):
    __morphs = []
    fp = open(path_processed_jsonl, 'w')
    writer = jsonlines.Writer(fp)
    for root, subdirs, files in os.walk(path_corpus_root):
        logging.info('processing {}...'.format(files))
        if len(subdirs) > 0:
            continue
        for f_name in tqdm.tqdm(files):
            __root, dir_name = os.path.split(root)
            path_text_file = os.path.join(root, f_name)
            processed_obj = load_liverdoor_corpus(path_text_file, f_name,
                                                  dir_name)
            writer.write(processed_obj)
            __morphs += processed_obj['morphs']
        else:
            pass
    else:
        writer.close()
        fp.close()

    morphs_freq = [(k, v) for k, v in Counter(__morphs).items()]
    fp = open(path_morph_freq_file, 'w')
    for t in morphs_freq:
        fp.write('{} {}'.format(t[0], t[1]) + '\n')
    else:
        fp.close()
示例#17
0
def do_phones_to_phonemes(config, args):
    """Transform/group phones in a pronuncation into language phonemes"""
    phonemes_path = Path(pydash.get(config, "language.phonemes"))

    with open(phonemes_path, "r") as phonemes_file:
        phonemes = gruut_ipa.Phonemes.from_text(phonemes_file)

    keep_stress = pydash.get(config, "language.keep_stress", False)

    if args.phones:
        phones = args.phones
    else:
        # Read from stdin
        phones = sys.stdin
        if os.isatty(sys.stdin.fileno()):
            print("Reading pronunciations from stdin...", file=sys.stderr)

    writer = jsonlines.Writer(sys.stdout, flush=True)
    for line in phones:
        line = line.strip()
        if line:
            line_phonemes = phonemes.split(line, keep_stress=keep_stress)
            phonemes_list = [p.text for p in line_phonemes]

            writer.write({
                "language": args.language,
                "raw_text": line,
                "phonemes_text": " ".join(phonemes_list),
                "phonemes_list": phonemes_list,
                "phonemes": [p.to_dict() for p in line_phonemes],
            })
示例#18
0
def do_print_phoneme_counts(config, args):
    """
    Print counts of all phonemes from the lexicon.
    """
    gruut_lang = try_load_language(args)
    writer = jsonlines.Writer(sys.stdout, flush=True)
    phoneme_counts = Counter()

    for phoneme in gruut_lang.phonemes:
        phoneme_counts[phoneme.text] = 0

    for word_prons in gruut_lang.phonemizer.lexicon.values():
        for word_pron in word_prons:
            for phoneme in word_pron:
                if not phoneme:
                    continue

                while phoneme and (gruut_ipa.IPA.is_stress(phoneme[0])
                                   or gruut_ipa.IPA.is_accent(phoneme[0])):
                    phoneme = phoneme[1:]

                if phoneme:
                    phoneme_counts[phoneme] += 1

    writer.write(phoneme_counts.most_common())
示例#19
0
def main():
    parser = argparse.ArgumentParser("vocab_g2p")
    parser.add_argument("--output",
                        default=None,
                        help="Path to write guesses as JSON (default: stdout)")
    parser.add_argument("--debug",
                        action="store_true",
                        help="Print DEBUG messages to console")
    args, _ = parser.parse_known_args()

    # -------------------------------------------------------------------------

    if args.debug:
        logging.basicConfig(level=logging.DEBUG)

    logger.debug(args)

    output_file = sys.stdout
    if args.output:
        output_file = open(args.output, "w")

    pronunciations = defaultdict(list)
    for line in sys.stdin:
        word, phonemes = re.split(r"\s+", line.strip(), maxsplit=1)
        pronunciations[word].append(phonemes)

    with jsonlines.Writer(output_file) as out:
        out.write(pronunciations)
示例#20
0
def generate(gpt2, split, out_dir, start=None, end=None):
    print('Getting prefixes')
    id_prefixs = get_prefixs(split)
    if (start is not None) and (end is not None):
        id_prefixs = id_prefixs[start:end]
        print(len(id_prefixs))

    # get out path
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    out_fn = '{}'.format(split)
    if (start is not None) and (end is not None):
        out_fn += '_{}-{}'.format(start, end)
    out_fn += '.jsonl'
    out_fp = os.path.join(out_dir,  out_fn)
    print(out_fp)

    print('Generating text')
    output = []
    step = 10
    writer = jsonlines.Writer(open(out_fp, 'w'), flush=True)
    for i in range(0, len(id_prefixs), step):
        print('GENERATED: ', i)
        cur_ids, cur_prefixs = zip(*id_prefixs[i:i+step])
        cur_gentexts = gpt2.generate_conditional(prompts=cur_prefixs)

        # add to batch
        cur_batch = [{'id': cur_ids[j], 'text': cur_gentexts[j]} for j in range(len(cur_ids))]
        for item in cur_batch:
            writer.write(item)
示例#21
0
 def write(self, tweets: Union[Any, List[Any]]) -> None:
     """Write Tweet or list of Tweets to file"""
     with jl.Writer(self.f) as writer:
         if not isinstance(tweets, list):
             writer.write(tweets)
         else:
             writer.write_all(tweets)
     self.f.close()
示例#22
0
def test_writer_flags():
    fp = io.BytesIO()
    with jsonlines.Writer(fp, compact=True, sort_keys=True) as writer:
        writer.write(collections.OrderedDict([
            ('b', 2),
            ('a', 1),
        ]))
    assert fp.getvalue() == b'{"a":1,"b":2}\n'
示例#23
0
    def send_event(topic, payload_dict={}, show_event=True):
        if show_event:
            print(topic, end=" ")

        with jsonlines.Writer(events_out_file) as out:
            out.write(payload_dict)

        events_out_file.flush()
示例#24
0
def _save_articles_to_jsonl(results: t.Iterator[dict],
                            jsonl_out: pathlib.Path) -> None:
    """
    Writes the relevant data to disk
    """
    with open(jsonl_out, 'w', encoding='utf-8') as fp:
        with jl.Writer(fp, compact=True, sort_keys=True) as writer:
            for item in results:
                writer.write(item)
示例#25
0
def print_json(value: typing.Any, out_file=sys.stdout) -> None:
    """Print a single line of JSON to stdout."""
    import jsonlines

    with jsonlines.Writer(out_file) as out:
        # pylint: disable=E1101
        out.write(value)

    out_file.flush()
def merge_sorted_jsonl_files(input_file_path_1: str, input_file_path_2: str,
                             result_file_path: str) -> None:
    with open(input_file_path_1, "r") as data_read_1, open(
            input_file_path_2,
            "r") as data_read_2, open(result_file_path, "w") as data_write:
        pairs = list(create_pairs(data_read_1, data_read_2))
        writer = jsonlines.Writer(data_write)
        while pairs:
            write_next_min_item(pairs, writer)
示例#27
0
def fix_ngram():
    with jsonlines.open(file) as reader:
        for obj in reader:
            with jsonlines.Writer(
                    open(
                        os.path.join(results_dir,
                                     "blimp_ngram_simplelm_peephole.jsonl"),
                        "w")) as writer:
                writer.write_all(obj)
示例#28
0
def main():
    reader = csv.DictReader(sys.stdin)
    writer = jsonlines.Writer(sys.stdout.buffer)
    try:
        writer.write_all(reader)
    except BrokenPipeError:
        pass

    return 0
示例#29
0
 def __init__(self, file_name: str):
     """Create a file object"""
     self.file_name = file_name
     self._clear()
     self.writer = jsonlines.Writer(fp=open(file_name,
                                            mode="a",
                                            encoding="utf-8"),
                                    dumps=NumpyEncoder().encode,
                                    flush=False)
示例#30
0
 def generate_paradigm(self,
                       number_to_generate=1000,
                       rel_output_path=None,
                       absolute_path=None):
     if rel_output_path is not None:
         project_root = "/".join(
             os.path.join(os.path.dirname(
                 os.path.abspath(__file__))).split("/")[:-1])
         output = open(os.path.join(project_root, rel_output_path), "w")
     elif absolute_path is not None:
         output = open(absolute_path, "w")
     else:
         raise Exception("You need to give an output path")
     past_sentences = set()
     generated_data = []
     pairID = 0
     error_counter = 0
     constant_data = self.make_metadata_dict()
     print("Generating data for " + constant_data["UID"])
     self.make_logger(constant_data)
     output_writer = jsonlines.Writer(output, flush=True)
     while len(past_sentences) < number_to_generate:
         try:
             new_data, track_sentence = self.sample()
             if track_sentence not in past_sentences:
                 past_sentences.add(track_sentence)
                 for field in self.data_fields:
                     if field in new_data:
                         new_data[field] = string_beautify(new_data[field])
                         new_data.update(constant_data)
                 new_data["pairID"] = str(pairID)
                 pairID += 1
                 if pairID % 100 == 0:
                     print("%d sentences generated" % pairID)
                 output_writer.write(new_data)
         except Exception as e:
             self.log_exception(e)
             print(self.get_stack_trace(e))
             error_counter += 1
             if error_counter > number_to_generate // 5:
                 pass
                 # raise Exception("Over 20\% of samples result in errors. You should fix this.")
     jsonlines.Writer(output).write_all(generated_data)