def test_smart_open_without_suffix(): with TemporaryDirectory() as temp: fname = os.path.join(temp, 'test') _touch_file(fname, compressed=True, empty=False) with utils.smart_open(fname) as fin: assert len(fin.readlines()) == 10 _touch_file(fname, compressed=False, empty=False) with utils.smart_open(fname) as fin: assert len(fin.readlines()) == 10
def get_output_handler(output_type: str, output_fname: Optional[str] = None) -> 'OutputHandler': """ :param output_type: Type of output handler. :param output_fname: Output filename. If none sys.stdout is used. :raises: ValueError for unknown output_type. :return: Output handler. """ output_stream = sys.stdout if output_fname is None else smart_open( output_fname, mode='w') if output_type == C.OUTPUT_HANDLER_TRANSLATION: return StringOutputHandler(output_stream) elif output_type == C.OUTPUT_HANDLER_SCORE: return ScoreOutputHandler(output_stream) elif output_type == C.OUTPUT_HANDLER_PAIR_WITH_SCORE: return PairWithScoreOutputHandler(output_stream) elif output_type == C.OUTPUT_HANDLER_TRANSLATION_WITH_SCORE: return StringWithScoreOutputHandler(output_stream) elif output_type == C.OUTPUT_HANDLER_BENCHMARK: return BenchmarkOutputHandler(output_stream) elif output_type == C.OUTPUT_HANDLER_JSON: return JSONOutputHandler(output_stream) elif output_type == C.OUTPUT_HANDLER_TRANSLATION_WITH_FACTORS: return FactoredStringOutputHandler(output_stream) else: raise ValueError("unknown output type")
def merge_spm(input_fname: str, output_fname: str): with utils.smart_open(input_fname, "r") as inp, open(output_fname, "w", encoding="utf-8") as out: for line in inp: sentence = line.replace(' ', '').replace('\u2581', ' ').lstrip() out.write(sentence)
def make_inputs( input_file: Optional[str], translator: inference.Translator, input_is_json: bool, input_factors: Optional[List[str]] = None ) -> Generator[inference.TranslatorInput, None, None]: """ Generates TranslatorInput instances from input. If input is None, reads from stdin. If num_input_factors > 1, the function will look for factors attached to each token, separated by '|'. If source is not None, reads from the source file. If num_source_factors > 1, num_source_factors source factor filenames are required. :param input_file: The source file (possibly None). :param translator: Translator that will translate each line of input. :param input_is_json: Whether the input is in json format. :param input_factors: Source factor files. :return: TranslatorInput objects. """ if input_file is None: check_condition( input_factors is None, "Translating from STDIN, not expecting any factor files.") for sentence_id, line in enumerate(sys.stdin, 1): if input_is_json: yield inference.make_input_from_json_string( sentence_id=sentence_id, json_string=line, translator=translator) else: yield inference.make_input_from_factored_string( sentence_id=sentence_id, factored_string=line, translator=translator) else: input_factors = [] if input_factors is None else input_factors inputs = [input_file] + input_factors if not input_is_json: check_condition( translator.num_source_factors == len(inputs), "Model(s) require %d factors, but %d given (through --input and --input-factors)." % (translator.num_source_factors, len(inputs))) with ExitStack() as exit_stack: streams = [exit_stack.enter_context(smart_open(i)) for i in inputs] for sentence_id, inputs in enumerate(zip(*streams), 1): if input_is_json: yield inference.make_input_from_json_string( sentence_id=sentence_id, json_string=inputs[0], translator=translator) else: yield inference.make_input_from_multiple_strings( sentence_id=sentence_id, strings=list(inputs))
def merge_bpe(input_fname: str, output_fname: str): """ Merge byte-pair encoded sub-words. :param input_fname: Path of byte-pair encoded input file, plain text or gzipped. :param output_fname: Path of tokenized output file, plain text. """ with utils.smart_open(input_fname, "r") as inp, open(output_fname, "w", encoding="utf-8") as out: for line in inp: # Merge on special markers and strip stray markers (end of line) merged = line.replace(SUBWORD_SPECIAL + " ", "").replace(SUBWORD_SPECIAL, "") out.write(merged)