def read_and_translate(translator: sockeye.inference.Translator, output_handler: sockeye.output_handler.OutputHandler, chunk_size: int, source: Optional[str] = None) -> None: """ Reads from either a file or stdin and translates each line, calling the output_handler with the result. :param output_handler: Handler that will write output to a stream. :param translator: Translator that will translate each line of input. :param chunk_size: The size of the portion to read at a time from the input. :param source: Path to file which will be translated line-by-line if included, if none use stdin. """ source_data = sys.stdin if source is None else sockeye.data_io.smart_open(source) logger.info("Translating...") total_time, total_lines = 0.0, 0 for chunk in grouper(source_data, chunk_size): chunk_time = translate(output_handler, chunk, translator, total_lines) total_lines += len(chunk) total_time += chunk_time if total_lines != 0: logger.info("Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f", total_lines, ceil(total_lines / translator.batch_size), total_time, total_time / total_lines, total_lines / total_time) else: logger.info("Processed 0 lines.")
def read_and_translate(translator: sockeye.inference.Translator, output_handler: sockeye.output_handler.OutputHandler, chunk_size: Optional[int], source: Optional[str] = None) -> None: """ Reads from either a file or stdin and translates each line, calling the output_handler with the result. :param output_handler: Handler that will write output to a stream. :param translator: Translator that will translate each line of input. :param chunk_size: The size of the portion to read at a time from the input. :param source: Path to file which will be translated line-by-line if included, if none use stdin. """ if source is None: source_data = sys.stdin elif source.find("scp") != -1: source_data = data_io.read_content(source, "scp") elif source.find("lab") != -1: source_data = data_io.read_content(source, "lab") else: sockeye.data_io.smart_open(source) batch_size = translator.batch_size if chunk_size is None: if translator.batch_size == 1: # No batching, therefore there is not need to read segments in chunks. chunk_size = C.CHUNK_SIZE_NO_BATCHING else: # Get a constant number of batches per call to Translator.translate. chunk_size = C.CHUNK_SIZE_PER_BATCH_SEGMENT * translator.batch_size else: if chunk_size < translator.batch_size: logger.warning( "You specified a chunk size (%d) smaller than the batch size (%d). This will lead to " "a degregation of translation speed. Consider choosing a larger chunk size." % (chunk_size, batch_size)) logger.info("Translating...") total_time, total_lines = 0.0, 0 cnt = 0 p = 0 for chunk in grouper(source_data, chunk_size): if cnt % 10 == 0: p = cnt else: p = 0 cnt += 1 chunk_time = translate(output_handler, chunk, translator, total_lines, p) total_lines += len(chunk) total_time += chunk_time if total_lines != 0: logger.info( "Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f", total_lines, ceil(total_lines / batch_size), total_time, total_time / total_lines, total_lines / total_time) else: logger.info("Processed 0 lines.")
def read_and_translate(translator: inference.Translator, output_handler: OutputHandler, chunk_size: Optional[int], input_file: Optional[str] = None, input_factors: Optional[List[str]] = None, input_is_json: bool = False, num_translations: int = 3) -> None: """ Reads from either a file or stdin and translates each line, calling the output_handler with the result. :param output_handler: Handler that will write output to a stream. :param translator: Translator that will translate each line of input. :param chunk_size: The size of the portion to read at a time from the input. :param input_file: Optional path to file which will be translated line-by-line if included, if none use stdin. :param input_factors: Optional list of paths to files that contain source factors. :param input_is_json: Whether the input is in json format. """ #num_translations = 3 batch_size = translator.batch_size if chunk_size is None: if translator.batch_size == 1: # No batching, therefore there is not need to read segments in chunks. chunk_size = C.CHUNK_SIZE_NO_BATCHING else: # Get a constant number of batches per call to Translator.translate. chunk_size = C.CHUNK_SIZE_PER_BATCH_SEGMENT * translator.batch_size else: if chunk_size < translator.batch_size: logger.warning( "You specified a chunk size (%d) smaller than the batch size (%d). This will lead to " "a reduction in translation speed. Consider choosing a larger chunk size." % (chunk_size, batch_size)) logger.info("Translating...") total_time, total_lines = 0.0, 0 for chunk in grouper(make_inputs(input_file, translator, input_is_json, input_factors), size=chunk_size): chunk_time = translate(output_handler, chunk, translator, num_translations) total_lines += len(chunk) total_time += chunk_time break if total_lines != 0: logger.info( "Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f", total_lines, ceil(total_lines / batch_size), total_time, total_time / total_lines, total_lines / total_time) else: logger.info("Processed 0 lines.")
def read_and_translate(translator: inference.Translator, output_handler: OutputHandler, chunk_size: Optional[int], input_file: Optional[str] = None, input_factors: Optional[List[str]] = None, input_is_json: bool = False) -> None: """ Reads from either a file or stdin and translates each line, calling the output_handler with the result. :param output_handler: Handler that will write output to a stream. :param translator: Translator that will translate each line of input. :param chunk_size: The size of the portion to read at a time from the input. :param input_file: Optional path to file which will be translated line-by-line if included, if none use stdin. :param input_factors: Optional list of paths to files that contain source factors. :param input_is_json: Whether the input is in json format. """ batch_size = translator.batch_size if chunk_size is None: if translator.batch_size == 1: # No batching, therefore there is not need to read segments in chunks. chunk_size = C.CHUNK_SIZE_NO_BATCHING else: # Get a constant number of batches per call to Translator.translate. chunk_size = C.CHUNK_SIZE_PER_BATCH_SEGMENT * translator.batch_size else: if chunk_size < translator.batch_size: logger.warning("You specified a chunk size (%d) smaller than the batch size (%d). This will lead to " "a reduction in translation speed. Consider choosing a larger chunk size." % (chunk_size, batch_size)) logger.info("Translating...") total_time, total_lines = 0.0, 0 for chunk in grouper(make_inputs(input_file, translator, input_is_json, input_factors), size=chunk_size): chunk_time = translate(output_handler, chunk, translator) total_lines += len(chunk) total_time += chunk_time if total_lines != 0: logger.info("Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f", total_lines, ceil(total_lines / batch_size), total_time, total_time / total_lines, total_lines / total_time) else: logger.info("Processed 0 lines.")
def read_and_translate(translator: inference.Translator, output_handler: OutputHandler, chunk_size: Optional[int], input_file: Optional[str] = None, input_factors: Optional[List[str]] = None, dynamic_batch_mode_enabled: bool = False, input_is_json: bool = False) -> None: """ Reads from either a file or stdin and translates each line, calling the output_handler with the result. :param output_handler: Handler that will write output to a stream. :param translator: Translator that will translate each line of input. :param chunk_size: The size of the portion to read at a time from the input. :param input_file: Optional path to file which will be translated line-by-line if included, if none use stdin. :param input_factors: Optional list of paths to files that contain source factors. :param dynamic_batch_mode_enabled: Flag set to allow dynamic batches in translation, rather than fixed value :param input_is_json: Whether the input is in json format. """ batch_size = translator.batch_size logger.info("Translating...") total_time, total_lines = 0.0, 0 # We allow for dynamic batch calls if calling from STDIN with json inputs if dynamic_batch_mode_enabled and input_file is None and input_is_json: logger.info( "Dynamic batch mode enabled, translating in batches as delivered..." ) for translation_in in make_input_lists(): # if the input goes beyond the max batch_size, split into batch_size chunks max_batches = [ translation_in[i:i + batch_size] for i in xrange(0, len(translation_in), batch_size) ] for max_batch in max_batches: translate_time = translate( output_handler=output_handler, dynamic_batch_mode_enabled=dynamic_batch_mode_enabled, trans_inputs=max_batch, translator=translator) total_lines += len(max_batch) total_time += translate_time else: if chunk_size is None: if translator.batch_size == 1: # No batching, therefore there is not need to read segments in chunks. chunk_size = C.CHUNK_SIZE_NO_BATCHING else: # Get a constant number of batches per call to Translator.translate. chunk_size = C.CHUNK_SIZE_PER_BATCH_SEGMENT * translator.batch_size else: if chunk_size < translator.batch_size: logger.warning( "You specified a chunk size (%d) smaller than the batch size (%d). This will lead to " "a reduction in translation speed. Consider choosing a larger chunk size." % (chunk_size, batch_size)) for chunk in grouper(make_inputs(input_file, translator, input_is_json, input_factors), size=chunk_size): chunk_time = translate( output_handler=output_handler, dynamic_batch_mode_enabled=dynamic_batch_mode_enabled, trans_inputs=chunk, translator=translator) total_lines += len(chunk) total_time += chunk_time if total_lines != 0: logger.info( "Processed %d lines in %d batches. Total time: %.4f, sec/sent: %.4f, sent/sec: %.4f", total_lines, ceil(total_lines / batch_size), total_time, total_time / total_lines, total_lines / total_time) else: logger.info("Processed 0 lines.")