Exemplo n.º 1
0
    def generate(self, bilingual_corpora, monolingual_corpora, output, log_file=None):
        fileutils.makedirs(self._model, exist_ok=True)

        args = ['--db', os.path.join(self._model, 'domains.db'), '-l', self._source_lang, '-c']

        source_paths = set([corpus.get_folder() for corpus in bilingual_corpora])
        for source_path in source_paths:
            args.append(source_path)

        command = cli.mmt_javamain(self._java_mainclass, args)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'w')

            stdout, _ = shell.execute(command, stderr=log)

            domains = {}

            for domain, name in [line.rstrip('\n').split('\t', 2) for line in stdout.splitlines()]:
                domains[name] = domain

            return self._make_training_folder(bilingual_corpora, monolingual_corpora, domains, output)
        finally:
            if log_file is not None:
                log.close()
Exemplo n.º 2
0
    def process(self, corpora, output_path, data_path=None):
        args = [
            '-s', self._source_lang, '-t', self._target_lang, '-v',
            self._vocabulary_path, '--output', output_path, '--input'
        ]

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        if data_path is not None:
            args.append('--dev')
            args.append(
                os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME))
            args.append('--test')
            args.append(
                os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command,
                      stdin=shell.DEVNULL,
                      stdout=shell.DEVNULL,
                      stderr=shell.DEVNULL)

        return BilingualCorpus.splitlist(self._source_lang,
                                         self._target_lang,
                                         roots=output_path)
Exemplo n.º 3
0
    def _start_process(self):
        if not os.path.isdir(self.engine.get_runtime_path()):
            fileutils.makedirs(self.engine.get_runtime_path(), exist_ok=True)
        self._log_file = self.engine.get_logfile(ClusterNode.__LOG_FILENAME, ensure=True)

        args = ['-e', self.engine.name, '-p', str(self._cluster_ports[0]), str(self._cluster_ports[1]),
                '--status-file', self._status_file]

        if self._start_rest_server:
            args.append('-a')
            args.append(str(self._api_port))

        if self._verbosity is not None:
            args.append('-v')
            args.append(str(self._verbosity))

        if self._sibling is not None:
            args.append('--member')
            args.append(str(self._sibling))

        command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain', args,
                               hserr_path=os.path.abspath(os.path.join(self._log_file, os.pardir)))

        log = open(self._log_file, 'wa')

        if os.path.isfile(self._status_file):
            os.remove(self._status_file)

        return subprocess.Popen(command, stdout=open(os.devnull), stderr=log, shell=False)
Exemplo n.º 4
0
    def process(self,
                corpora,
                output_path,
                test_data_path=None,
                dev_data_path=None,
                log=None):
        if log is None:
            log = osutils.DEVNULL

        args = [
            '-s', self._source_lang, '-t', self._target_lang, '--output',
            output_path, '--input'
        ]

        for root in set([corpus.get_folder() for corpus in corpora]):
            args.append(root)

        if dev_data_path is not None:
            args.append('--dev')
            args.append(dev_data_path)
        if test_data_path is not None:
            args.append('--test')
            args.append(test_data_path)

        command = mmt_javamain(self._java_main, args)
        osutils.shell_exec(command, stdout=log, stderr=log)

        return BilingualCorpus.list(self._source_lang, self._target_lang,
                                    output_path)
Exemplo n.º 5
0
    def clean(self, corpora, output_path, log=None):
        if log is None:
            log = shell.DEVNULL

        # read memory size
        mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf(
            'SC_PHYS_PAGES')  # e.g. 4015976448
        mem_mb = mem_bytes / (1024.**2)  # e.g. 3.74

        extended_heap_mb = int(mem_mb * 90 / 100)

        args = [
            '-s', self._source_lang, '-t', self._target_lang, '--output',
            output_path, '--input'
        ]

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        command = mmt_javamain(self._java_mainclass,
                               args=args,
                               max_heap_mb=extended_heap_mb)
        shell.execute(command, stdout=log, stderr=log)

        return BilingualCorpus.list(output_path)
Exemplo n.º 6
0
    def generate(self, bilingual_corpora, monolingual_corpora, output, log_file=None):
        fileutils.makedirs(self._model, exist_ok=True)

        args = ['--db', os.path.join(self._model, 'domains.db'), '-l', self._source_lang, '-c']

        source_paths = set([corpus.get_folder() for corpus in bilingual_corpora])
        for source_path in source_paths:
            args.append(source_path)

        command = cli.mmt_javamain(self._java_mainclass, args)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'w')

            stdout, _ = shell.execute(command, stderr=log)

            domains = {}

            for domain, name in [line.rstrip('\n').split('\t', 2) for line in stdout.splitlines()]:
                domains[name] = domain

            return self._make_training_folder(bilingual_corpora, monolingual_corpora, domains, output)
        finally:
            if log_file is not None:
                log.close()
Exemplo n.º 7
0
    def __get_command(self, lang, print_tags, print_placeholders, original_spacing):
        args = ['--lang', lang]
        if original_spacing:
            args.append('--original-spacing')
        if not print_tags:
            args.append('--no-tags')
        if print_placeholders:
            args.append('--print-placeholders')

        return mmt_javamain(self._java_mainclass, args)
Exemplo n.º 8
0
    def clean(self, source, target, input_paths, output_path):
        args = ['-s', source, '-t', target, '--output', output_path, '--input']

        for root in input_paths:
            args.append(root)

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL)

        return BilingualCorpus.splitlist(source, target, roots=output_path)[0]
Exemplo n.º 9
0
    def process_file(self, source, dest, lang):
        args = ['--lang', self._lang]
        if not self._print_tags:
            args.append('--no-tags')
        if self._print_placeholders:
            args.append('--print-placeholders')

        command = mmt_javamain(self._java_mainclass, args=args)

        with open(source) as input_stream:
            with open(dest.get_file(lang), 'w') as output_stream:
                shell.execute(command, stdin=input_stream, stdout=output_stream)
Exemplo n.º 10
0
    def clean(self, corpora, output_path):
        args = ['-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input']

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL)

        return BilingualCorpus.list(output_path)
Exemplo n.º 11
0
    def clean(self, corpora, output_path, log=None):
        if log is None:
            log = shell.DEVNULL

        args = ['-s', self._source_lang, '-t', self._target_lang, '--output', output_path, '--input']

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command, stdout=log, stderr=log)

        return BilingualCorpus.list(output_path)
Exemplo n.º 12
0
    def process(self, source, target, input_paths, output_path, data_path=None):
        args = ['-s', source, '-t', target, '--output', output_path, '--input']

        for root in input_paths:
            args.append(root)

        if data_path is not None:
            args.append('--dev')
            args.append(os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME))
            args.append('--test')
            args.append(os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL)

        return BilingualCorpus.splitlist(source, target, roots=output_path)
Exemplo n.º 13
0
    def reduce(self, corpora, output_path, word_limit, log=None):
        if log is None:
            log = shell.DEVNULL

        args = [
            '-s', self._source_lang, '-t', self._target_lang, '--words',
            str(word_limit), '--output', output_path, '--input'
        ]

        for root in set([corpus.get_folder() for corpus in corpora]):
            args.append(root)

        command = mmt_javamain(self._reduce_mainclass, args=args)
        shell.execute(command, stdout=log, stderr=log)

        return BilingualCorpus.list(output_path)
Exemplo n.º 14
0
    def _start_process(self, api_port, cluster_port, datastream_port, db_port,
                       leader, verbosity):
        if not os.path.isdir(self.engine.runtime_path):
            fileutils.makedirs(self.engine.runtime_path, exist_ok=True)
        logs_folder = os.path.abspath(os.path.join(self._log_file, os.pardir))

        args = [
            '-e', self.engine.name, '--status-file', self._status_file,
            '--logs', logs_folder
        ]

        if cluster_port is not None:
            args.append('--cluster-port')
            args.append(str(cluster_port))

        if api_port is not None:
            args.append('--api-port')
            args.append(str(api_port))

        if datastream_port is not None:
            args.append('--datastream-port')
            args.append(str(datastream_port))

        if db_port is not None:
            args.append('--db-port')
            args.append(str(db_port))

        if verbosity is not None:
            args.append('-v')
            args.append(str(verbosity))

        if leader is not None:
            args.append('--leader')
            args.append(leader)

        command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain',
                               args,
                               hserr_path=logs_folder)

        if os.path.isfile(self._status_file):
            os.remove(self._status_file)

        return subprocess.Popen(command,
                                stdout=shell.DEVNULL,
                                stderr=shell.DEVNULL,
                                shell=False)
Exemplo n.º 15
0
    def create_index(self, corpora, log=None):
        if log is None:
            log = shell.DEVNULL

        source_paths = set()

        for corpus in corpora:
            source_paths.add(corpus.get_folder())

        shutil.rmtree(self._index, ignore_errors=True)
        fileutils.makedirs(self._index, exist_ok=True)

        args = ['-s', self._source_lang, '-t', self._target_lang, '-i', self._index, '-c']
        for source_path in source_paths:
            args.append(source_path)

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command, stdout=log, stderr=log)
Exemplo n.º 16
0
    def process(self, corpora, output_path, data_path=None):
        args = ['-s', self._source_lang, '-t', self._target_lang, '-v', self._vocabulary_path, '--output', output_path,
                '--input']

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        if data_path is not None:
            args.append('--dev')
            args.append(os.path.join(data_path, TrainingPreprocessor.DEV_FOLDER_NAME))
            args.append('--test')
            args.append(os.path.join(data_path, TrainingPreprocessor.TEST_FOLDER_NAME))

        command = mmt_javamain(self._java_mainclass, args)
        shell.execute(command, stdin=shell.DEVNULL, stdout=shell.DEVNULL, stderr=shell.DEVNULL)

        return BilingualCorpus.splitlist(self._source_lang, self._target_lang, roots=output_path)
Exemplo n.º 17
0
    def process_file(self, input_path, output_path, lang):
        if lang == self._source_lang:
            args = ['-s', self._source_lang, '-t', self._target_lang]
        elif lang == self._target_lang:
            args = ['-s', self._target_lang, '-t', self._source_lang]
        else:
            raise ValueError('Unsupported language "%s"' % lang)

        if not self._print_tags:
            args.append('--no-tags')
        if self._print_placeholders:
            args.append('--print-placeholders')

        command = mmt_javamain(self._java_main, args=args)

        with open(input_path) as input_stream:
            with open(output_path, 'w') as output_stream:
                osutils.shell_exec(command,
                                   stdin=input_stream,
                                   stdout=output_stream)
Exemplo n.º 18
0
    def generate(self,
                 bilingual_corpora,
                 monolingual_corpora,
                 output,
                 log=None):
        if log is None:
            log = shell.DEVNULL

        fileutils.makedirs(self._model, exist_ok=True)

        args = [
            '--db',
            os.path.join(self._model, 'domains.db'), '-s', self._source_lang,
            '-t', self._target_lang, '-c'
        ]

        source_paths = set(
            [corpus.get_folder() for corpus in bilingual_corpora])
        for source_path in source_paths:
            args.append(source_path)

        command = cli.mmt_javamain(self._java_mainclass, args)
        stdout, _ = shell.execute(command, stderr=log)

        domains = {}

        for domain, name in [
                line.rstrip('\n').split('\t', 2)
                for line in stdout.splitlines()
        ]:
            domains[name] = domain

        bilingual_corpora = [
            corpus.symlink(output, name=domains[corpus.name])
            for corpus in bilingual_corpora
        ]
        monolingual_corpora = [
            corpus.symlink(output) for corpus in monolingual_corpora
        ]

        return bilingual_corpora, monolingual_corpora
Exemplo n.º 19
0
    def _start_process(self):
        if not os.path.isdir(self.engine.get_runtime_path()):
            fileutils.makedirs(self.engine.get_runtime_path(), exist_ok=True)
        self._log_file = self.engine.get_logfile(ClusterNode.__LOG_FILENAME,
                                                 ensure=True)

        args = [
            '-e', self.engine.name, '-p',
            str(self._cluster_ports[0]),
            str(self._cluster_ports[1]), '--status-file', self._status_file
        ]

        if self._start_rest_server:
            args.append('-a')
            args.append(str(self._api_port))

        if self._verbosity is not None:
            args.append('-v')
            args.append(str(self._verbosity))

        if self._sibling is not None:
            args.append('--member')
            args.append(str(self._sibling))

        command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain',
                               args,
                               hserr_path=os.path.abspath(
                                   os.path.join(self._log_file, os.pardir)))

        log = open(self._log_file, 'wa')

        if os.path.isfile(self._status_file):
            os.remove(self._status_file)

        return subprocess.Popen(command,
                                stdout=open(os.devnull),
                                stderr=log,
                                shell=False)
Exemplo n.º 20
0
    def clean(self, corpora, output_path, log=None):
        if log is None:
            log = osutils.DEVNULL

        args = [
            '-s', self._source_lang, '-t', self._target_lang, '--output',
            output_path, '--input'
        ]

        input_paths = set([corpus.get_folder() for corpus in corpora])

        for root in input_paths:
            args.append(root)

        extended_heap_mb = int(osutils.mem_size() * 90 / 100)

        command = mmt_javamain(self._java_main,
                               args=args,
                               max_heap_mb=extended_heap_mb)
        osutils.shell_exec(command, stdout=log, stderr=log)

        return BilingualCorpus.list(self._source_lang, self._target_lang,
                                    output_path)
Exemplo n.º 21
0
    def create_index(self, corpora, lang, log_file=None):
        source_paths = set()

        for corpus in corpora:
            source_paths.add(corpus.get_folder())

        fileutils.makedirs(self._index, exist_ok=True)

        args = ['-l', lang, '-i', self._index, '-c']
        for source_path in source_paths:
            args.append(source_path)

        command = mmt_javamain(self._java_mainclass, args)

        log = shell.DEVNULL

        try:
            if log_file is not None:
                log = open(log_file, 'w')

            shell.execute(command, stdout=log, stderr=log)
        finally:
            if log_file is not None:
                log.close()
Exemplo n.º 22
0
    def _start_process(self):
        if not os.path.isdir(self.engine.runtime_path):
            fileutils.makedirs(self.engine.runtime_path, exist_ok=True)
        logs_folder = os.path.abspath(os.path.join(self._log_file, os.pardir))

        args = [
            '-e', self.engine.name, '-p',
            str(self._cluster_ports[0]),
            str(self._cluster_ports[1]), '--datastream-port',
            str(self._datastream_port), '--status-file', self._status_file,
            '--logs', logs_folder
        ]

        if self._start_rest_server:
            args.append('-a')
            args.append(str(self._api_port))

        if self._verbosity is not None:
            args.append('-v')
            args.append(str(self._verbosity))

        if self._sibling is not None:
            args.append('--member')
            args.append(str(self._sibling))

        command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain',
                               args,
                               hserr_path=logs_folder)

        if os.path.isfile(self._status_file):
            os.remove(self._status_file)

        return subprocess.Popen(command,
                                stdout=shell.DEVNULL,
                                stderr=shell.DEVNULL,
                                shell=False)
Exemplo n.º 23
0
    def start(self,
              api_port=None,
              cluster_port=None,
              datastream_port=None,
              db_port=None,
              leader=None,
              verbosity=None,
              remote_debug=False,
              log_file=None):
        if log_file is not None:
            self._log_file = log_file

        if not os.path.isdir(self.engine.runtime_path):
            os.makedirs(self.engine.runtime_path)

        args = [
            '-e', self.engine.name, '--status-file', self._status_file,
            '--log-file', self._log_file
        ]

        if cluster_port is not None:
            args.append('--cluster-port')
            args.append(str(cluster_port))

        if api_port is not None:
            args.append('--api-port')
            args.append(str(api_port))

        if datastream_port is not None:
            args.append('--datastream-port')
            args.append(str(datastream_port))

        if db_port is not None:
            args.append('--db-port')
            args.append(str(db_port))

        if verbosity is not None:
            args.append('-v')
            args.append(str(verbosity))

        if leader is not None:
            args.append('--leader')
            args.append(leader)

        # read memory size
        mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf(
            'SC_PHYS_PAGES')  # e.g. 4015976448
        mem_mb = mem_bytes / (1024.**2)  # e.g. 3.74

        heap_mb = max(min(mem_mb / 4, 16 * 1024), 1024)
        heap_mb = int(heap_mb / 1024) * 1024

        logs_folder = os.path.abspath(os.path.join(self._log_file, os.pardir))
        command = mmt_javamain('eu.modernmt.cli.ClusterNodeMain',
                               args,
                               logs_path=logs_folder,
                               remote_debug=remote_debug,
                               max_heap_mb=heap_mb,
                               server=True)

        if os.path.isfile(self._status_file):
            os.remove(self._status_file)

        if not super(ClusterNode, self)._start(command):
            raise Exception(
                'failed to start node, check log file for more details: %s' %
                self._log_file)