Exemplo n.º 1
0
    def start(self):
        if self.is_running():
            raise IllegalStateException('process is already running')

        self._log_file = self._engine.get_logfile('embedded-kafka',
                                                  ensure=True)

        success = False
        zpid, kpid = 0, 0

        log = open(self._log_file, 'w')

        try:
            zpid = self._start_zookeeper(log)
            if zpid is None:
                raise IllegalStateException(
                    'failed to start zookeeper, check log file for more details: '
                    + self._log_file)

            kpid = self._start_kafka(log)
            if kpid is None:
                raise IllegalStateException(
                    'failed to start kafka, check log file for more details: '
                    + self._log_file)

            self._set_pids(kpid, zpid)

            success = True
        except:
            if not success:
                daemon.kill(kpid)
                daemon.kill(zpid)
                log.close()
            raise
Exemplo n.º 2
0
    def start(self):
        if self.is_running():
            raise IllegalStateException('process is already running')

        if self._kafka:
            self._kafka.start()

        success = False
        process = self._start_process()
        pid = process.pid

        if pid > 0:
            self._set_pid(pid)

            for _ in range(0, 5):
                success = self.is_running()
                if success:
                    break

                time.sleep(1)

        if not success:
            if self._kafka:
                self._kafka.stop()
            raise Exception(
                'failed to start node, check log file for more details: ' +
                self._log_file)
Exemplo n.º 3
0
    def stop(self):
        kpid, zpid = self._get_pids()

        if not self.is_running():
            raise IllegalStateException('process is not running')

        daemon.kill(kpid, 5)
        daemon.kill(zpid)
Exemplo n.º 4
0
    def start(self):
        if self.is_running():
            raise IllegalStateException(
                'Cannot start Kafka process. Kafka process is already running')

        if not netutils.is_free(self.port):
            raise IllegalStateException(
                'port %d is already in use, please specify another port with --datastream-port'
                % self.port)

        self._log_file = self._engine.get_logfile('embedded-kafka',
                                                  ensure=True)

        shutil.rmtree(self._runtime, ignore_errors=True)
        fileutils.makedirs(self._runtime, exist_ok=True)

        success = False
        zpid, kpid = 0, 0

        log = open(self._log_file, 'w')

        try:
            zookeeper_port = netutils.get_free_tcp_port()

            zpid = self._start_zookeeper(log, zookeeper_port)
            if zpid is None:
                raise IllegalStateException(
                    'failed to start zookeeper, check log file for more details: '
                    + self._log_file)

            kpid = self._start_kafka(log, zookeeper_port)
            if kpid is None:
                raise IllegalStateException(
                    'failed to start kafka, check log file for more details: '
                    + self._log_file)

            self._set_pids(kpid, zpid)

            success = True
        except:
            if not success:
                daemon.kill(kpid)
                daemon.kill(zpid)
                log.close()
            raise
Exemplo n.º 5
0
    def stop(self):
        pid = self._get_pid()

        if not self.is_running():
            raise IllegalStateException('process is not running')

        daemon.kill(pid, ClusterNode.__SIGTERM_TIMEOUT)
        if self._kafka:
            self._kafka.stop()
Exemplo n.º 6
0
    def _on_fields_injected(self, injector):
        if self.target_lang is None or self.source_lang is None:
            config = self.config

            if config is not None:
                self.target_lang = config.get(self.injector_section, 'target_lang')
                self.source_lang = config.get(self.injector_section, 'source_lang')

        if self.target_lang is None or self.source_lang is None:
            raise IllegalStateException('Engine target language or source language must be specified')

        if self._lm_type is None:
            self._lm_type = LanguageModel.available_types[0]
        if self._aligner_type is None:
            self._aligner_type = WordAligner.available_types[0]

        self.analyzer = injector.inject(ContextAnalyzer(self._context_index))
        self.cleaner = TMCleaner(self.source_lang, self.target_lang)

        self.pt = injector.inject(SuffixArraysPhraseTable(self._pt_model, (self.source_lang, self.target_lang)))
        self.aligner = injector.inject(
            WordAligner.instantiate(self._aligner_type, self._aligner_model, self.source_lang, self.target_lang)
        )
        self.lm = injector.inject(LanguageModel.instantiate(self._lm_type, self._lm_model))
        self.training_preprocessor = injector.inject(
            TrainingPreprocessor(self.source_lang, self.target_lang, self._vocabulary_model)
        )

        self.db = _DomainMapBuilder(self._db_path, self.source_lang, self.target_lang)

        self.moses = injector.inject(Moses(self._moses_ini_file))
        self.moses.add_feature(MosesFeature('UnknownWordPenalty'))
        self.moses.add_feature(MosesFeature('WordPenalty'))
        self.moses.add_feature(MosesFeature('Distortion'))
        self.moses.add_feature(MosesFeature('PhrasePenalty'))
        self.moses.add_feature(self.pt, 'Sapt')
        # self.moses.add_feature(LexicalReordering(), 'DM0')
        self.moses.add_feature(self.lm, 'InterpolatedLM')

        self._optimal_weights = {
            'InterpolatedLM': [0.24759],
            'Sapt': [0.118797, 0.172922, 0.0134384, 0.0143003],
            'Distortion0': [0.197845],
            'WordPenalty0': [-0.217267],
            'PhrasePenalty0': [0.0178411],
        }

        if self._config is None:
            self._config = injector.to_config()
            self._config.set(self.injector_section, 'source_lang', self.source_lang)
            self._config.set(self.injector_section, 'target_lang', self.target_lang)
Exemplo n.º 7
0
    def start(self):
        if self.is_running():
            raise IllegalStateException(
                'Cannot start Cassandra process. Cassandra process is already running'
            )

        if not netutils.is_free(self.port):
            raise IllegalStateException(
                'port %d is already in use, please specify another port with --db-port'
                % self.port)

        self._log_file = self._engine.get_logfile('embedded-cassandra',
                                                  ensure=True)

        shutil.rmtree(self._runtime, ignore_errors=True)
        fileutils.makedirs(self._runtime, exist_ok=True)

        success = False
        cpid = 0

        log = open(self._log_file, 'w')

        try:
            cpid = self._start_cassandra(log)

            if cpid is None:
                raise IllegalStateException(
                    'failed to start Cassandra, check log file for more details: '
                    + self._log_file)
            self._set_pid(cpid)
            success = True
        except:
            if not success:
                daemon.kill(cpid)
                log.close()
            raise
Exemplo n.º 8
0
    def execute(self, line):
        if len(line) == 0:
            return

        try:
            translation = self._translate(line)

            if self._print_nbest is not None:
                for nbest in translation['nbest']:
                    self._nbest_out.write((u' ||| '.join(self._encode_nbest(nbest))).encode('utf-8'))
                    self._nbest_out.write('\n')

            print '>>', self._encode_translation(translation)
        except requests.exceptions.ConnectionError:
            raise IllegalStateException('connection problem: MMT server not running, start it with "./mmt start"')
        except requests.exceptions.HTTPError as e:
            raise Exception('HTTP ERROR: ' + e.message)
Exemplo n.º 9
0
    def tune(self, corpora=None, debug=False, context_enabled=True):
        if corpora is None:
            corpora = BilingualCorpus.list(
                os.path.join(self.engine.data_path,
                             TrainingPreprocessor.DEV_FOLDER_NAME))

        if len(corpora) == 0:
            raise IllegalArgumentException('empty corpora')

        if not self.is_running():
            raise IllegalStateException(
                'No MMT Server running, start the engine first')

        tokenizer = Tokenizer()

        target_lang = self.engine.target_lang
        source_lang = self.engine.source_lang

        source_corpora = [
            BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(),
                                          [source_lang]) for corpus in corpora
        ]
        reference_corpora = [
            BilingualCorpus.make_parallel(corpus.name, corpus.get_folder(),
                                          [target_lang]) for corpus in corpora
        ]

        cmdlogger = _tuning_logger(4)
        cmdlogger.start(self, corpora)

        working_dir = self.engine.get_tempdir('tuning')
        mert_wd = os.path.join(working_dir, 'mert')

        try:
            # Tokenization
            tokenized_output = os.path.join(working_dir, 'reference_corpora')
            fileutils.makedirs(tokenized_output, exist_ok=True)

            with cmdlogger.step('Corpora tokenization') as _:
                reference_corpora = tokenizer.process_corpora(
                    reference_corpora, tokenized_output)

            # Create merged corpus
            with cmdlogger.step('Merging corpus') as _:
                # source
                source_merged_corpus = os.path.join(working_dir,
                                                    'corpus.' + source_lang)

                with open(source_merged_corpus, 'wb') as out:
                    for corpus in source_corpora:
                        out.write(corpus.get_file(source_lang) + '\n')

                # target
                target_merged_corpus = os.path.join(working_dir,
                                                    'corpus.' + target_lang)
                fileutils.merge([
                    corpus.get_file(target_lang)
                    for corpus in reference_corpora
                ], target_merged_corpus)

            # Run MERT algorithm
            with cmdlogger.step('Tuning') as _:
                # Start MERT
                decoder_flags = ['--port', str(self.api.port)]

                if not context_enabled:
                    decoder_flags.append('--skip-context-analysis')
                    decoder_flags.append('1')

                fileutils.makedirs(mert_wd, exist_ok=True)

                with tempfile.NamedTemporaryFile() as runtime_moses_ini:
                    command = [
                        self._mert_script, source_merged_corpus,
                        target_merged_corpus, self._mert_i_script,
                        runtime_moses_ini.name, '--threads',
                        str(multiprocessing.cpu_count()), '--mertdir',
                        cli.BIN_DIR, '--mertargs',
                        '\'--binary --sctype BLEU\'', '--working-dir', mert_wd,
                        '--nbest', '100', '--decoder-flags',
                        '"' + ' '.join(decoder_flags) + '"', '--nonorm',
                        '--closest', '--no-filter-phrase-table'
                    ]

                    with open(self.engine.get_logfile('mert'), 'wb') as log:
                        shell.execute(' '.join(command),
                                      stdout=log,
                                      stderr=log)

            # Read optimized configuration
            with cmdlogger.step('Applying changes') as _:
                bleu_score = 0
                weights = {}
                found_weights = False

                with open(os.path.join(mert_wd, 'moses.ini')) as moses_ini:
                    for line in moses_ini:
                        line = line.strip()

                        if len(line) == 0:
                            continue
                        elif found_weights:
                            tokens = line.split()
                            weights[tokens[0].rstrip('=')] = [
                                float(val) for val in tokens[1:]
                            ]
                        elif line.startswith('# BLEU'):
                            bleu_score = float(line.split()[2])
                        elif line == '[weight]':
                            found_weights = True

                _ = self.api.update_features(weights)

            cmdlogger.completed(bleu_score)
        finally:
            if not debug:
                self.engine.clear_tempdir()