def test_parallel_first_and_second(self):
      pi = 3.141
      e = 2.718
      value = {'pi' : pi, 'e' : e}
      pipeline = cons_split_wire() >> \
                 (cons_dictionary_wire({'pi' : 'PI'}) >> cons_function_component(lambda a, s: {'PI' : a['PI']})).first() >> \
                 (cons_dictionary_wire({'e' : 'E'}) >> cons_function_component(lambda a, s: {'E' : a['E']})).second()
      result = ParallelPypelineHelperUnitTest.test(1, pipeline, value, None, eval_pipeline)
      self.assertEquals(({'PI' : pi}, {'E' : e}), result)
 def test_parallel_split(self):
      pi = 3.141
      value = {'pi' : pi}
      pipeline = cons_function_component(lambda a, s: a) >> \
                 cons_split_wire() >> \
                 cons_function_component(lambda a, s: {'PI' : a['pi']}).first() >> \
                 (cons_function_component(lambda a, s: {'PI' : a['PI']}) ** \
                  cons_function_component(lambda a, s: a)) >> \
                  cons_unsplit_wire(lambda t, b: {'PI' : t['PI'], 'pi' : b['pi']})
      result = ParallelPypelineHelperUnitTest.test(1, pipeline, value, None, eval_pipeline)
      self.assertEquals({'PI' : pi, 'pi' : pi}, result)
Exemplo n.º 3
def main(src_lang, trg_lang, src_filename, trg_filename):
    # Global configuration
    # One day, this configuration shall be constructed from
    # command line options, or a properties file.
    configuration = {
        'moses_installation_dir': os.environ['MOSES_HOME'],
        'irstlm_installation_dir': os.environ['IRSTLM'],
        'giza_installation_dir': os.environ['GIZA_HOME'],
        'src_lang': src_lang,
        'src_tokenisation_dir': './tokenisation',
        'trg_lang': trg_lang,
        'trg_tokenisation_dir': './tokenisation',
        'segment_length_limit': 60,
        'irstlm_smoothing_method': 'improved-kneser-ney',
        'language_model_directory': './language-model',
        'translation_model_directory': './translation-model',
        'mert_working_directory': './mert',
        'evaluation_data_size': 100,
        'development_data_size': 100

    # The modules to load
    # In the future, the components shall be specified in some kind
    # pipeline description file.
    component_modules = {
        'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
        'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
        'cleanup': 'training.components.cleanup.cleanup',
        'data_split': 'training.components.data_split.data_split',
        'irstlm_build': 'training.components.irstlm_build.irstlm_build',
        'model_training': 'training.components.model_training.model_training',
        'mert': 'training.components.mert.mert'

    # The thread pool
    executor = ThreadPoolExecutor(max_workers=3)

    # Phew, build the required components
    components, component_config = build_components(component_modules,
                                                    configuration, executor)

    # Wire up components
    # Description of wiring should be, in the future, alongside the component
    # specification in some kind of confuguration file. Components shall be
    # declared then used, i.e., bind a component instance to a unique component
    # identifier, then wire component instances together by identifier.

    # Tokenisation of source and target...
    # IRSTLM Build components
    irstlm_build_component = cons_split_wire() >> \
                             (cons_wire(lambda a, s: {'input_filename':  a['tokenised_trg_filename']}) >> \
                              components['irstlm_build']).second() >> \
                             cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
                                                             'trg_language_model_filename': b['compiled_lm_filename']})

    # The complete tokenisation component
    tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
                             irstlm_build_component.second() >> \
                             cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
                                                             'trg_filename': b['tokenised_trg_filename'],
                                                             'trg_language_model_filename': b['trg_language_model_filename']})

    # Cleanup and Data Spliting...

    # A function that clips off the last '.' delimited string
    def clip_last_bit(filename):
        bn = os.path.basename(filename)
        directory = os.path.dirname(filename)
        bits = bn.split(".")
        return os.path.join(directory, ".".join(bits))

    cleanup_datasplit_component = components['cleanup'] >> \
                                  cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
                                                          'trg_filename': a['cleaned_trg_filename']}) >> \
                                  components['data_split'] >> \
                                  cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
                                                          'eval_src_filename': a['eval_src_filename'],
                                                          'eval_trg_filename': a['eval_trg_filename']})

    # Translation model training
    translation_model_component = cons_split_wire() >> \
                                  components['model_training'].first() >> \
                                  cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
                                                                  'development_data_filename': b['eval_src_filename']})

    # The whole pipeline
    pipeline = tokenisation_component >> \
               cons_split_wire() >> \
               (cleanup_datasplit_component >> translation_model_component).first() >> \
               cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
                                               'development_data_filename': clip_last_bit(t['development_data_filename']),
                                               'trg_language_model_filename': b['trg_language_model_filename'],
                                               'trg_language_model_order': 3,
                                               'trg_language_model_type': 9}) >> \

    # The input to the pipeline
    value = {'src_filename': src_filename, 'trg_filename': trg_filename}

    # Evaluate the pipeline
    #"Evaluating pipeline with input [%s]..." % value)
    new_value = eval_pipeline(executor, pipeline, value, component_config)

    # Wait for all components to finish
    executor.shutdown(True)"Pipeline evaluated to %s" % new_value)
Exemplo n.º 4
def main(src_lang, trg_lang, src_filename, trg_filename):
  # Global configuration
  # One day, this configuration shall be constructed from
  # command line options, or a properties file.
  configuration = {
    'moses_installation_dir': os.environ['MOSES_HOME'],
    'irstlm_installation_dir': os.environ['IRSTLM'],
    'giza_installation_dir': os.environ['GIZA_HOME'],
    'src_lang': src_lang,
    'src_tokenisation_dir': './tokenisation',
    'trg_lang': trg_lang,
    'trg_tokenisation_dir': './tokenisation',
    'segment_length_limit': 60,
    'irstlm_smoothing_method': 'improved-kneser-ney',
    'language_model_directory': './language-model',
    'translation_model_directory': './translation-model',
    'mert_working_directory': './mert',
    'evaluation_data_size': 100,
    'development_data_size': 100

  # The modules to load
  # In the future, the components shall be specified in some kind
  # pipeline description file.
  component_modules = {
    'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
    'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
    'cleanup': 'training.components.cleanup.cleanup',
    'data_split': 'training.components.data_split.data_split',
    'irstlm_build': 'training.components.irstlm_build.irstlm_build',
    'model_training': 'training.components.model_training.model_training',
    'mert': 'training.components.mert.mert'

  # The thread pool
  executor = ThreadPoolExecutor(max_workers = 3)

  # Phew, build the required components
  components, component_config = build_components(component_modules, configuration, executor)

  # Wire up components
  # Description of wiring should be, in the future, alongside the component
  # specification in some kind of confuguration file. Components shall be
  # declared then used, i.e., bind a component instance to a unique component
  # identifier, then wire component instances together by identifier.

  # Tokenisation of source and target...
  # IRSTLM Build components
  irstlm_build_component = cons_split_wire() >> \
                           (cons_wire(lambda a, s: {'input_filename':  a['tokenised_trg_filename']}) >> \
                            components['irstlm_build']).second() >> \
                           cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
                                                           'trg_language_model_filename': b['compiled_lm_filename']})

  # The complete tokenisation component
  tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
                           irstlm_build_component.second() >> \
                           cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
                                                           'trg_filename': b['tokenised_trg_filename'],
                                                           'trg_language_model_filename': b['trg_language_model_filename']})

  # Cleanup and Data Spliting...

  # A function that clips off the last '.' delimited string
  def clip_last_bit(filename):
    bn = os.path.basename(filename)
    directory = os.path.dirname(filename)
    bits = bn.split(".")
    return os.path.join(directory, ".".join(bits))

  cleanup_datasplit_component = components['cleanup'] >> \
                                cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
                                                        'trg_filename': a['cleaned_trg_filename']}) >> \
                                components['data_split'] >> \
                                cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
                                                        'eval_src_filename': a['eval_src_filename'],
                                                        'eval_trg_filename': a['eval_trg_filename']})

  # Translation model training
  translation_model_component = cons_split_wire() >> \
                                components['model_training'].first() >> \
                                cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
                                                                'development_data_filename': b['eval_src_filename']})

  # The whole pipeline
  pipeline = tokenisation_component >> \
             cons_split_wire() >> \
             (cleanup_datasplit_component >> translation_model_component).first() >> \
             cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
                                             'development_data_filename': clip_last_bit(t['development_data_filename']),
                                             'trg_language_model_filename': b['trg_language_model_filename'],
                                             'trg_language_model_order': 3,
                                             'trg_language_model_type': 9}) >> \

  # The input to the pipeline
  value = {'src_filename': src_filename,
           'trg_filename': trg_filename}

  # Evaluate the pipeline
  #"Evaluating pipeline with input [%s]..." % value)
  new_value = eval_pipeline(executor, pipeline, value, component_config)

  # Wait for all components to finish
  executor.shutdown(True)"Pipeline evaluated to %s" % new_value)