Пример #1
0
def should_do_shrinkage(iter, model_file, shrink_saturation_threshold,
                        get_raw_nnet_from_am=True):

    if iter == 0:
        return True

    if get_raw_nnet_from_am:
        output = common_lib.get_command_stdout(
            "nnet3-am-info {0} 2>/dev/null | "
            "steps/nnet3/get_saturation.pl".format(model_file))
    else:
        output = common_lib.get_command_stdout(
            "nnet3-info 2>/dev/null {0} | "
            "steps/nnet3/get_saturation.pl".format(model_file))
    output = output.strip().split("\n")
    try:
        assert len(output) == 1
        saturation = float(output[0])
        assert saturation >= 0 and saturation <= 1
    except:
        raise Exception("Something went wrong, could not get "
                        "saturation from the output '{0}' of "
                        "get_saturation.pl on the info of "
                        "model {1}".format(output, model_file))
    return saturation > shrink_saturation_threshold
Пример #2
0
def should_do_shrinkage(iter, model_file, shrink_saturation_threshold,
                        get_raw_nnet_from_am=True):

    if iter == 0:
        return True

    if get_raw_nnet_from_am:
        output = common_lib.get_command_stdout(
            "nnet3-am-info {0} 2>/dev/null | "
            "steps/nnet3/get_saturation.pl".format(model_file))
    else:
        output = common_lib.get_command_stdout(
            "nnet3-info 2>/dev/null {0} | "
            "steps/nnet3/get_saturation.pl".format(model_file))
    output = output.strip().split("\n")
    try:
        assert len(output) == 1
        saturation = float(output[0])
        assert saturation >= 0 and saturation <= 1
    except:
        raise Exception("Something went wrong, could not get "
                        "saturation from the output '{0}' of "
                        "get_saturation.pl on the info of "
                        "model {1}".format(output, model_file))
    return saturation > shrink_saturation_threshold
Пример #3
0
def parse_prob_logs(exp_dir, key='accuracy', output="output"):
    train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir)
    valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir)
    train_prob_strings = common_lib.get_command_stdout(
        'grep -e {0} {1}'.format(key, train_prob_files))
    valid_prob_strings = common_lib.get_command_stdout(
        'grep -e {0} {1}'.format(key, valid_prob_files))

    # LOG
    # (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149)
    # Overall log-probability for 'output' is -0.399395 + -0.013437 = -0.412832
    # per frame, over 20000 fra

    # LOG
    # (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:144)
    # Overall log-probability for 'output' is -0.307255 per frame, over 20000
    # frames.

    parse_regex = re.compile(
        ".*compute_prob_.*\.([0-9]+).log:LOG "
        ".nnet3.*compute-prob.*:PrintTotalStats..:"
        "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for "
        "'{output}'.*is ([0-9.\-e]+) .*per frame".format(output=output))

    train_objf = {}
    valid_objf = {}

    for line in train_prob_strings.split('\n'):
        mat_obj = parse_regex.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            if groups[1] == key:
                train_objf[int(groups[0])] = groups[2]
    if not train_objf:
        raise KaldiLogParseException("Could not find any lines with {k} in "
                                     " {l}".format(k=key, l=train_prob_files))

    for line in valid_prob_strings.split('\n'):
        mat_obj = parse_regex.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            if groups[1] == key:
                valid_objf[int(groups[0])] = groups[2]

    if not valid_objf:
        raise KaldiLogParseException("Could not find any lines with {k} in "
                                     " {l}".format(k=key, l=valid_prob_files))

    iters = list(set(valid_objf.keys()).intersection(train_objf.keys()))
    if not iters:
        raise KaldiLogParseException("Could not any common iterations with"
                                     " key {k} in both {tl} and {vl}".format(
                                         k=key,
                                         tl=train_prob_files,
                                         vl=valid_prob_files))
    iters.sort()
    return list(
        map(lambda x: (int(x), float(train_objf[x]), float(valid_objf[x])),
            iters))
Пример #4
0
def parse_prob_logs(exp_dir, key='accuracy', output="output"):
    train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir)
    valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir)
    train_prob_strings = common_lib.get_command_stdout(
        'grep -e {0} {1}'.format(key, train_prob_files))
    valid_prob_strings = common_lib.get_command_stdout(
        'grep -e {0} {1}'.format(key, valid_prob_files))

    # LOG
    # (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149)
    # Overall log-probability for 'output' is -0.399395 + -0.013437 = -0.412832
    # per frame, over 20000 fra

    # LOG
    # (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:144)
    # Overall log-probability for 'output' is -0.307255 per frame, over 20000
    # frames.

    parse_regex = re.compile(
        ".*compute_prob_.*\.([0-9]+).log:LOG "
        ".nnet3.*compute-prob.*:PrintTotalStats..:"
        "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for "
        "'{output}'.*is ([0-9.\-e]+) .*per frame".format(output=output))

    train_objf = {}
    valid_objf = {}

    for line in train_prob_strings.split('\n'):
        mat_obj = parse_regex.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            if groups[1] == key:
                train_objf[int(groups[0])] = groups[2]
    if not train_objf:
        raise KaldiLogParseException("Could not find any lines with {k} in "
                " {l}".format(k=key, l=train_prob_files))

    for line in valid_prob_strings.split('\n'):
        mat_obj = parse_regex.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            if groups[1] == key:
                valid_objf[int(groups[0])] = groups[2]

    if not valid_objf:
        raise KaldiLogParseException("Could not find any lines with {k} in "
                " {l}".format(k=key, l=valid_prob_files))

    iters = list(set(valid_objf.keys()).intersection(train_objf.keys()))
    if not iters:
        raise KaldiLogParseException("Could not any common iterations with"
                " key {k} in both {tl} and {vl}".format(
                    k=key, tl=train_prob_files, vl=valid_prob_files))
    iters.sort()
    return list(map(lambda x: (int(x), float(train_objf[x]),
                               float(valid_objf[x])), iters))
Пример #5
0
def parse_progress_logs_for_nonlinearity_stats(exp_dir):
    """Parse progress logs for mean and std stats for non-linearities.
    e.g. for a line that is parsed from progress.*.log:
    exp/nnet3/lstm_self_repair_ld5_sp/log/progress.9.log:component name=Lstm3_i
    type=SigmoidComponent, dim=1280, self-repair-scale=1e-05, count=1.96e+05,
    value-avg=[percentiles(0,1,2,5 10,20,50,80,90
    95,98,99,100)=(0.05,0.09,0.11,0.15 0.19,0.27,0.50,0.72,0.83
    0.88,0.92,0.94,0.99), mean=0.502, stddev=0.23],
    deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90
    95,98,99,100)=(0.009,0.04,0.05,0.06 0.08,0.10,0.14,0.17,0.18
    0.19,0.20,0.20,0.21), mean=0.134, stddev=0.0397]
    """

    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
    stats_per_component_per_iter = {}

    progress_log_lines = common_lib.get_command_stdout(
        'grep -e "value-avg.*deriv-avg.*oderiv" {0}'.format(
            progress_log_files),
        require_zero_status=False,
    )

    if progress_log_lines:
        # cases with oderiv-rms
        parse_regex = re.compile(g_normal_nonlin_regex_pattern_with_oderiv)
    else:
        # cases with only value-avg and deriv-avg
        progress_log_lines = common_lib.get_command_stdout(
            'grep -e "value-avg.*deriv-avg" {0}'.format(progress_log_files),
            require_zero_status=False,
        )
        parse_regex = re.compile(g_normal_nonlin_regex_pattern)

    for line in progress_log_lines.split("\n"):
        mat_obj = parse_regex.search(line)
        if mat_obj is None:
            continue
        # groups = ('9', 'Lstm3_i', 'Sigmoid', '0.05...0.99', '0.502', '0.23',
        # '0.009...0.21', '0.134', '0.0397')
        groups = mat_obj.groups()
        component_type = groups[2]
        if component_type == "LstmNonlinearity":
            parse_regex_lstmp = re.compile(g_lstmp_nonlin_regex_pattern)
            mat_obj = parse_regex_lstmp.search(line)
            groups = mat_obj.groups()
            assert len(groups) == 33
            for i in list(range(0, 5)):
                fill_nonlin_stats_table_with_regex_result(
                    groups, i, stats_per_component_per_iter)
        else:
            fill_nonlin_stats_table_with_regex_result(
                groups, 0, stats_per_component_per_iter)
    return stats_per_component_per_iter
Пример #6
0
def parse_progress_logs_for_nonlinearity_stats(exp_dir):

    """ Parse progress logs for mean and std stats for non-linearities.
    e.g. for a line that is parsed from progress.*.log:
    exp/nnet3/lstm_self_repair_ld5_sp/log/progress.9.log:component name=Lstm3_i
    type=SigmoidComponent, dim=1280, self-repair-scale=1e-05, count=1.96e+05,
    value-avg=[percentiles(0,1,2,5 10,20,50,80,90
    95,98,99,100)=(0.05,0.09,0.11,0.15 0.19,0.27,0.50,0.72,0.83
    0.88,0.92,0.94,0.99), mean=0.502, stddev=0.23],
    deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90
    95,98,99,100)=(0.009,0.04,0.05,0.06 0.08,0.10,0.14,0.17,0.18
    0.19,0.20,0.20,0.21), mean=0.134, stddev=0.0397]
    """

    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
    stats_per_component_per_iter = {}

    progress_log_lines = common_lib.get_command_stdout(
        'grep -e "value-avg.*deriv-avg.*oderiv" {0}'.format(progress_log_files),
        require_zero_status = False)

    if progress_log_lines:
        # cases with oderiv-rms
        parse_regex = re.compile(g_normal_nonlin_regex_pattern_with_oderiv)
    else:
        # cases with only value-avg and deriv-avg
        progress_log_lines = common_lib.get_command_stdout(
        'grep -e "value-avg.*deriv-avg" {0}'.format(progress_log_files),
        require_zero_status = False)
        parse_regex = re.compile(g_normal_nonlin_regex_pattern)

    for line in progress_log_lines.split("\n"):
        mat_obj = parse_regex.search(line)
        if mat_obj is None:
            continue
        # groups = ('9', 'Lstm3_i', 'Sigmoid', '0.05...0.99', '0.502', '0.23',
        # '0.009...0.21', '0.134', '0.0397')
        groups = mat_obj.groups()
        component_type = groups[2]
        if component_type == 'LstmNonlinearity':
            parse_regex_lstmp = re.compile(g_lstmp_nonlin_regex_pattern)
            mat_obj = parse_regex_lstmp.search(line)
            groups = mat_obj.groups()
            assert len(groups) == 33
            for i in list(range(0,5)):
                fill_nonlin_stats_table_with_regex_result(groups, i,
                        stats_per_component_per_iter)
        else:
            fill_nonlin_stats_table_with_regex_result(groups, 0,
                    stats_per_component_per_iter)
    return stats_per_component_per_iter
Пример #7
0
def get_outputs_list(model_file, get_raw_nnet_from_am=True):
    """ Generates list of output-node-names used in nnet3 model configuration.
        It will normally return 'output'.
    """
    if get_raw_nnet_from_am:
        outputs_list = common_lib.get_command_stdout(
            "nnet3-am-info --print-args=false {0} | "
            "grep -e 'output-node' | cut -f2 -d' ' | cut -f2 -d'=' ".format(model_file))
    else:
        outputs_list = common_lib.get_command_stdout(
            "nnet3-info --print-args=false {0} | "
            "grep -e 'output-node' | cut -f2 -d' ' | cut -f2 -d'=' ".format(model_file))

    return outputs_list.split()
Пример #8
0
def get_outputs_list(model_file, get_raw_nnet_from_am=True):
    """ Generates list of output-node-names used in nnet3 model configuration.
        It will normally return 'output'.
    """
    if get_raw_nnet_from_am:
        outputs_list = common_lib.get_command_stdout(
            "nnet3-am-info --print-args=false {0} | "
            "grep -e 'output-node' | cut -f2 -d' ' | cut -f2 -d'=' ".format(model_file))
    else:
        outputs_list = common_lib.get_command_stdout(
            "nnet3-info --print-args=false {0} | "
            "grep -e 'output-node' | cut -f2 -d' ' | cut -f2 -d'=' ".format(model_file))

    return outputs_list.split()
Пример #9
0
def add_nnet_context_info(config_dir):
    """This will be removed when python script refactoring is done."""

    common_lib.execute_command("nnet3-init {0}/ref.config "
                               "{0}/ref.raw".format(config_dir))
    out = common_lib.get_command_stdout("nnet3-info {0}/ref.raw | "
                                        "head -4".format(config_dir))
    # out looks like this
    # left-context: 7
    # right-context: 0
    # num-parameters: 90543902
    # modulus: 1
    info = {}
    for line in out.split("\n"):
        parts = line.split(":")
        if len(parts) != 2:
            continue
        info[parts[0].strip()] = int(parts[1].strip())

    # Writing the 'vars' file:
    #   model_left_context=0
    #   model_right_context=7
    vf = open('{0}/vars'.format(config_dir), 'w')
    vf.write('model_left_context={0}\n'.format(info['left-context']))
    vf.write('model_right_context={0}\n'.format(info['right-context']))
    vf.close()
Пример #10
0
def get_model_component_info(model_filename):
    """
    This function reads existing model (*.raw or *.mdl) and returns array
    of XconfigExistingLayer one per {input,output}-node or component-node
    with same 'name' used in the raw model and 'dim' equal to 'output-dim'
    for component-node and 'dim' for {input,output}-node.

    e.g. layer in *.mdl -> corresponding 'XconfigExistingLayer' layer
         'input-node name=ivector dim=100' ->
         'existing name=ivector dim=100'
         'component-node name=tdnn1.affine ... input-dim=1000 '
         'output-dim=500' ->
         'existing name=tdnn1.affine dim=500'
    """

    all_layers = []
    try:
        f = open(model_filename, 'r')
    except Exception as e:
        sys.exit("{0}: error reading model file '{1}'".format(sys.argv[0],
                                                              model_filename,
                                                              repr(e)))

    # use nnet3-info to get component names in the model.
    out = common_lib.get_command_stdout("""nnet3-info {0} | grep '\-node' """
                                        """ """.format(model_filename))

    # out contains all {output, input, component}-nodes used in model_filename
    # It can parse lines in out like:
    # i.e. input-node name=input dim=40
    #   component-node name=tdnn1.affine component=tdnn1.affine input=lda
    #   input-dim=300 output-dim=512
    layer_names = []
    key_to_value = dict()
    for line in out.split("\n"):
        parts = line.split(" ")
        dim = -1
        for  field in parts:
            key_value = field.split("=")
            if len(key_value) == 2:
                key = key_value[0]
                value = key_value[1]
                if key == "name":           # name=**
                    layer_name = value
                elif key == "dim":          # for input-node
                    dim = int(value)
                elif key == "output-dim":   # for component-node
                    dim = int(value)

        if layer_name is not None and layer_name not in layer_names:
            layer_names.append(layer_name)
            key_to_value['name'] = layer_name
            assert(dim != -1)
            key_to_value['dim'] = dim
            all_layers.append(xlayers.XconfigExistingLayer('existing', key_to_value, all_layers))
    if len(all_layers) == 0:
        raise RuntimeError("{0}: model filename '{1}' is empty.".format(
            sys.argv[0], model_filename))
    f.close()
    return all_layers
Пример #11
0
def add_nnet_context_info(config_dir, nnet_edits=None, existing_model=None):
    """Create the 'vars' file that specifies model_left_context, etc."""

    common_lib.execute_command(
        "nnet3-init {0} {1}/ref.config "
        "{1}/ref.raw"
        "".format(existing_model if existing_model is not None else "",
                  config_dir))
    model = "{0}/ref.raw".format(config_dir)
    if nnet_edits is not None:
        model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits, model)
    out = common_lib.get_command_stdout('nnet3-info "{0}"'.format(model))
    # out looks like this
    # left-context: 7
    # right-context: 0
    # num-parameters: 90543902
    # modulus: 1
    # ...
    info = {}
    for line in out.split("\n")[:4]:  # take 4 initial lines,
        parts = line.split(":")
        if len(parts) != 2:
            continue
        info[parts[0].strip()] = int(parts[1].strip())

    # Writing the 'vars' file:
    #   model_left_context=0
    #   model_right_context=7
    vf = open("{0}/vars".format(config_dir), "w")
    vf.write("model_left_context={0}\n".format(info["left-context"]))
    vf.write("model_right_context={0}\n".format(info["right-context"]))
    vf.close()
Пример #12
0
def get_input_model_info(input_model):
    """ This function returns a dictionary with keys "model_left_context" and
        "model_right_context" and values equal to the left/right model contexts
        for input_model.
        This function is useful when using the --trainer.input-model option
        instead of initializing the model using configs.
    """
    variables = {}
    try:
        out = common_lib.get_command_stdout("""nnet3-info {0} | """
                                            """head -4 """.format(input_model))
        # out looks like this
        # left-context: 7
        # right-context: 0
        # num-parameters: 90543902
        # modulus: 1
        for line in out.split("\n"):
            parts = line.split(":")
            if len(parts) != 2:
                continue
            if parts[0].strip() ==  'left-context':
                variables['model_left_context'] = int(parts[1].strip())
            elif parts[0].strip() ==  'right-context':
                variables['model_right_context'] = int(parts[1].strip())

    except ValueError:
        pass
    return variables
Пример #13
0
def add_nnet_context_info(config_dir, nnet_edits=None):
    """Create the 'vars' file that specifies model_left_context, etc."""

    common_lib.execute_command("nnet3-init {0}/ref.config "
                               "{0}/ref.raw".format(config_dir))
    model = "{0}/ref.raw".format(config_dir)
    if nnet_edits is not None:
        model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits, model)
    out = common_lib.get_command_stdout(
        'nnet3-info "{0}" | head -n 4 '.format(model))
    # out looks like this
    # left-context: 7
    # right-context: 0
    # num-parameters: 90543902
    # modulus: 1
    info = {}
    for line in out.split("\n"):
        parts = line.split(":")
        if len(parts) != 2:
            continue
        info[parts[0].strip()] = int(parts[1].strip())

    # Writing the 'vars' file:
    #   model_left_context=0
    #   model_right_context=7
    vf = open('{0}/vars'.format(config_dir), 'w')
    vf.write('model_left_context={0}\n'.format(info['left-context']))
    vf.write('model_right_context={0}\n'.format(info['right-context']))
    vf.close()
Пример #14
0
def add_nnet_context_info(config_dir, nnet_edits=None,
                          existing_model=None):
    """Create the 'vars' file that specifies model_left_context, etc."""

    common_lib.execute_command("nnet3-init {0} {1}/ref.config "
                               "{1}/ref.raw"
                               "".format(existing_model if
                                         existing_model is not None else "",
                                         config_dir))
    model = "{0}/ref.raw".format(config_dir)
    if nnet_edits is not None:
        model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
                                                          model)
    out = common_lib.get_command_stdout('nnet3-info "{0}" | head -n 4 '
                                        .format(model))
    # out looks like this
    # left-context: 7
    # right-context: 0
    # num-parameters: 90543902
    # modulus: 1
    info = {}
    for line in out.split("\n"):
        parts = line.split(":")
        if len(parts) != 2:
            continue
        info[parts[0].strip()] = int(parts[1].strip())

    # Writing the 'vars' file:
    #   model_left_context=0
    #   model_right_context=7
    vf = open('{0}/vars'.format(config_dir), 'w')
    vf.write('model_left_context={0}\n'.format(info['left-context']))
    vf.write('model_right_context={0}\n'.format(info['right-context']))
    vf.close()
Пример #15
0
def get_model_component_info(model_filename):
    """
    This function reads existing model (*.raw or *.mdl) and returns array
    of XconfigExistingLayer one per {input,output}-node or component-node
    with same 'name' used in the raw model and 'dim' equal to 'output-dim'
    for component-node and 'dim' for {input,output}-node.

    e.g. layer in *.mdl -> corresponding 'XconfigExistingLayer' layer
         'input-node name=ivector dim=100' ->
         'existing name=ivector dim=100'
         'component-node name=tdnn1.affine ... input-dim=1000 '
         'output-dim=500' ->
         'existing name=tdnn1.affine dim=500'
    """

    all_layers = []
    try:
        f = open(model_filename, 'r')
    except Exception as e:
        sys.exit("{0}: error reading model file '{1}'".format(sys.argv[0],
                                                              model_filename,
                                                              repr(e)))

    # use nnet3-info to get component names in the model.
    out = common_lib.get_command_stdout("""nnet3-info {0} | grep '\-node' """
                                        """ """.format(model_filename))

    # out contains all {output, input, component}-nodes used in model_filename
    # It can parse lines in out like:
    # i.e. input-node name=input dim=40
    #   component-node name=tdnn1.affine component=tdnn1.affine input=lda
    #   input-dim=300 output-dim=512
    layer_names = []
    key_to_value = dict()
    for line in out.split("\n"):
        parts = line.split(" ")
        dim = -1
        for  field in parts:
            key_value = field.split("=")
            if len(key_value) == 2:
                key = key_value[0]
                value = key_value[1]
                if key == "name":           # name=**
                    layer_name = value
                elif key == "dim":          # for input-node
                    dim = int(value)
                elif key == "output-dim":   # for component-node
                    dim = int(value)

        if layer_name is not None and layer_name not in layer_names:
            layer_names.append(layer_name)
            key_to_value['name'] = layer_name
            assert(dim != -1)
            key_to_value['dim'] = dim
            all_layers.append(xlayers.XconfigExistingLayer('existing', key_to_value, all_layers))
    if len(all_layers) == 0:
        raise RuntimeError("{0}: model filename '{1}' is empty.".format(
            sys.argv[0], model_filename))
    f.close()
    return all_layers
Пример #16
0
def get_input_model_info(input_model):
    """ This function returns a dictionary with keys "model_left_context" and
        "model_right_context" and values equal to the left/right model contexts
        for input_model.
        This function is useful when using the --trainer.input-model option
        instead of initializing the model using configs.
    """
    variables = {}
    try:
        out = common_lib.get_command_stdout("""nnet3-info {0} | """
                                            """head -4 """.format(input_model))
        # out looks like this
        # left-context: 7
        # right-context: 0
        # num-parameters: 90543902
        # modulus: 1
        for line in out.split("\n"):
            parts = line.split(":")
            if len(parts) != 2:
                continue
            if parts[0].strip() ==  'left-context':
                variables['model_left_context'] = int(parts[1].strip())
            elif parts[0].strip() ==  'right-context':
                variables['model_right_context'] = int(parts[1].strip())

    except ValueError:
        pass
    return variables
Пример #17
0
def check_model_contexts(config_dir, nnet_edits=None, existing_model=None):
    contexts = {}
    for file_name in ["init", "ref"]:
        if os.path.exists("{0}/{1}.config".format(config_dir, file_name)):
            contexts[file_name] = {}
            common_lib.execute_command(
                "nnet3-init {0} {1}/{2}.config "
                "{1}/{2}.raw"
                "".format(
                    existing_model if existing_model is not None else "",
                    config_dir,
                    file_name,
                ))
            model = "{0}/{1}.raw".format(config_dir, file_name)
            if nnet_edits is not None and file_name != "init":
                model = "nnet3-copy --edits='{0}' {1} - |".format(
                    nnet_edits, model)
            out = common_lib.get_command_stdout(
                'nnet3-info "{0}"'.format(model))
            # out looks like this
            # left-context: 7
            # right-context: 0
            # num-parameters: 90543902
            # modulus: 1
            # ...
            for line in out.split("\n")[:4]:  # take 4 initial lines,
                parts = line.split(":")
                if len(parts) != 2:
                    continue
                key = parts[0].strip()
                value = int(parts[1].strip())
                if key in ["left-context", "right-context"]:
                    contexts[file_name][key] = value

    if "init" in contexts:
        assert "ref" in contexts
        if "left-context" in contexts["init"] and "left-context" in contexts[
                "ref"]:
            if (contexts["init"]["left-context"] >
                    contexts["ref"]["left-context"]) or (
                        contexts["init"]["right-context"] >
                        contexts["ref"]["right-context"]):
                raise Exception(
                    "Model specified in {0}/init.config requires greater"
                    " context than the model specified in {0}/ref.config."
                    " This might be due to use of label-delay at the output"
                    " in ref.config. Please use delay=$label_delay in the"
                    " initial fixed-affine-layer of the network, to avoid"
                    " this issue.")
Пример #18
0
def check_model_contexts(config_dir, nnet_edits=None, existing_model=None):
    contexts = {}
    for file_name in ['init', 'ref']:
        if os.path.exists('{0}/{1}.config'.format(config_dir, file_name)):
            contexts[file_name] = {}
            common_lib.execute_command("nnet3-init {0} {1}/{2}.config "
                                       "{1}/{2}.raw"
                                       "".format(existing_model if
                                                 existing_model is not
                                                 None else '',
                                                 config_dir, file_name))
            model = "{0}/{1}.raw".format(config_dir, file_name)
            if nnet_edits is not None:
                model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
                                                                  model)
            out = common_lib.get_command_stdout('nnet3-info "{0}" | head -n 4 '
                                                .format(model))
            # out looks like this
            # left-context: 7
            # right-context: 0
            # num-parameters: 90543902
            # modulus: 1
            for line in out.split("\n"):
                parts = line.split(":")
                if len(parts) != 2:
                    continue
                key = parts[0].strip()
                value = int(parts[1].strip())
                if key in ['left-context', 'right-context']:
                    contexts[file_name][key] = value

    if 'init' in contexts:
        assert('ref' in contexts)
        if ('left-context' in contexts['init'] and
            'left-context' in contexts['ref']):
            if ((contexts['init']['left-context']
                 > contexts['ref']['left-context'])
                or (contexts['init']['right-context']
                    > contexts['ref']['right-context'])):
               raise Exception(
                    "Model specified in {0}/init.config requires greater"
                    " context than the model specified in {0}/ref.config."
                    " This might be due to use of label-delay at the output"
                    " in ref.config. Please use delay=$label_delay in the"
                    " initial fixed-affine-layer of the network, to avoid"
                    " this issue.")
Пример #19
0
def check_model_contexts(config_dir, nnet_edits=None, existing_model=None):
    contexts = {}
    for file_name in ['init', 'ref']:
        if os.path.exists('{0}/{1}.config'.format(config_dir, file_name)):
            contexts[file_name] = {}
            common_lib.execute_command("nnet3-init {0} {1}/{2}.config "
                                       "{1}/{2}.raw"
                                       "".format(existing_model if
                                                 existing_model is not
                                                 None else '',
                                                 config_dir, file_name))
            model = "{0}/{1}.raw".format(config_dir, file_name)
            if nnet_edits is not None:
                model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits,
                                                                  model)
            out = common_lib.get_command_stdout('nnet3-info "{0}" | head -n 4 '
                                                .format(model))
            # out looks like this
            # left-context: 7
            # right-context: 0
            # num-parameters: 90543902
            # modulus: 1
            for line in out.split("\n"):
                parts = line.split(":")
                if len(parts) != 2:
                    continue
                key = parts[0].strip()
                value = int(parts[1].strip())
                if key in ['left-context', 'right-context']:
                    contexts[file_name][key] = value

    if contexts.has_key('init'):
        assert(contexts.has_key('ref'))
        if (contexts['init'].has_key('left-context') and
            contexts['ref'].has_key('left-context')):
            if ((contexts['init']['left-context']
                 > contexts['ref']['left-context'])
                or (contexts['init']['right-context']
                    > contexts['ref']['right-context'])):
               raise Exception(
                    "Model specified in {0}/init.config requires greater"
                    " context than the model specified in {0}/ref.config."
                    " This might be due to use of label-delay at the output"
                    " in ref.config. Please use delay=$label_delay in the"
                    " initial fixed-affine-layer of the network, to avoid"
                    " this issue.")
def add_nnet_context_info(config_dir, nnet_edits=None, existing_model=None):
    """Create the 'vars' file that specifies model_left_context, etc."""

    common_lib.execute_command(
        "nnet3-init {0} {1}/ref.config "
        "{1}/ref.raw"
        "".format(existing_model if existing_model is not None else "",
                  config_dir))
    model = "{0}/ref.raw".format(config_dir)
    if nnet_edits is not None:
        model = "nnet3-copy --edits='{0}' {1} - |".format(nnet_edits, model)
    out = common_lib.get_command_stdout('nnet3-info "{0}"'.format(model))
    # Add by myself
    f_info = open("{0}/ref.raw.info".format(config_dir), 'w')
    print('# This file was created by the command:\n'
          '# nnet3-info "{0}"\n'
          '# which is called in func:\n'
          '#     add_nnet_context_info(args.config_dir, args.nnet_edits,'
          '# existing_model=args.existing_model)\n'
          '# This func is in steps/nnet3/xconfig_to_configs.py'.format(model),
          file=f_info)
    print(out, file=f_info)
    f_info.close()
    # out looks like this
    # left-context: 7
    # right-context: 0
    # num-parameters: 90543902
    # modulus: 1
    # ...
    info = {}
    for line in out.split("\n")[:4]:  # take 4 initial lines,
        parts = line.split(":")
        if len(parts) != 2:
            continue
        info[parts[0].strip()] = int(parts[1].strip())

    # Writing the 'vars' file:
    #   model_left_context=0
    #   model_right_context=7
    vf = open('{0}/vars'.format(config_dir), 'w')
    vf.write('model_left_context={0}\n'.format(info['left-context']))
    vf.write('model_right_context={0}\n'.format(info['right-context']))
    vf.close()
Пример #21
0
def parse_train_logs(exp_dir):
    train_log_files = "%s/log/train.*.log" % (exp_dir)
    train_log_lines = common_lib.get_command_stdout(
        'grep -e Accounting {0}'.format(train_log_files))
    parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# "
                             "Accounting: time=([0-9]+) thread.*")

    train_times = {}
    for line in train_log_lines.split('\n'):
        mat_obj = parse_regex.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            try:
                train_times[int(groups[0])][int(groups[1])] = float(groups[2])
            except KeyError:
                train_times[int(groups[0])] = {}
                train_times[int(groups[0])][int(groups[1])] = float(groups[2])
    iters = train_times.keys()
    for iter in iters:
        values = train_times[iter].values()
        train_times[iter] = max(values)
    return train_times
Пример #22
0
def get_train_times(exp_dir):
    train_log_files = "%s/log/" % (exp_dir)
    train_log_names = "train.*.log"
    train_log_lines = common_lib.get_command_stdout(
        'find {0} -name "{1}" | xargs grep -H -e Accounting'.format(train_log_files,train_log_names))
    parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# "
                             "Accounting: time=([0-9]+) thread.*")

    train_times = {}
    for line in train_log_lines.split('\n'):
        mat_obj = parse_regex.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            try:
                train_times[int(groups[0])][int(groups[1])] = float(groups[2])
            except KeyError:
                train_times[int(groups[0])] = {}
                train_times[int(groups[0])][int(groups[1])] = float(groups[2])
    iters = train_times.keys()
    for iter in iters:
        values = train_times[iter].values()
        train_times[iter] = max(values)
    return train_times
Пример #23
0
def train(args, run_opts):
    """ The main function for training.

    Args:
        args: a Namespace object with the required parameters
            obtained from the function process_args()
        run_opts: RunOpts object obtained from the process_args()
    """

    arg_string = pprint.pformat(vars(args))
    logger.info("Arguments for the experiment\n{0}".format(arg_string))

    # Set some variables.
    config_dir = '{0}/configs'.format(args.dir)
    am_var_file = '{0}/vars_am'.format(config_dir)
    xvec_var_file = '{0}/vars_xvec'.format(config_dir)
    am_variables = common_train_lib.parse_generic_config_vars_file(am_var_file)
    xvec_variables = common_train_lib.parse_generic_config_vars_file(xvec_var_file)

    # Set some variables.
    try:
        am_model_left_context = am_variables['model_left_context']
        am_model_right_context = am_variables['model_right_context']
        xvec_model_left_context = xvec_variables['model_left_context']
        xvec_model_right_context = xvec_variables['model_right_context']
    except KeyError as e:
        raise Exception("KeyError {0}: Variables need to be defined in "
                        "{1}".format(str(e), '{0}/configs'.format(args.dir)))

    am_left_context = am_model_left_context
    am_right_context = am_model_right_context
    xvec_left_context = xvec_model_left_context
    xvec_right_context = xvec_model_right_context

    # Initialize as "raw" nnet, prior to training the LDA-like preconditioning
    # matrix.  This first config just does any initial splicing that we do;
    # we do this as it's a convenient way to get the stats for the 'lda-like'
    # transform.
    if (args.stage <= -5) and os.path.exists(args.dir+"/configs/init.config"):
        logger.info("Initializing a basic network for estimating "
                    "preconditioning matrix")
        common_lib.execute_command(
            """{command} {dir}/log/nnet_init.log \
                    nnet3-init --srand=-2 {dir}/configs/init.config \
                    {dir}/init.raw""".format(command=run_opts.command,
                                             dir=args.dir))

    am_egs_dir = args.am_egs_dir
    xvec_egs_dir = args.xvec_egs_dir
    am_output_name = args.am_output_name
    xvec_output_name = args.xvec_output_name
    am_weight = args.am_weight
    xvec_weight = args.xvec_weight

    feat_dim = int(common_lib.get_command_stdout("cat {0}/info/feat_dim".format(am_egs_dir)))
    num_archives = int(common_lib.get_command_stdout("cat {0}/info/num_archives".format(am_egs_dir)))

    tmp_feat_dim = int(common_lib.get_command_stdout("cat {0}/info/feat_dim".format(xvec_egs_dir)))
    tmp_num_archives = int(common_lib.get_command_stdout("cat {0}/info/num_archives".format(xvec_egs_dir)))

    # frames_per_eg is no longer a parameter but load from am_egs/info/frames_per_eg
    am_frames_per_eg = int(common_lib.get_command_stdout("cat {0}/info/frames_per_eg".format(am_egs_dir)))

    if feat_dim != tmp_feat_dim or num_archives*am_frames_per_eg != tmp_num_archives:
        raise Exception('The am egs and xvec egs do not match')

    if args.num_jobs_final > num_archives:
        raise Exception('num_jobs_final cannot exceed the number of archives '
                        'in the egs directory')

    # # No need to copy files for decoding
    # common_train_lib.copy_egs_properties_to_exp_dir(am_egs_dir, args.dir)

    if args.stage <= -3 and os.path.exists(args.dir+"/configs/init.config"):
        logger.info('Computing the preconditioning matrix for input features')

        train_lib.common.compute_preconditioning_matrix(
            args.dir, egs_dir, num_archives, run_opts,
            max_lda_jobs=args.max_lda_jobs,
            rand_prune=args.rand_prune)

    if args.stage <= -1:
        logger.info("Preparing the initial network.")
        common_train_lib.prepare_initial_network(args.dir, run_opts)

    # set num_iters so that as close as possible, we process the data
    # $num_epochs times, i.e. $num_iters*$avg_num_jobs) ==
    # $num_epochs*$num_archives, where
    # avg_num_jobs=(num_jobs_initial+num_jobs_final)/2.
    num_archives_expanded = num_archives * am_frames_per_eg
    num_archives_to_process = int(args.num_epochs * num_archives_expanded)
    num_archives_processed = 0
    num_iters = ((num_archives_to_process * 2)
                 / (args.num_jobs_initial + args.num_jobs_final))

    # If do_final_combination is True, compute the set of models_to_combine.
    # Otherwise, models_to_combine will be none.
    if args.do_final_combination:
        models_to_combine = common_train_lib.get_model_combine_iters(
            num_iters, args.num_epochs,
            num_archives_expanded, args.max_models_combine,
            args.num_jobs_final)
    else:
        models_to_combine = None

    logger.info("Training will run for {0} epochs = "
                "{1} iterations".format(args.num_epochs, num_iters))

    for iter in range(num_iters):
        if (args.exit_stage is not None) and (iter == args.exit_stage):
            logger.info("Exiting early due to --exit-stage {0}".format(iter))
            return
        current_num_jobs = int(0.5 + args.num_jobs_initial
                               + (args.num_jobs_final - args.num_jobs_initial)
                               * float(iter) / num_iters)

        if args.stage <= iter:
            lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
                                                       num_iters,
                                                       num_archives_processed,
                                                       num_archives_to_process,
                                                       args.initial_effective_lrate,
                                                       args.final_effective_lrate)
            shrinkage_value = 1.0 - (args.proportional_shrink * lrate)
            if shrinkage_value <= 0.5:
                raise Exception("proportional-shrink={0} is too large, it gives "
                                "shrink-value={1}".format(args.proportional_shrink,
                                                          shrinkage_value))

            percent = num_archives_processed * 100.0 / num_archives_to_process
            epoch = (num_archives_processed * args.num_epochs
                     / num_archives_to_process)
            shrink_info_str = ''
            if shrinkage_value != 1.0:
                shrink_info_str = 'shrink: {0:0.5f}'.format(shrinkage_value)
            logger.info("Iter: {0}/{1}    "
                        "Epoch: {2:0.2f}/{3:0.1f} ({4:0.1f}% complete)    "
                        "lr: {5:0.6f}    {6}".format(iter, num_iters - 1,
                                                     epoch, args.num_epochs,
                                                     percent,
                                                     lrate, shrink_info_str))
            train_lib.common.train_cvector_one_iteration(
                dir=args.dir,
                iter=iter,
                srand=args.srand,
                am_output_name=am_output_name,
                am_weight=am_weight,
                am_egs_dir=am_egs_dir,
                xvec_output_name=xvec_output_name,
                xvec_weight=xvec_weight,
                xvec_egs_dir=xvec_egs_dir,
                num_jobs=current_num_jobs,
                num_archives_processed=num_archives_processed,
                num_archives=num_archives,
                learning_rate=lrate,
                minibatch_size_str=args.minibatch_size,
                momentum=args.momentum,
                max_param_change=args.max_param_change,
                shuffle_buffer_size=args.shuffle_buffer_size,
                run_opts=run_opts,
                am_frames_per_eg=am_frames_per_eg,
                dropout_edit_string=common_train_lib.get_dropout_edit_string(
                    args.dropout_schedule,
                    float(num_archives_processed) / num_archives_to_process,
                    iter),
                shrinkage_value=shrinkage_value,
                get_raw_nnet_from_am=False,
                backstitch_training_scale=args.backstitch_training_scale,
                backstitch_training_interval=args.backstitch_training_interval)

            if args.cleanup:
                # do a clean up everythin but the last 2 models, under certain
                # conditions
                common_train_lib.remove_model(
                    args.dir, iter-2, num_iters, models_to_combine,
                    args.preserve_model_interval,
                    get_raw_nnet_from_am=False)

            if args.email is not None:
                reporting_iter_interval = num_iters * args.reporting_interval
                if iter % reporting_iter_interval == 0:
                    # lets do some reporting
                    [report, times, data] = (
                        nnet3_log_parse.generate_acc_logprob_report(args.dir))
                    message = report
                    subject = ("Update : Expt {dir} : "
                               "Iter {iter}".format(dir=args.dir, iter=iter))
                    common_lib.send_mail(message, subject, args.email)

        num_archives_processed = num_archives_processed + current_num_jobs

    # when we do final combination, just use the xvector egs
    if args.stage <= num_iters:
        if args.do_final_combination:
            logger.info("Doing final combination to produce final.mdl")

            train_lib.common.combine_models(
                dir=args.dir, num_iters=num_iters,
                models_to_combine=models_to_combine,
                egs_dir=xvec_egs_dir,
                minibatch_size_str="64", run_opts=run_opts,
                get_raw_nnet_from_am=False,
                max_objective_evaluations=args.max_objective_evaluations,
                use_egs=True)
                # sum_to_one_penalty=args.combine_sum_to_one_penalty,
        else:
            common_lib.force_symlink("{0}.raw".format(num_iters),
                                     "{0}/final.raw".format(args.dir))
    
    if args.cleanup:
        logger.info("Cleaning up the experiment directory "
                    "{0}".format(args.dir))
        remove_egs = False

        common_train_lib.clean_nnet_dir(
            nnet_dir=args.dir, num_iters=num_iters, egs_dir=am_egs_dir,
            preserve_model_interval=args.preserve_model_interval,
            remove_egs=remove_egs,
            get_raw_nnet_from_am=False)

    # TODO: we may trace other output nodes expect for "output"
    # do some reporting
    outputs_list = common_train_lib.get_outputs_list("{0}/final.raw".format(
        args.dir), get_raw_nnet_from_am=False)
    if 'output' in outputs_list:
        [report, times, data] = nnet3_log_parse.generate_acc_logprob_report(args.dir)
        if args.email is not None:
            common_lib.send_mail(report, "Update : Expt {0} : "
                                         "complete".format(args.dir),
                                 args.email)
            with open("{dir}/accuracy.{output_name}.report".format(dir=args.dir,
                                                                   output_name="output"),
                      "w") as f:
                f.write(report)

    common_lib.execute_command("subtools/kaldi/steps/info/nnet3_dir_info.pl "
                               "{0}".format(args.dir))
Пример #24
0
def parse_progress_logs_for_clipped_proportion(exp_dir):
    """ Parse progress logs for clipped proportion stats.

    e.g. for a line that is parsed from progress.*.log:
    exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:component
    name=BLstm1_forward_c type=ClipGradientComponent, dim=512,
    norm-based-clipping=true, clipping-threshold=30,
    clipped-proportion=0.000565527,
    self-repair-clipped-proportion-threshold=0.01, self-repair-target=0,
    self-repair-scale=1
    """

    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
    component_names = set([])
    progress_log_lines = common_lib.get_command_stdout(
        'grep -e "{0}" {1}'.format(
            "clipped-proportion", progress_log_files),
        require_zero_status=False)
    parse_regex = re.compile(".*progress\.([0-9]+)\.log:component "
                             "name=(.*) type=.* "
                             "clipped-proportion=([0-9\.e\-]+)")

    cp_per_component_per_iter = {}

    max_iteration = 0
    component_names = set([])
    for line in progress_log_lines.split("\n"):
        mat_obj = parse_regex.search(line)
        if mat_obj is None:
            if line.strip() == "":
                continue
            raise MalformedClippedProportionLineException(line)
        groups = mat_obj.groups()
        iteration = int(groups[0])
        max_iteration = max(max_iteration, iteration)
        name = groups[1]
        clipped_proportion = float(groups[2])
        if clipped_proportion > 1:
            raise MalformedClippedProportionLineException(line)
        if iteration not in cp_per_component_per_iter:
            cp_per_component_per_iter[iteration] = {}
        cp_per_component_per_iter[iteration][name] = clipped_proportion
        component_names.add(name)
    component_names = list(component_names)
    component_names.sort()

    # re arranging the data into an array
    # and into an cp_per_iter_per_component
    cp_per_iter_per_component = {}
    for component_name in component_names:
        cp_per_iter_per_component[component_name] = []
    data = []
    data.append(["iteration"]+component_names)
    for iter in range(max_iteration+1):
        if iter not in cp_per_component_per_iter:
            continue
        comp_dict = cp_per_component_per_iter[iter]
        row = [iter]
        for component in component_names:
            try:
                row.append(comp_dict[component])
                cp_per_iter_per_component[component].append(
                    [iter, comp_dict[component]])
            except KeyError:
                # if clipped proportion is not available for a particular
                # component it is set to None
                # this usually happens during layer-wise discriminative
                # training
                row.append(None)
        data.append(row)

    return {'table': data,
            'cp_per_component_per_iter': cp_per_component_per_iter,
            'cp_per_iter_per_component': cp_per_iter_per_component}
Пример #25
0
def parse_progress_logs_for_param_diff(exp_dir, pattern):
    """ Parse progress logs for per-component parameter differences.

    e.g. for a line that is parsed from progress.*.log:
    exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG
    (nnet3-show-progress:main():nnet3-show-progress.cc:144) Relative parameter
    differences per layer are [ Cwrnn1_T3_W_r:0.0171537
    Cwrnn1_T3_W_x:1.33338e-07 Cwrnn1_T2_W_r:0.048075 Cwrnn1_T2_W_x:1.34088e-07
    Cwrnn1_T1_W_r:0.0157277 Cwrnn1_T1_W_x:0.0212704 Final_affine:0.0321521
    Cwrnn2_T3_W_r:0.0212082 Cwrnn2_T3_W_x:1.33691e-07 Cwrnn2_T2_W_r:0.0212978
    Cwrnn2_T2_W_x:1.33401e-07 Cwrnn2_T1_W_r:0.014976 Cwrnn2_T1_W_x:0.0233588
    Cwrnn3_T3_W_r:0.0237165 Cwrnn3_T3_W_x:1.33184e-07 Cwrnn3_T2_W_r:0.0239754
    Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ]
    """

    if pattern not in set(["Relative parameter differences",
                           "Parameter differences"]):
        raise Exception("Unknown value for pattern : {0}".format(pattern))

    progress_log_files = "%s/log/progress.*.log" % (exp_dir)
    progress_per_iter = {}
    component_names = set([])
    progress_log_lines = common_lib.get_command_stdout(
        'grep -e "{0}" {1}'.format(pattern, progress_log_files))
    parse_regex = re.compile(".*progress\.([0-9]+)\.log:"
                             "LOG.*{0}.*\[(.*)\]".format(pattern))
    for line in progress_log_lines.split("\n"):
        mat_obj = parse_regex.search(line)
        if mat_obj is None:
            continue
        groups = mat_obj.groups()
        iteration = groups[0]
        differences = parse_difference_string(groups[1])
        component_names = component_names.union(differences.keys())
        progress_per_iter[int(iteration)] = differences

    component_names = list(component_names)
    component_names.sort()
    # rearranging the parameter differences available per iter
    # into parameter differences per component
    progress_per_component = {}
    for cn in component_names:
        progress_per_component[cn] = {}

    max_iter = max(progress_per_iter.keys())
    total_missing_iterations = 0
    gave_user_warning = False
    for iter in range(max_iter + 1):
        try:
            component_dict = progress_per_iter[iter]
        except KeyError:
            continue

        for component_name in component_names:
            try:
                progress_per_component[component_name][iter] = component_dict[
                    component_name]
            except KeyError:
                total_missing_iterations += 1
                # the component was not found this iteration, may be because of
                # layerwise discriminative training
                pass
        if (total_missing_iterations/len(component_names) > 20
                and not gave_user_warning and logger is not None):
            logger.warning("There are more than {0} missing iterations per "
                           "component. Something might be wrong.".format(
                                total_missing_iterations/len(component_names)))
            gave_user_warning = True

    return {'progress_per_component': progress_per_component,
            'component_names': component_names,
            'max_iter': max_iter}
Пример #26
0
def parse_rnnlm_prob_logs(exp_dir, key='objf'):
    train_prob_files = "%s/log/train.*.*.log" % (exp_dir)
    valid_prob_files = "%s/log/compute_prob.*.log" % (exp_dir)
    train_prob_strings = common_lib.get_command_stdout(
        'grep -e {0} {1}'.format(key, train_prob_files))
    valid_prob_strings = common_lib.get_command_stdout(
        'grep -e {0} {1}'.format(key, valid_prob_files))

    # LOG
    # (rnnlm-train[5.3.36~8-2ec51]:PrintStatsOverall():rnnlm-core-training.cc:118)
    # Overall objf is (-4.426 + -0.008287) = -4.435 over 4.503e+06 words (weighted)
    # in 1117 minibatches; exact = (-4.426 + 0) = -4.426

    # LOG
    # (rnnlm-compute-prob[5.3.36~8-2ec51]:PrintStatsOverall():rnnlm-core-training.cc:118)
    # Overall objf is (-4.677 + -0.002067) = -4.679 over 1.08e+05 words (weighted)
    # in 27 minibatches; exact = (-4.677 + 0.002667) = -4.674

    parse_regex_train = re.compile(
        ".*train\.([0-9]+).1.log:LOG "
        ".rnnlm-train.*:PrintStatsOverall..:"
        "rnnlm.*training.cc:[0-9]+. Overall ([a-zA-Z\-]+) is "
        ".*exact = \(.+\) = ([0-9.\-\+e]+)")

    parse_regex_valid = re.compile(
        ".*compute_prob\.([0-9]+).log:LOG "
        ".rnnlm.*compute-prob.*:PrintStatsOverall..:"
        "rnnlm.*training.cc:[0-9]+. Overall ([a-zA-Z\-]+) is "
        ".*exact = \(.+\) = ([0-9.\-\+e]+)")

    train_objf = {}
    valid_objf = {}

    for line in train_prob_strings.split('\n'):
        mat_obj = parse_regex_train.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            if groups[1] == key:
                train_objf[int(groups[0])] = groups[2]
    if not train_objf:
        raise KaldiLogParseException("Could not find any lines with {k} in "
                " {l}".format(k=key, l=train_prob_files))

    for line in valid_prob_strings.split('\n'):
        mat_obj = parse_regex_valid.search(line)
        if mat_obj is not None:
            groups = mat_obj.groups()
            if groups[1] == key:
                valid_objf[int(groups[0])] = groups[2]

    if not valid_objf:
        raise KaldiLogParseException("Could not find any lines with {k} in "
                " {l}".format(k=key, l=valid_prob_files))

    iters = list(set(valid_objf.keys()).intersection(train_objf.keys()))
    if not iters:
        raise KaldiLogParseException("Could not any common iterations with"
                " key {k} in both {tl} and {vl}".format(
                    k=key, tl=train_prob_files, vl=valid_prob_files))
    iters.sort()
    return map(lambda x: (int(x), float(train_objf[x]),
                          float(valid_objf[x])), iters)