Пример #1
0
def convert(in_file, out_file, feat_no, use_feat_names, cpos_chars):
    """\
    This does the conversion to ARFF.
    """
    fh_in = file_stream(in_file)

    buf = []
    sent_id = 1
    word_id = 1

    for line in fh_in:
        line = line.rstrip('\r\n')
        if not line:
            sent_id += 1
            word_id = 1
            continue
        # split the CoNLL format, removing unwanted stuff
        _, form, lemma, _, pos, _, feat, _ = line.split('\t', 7)
        # copy attributes
        inst = {
            'Form': form,
            'Lemma': lemma,
            'Tag_POS': pos,
            'word_id': word_id,
            'sent_id': in_file + '-' + str(sent_id)
        }
        # computing form-lemma diff (edit script)
        escr_front, escr_midback = edit_script(lemma, form)
        inst['LemmaFormDiff_Front'] = escr_front
        inst['LemmaFormDiff_Back'] = escr_midback
        # lemma suffixes
        for i in xrange(1, 9):
            inst['LemmaSuff_' + str(i)] = lemma[-i:]
        # coarse POS
        inst['Tag_CPOS'] = pos[:cpos_chars]
        # POS features
        feats = feat.split('|', feat_no - 1)
        feats += [''] * (feat_no - len(feats))
        for feat_ord, feat in enumerate(feats, start=1):
            if use_feat_names:
                feat_name, feat_val = feat.split('=', 1)
                inst['Tag_' + feat_name] = feat_val
            else:
                inst['Tag_FEAT' + str(feat_ord)] = feat
        # increase word number
        word_id += 1
        # save the instance to the buffer
        buf.append(inst)
    # write this all out as ARFF
    data = DataSet()
    attr_order = [
        'sent_id', 'word_id', 'Lemma', 'Form', 'LemmaFormDiff_Front',
        'LemmaFormDiff_Back'
    ]
    for i in xrange(1, 9):
        attr_order.append('LemmaSuff_' + str(i))
    attr_order.extend(['Tag_POS', 'Tag_CPOS'])
    data.load_from_dict(buf, {'word_id': 'numeric'}, attr_order)
    data.save_to_arff(out_file)
Пример #2
0
def main():
    """\
    Main application entry: parse command line and run the test.
    """
    opts, filenames = getopt.getopt(sys.argv[1:], 'ca:s:n:')
    show_help = False
    combine_cng = False
    subsets = []
    neighbors = []
    substrs = []
    for opt, arg in opts:
        if opt == '-c':
            combine_cng = True
        elif opt == '-s':
            sub_len, attr = arg.split(':', 1)
            substrs.append((int(sub_len), attr))
        elif opt == '-a':
            size, attrs = arg.split(':', 1)
            subsets.append((int(size), re.split(r'[, ]+', attrs)))
        elif opt == '-n':
            shift, attrs = arg.split(':', 1)
            neighbors.append((int(shift), re.split(r'[, ]+', attrs)))
    # display help and exit
    if len(filenames) != 2 or not (combine_cng or substrs or
                                   subsets or neighbors) or show_help:
        display_usage()
        sys.exit(1)
    # run the training
    filename_in, filename_out = filenames
    data = DataSet()
    log_info('Loading data: ' + filename_in)
    data.load_from_arff(filename_in)
    if substrs:
        for (sub_len, attr) in substrs:
            log_info(('Adding substrings from the %s of %s ' +
                      'up to %d characters long ...') %
                     (('beginning' if sub_len > 0 else 'end'),
                      attr, abs(sub_len)))
            add_substr_attributes(data, sub_len, attr)
    if combine_cng:
        log_info('Combining case, number, gender ...')
        combine_tag_num_gen_cas(data)
    if subsets:
        for (set_size, set_attrs) in subsets:
            log_info('Combining up to %d attributes from [%s] ...' %
                     (set_size, ','.join(set_attrs)))
            combine_subsets(data, set_attrs, set_size)
    if neighbors:
        for (shift, attrs) in neighbors:
            log_info('Adding neighbor %d\'s attributes [%s] ...' %
                     (shift, ','.join(attrs)))
            add_neighbor_attributes(data, shift, attrs)
    log_info('Saving data: ' + filename_out)
    data.save_to_arff(filename_out)
Пример #3
0
def convert(in_file, out_file, feat_no, use_feat_names, cpos_chars):
    """\
    This does the conversion to ARFF.
    """
    fh_in = file_stream(in_file)

    buf = []
    sent_id = 1
    word_id = 1

    for line in fh_in:
        line = line.rstrip('\r\n')
        if not line:
            sent_id += 1
            word_id = 1
            continue
        # split the CoNLL format, removing unwanted stuff
        _, form, lemma, _, pos, _, feat, _ = line.split('\t', 7)
        # copy attributes
        inst = {
                'Form': form, 'Lemma': lemma, 'Tag_POS': pos,
                'word_id': word_id, 'sent_id': in_file + '-' + str(sent_id)
               }
        # computing form-lemma diff (edit script)
        escr_front, escr_midback = edit_script(lemma, form)
        inst['LemmaFormDiff_Front'] = escr_front
        inst['LemmaFormDiff_Back'] = escr_midback
        # lemma suffixes
        for i in xrange(1, 9):
            inst['LemmaSuff_' + str(i)] = lemma[-i:]
        # coarse POS
        inst['Tag_CPOS'] = pos[:cpos_chars]
        # POS features
        feats = feat.split('|', feat_no - 1)
        feats += [''] * (feat_no - len(feats))
        for feat_ord, feat in enumerate(feats, start=1):
            if use_feat_names:
                feat_name, feat_val = feat.split('=', 1)
                inst['Tag_' + feat_name] = feat_val
            else:
                inst['Tag_FEAT' + str(feat_ord)] = feat
        # increase word number
        word_id += 1
        # save the instance to the buffer
        buf.append(inst)
    # write this all out as ARFF
    data = DataSet()
    attr_order = ['sent_id', 'word_id', 'Lemma', 'Form',
            'LemmaFormDiff_Front', 'LemmaFormDiff_Back']
    for i in xrange(1, 9):
        attr_order.append('LemmaSuff_' + str(i))
    attr_order.extend(['Tag_POS', 'Tag_CPOS'])
    data.load_from_dict(buf, {'word_id': 'numeric'}, attr_order)
    data.save_to_arff(out_file)
Пример #4
0
def main():
    """\
    Main application entry: parse command line and run the test.
    """
    opts, filenames = getopt.getopt(sys.argv[1:], 'ca:s:n:')
    show_help = False
    combine_cng = False
    subsets = []
    neighbors = []
    substrs = []
    for opt, arg in opts:
        if opt == '-c':
            combine_cng = True
        elif opt == '-s':
            sub_len, attr = arg.split(':', 1)
            substrs.append((int(sub_len), attr))
        elif opt == '-a':
            size, attrs = arg.split(':', 1)
            subsets.append((int(size), re.split(r'[, ]+', attrs)))
        elif opt == '-n':
            shift, attrs = arg.split(':', 1)
            neighbors.append((int(shift), re.split(r'[, ]+', attrs)))
    # display help and exit
    if len(filenames) != 2 or not (combine_cng or substrs or
                                   subsets or neighbors) or show_help:
        display_usage()
        sys.exit(1)
    # run the training
    filename_in, filename_out = filenames
    data = DataSet()
    log_info('Loading data: ' + filename_in)
    data.load_from_arff(filename_in)
    if substrs:
        for (sub_len, attr) in substrs:
            log_info(('Adding substrings from the %s of %s ' +
                      'up to %d characters long ...') %
                     (('beginning' if sub_len > 0 else 'end'),
                      attr, abs(sub_len)))
            add_substr_attributes(data, sub_len, attr)
    if combine_cng:
        log_info('Combining case, number, gender ...')
        combine_tag_num_gen_cas(data)
    if subsets:
        for (set_size, set_attrs) in subsets:
            log_info('Combining up to %d attributes from [%s] ...' %
                     (set_size, ','.join(set_attrs)))
            combine_subsets(data, set_attrs, set_size)
    if neighbors:
        for (shift, attrs) in neighbors:
            log_info('Adding neighbor %d\'s attributes [%s] ...' %
                     (shift, ','.join(attrs)))
            add_neighbor_attributes(data, shift, attrs)
    log_info('Saving data: ' + filename_out)
    data.save_to_arff(filename_out)
Пример #5
0
def test_models(file_in, file_out, model_files, source_attr, target_attr,
                oov_test_file, oov_part, pos_attr, test_indiv):
    """\
    Test all the given models on the selected file and save the target.

    If oov_test_file is set, performs also OOV evaluation.
    If test_pos is True, prints detailed results for various POSs.
    """
    # load testing data
    log_info('Loading data: ' + file_in)
    data = DataSet()
    data.load_from_arff(file_in)
    forms = data[source_attr]
    # apply all models
    for model_num, model_file in enumerate(model_files, start=1):
        model = Model.load_from_file(model_file)
        log_info('Applying model: ' + model_file)
        rules = model.classify(data)
        output_attr = 'OUTPUT_M' + str(model_num)
        data.add_attrib(Attribute(output_attr, 'string'), rules)
        if test_indiv:
            good = count_correct(data, model.class_attr, output_attr)
            print_score(good, len(data), 'Model accuracy')
        forms = [inflect(form, rule) for form, rule in zip(forms, rules)]
        forms_attr = 'FORMS_M' + str(model_num)
        data.add_attrib(Attribute(forms_attr, 'string'), forms)
    # test the final performance
    log_info('Evaluating...')
    good = count_correct(data, target_attr, forms_attr)
    print_score(good, len(data), 'ALL')
    # evaluate without punctuation
    evaluate_nopunct(data, source_attr, target_attr, forms_attr)
    # evaluate forms different from lemma
    evaluate_nolemma(data, source_attr, target_attr, forms_attr)
    # load training data for OOV tests, evaluate on OOV
    if oov_test_file:
        evaluate_oov(data, source_attr, target_attr, forms_attr,
                     oov_test_file, oov_part)
    # test on different POSes
    if pos_attr:
        evaluate_poses(data, target_attr, forms_attr, pos_attr)
    # save the classification results
    log_info('Saving data: ' + file_out)
    data.save_to_arff(file_out)
Пример #6
0
def test_models(file_in, file_out, model_files, source_attr, target_attr,
                oov_test_file, oov_part, pos_attr, test_indiv):
    """\
    Test all the given models on the selected file and save the target.

    If oov_test_file is set, performs also OOV evaluation.
    If test_pos is True, prints detailed results for various POSs.
    """
    # load testing data
    log_info('Loading data: ' + file_in)
    data = DataSet()
    data.load_from_arff(file_in)
    forms = data[source_attr]
    # apply all models
    for model_num, model_file in enumerate(model_files, start=1):
        model = Model.load_from_file(model_file)
        log_info('Applying model: ' + model_file)
        rules = model.classify(data)
        output_attr = 'OUTPUT_M' + str(model_num)
        data.add_attrib(Attribute(output_attr, 'string'), rules)
        if test_indiv:
            good = count_correct(data, model.class_attr, output_attr)
            print_score(good, len(data), 'Model accuracy')
        forms = [inflect(form, rule) for form, rule in zip(forms, rules)]
        forms_attr = 'FORMS_M' + str(model_num)
        data.add_attrib(Attribute(forms_attr, 'string'), forms)
    # test the final performance
    log_info('Evaluating...')
    good = count_correct(data, target_attr, forms_attr)
    print_score(good, len(data), 'ALL')
    # evaluate without punctuation
    evaluate_nopunct(data, source_attr, target_attr, forms_attr)
    # evaluate forms different from lemma
    evaluate_nolemma(data, source_attr, target_attr, forms_attr)
    # load training data for OOV tests, evaluate on OOV
    if oov_test_file:
        evaluate_oov(data, source_attr, target_attr, forms_attr, oov_test_file,
                     oov_part)
    # test on different POSes
    if pos_attr:
        evaluate_poses(data, target_attr, forms_attr, pos_attr)
    # save the classification results
    log_info('Saving data: ' + file_out)
    data.save_to_arff(file_out)
Пример #7
0
def main():
    """\
    Main application entry: parse command line and run the test.
    """
    opts, filenames = getopt.getopt(sys.argv[1:], 'g:p:ai')
    show_help = False
    annot_errors = False
    gold = None
    predicted = 'PREDICTED'
    ignore_case = False
    for opt, arg in opts:
        if opt == '-g':
            gold = arg
        elif opt == '-p':
            predicted = arg
        elif opt == '-a':
            annot_errors = True
        elif opt == '-i':
            ignore_case = True
    # display help and exit
    if len(filenames) != 2 or not gold or show_help:
        display_usage()
        sys.exit(1)
    # run the training
    filename_in, filename_out = filenames
    data = DataSet()
    log_info('Loading data: ' + filename_in)
    data.load_from_arff(filename_in)
    if ignore_case:
        cmp_func = lambda a, b: a.lower() != b.lower()
    else:
        cmp_func = lambda a, b: a != b
    if annot_errors:
        log_info('Annotating errors...')
        err_ind = [
            'ERR' if cmp_func(i[gold], i[predicted]) else '' for i in data
        ]
        data.add_attrib(Attribute('ERROR_IND', 'string'), err_ind)
    else:
        log_info('Selecting errors...')
        data = data[lambda _, i: cmp_func(i[gold], i[predicted])]
    log_info('Saving data: ' + filename_out)
    data.save_to_arff(filename_out)
Пример #8
0
def main():
    """\
    Main application entry: parse command line and run the test.
    """
    opts, filenames = getopt.getopt(sys.argv[1:], "g:p:ai")
    show_help = False
    annot_errors = False
    gold = None
    predicted = "PREDICTED"
    ignore_case = False
    for opt, arg in opts:
        if opt == "-g":
            gold = arg
        elif opt == "-p":
            predicted = arg
        elif opt == "-a":
            annot_errors = True
        elif opt == "-i":
            ignore_case = True
    # display help and exit
    if len(filenames) != 2 or not gold or show_help:
        display_usage()
        sys.exit(1)
    # run the training
    filename_in, filename_out = filenames
    data = DataSet()
    log_info("Loading data: " + filename_in)
    data.load_from_arff(filename_in)
    if ignore_case:
        cmp_func = lambda a, b: a.lower() != b.lower()
    else:
        cmp_func = lambda a, b: a != b
    if annot_errors:
        log_info("Annotating errors...")
        err_ind = ["ERR" if cmp_func(i[gold], i[predicted]) else "" for i in data]
        data.add_attrib(Attribute("ERROR_IND", "string"), err_ind)
    else:
        log_info("Selecting errors...")
        data = data[lambda _, i: cmp_func(i[gold], i[predicted])]
    log_info("Saving data: " + filename_out)
    data.save_to_arff(filename_out)