Пример #1
0
    train_x, train_y, max_len_train = toolbox.get_input_vec(
        path,
        'tag_train.txt',
        char2idx,
        tag2idx,
        limit=args.sent_limit,
        sent_seg=args.sent_seg,
        is_space=is_space,
        train_size=args.train_size,
        ignore_space=args.ignore_space)

    dev_x, max_len_dev = toolbox.get_input_vec_raw(
        path,
        'raw_dev.txt',
        char2idx,
        limit=args.sent_limit,
        sent_seg=args.sent_seg,
        is_space=is_space,
        ignore_space=args.ignore_space)
    if args.sent_seg:
        print 'Joint sentence segmentation...'
    else:
        print 'Training set: %d instances; Dev set: %d instances.' % (len(
            train_x[0]), len(dev_x[0]))

    nums_grams = None
    ng_embs = None

    if args.ngram > 1 and (
            args.reset
            or not os.path.isfile(path + '/' + str(args.ngram) + 'gram.txt')):
Пример #2
0
        new_chars = toolbox.get_new_chars(raw_file, char2idx, type='raw')

        valid_chars = None

        if args.embeddings is not None:

            valid_chars = toolbox.get_valid_chars(new_chars, args.embeddings)

        char2idx, idx2char, unk_char2idx = toolbox.update_char_dict(
            char2idx, new_chars, valid_chars)

        if not args.tag_large:

            raw_x, raw_len = toolbox.get_input_vec_raw(None,
                                                       raw_file,
                                                       char2idx,
                                                       rad_dic=rad_dic)
            print 'Numbers of sentences: %d.' % len(raw_x[0])
            max_step = raw_len

        else:
            max_step = toolbox.get_maxstep(raw_file, args.bucket_size)

        print 'Longest sentence is %d. ' % max_step

        if graphic:
            new_pixels = toolbox.get_new_pixels(new_chars, font, pic_size)
            pixels += new_pixels

        if ngram > 1:
Пример #3
0
            char2idx, new_chars, unk_chars_idx, valid_chars)

        if not args.segment_large:

            if sent_seg:
                raw_x, raw_len = toolbox.get_input_vec_tag(
                    None,
                    raw_file,
                    char2idx,
                    limit=args.sent_limit + 100,
                    is_space=is_space)
            else:
                raw_x, raw_len = toolbox.get_input_vec_raw(
                    None,
                    raw_file,
                    char2idx,
                    limit=args.sent_limit + 100,
                    sent_seg=sent_seg,
                    is_space=is_space)

            if sent_seg:
                print 'Joint sentence segmentation...'
            else:
                print 'Raw setences: %d instances.' % len(raw_x[0])

            max_step = raw_len

        else:

            max_step = args.sent_limit