Python Feature 예제들, featwheel.feature.Feature Python 예제들

예제 #1

0

파일 보기

파일: topic_rate_features.py 프로젝트: zbxzc35/zhihu-machine-learning-challenge-2017

def generate(config, argv):
    topic_info_fp = config.get('DIRECTORY',
                               'source_pt') + '/question_topic_train_set.txt'
    qid_list, tid_list = load_question_topic_set(topic_info_fp)

    tid_rate = dict()
    for tids in tid_list:
        for tid in tids:
            tid_rate[tid] = tid_rate.get(tid, 0.) + 1.
    for tid in tid_rate:
        tid_rate[tid] /= (1. * len(tid_list))

    # load hash table of label
    label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'),
                             config.get('TITLE_CONTENT_CNN', 'label2id_fn'))
    label2id = json.load(open(label2id_fp, 'r'))

    features = [0.] * 1999
    for tid in tid_rate:
        features[int(label2id[tid])] = tid_rate[tid]

    feature_file_path = '%s/topic_fs_rate.%s.smat' % (config.get(
        'DIRECTORY', 'dataset_pt'), 'all')
    feature_file = open(feature_file_path, 'w')
    feature_file.write('%d 1\n' % len(features))
    for feature in features:
        Feature.save_feature([feature], feature_file)
    feature_file.close()

예제 #2

0

파일 보기

파일: topic_length_features.py 프로젝트: zbxzc35/zhihu-machine-learning-challenge-2017

def generate(config, argv):
    # load topic info
    topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt'
    tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info(
        topic_info_fp)

    # load hash table of label
    label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'),
                             config.get('TITLE_CONTENT_CNN', 'label2id_fn'))
    label2id = json.load(open(label2id_fp, 'r'))

    feature_file_path = '%s/topic_fs_length.%s.smat' % (config.get(
        'DIRECTORY', 'dataset_pt'), 'all')
    feature_file = open(feature_file_path, 'w')
    features = [0] * len(tid_list)

    for line_id in range(len(tid_list)):
        feature = list()
        feature.append(len(father_list[line_id]))
        feature.append(len(tc_list[line_id]))
        feature.append(len(tw_list[line_id]))
        feature.append(len(dc_list[line_id]))
        feature.append(len(dw_list[line_id]))

        label_id = int(label2id[tid_list[line_id]])
        features[label_id] = feature

    feature_file.write('%d %d\n' % (len(features), len(features[0])))

    for feature in features:
        Feature.save_feature(feature, feature_file)

    feature_file.close()

예제 #3

0

파일 보기

파일: postprocessor.py 프로젝트: liuzongquan/kaggle-quora-question-pairs

    def rescale(config, online_preds_fp):
        online_preds = DataUtil.load_vector(online_preds_fp, 'float')

        feature_name = 'graph_edge_max_clique_size'
        feature_pt = config.get('DEFAULT', 'feature_pt')
        test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name)
        test_features_mc = Feature.load(test_feature_fp).toarray()

        feature_name = 'graph_edge_cc_size'
        feature_pt = config.get('DEFAULT', 'feature_pt')
        test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name)
        test_features_cc = Feature.load(test_feature_fp).toarray()

        for index in range(len(online_preds)):
            score = online_preds[index]
            if test_features_mc[index][0] == 3.:
                score = PostProcessor.adj(score, te=0.40883512, tr=0.623191)
            elif test_features_mc[index][0] > 3.:
                score = PostProcessor.adj(score, te=0.96503024, tr=0.972554)
            else:
                if test_features_cc[index][0] < 3.:
                    score = PostProcessor.adj(score, te=0.05739666, tr=0.233473)
                else:
                    score = PostProcessor.adj(score, te=0.04503431, tr=0.149471)
            online_preds[index] = score

        DataUtil.save_vector(online_preds_fp + '.rescale', online_preds)

예제 #4

0

파일 보기

    def rescale(config, online_preds_fp):
        online_preds = DataUtil.load_vector(online_preds_fp, 'float')

        feature_name = 'graph_edge_max_clique_size'
        feature_pt = config.get('DEFAULT', 'feature_pt')
        test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name)
        test_features_mc = Feature.load(test_feature_fp).toarray()

        feature_name = 'graph_edge_cc_size'
        feature_pt = config.get('DEFAULT', 'feature_pt')
        test_feature_fp = '%s/%s.test.smat' % (feature_pt, feature_name)
        test_features_cc = Feature.load(test_feature_fp).toarray()

        for index in range(len(online_preds)):
            score = online_preds[index]
            if test_features_mc[index][0] == 3.:
                score = PostProcessor.adj(score, te=0.40883512, tr=0.623191)
            elif test_features_mc[index][0] > 3.:
                score = PostProcessor.adj(score, te=0.96503024, tr=0.972554)
            else:
                if test_features_cc[index][0] < 3.:
                    score = PostProcessor.adj(score,
                                              te=0.05739666,
                                              tr=0.233473)
                else:
                    score = PostProcessor.adj(score,
                                              te=0.04503431,
                                              tr=0.149471)
            online_preds[index] = score

        DataUtil.save_vector(online_preds_fp + '.rescale', online_preds)

예제 #5

0

파일 보기

def generate(config, argv):
    data_name = argv[0]
    LogUtil.log('INFO', 'data_name=%s' % data_name)

    # load data set
    if 'offline' == data_name:
        # load offline valid dataset index
        valid_index_off_fp = '%s/%s.offline.index' % (config.get('DIRECTORY', 'index_pt'),
                                                      config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn'))
        valid_index_off = DataUtil.load_vector(valid_index_off_fp, 'int')
        valid_index_off = [num - 1 for num in valid_index_off]

        source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_train_set.txt'
        source_data = load_raw_line_from_file(config, source_file_path, valid_index_off)
    elif 'online' == data_name:
        source_file_path = config.get('DIRECTORY', 'source_pt') + '/question_eval_set.txt'
        source_data = open(source_file_path, 'r').readlines()
    else:
        source_data = None

    feature_file_path = '%s/instance_fs_length.%s.smat' % (config.get('DIRECTORY', 'dataset_pt'), data_name)
    feature_file = open(feature_file_path, 'w')

    feature_file.write('%d %d\n' % (len(source_data), 4))
    for line in source_data:
        qid, tc, tw, dc, dw = parse_question_set(line)
        feature = list()
        feature.append(len(tc))
        feature.append(len(tw))
        feature.append(len(dc))
        feature.append(len(dw))
        Feature.save_feature(feature, feature_file)

    feature_file.close()

예제 #6

0

파일 보기

파일: instance_id_features.py 프로젝트: zbxzc35/zhihu-machine-learning-challenge-2017

def generate(config, argv):
    data_name = argv[0]

    word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt')
    with open(word_idf_fp, 'r') as word_idf_f:
        word_idf = json.load(word_idf_f)
    LogUtil.log("INFO", "load word_idf done, len(word_idf)=%d" % len(word_idf))

    char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt')
    with open(char_idf_fp, 'r') as char_idf_f:
        char_idf = json.load(char_idf_f)
    LogUtil.log("INFO", "load char_idf done, len(char_idf)=%d" % len(char_idf))

    # load data set
    if 'offline' == data_name:
        # load offline valid dataset index
        valid_index_off_fp = '%s/%s.offline.index' % (
            config.get('DIRECTORY', 'index_pt'),
            config.get('TITLE_CONTENT_CNN', 'valid_index_offline_fn'))
        valid_index_off = DataUtil.load_vector(valid_index_off_fp, 'int')
        valid_index_off = [num - 1 for num in valid_index_off]

        source_file_path = config.get('DIRECTORY',
                                      'source_pt') + '/question_train_set.txt'
        source_data = load_raw_line_from_file(config, source_file_path,
                                              valid_index_off)

        features = valid_index_off
    elif 'online' == data_name:

        source_file_path = config.get('DIRECTORY',
                                      'source_pt') + '/question_eval_set.txt'
        source_data = open(source_file_path, 'r').readlines()

        features = range(len(source_data))
    else:
        source_data = None
        features = None

    id_feature_file_path = '%s/instance_fs_id.%s.smat' % (config.get(
        'DIRECTORY', 'dataset_pt'), data_name)
    feature_file = open(id_feature_file_path, 'w')

    feature_file.write('%d %d\n' % (len(source_data), 1))
    for id_num in features:
        feature = list()

        feature.append(id_num % 100000)

        Feature.save_feature(feature, feature_file)

    feature_file.close()

예제 #7

0

파일 보기

파일: topic_idf_sum_features.py 프로젝트: zbxzc35/zhihu-machine-learning-challenge-2017

def generate(config, argv):
    word_idf_fp = '%s/words.idf' % config.get('DIRECTORY', 'devel_pt')
    with open(word_idf_fp, 'r') as word_idf_f:
        word_idf = json.load(word_idf_f)
    LogUtil.log("INFO", "load word_idf done, len(word_idf)=%d" % len(word_idf))

    char_idf_fp = '%s/chars.idf' % config.get('DIRECTORY', 'devel_pt')
    with open(char_idf_fp, 'r') as char_idf_f:
        char_idf = json.load(char_idf_f)
    LogUtil.log("INFO", "load char_idf done, len(char_idf)=%d" % len(char_idf))

    # load topic info
    topic_info_fp = config.get('DIRECTORY', 'source_pt') + '/topic_info.txt'
    tid_list, father_list, tc_list, tw_list, dc_list, dw_list = load_topic_info(
        topic_info_fp)

    # load hash table of label
    label2id_fp = '%s/%s' % (config.get('DIRECTORY', 'hash_pt'),
                             config.get('TITLE_CONTENT_CNN', 'label2id_fn'))
    label2id = json.load(open(label2id_fp, 'r'))

    feature_file_path = '%s/topic_fs_idf_sum.%s.smat' % (config.get(
        'DIRECTORY', 'dataset_pt'), 'all')
    feature_file = open(feature_file_path, 'w')
    features = [0] * len(tid_list)

    for line_id in range(len(tid_list)):
        feature = list()

        tc = tc_list[line_id]
        tw = tw_list[line_id]
        dc = dc_list[line_id]
        dw = dw_list[line_id]

        feature.append(
            sum([
                char_idf[char] for char in tc
                if len(char) > 0 and char in char_idf
            ]))
        feature.append(
            sum([
                word_idf[word] for word in tw
                if len(word) > 0 and word in word_idf
            ]))

        feature.append(
            sum([
                char_idf[char] for char in dc
                if len(char) > 0 and char in char_idf
            ]))
        feature.append(
            sum([
                word_idf[word] for word in dw
                if len(word) > 0 and word in word_idf
            ]))

        label_id = int(label2id[tid_list[line_id]])
        features[label_id] = feature

    feature_file.write('%d %d\n' % (len(features), len(features[0])))

    for feature in features:
        Feature.save_feature(feature, feature_file)

    feature_file.close()