Python DataUtil 예제들, coprocessor.Corpus.ood_dataset.stable_vesion.data_util.DataUtil Python 예제들

예제 #1

0

파일 보기

파일: 2222.py 프로젝트: JDwangmo/coprocessor

print('=' * 30)
print 'start running!'
logging.debug('=' * 30)
logging.debug(config['describe'])
logging.debug('=' * 30)
logging.debug('start running!')
logging.debug('=' * 20)

# ****************************************************************
# +++++++++++++ region start : 1. 加载训练数据和测试数据 +++++++++++++
# ****************************************************************


from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil

data_util = DataUtil()
train_data,test_data = data_util.load_train_test_data(config)

label_to_index,index_to_label = data_util.get_label_index()

# ****************************************************************
# ------------- region end : 1. 加载训练数据和测试数据 -------------
# ****************************************************************

# ****************************************************************
# +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++
# ****************************************************************
logging.debug('=' * 20)
logging.debug('2. 转换数据的格式并特征编码')

예제 #2

0

파일 보기

파일: dataset_size_learning_curue.py 프로젝트: JDwangmo/coprocessor

print('树：%s'%estimators)

print('=' * 30)

logging.debug('=' * 20)
# ****************************************************************
# ------------- region end : 参数设置 -------------
# ****************************************************************

# ------------------------------------------------------------------------------
# -------------- region start : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
from traditional_classify.bow_rf.bow_rf_model import BowRandomForest

data_util = DataUtil()
train_data,test_data = data_util.load_train_test_data(config)
label_to_index,index_to_label = data_util.get_label_index(version=config['label_version'])
# print train_data['LABEL_INDEX'].unique()
# print ','.join(train_data['LABEL'].unique())
# print test_data['LABEL_INDEX'].as_matrix()
test_y = test_data['LABEL_INDEX'].as_matrix()
train_y = train_data['LABEL_INDEX'].as_matrix()
# ------------------------------------------------------------------------------
# -------------- region end : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# -------------- region start : 将数据转为特征 -------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('将数据转为特征')

예제 #3

0

파일 보기

파일: cv.py 프로젝트: JDwangmo/coprocessor

logging.debug('=' * 30)
logging.debug(config['describe'])
logging.debug('=' * 30)
logging.debug('start running!')
logging.debug('=' * 20)

import os

# ****************************************************************
# +++++++++++++ region start : 1. 加载训练数据和测试数据 +++++++++++++
# ****************************************************************


from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil

data_util = DataUtil()
train_data,test_data = data_util.load_train_test_data(config)

label_to_index,index_to_label = data_util.get_label_index()

# ****************************************************************
# ------------- region end : 1. 加载训练数据和测试数据 -------------
# ****************************************************************

# ****************************************************************
# +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++
# ****************************************************************
logging.debug('=' * 20)
logging.debug('2. 转换数据的格式并特征编码')

예제 #4

0

파일 보기

파일: multi_bow2_main.py 프로젝트: JDwangmo/coprocessor

print config['describe']
print('=' * 30)
print 'start running!'
logging.debug('=' * 30)
logging.debug(config['describe'])
logging.debug('=' * 30)
logging.debug('start running!')
logging.debug('=' * 20)

# ------------------------------------------------------------------------------
# -------------- region start : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder

data_util = DataUtil()
train_data,test_data = data_util.load_train_test_data(config)
label_to_index,index_to_label = data_util.get_label_index()
# print train_data['LABEL_INDEX'].unique()
# print ','.join(train_data['LABEL'].unique())
# print test_data['LABEL_INDEX'].as_matrix()
# ------------------------------------------------------------------------------
# -------------- region end : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# -------------- region start : 将数据转为特征 ---------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('开始生成特征向量...')

logging.debug('使用 %s 提取特征向量'%(config['model']))

예제 #5

0

파일 보기

파일: single_bow1_main.py 프로젝트: JDwangmo/coprocessor

print('l2_conv_filter_type:%s'%l2_conv_filter_type)
print('k:%s'%k)

print('=' * 150)
logging.debug('=' * 20)
# ****************************************************************
# ------------- region end : 参数设置 -------------
# ****************************************************************

# ------------------------------------------------------------------------------
# -------------- region start : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder

data_util = DataUtil()
train_data, test_data = data_util.load_train_test_data(config)
label_to_index, index_to_label = data_util.get_label_index()
# print train_data['LABEL_INDEX'].unique()
# print ','.join(train_data['LABEL'].unique())
# print test_data['LABEL_INDEX'].as_matrix()
# ------------------------------------------------------------------------------
# -------------- region end : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# -------------- region start : 将数据转为特征 ---------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('开始生成特征向量...')

logging.debug('使用 %s 提取特征向量' % (config['model']))

예제 #6

0

파일 보기

파일: one_conv_layer_wordEmbedding_cnn_cv.py 프로젝트: JDwangmo/coprocessor

logging.basicConfig(filename=''.join(config['log_file_path']), filemode='w',
                    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
start_time = timeit.default_timer()
print('=' * 30)
print config['describe']
print('=' * 30)
print 'start running!'
logging.debug('=' * 30)
logging.debug(config['describe'])
logging.debug('=' * 30)
logging.debug('start running!')
logging.debug('=' * 20)

from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil

data_util = DataUtil()
train_data, test_data = data_util.load_train_test_data(config)
label_to_index, index_to_label = data_util.get_label_index()

train_x = train_data['SENTENCE'].as_matrix()
train_y = train_data['LABEL_INDEX'].as_matrix()
test_x = test_data['SENTENCE'].as_matrix()
test_y = test_data['LABEL_INDEX'].as_matrix()

from deep_learning.cnn.wordEmbedding_cnn.example.one_conv_layer_wordEmbedding_cnn import WordEmbeddingCNNWithOneConv

input_length = 14
word_embedding_dim = 50
WordEmbeddingCNNWithOneConv.cross_validation(
    train_data=(train_x, train_y),
    test_data=(test_x, test_y),

예제 #7

0

파일 보기

파일: multi_bow2_main_cv.py 프로젝트: JDwangmo/coprocessor

print config['describe']
print('=' * 30)
print 'start running!'
logging.debug('=' * 30)
logging.debug(config['describe'])
logging.debug('=' * 30)
logging.debug('start running!')
logging.debug('=' * 20)

# ------------------------------------------------------------------------------
# -------------- region start : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder

data_util = DataUtil()
train_data,test_data = data_util.load_train_test_data(config)
label_to_index,index_to_label = data_util.get_label_index()
# print train_data['LABEL_INDEX'].unique()
# print ','.join(train_data['LABEL'].unique())
# print test_data['LABEL_INDEX'].as_matrix()
# ------------------------------------------------------------------------------
# -------------- region end : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# -------------- region start : 将数据转为特征 ---------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('开始生成特征向量...')

logging.debug('使用 %s 提取特征向量'%(config['model']))

예제 #8

0

파일 보기

파일: data_util.py 프로젝트: JDwangmo/coprocessor

                   )
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv'%(dataset_type,
                                                        feature_type,
                                                        counter,
                                                        len(val)),
            val,
            fmt='%d',
            delimiter=',',
        )




if __name__ == '__main__':
    dutil = DataUtil()
    # process_train_data_Sa()
    # process_train_data_S()
    # process_train_data_L()
    # process_test_data()
    #
    # genernate_aiml_file()

    # process_train_data_sentence_length()
    # process_train_data_for_k_fold(3)

    # quit()

    dutil.print_data_detail(dutil.load_data('data/v2.3_train_Sa_891.csv'))
    # dutil.print_data_detail(dutil.load_data('data/v2.3_train_Sa_891.csv'))
    quit()

예제 #9

0

파일 보기

파일: data_util.py 프로젝트: JDwangmo/coprocessor

def process_train_data_for_k_fold(k=3):
    '''
        将训练数据分成 K-份 ，以进行交叉验证，尽量按类别分。
        处理文件： v2.2/v2.2_train_Sa_884.csv
        输出文件： v2.2/v2.2_train_Sa_i%d_%d.csv

    :return:
    '''

    from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
    from data_processing_util.cross_validation_util import transform_cv_data, data_split_k_fold
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder

    data_util = DataUtil()

    feature_type = 'seg'
    # L or Sa
    dataset_type = 'S'

    config = {'dataset_type':'v2.3(%s)'%dataset_type,
              'verbose':1,
              'label_version':'v2.0'
              }
    train_data, test_data = data_util.load_train_test_data(config)
    label_to_index, index_to_label = data_util.get_label_index(config['label_version'])
    # print(train_data.head())
    train_X = train_data['SENTENCE'].as_matrix()
    train_y = train_data['LABEL_INDEX'].as_matrix()
    test_X = test_data['SENTENCE'].as_matrix()
    test_y = test_data['LABEL_INDEX'].as_matrix()

    cv_x = []
    cv_y = []
    for index,(x,y) in enumerate(data_split_k_fold(k=k,data=(train_X,train_y),rand_seed=3)):
        cv_x.append(x)
        cv_y.append(y)
        # print(y)
        y = [index_to_label[item] for item in y]
        cv_data = pd.DataFrame(data={'LABEL': y, 'SENTENCE': x})
        data_util.save_data(cv_data, 'result/cv_data/v2.3_train_%s_i%d_%d.csv' % (dataset_type,index, len(cv_data)))
        print(len(x))
    # quit()
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=False,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type=feature_type,
        max_features=2000,
    )

    all_cv_data = transform_cv_data(feature_encoder, (cv_x,cv_y), (test_X,test_y),verbose=1)

    counter = 0
    for dev_X, dev_y, val_X, val_y in all_cv_data:
        counter+=1
        dev = np.concatenate((dev_y.reshape(-1,1),dev_X),axis=1)
        val = np.concatenate((val_y.reshape(-1,1),val_X),axis=1)
        print(dev_X.shape)
        print(len(dev_y))
        print(dev.shape)
        print(val_X.shape)
        print(len(val_y))
        print(val.shape)
        np.savetxt('result/cv_data/v2.3_train_%s_%s_i%d_dev_%d.csv'%(
            dataset_type,
            feature_type,
            counter,len(dev)),
                   dev,
                   fmt='%d',
                   delimiter=',',
                   )
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv'%(dataset_type,
                                                        feature_type,
                                                        counter,
                                                        len(val)),
            val,
            fmt='%d',
            delimiter=',',
        )

예제 #10

0

파일 보기

            (dataset_type, feature_type, counter, len(dev)),
            dev,
            fmt='%d',
            delimiter=',',
        )
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv' %
            (dataset_type, feature_type, counter, len(val)),
            val,
            fmt='%d',
            delimiter=',',
        )


if __name__ == '__main__':
    dutil = DataUtil()
    # process_train_data_Sa()
    # process_train_data_S()
    # process_train_data_L()
    # process_test_data()
    #
    # genernate_aiml_file()

    # process_train_data_sentence_length()
    # process_train_data_for_k_fold(3)

    # quit()

    dutil.print_data_detail(dutil.load_data('data/v2.3_train_Sa_891.csv'))
    # dutil.print_data_detail(dutil.load_data('data/v2.3_train_Sa_891.csv'))
    quit()

예제 #11

0

파일 보기

def process_train_data_for_k_fold(k=3):
    '''
        将训练数据分成 K-份 ，以进行交叉验证，尽量按类别分。
        处理文件： v2.2/v2.2_train_Sa_884.csv
        输出文件： v2.2/v2.2_train_Sa_i%d_%d.csv

    :return:
    '''

    from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
    from data_processing_util.cross_validation_util import transform_cv_data, data_split_k_fold
    from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder

    data_util = DataUtil()

    feature_type = 'seg'
    # L or Sa
    dataset_type = 'S'

    config = {
        'dataset_type': 'v2.3(%s)' % dataset_type,
        'verbose': 1,
        'label_version': 'v2.0'
    }
    train_data, test_data = data_util.load_train_test_data(config)
    label_to_index, index_to_label = data_util.get_label_index(
        config['label_version'])
    # print(train_data.head())
    train_X = train_data['SENTENCE'].as_matrix()
    train_y = train_data['LABEL_INDEX'].as_matrix()
    test_X = test_data['SENTENCE'].as_matrix()
    test_y = test_data['LABEL_INDEX'].as_matrix()

    cv_x = []
    cv_y = []
    for index, (x, y) in enumerate(
            data_split_k_fold(k=k, data=(train_X, train_y), rand_seed=3)):
        cv_x.append(x)
        cv_y.append(y)
        # print(y)
        y = [index_to_label[item] for item in y]
        cv_data = pd.DataFrame(data={'LABEL': y, 'SENTENCE': x})
        data_util.save_data(
            cv_data, 'result/cv_data/v2.3_train_%s_i%d_%d.csv' %
            (dataset_type, index, len(cv_data)))
        print(len(x))
    # quit()
    feature_encoder = FeatureEncoder(
        verbose=0,
        need_segmented=True,
        full_mode=True,
        remove_stopword=False,
        replace_number=True,
        lowercase=True,
        zhs2zht=True,
        remove_url=True,
        feature_method='bow',
        feature_type=feature_type,
        max_features=2000,
    )

    all_cv_data = transform_cv_data(feature_encoder, (cv_x, cv_y),
                                    (test_X, test_y),
                                    verbose=1)

    counter = 0
    for dev_X, dev_y, val_X, val_y in all_cv_data:
        counter += 1
        dev = np.concatenate((dev_y.reshape(-1, 1), dev_X), axis=1)
        val = np.concatenate((val_y.reshape(-1, 1), val_X), axis=1)
        print(dev_X.shape)
        print(len(dev_y))
        print(dev.shape)
        print(val_X.shape)
        print(len(val_y))
        print(val.shape)
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_dev_%d.csv' %
            (dataset_type, feature_type, counter, len(dev)),
            dev,
            fmt='%d',
            delimiter=',',
        )
        np.savetxt(
            'result/cv_data/v2.3_train_%s_%s_i%d_val_%d.csv' %
            (dataset_type, feature_type, counter, len(val)),
            val,
            fmt='%d',
            delimiter=',',
        )

예제 #12

0

파일 보기

파일: single_bow1_main_cv.py 프로젝트: JDwangmo/coprocessor

fout.write('l2_conv_filter_type:%s\n'%l2_conv_filter_type)
fout.write('k:%s\n'%k)
fout.write('=' * 150+'\n')

logging.debug('=' * 20)
# ****************************************************************
# ------------- region end : 参数设置 -------------
# ****************************************************************

# ------------------------------------------------------------------------------
# -------------- region start : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder

data_util = DataUtil()
train_data,test_data = data_util.load_train_test_data(config)
label_to_index,index_to_label = data_util.get_label_index()
# print train_data['LABEL_INDEX'].unique()
# print ','.join(train_data['LABEL'].unique())
# print test_data['LABEL_INDEX'].as_matrix()
# ------------------------------------------------------------------------------
# -------------- region end : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# -------------- region start : 将数据转为特征 ---------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('开始生成特征向量...')

logging.debug('使用 %s 提取特征向量'%(config['model']))

예제 #13

0

파일 보기

파일: main.py 프로젝트: JDwangmo/coprocessor

logging.debug('=' * 30)
logging.debug('start running!')
logging.debug('=' * 20)

from data_processing_util.jiebanlp.jieba_util import Jieba_Util
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from gensim.models import Word2Vec


# ------------------------------------------------------------------------------
# -------------- region start : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------

from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
data_util = DataUtil()

train_data,test_data = data_util.load_train_test_data(config)
label_to_index,index_to_label = data_util.get_label_index(version=config['label_version'])
# print train_data['LABEL_INDEX'].unique()
# print ','.join(train_data['LABEL'].unique())
# print test_data['LABEL_INDEX'].as_matrix()
test_y = test_data['LABEL_INDEX'].as_matrix()
train_y = train_data['LABEL_INDEX'].as_matrix()
# ------------------------------------------------------------------------------
# -------------- region end : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------

train_data = train_data[['LABEL','SENTENCE']]
test_data = test_data[['LABEL','SENTENCE']]

예제 #14

0

파일 보기

파일: single_bow1_main.py 프로젝트: JDwangmo/coprocessor

print('l2_conv_filter_type:%s' % l2_conv_filter_type)
print('k:%s' % k)

print('=' * 150)
logging.debug('=' * 20)
# ****************************************************************
# ------------- region end : 参数设置 -------------
# ****************************************************************

# ------------------------------------------------------------------------------
# -------------- region start : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder

data_util = DataUtil()
train_data, test_data = data_util.load_train_test_data(config)
label_to_index, index_to_label = data_util.get_label_index()
# print train_data['LABEL_INDEX'].unique()
# print ','.join(train_data['LABEL'].unique())
# print test_data['LABEL_INDEX'].as_matrix()
# ------------------------------------------------------------------------------
# -------------- region end : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# -------------- region start : 将数据转为特征 ---------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('开始生成特征向量...')
#
# logging.debug('使用 %s 提取特征向量' % (config['model']))

예제 #15

0

파일 보기

파일: main.py 프로젝트: JDwangmo/coprocessor

config = config['main']
logging.basicConfig(filename=''.join(config['log_file_path']), filemode='w',
                    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
start_time = timeit.default_timer()
print('=' * 30)
print(config['describe'])
print('=' * 30)
print('start running!')
logging.debug('=' * 30)
logging.debug(config['describe'])
logging.debug('=' * 30)
logging.debug('start running!')
logging.debug('=' * 20)

from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
data_util = DataUtil()

# ****************************************************************
# +++++++++++++ region start : 参数调节 +++++++++++++
# ****************************************************************
logging.debug('=' * 20)
logging.debug('参数调节')
print('='*50)
batch_size = 100
feature_type = 'word'
full_mode = False
sentence_padding_length = 15
word_embedding_dim = 50
word2vec_model_file_path = data_util.transform_word2vec_model_name('%dd_v2.3Sa_word' % word_embedding_dim)
# word2vec_model_file_path = data_util.transform_word2vec_model_name('%dd_weibo_100w' % word_embedding_dim)

예제 #16

0

파일 보기

파일: main.py 프로젝트: JDwangmo/coprocessor

logging.debug('=' * 30)
logging.debug(config['describe'])
logging.debug('=' * 30)
logging.debug('start running!')
logging.debug('=' * 20)



# ------------------------------------------------------------------------------
# -------------- region start : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil
from data_processing_util.feature_encoder.bow_feature_encoder import FeatureEncoder
from traditional_classify.bow_rf.bow_rf_model import BowRandomForest

data_util = DataUtil()
train_data,test_data = data_util.load_train_test_data(config)
label_to_index,index_to_label = data_util.get_label_index()
# print train_data['LABEL_INDEX'].unique()
# print ','.join(train_data['LABEL'].unique())
# print test_data['LABEL_INDEX'].as_matrix()

# ------------------------------------------------------------------------------
# -------------- region end : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# -------------- region start : 将数据转为特征 -------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('将数据转为特征')

예제 #17

0

파일 보기

파일: bow_RF.py 프로젝트: JDwangmo/coprocessor

# encoding=utf8
"""
    Author:  'jdwang'
    Date:    'create date: 2016-09-27'; 'last updated date: 2016-09-27'
    Email:   '*****@*****.**'
    Describe: IALP paper - Dialogue Act Recognition for Chinese Out-of-Domain Utterances using Hybrid CNN-RF
        RF（BOC/BOW） 模型
"""
from __future__ import print_function

from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil

data_util = DataUtil()

# ****************************************************************
# +++++++++++++ region start : 参数设置 +++++++++++++
# ****************************************************************

print('=' * 30)
config = {
    'dataset_type': 'v2.3(Sa)',
    'label_version': 'v2.0',
    'verbose': 1,
}
word2vec_to_solve_oov = False
feature_type = 'seg'
seed = 64003
estimator_paramter_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000]
# estimator_paramter_list = [2000]
print('word2vec_to_solve_oov:%s\nrand_seed:%s\nfeature_type:%s' % (word2vec_to_solve_oov, seed,
                                                                   feature_type))

예제 #18

0

파일 보기

파일: expected_cross_entropy.py 프로젝트: JDwangmo/coprocessor

from jiebanlp.toolSet import seg
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from gensim.models import Word2Vec
import pickle



# ------------------------------------------------------------------------------
# -------------- region start : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil

data_util = DataUtil()
train_data,test_data,label_to_index,index_to_label = data_util.load_train_test_data(config)

# ------------------------------------------------------------------------------
# -------------- region end : 加载训练数据和测试数据 -------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# -------------- region start : 将数据转为特征 -------------
# ------------------------------------------------------------------------------
logging.debug('=' * 20)
logging.debug('将数据转为特征')

logging.debug('=' * 20)
logging.debug('对数据进行分词...')
logging.debug('-' * 20)

예제 #19

0

파일 보기

파일: multi_onehot_main.py 프로젝트: JDwangmo/coprocessor

layer1,hidden1 = 5,500
verbose = 1
print('word_input_length:%d\nseg_input_length:%d'%(word_input_length,seg_input_length))
print('layer1:%d\nhidden1:%d'%(layer1,hidden1))



print('=' * 30)
# ****************************************************************
# +++++++++++++ region start : 1. 加载训练数据和测试数据 +++++++++++++
# ****************************************************************


from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil

data_util = DataUtil()
train_data,test_data = data_util.load_train_test_data(config)

label_to_index,index_to_label = data_util.get_label_index()

# ****************************************************************
# ------------- region end : 1. 加载训练数据和测试数据 -------------
# ****************************************************************

# ****************************************************************
# +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++
# ****************************************************************
logging.debug('=' * 20)
logging.debug('2. 转换数据的格式并特征编码')
from deep_learning.cnn.wordEmbedding_cnn.multichannel_onehot_cnn_model import MultiChannelOnehotBowCNN
# 获取该分类器的编码器

예제 #20

0

파일 보기

파일: main.py 프로젝트: JDwangmo/coprocessor

print('=' * 30)
print 'start running!'
logging.debug('=' * 30)
logging.debug(config['describe'])
logging.debug('=' * 30)
logging.debug('start running!')
logging.debug('=' * 20)

# ****************************************************************
# +++++++++++++ region start : 1. 加载训练数据和测试数据 +++++++++++++
# ****************************************************************


from coprocessor.Corpus.ood_dataset.stable_vesion.data_util import DataUtil

data_util = DataUtil()
train_data,test_data = data_util.load_train_test_data(config)

label_to_index,index_to_label = data_util.get_label_index()

# ****************************************************************
# ------------- region end : 1. 加载训练数据和测试数据 -------------
# ****************************************************************

# ****************************************************************
# +++++++++++++ region start : 2. 转换数据的格式并特征编码 +++++++++++++
# ****************************************************************
logging.debug('=' * 20)
logging.debug('2. 转换数据的格式并特征编码')