Пример #1
0
def x_2func(traindir,testdir):
    '''
    train = nb.read('train_nb_eventmodel')
    sta = nb.sta_count(train)
    category_tokens = get_category_tokens()
    token_x = x_2(train,sta,category_tokens)
    nb.write(token_x,'token_x')
    
    '''
    category = nb.read('category_nb_eventmodel')
    tokens_x = nb.read('token_x')
    category_convert = nb.convert(category)
    tokens_all_x = []
    x_category = {}
    for i  in range(10):
        tokens = sorted(tokens_x[i],key=tokens_x[i].get,reverse = True)
        x_category[i] = tokens[:100]
        '''
        x_category[i] = []
        for word in tokens:
            if tokens_x[i][word] >10.83:
                x_category[i].append(word)
            else:
                break
        '''
        print len(x_category[i])
        tokens_all_x = set(tokens_all_x)|set(x_category[i])
    print len(tokens_all_x)
    nb.write(x_category,'x_category')
    nb.write(tokens_all_x,'tokens_all_x')
Пример #2
0
def dffunc():
    #这里先使用文档频率,需要统计在某一类别下词t在多少个文档中出现
    '''
    train = nb.read('train_nb_eventmodel')
    sta = nb.sta_count(train)
    all_tokens = get_all_tokens()
    tokens_df = df(train,sta,all_tokens)
    nb.write(tokens_df,'tokens_df')
    '''
    category = nb.read('category_nb_eventmodel')
    tokens_df = nb.read('tokens_df')
    category_convert = nb.convert(category)
    tokens_all_df = []
    df_category = {}
    for i in range(10):
        tokens = sorted(tokens_df,key=tokens_df.get,reverse = True)
        df_category[i] = tokens[:200]
        tokens_all_df = set(tokens_all_df)|set(df_category[i])
    print tokens_all_df

    nb.write(df_category,'df_category')
    nb.write(tokens_all_df,'tokens_all_df')
Пример #3
0
def mi_func():
    '''
    train = nb.read('train_nb_eventmodel') 
    sta = nb.sta_count(train)
    category_tokens = get_category_tokens()
    token_mi = mi(train,sta,category_tokens)
    nb.write(token_mi,'token_mi')
    '''
    category = nb.read('category_nb_eventmodel')
    token_mi = nb.read('token_mi')
    category_convert = nb.convert(category)
    tokens_all_mi = []
    mi_category = {}
    for i in range(10):
        tokens = sorted(token_mi[i],key = token_mi[i].get,reverse = True)
        mi_category[i] = tokens[:500]
        tokens_all_mi = set(tokens_all_mi)|set(mi_category[i])
    print len(tokens_all_mi)
    nb.write(mi_category,'mi_category')
    nb.write(tokens_all_mi,'tokens_all_mi')
    
    '''
Пример #4
0
def gifunc():
    train = nb.read('train_nb_eventmodel')
    sta = nb.sta_count(train)
    all_tokens = get_all_tokens()
    tokens_gi = gi(train,sta,all_tokens)
    nb.write(tokens_gi,'tokens_gi')

    category = nb.read('category_nb_eventmodel')
    tokens_gi = nb.read('tokens_gi')
    category_convert = nb.convert(category)
    tokens_all_gi = []
    gi_category = {}
    for i in range(10):
        tokens = sorted(tokens_gi,key=tokens_gi.get,reverse=True)
        gi_category[i] = tokens[:100]
        tokens_all_gi = set(tokens_all_gi)|set(gi_category[i])
    print tokens_all_gi
    nb.write(gi_category,'gi_category')
    nb.write(tokens_all_gi,'tokens_all_gi')
def preprocess():
    traindir = './data/training'
    testdir = './data/test'
    
    tokens_all_x = nb.read('tokens_all_x')
    train_x,train_y,category = nb.func2(traindir)
    train_x = np.array(train_x)
    train_y = np.array(train_y)
    
    result,test_x,test_file = nb.func3(testdir,category)
    test_x = np.array(test_x)
    test_file = np.array(test_file)
    
    nb.write(train_x,'train_x')
    nb.write(train_y,'train_y')
    nb.write(category,'category')
    nb.write(result,'result')
    nb.write(test_x,'test_x')
    nb.write(test_file,'test_file')
def preprocess():
    traindir = './data/training'
    testdir = './data/test'

    tokens_all_x = nb.read('tokens_all_x')
    train_x, train_y, category = nb.func2(traindir)
    train_x = np.array(train_x)
    train_y = np.array(train_y)

    result, test_x, test_file = nb.func3(testdir, category)
    test_x = np.array(test_x)
    test_file = np.array(test_file)

    nb.write(train_x, 'train_x')
    nb.write(train_y, 'train_y')
    nb.write(category, 'category')
    nb.write(result, 'result')
    nb.write(test_x, 'test_x')
    nb.write(test_file, 'test_file')