示例#1
0
文件: classify.py 项目: imclab/nytml
def multistage_classify(n, s, e, folds):
    features, metafeatures, labels = get_data(n, s, e, metadata=True)
    features2 = new_features(features, labels, metafeatures)
    return kfold_crossval(npa(features2), npa(labels), folds, 2)
示例#2
0
文件: classify.py 项目: imclab/nytml
    # v_features = (v.fit_transform(POS_features)).toarray()
    # transform text into count vectors
    v = CountVectorizer(min_df=1, max_features=2000)
    v_features = (v.fit_transform(features)).toarray()
    print 'Finished vectorizing text data'

    X, Y = randomize(v_features, labels)

    print len(X[0])

    if len(X) == len(Y): print 'Data check ... OK'
    else: 
        print 'Data check failed. Aborting execution'
        return None

    return kfold_crossval(X, Y, folds, 2, RF=True)

def iterateMNB(n_trials, d_range, s_size, folds):

    C0L, C1L, WL = [], [], []
    # upper and lower date limits
    ll = datetime.date(2012,01,01)
    ul = datetime.date(2013,06,01)

    v = CountVectorizer(min_df=1, max_features=2000)

    for i in range(n_trials):
        s = tools.randomDate(ll, ul)
        e = date.fromordinal(s.toordinal() + d_range)
        # c0, c1, w = multistage_classify(s_size, s, e, folds)
        c0, c1, w = classify_text(s_size, s, e, folds)