示例#1
0
def main():
    datasets = [
        {"city": "beijing", "age": 500, "temperature": 26},
        {"city": "shanghai", "age": 550, "temperature": 27},
        {"city": "shenzheng", "age": 300, "temperature": 30},
    ]

    dict_vectorizer = DictVectorizer()
    dv_datasets = dict_vectorizer.fit_transform(datasets)
    print dv_datasets.toarray()
    print dict_vectorizer.vocabulary_
    print dict_vectorizer.feature_names_
    print "-" * 80

    #fh_vectorizer = FeatureHasher(n_features=10, input_type="dict")
    #fh_datasets = fh_vectorizer.fit_transform([{"text": 10, "words": 7}, {"name": 1, "words": 5}, {"gender": 1}])
    fh_vectorizer = FeatureHasher(n_features=10, input_type="string")
    fh_datasets = fh_vectorizer.fit_transform(["Liming love football", "Zhansan likes baseball"])
    print fh_datasets.toarray()

    raw_datasets, _ = Datasets.load_datasets()
    datasets = [v for v in raw_datasets.data[:10]]

    count_vectorizer = CountVectorizer(decode_error="ignore")
    cv_datasets = count_vectorizer.fit_transform(datasets)
    print count_vectorizer.vocabulary_

    tfidf_transformer = TfidfTransformer(smooth_idf=True)
    tfidft_datasets = tfidf_transformer.fit_transform(cv_datasets)
    print tfidft_datasets.toarray()
    print tfidf_transformer.idf_

    hash_vectorizer = HashingVectorizer(n_features=100, decode_error="ignore")
    hv_datasets = hash_vectorizer.fit_transform(datasets)
    print hv_datasets.toarray().shape
def main():
    raw_datasets, _ = Datasets.load_datasets()
    X, Y = gen_datasets(raw_datasets)

    vectorizer = CountVectorizer(decode_error="ignore")
    cv_datasets = vectorizer.fit_transform(X).toarray()

    clf = ExtraTreesClassifier()
    clf = clf.fit(cv_datasets, Y)
    print cv_datasets.shape

    print clf.feature_importances_

    modle = SelectFromModel(clf, prefit=True)
    X_new = modle.transform(cv_datasets)
    print X_new.shape

    binarizer = Binarizer(threshold=1.0)
    b_datasets = binarizer.fit_transform(cv_datasets)
    variance_threshold = VarianceThreshold(.8 * (1 - .8))
    v_datasets = variance_threshold.fit_transform(b_datasets)
    print v_datasets.shape