Python fetch_20newsgroups示例，sklearn.datasets.twenty_newsgroups.fetch_20newsgroups Python示例

示例#1

0

显示文件

def dispBanner(stage):
    if stage == 'MNIST':
        print '************************************************************************'
        print '**  MNIST DIGITS EXPERIMENTS'
        print '************************************************************************'
            
        print 'First we load the data '
        dataset = ms.MNISTcontrol("../MNIST/")
        trX_images, trY = dataset.load_mnist('training')
        # we need x to be 1d
        sizeX = trX_images.shape
        if len(sizeX) > 2:
            newXdim = 0
            for i in range(sizeX[1]):
                newXdim += len(trX_images[0][i])
            trX = np.reshape(trX_images, (sizeX[0], newXdim))
            #read in test data
        deX, deY = dataset.load_mnist('test')
        # we need x to be 1d
        sizeX = deX.shape
        if len(sizeX) > 2:
            newXdim = 0
            for i in range(sizeX[1]):
                newXdim += len(deX[0][i])
            deX = np.reshape(deX, (sizeX[0], newXdim))
       
        return trX_images, trX, trY, deX, deY
    print'%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%'
    print'********************************************************************'
    print'** 20 NEWSGROUPS EXPERIMENTS'
    print'********************************************************************\n'
    print 'First we load the data \n'
    cats = ['alt.atheism', 'talk.religion.misc','comp.graphics', 'sci.space']
    news_train = fetch_20newsgroups(subset='train', categories=cats, shuffle=True, random_state=42)
    news_test = fetch_20newsgroups(subset='test', categories=cats, shuffle=True, random_state=42)
    for i in range(len(news_train.target_names)):
        print "Target number " + str(i) + " is " + news_train.target_names[i]
    vectorizer = TfidfVectorizer()
    trX = vectorizer.fit_transform(news_train.data) 
#    trX = trX.toarray() 
    trX_images = news_train.data
    trY = news_train.target
    trY = np.reshape(trY, (trY.shape[0],1))
    deX = vectorizer.fit_transform(news_test.data)
#    deX = deX.toarray()
    deY = news_test.target
    deY = np.reshape(deY, (deY.shape[0],1))
    raw_input('\nPress enter to continue...')
    return trX_images, trX, trY, deX,deY

示例#2

0

显示文件

文件： datasets.py 项目： ickabob/ml

def fetch_20newsgroups_bows(subset='all',
                            data_home="/home/ian/School/CS5950/datasets/"):
    """
    Load the 20 newsgroups dataset and transform in into bag-of-words.
    This is a wrapper around sklearn.fetch_20newsgroups

    Parameters:
    -----------
    
    subset: ['train'|'test'|'all'], optional
        Select the dataset to load

    data_home: optional, default: None
        Specify an download and cache folder for the datasets.  

    Returns
    -------
    
    bunch : Bunch object
        bunch.data: sparse matrix, shape [n_samples, n_features]
        bunch.target: array, shape [n_samples]
        bunch.target_names: list, length [n_classes]
    """
    data_home = get_data_home(data_home=data_home)
    target_file = os.path.join(data_home, "20newsgroup_bow.pk")

    data_train = fetch_20newsgroups(data_home=data_home,
                                    subset='train',
                                    catagories=None,
                                    shuffle=True,
                                    random_state=12)
    data_test = fetch_20newsgroups(data_home=data_home,
                                    subset='test',
                                    catagories=None,
                                    shuffle=True,
                                    random_state=12)

    if os.path.exists(target_file):
        X_train, X_test = joblib.load(target_file)
    else:
        vocabulary = dict((t,i) for i, t in enumerate(open(vocab_path)))

示例#3

0

显示文件

文件： LR_conf_intervals.py 项目： moghimis/CPR

def check_ci(crit_val=95):
    try:
        z_score = CRIT_VALS[crit_val]
    except KeyError:
        print('Provide a value one of %s' % list(CRIT_VALS.keys()))
        return

    print('Using threshold %d%%' % crit_val)
    lr = LogisticRegression(C=1, dual=False, solver='lbfgs', max_iter=1000)
    svm = LinearSVC(C=0.3, max_iter=1000)
    sel = SelectFromModel(svm, prefit=False)
    sc = StandardScaler()
    vect = TfidfVectorizer(sublinear_tf=True, stop_words='english')

    categ = ['alt.atheism', 'talk.religion.misc']
    res = twenty_newsgroups.fetch_20newsgroups(categories=categ)
    data, y = res['data'], res['target']

    data_trn_val, data_tst, y_trn_val, y_tst = train_test_split(
        data, y, test_size=0.2, random_state=42)
    data_trn, data_val, y_trn, y_val = train_test_split(
        data_trn_val, y_trn_val, test_size=0.5, random_state=42)

    X_trn = sc.fit_transform(
        sel.fit_transform(vect.fit_transform(data_trn), y_trn).todense())
    X_val = sc.transform(sel.transform(vect.transform(data_val)).todense())
    X_tst = sc.transform(sel.transform(vect.transform(data_tst)).todense())

    lr.fit(X_trn, y_trn)

    # could be estimated from x-validation instead
    SE_est = get_se(X_val, y_val, lr)
    prob, up, lo = get_probs(lr, X_tst, SE_est, z=z_score)
    # print(up - lo)
    print('Accuracy normally:', lr.score(X_tst, y_tst))
    # 90.6%
    evaluate_using_heuristic(prob, up, lo, y_tst)

示例#4

0

显示文件

文件： bicluster_newsgroups.py 项目： Comy/scikit-learn

    token_pattern = re.compile(u'(?u)\\b\\w\\w+\\b')
    tokens = token_pattern.findall(doc)
    tokens = ["#NUMBER" if token[0] in "0123456789_" else token
              for token in tokens]
    return tokens

# exclude 'comp.os.ms-windows.misc'
categories = ['alt.atheism', 'comp.graphics',
              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'comp.windows.x', 'misc.forsale', 'rec.autos',
              'rec.motorcycles', 'rec.sport.baseball',
              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
              'sci.med', 'sci.space', 'soc.religion.christian',
              'talk.politics.guns', 'talk.politics.mideast',
              'talk.politics.misc', 'talk.religion.misc']
newsgroups = fetch_20newsgroups(categories=categories)
y_true = newsgroups.target

vectorizer = TfidfVectorizer(stop_words='english', min_df=5,
                             tokenizer=number_aware_tokenizer)
cocluster = SpectralCoclustering(n_clusters=len(categories),
                                 svd_method='arpack', random_state=0)
kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,
                         random_state=0)

print("Vectorizing...")
X = vectorizer.fit_transform(newsgroups.data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)

示例#5

0

显示文件

文件： plot_bicluster_newsgroups.py 项目： johnnykwwang/GPGPU_Final_Python_CUDA

    tokens = token_pattern.findall(doc)
    tokens = ["#NUMBER" if token[0] in "0123456789_" else token
              for token in tokens]
    return tokens


# exclude 'comp.os.ms-windows.misc'
categories = ['alt.atheism', 'comp.graphics',
              'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'comp.windows.x', 'misc.forsale', 'rec.autos',
              'rec.motorcycles', 'rec.sport.baseball',
              'rec.sport.hockey', 'sci.crypt', 'sci.electronics',
              'sci.med', 'sci.space', 'soc.religion.christian',
              'talk.politics.guns', 'talk.politics.mideast',
              'talk.politics.misc', 'talk.religion.misc']
newsgroups = fetch_20newsgroups(categories=categories)
y_true = newsgroups.target

vectorizer = TfidfVectorizer(stop_words='english', min_df=5,
                             tokenizer=number_aware_tokenizer)
cocluster = SpectralCoclustering(n_clusters=len(categories),
                                 svd_method='arpack', random_state=0)
kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000,
                         random_state=0)

print("Vectorizing...")
X = vectorizer.fit_transform(newsgroups.data)

print("Coclustering...")
start_time = time()
cocluster.fit(X)

示例#6

0

显示文件

文件： bn_new_cate.py 项目： yxdongshine/machineLearning

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors.classification import KNeighborsClassifier
from sklearn.svm.classes import SVC, LinearSVC
import matplotlib.pyplot as plt

mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False

#加载数据
load_start_time = time()
remove = ('headers', 'footers', 'quotes')
#类别
categories = 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'
data_train = fetch_20newsgroups(data_home='../../data/',
                                subset='train',
                                categories=categories,
                                shuffle=True,
                                random_state=0,
                                remove=remove)
data_test = fetch_20newsgroups(data_home='../../data/',
                               subset='test',
                               categories=categories,
                               shuffle=True,
                               random_state=0,
                               remove=remove)
print u"完成数据加载过程耗时:%.3fs" % (time() - load_start_time)


#查看测试数据和训练数据大小
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

示例#7

0

显示文件

文件： text.py 项目： ingochris/Charlie

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

import sklearn.datasets.twenty_newsgroups as news
from sklearn.feature_extraction.text import CountVectorizer

twenty_train = news.fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print X_train_counts.shape

print "hello!"

示例#8

0

显示文件

文件： text.py 项目： andrely/mimir

from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups

fetch_20newsgroups()

示例#9

0

显示文件

"""
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups
from global_variables import stop, topic_mapping, inverse_mapping, leaf_to_topic, inverse_leaf_to_topic, cats
from functions import build_hierarchy, train_classifiers, build_classifier_map, predict_class
import time

start = time.clock()

train_dataset = fetch_20newsgroups(subset='train',
                                   categories=cats,
                                   shuffle=True,
                                   random_state=42)

# Adjacency list represents the hieararchy tree
# node_int_map maps the node label to the adjacency list index
# node_int_inverse_map represents the inverse of node_int_map
# parent_nos represents the number of nodes which are not leaves
[adjacency_list, node_int_map, node_int_inverse_map,
 parent_nos] = build_hierarchy()
start_time = time.process_time()
count_vectorizer = CountVectorizer(stop_words=stop, ngram_range=(1, 2))
tfidf_transformer = TfidfTransformer()
features = count_vectorizer.fit_transform(train_dataset.data)
features = tfidf_transformer.fit_transform(features)
print("----Built Features----")

示例#10

0

显示文件

def load_data(dataset_name):
    if dataset_name == 'iris':
        from sklearn.datasets import load_iris
        iris = load_iris()
        X, y = iris.data, iris.target
        num_cls = 3
    elif dataset_name == 'digits':
        from sklearn.datasets import load_digits
        digits = load_digits()
        X, y = digits.data, digits.target
        num_cls = 10
    elif dataset_name == 'wine':
        from sklearn.datasets import load_wine
        wine = load_wine()
        X, y = wine.data, wine.target
        num_cls = 3
    elif dataset_name == 'breast_cancer':
        from sklearn.datasets import load_breast_cancer
        data = load_breast_cancer()
        X, y = data.data, data.target
        num_cls = 2
    elif dataset_name == 'olivetti_faces':
        from sklearn.datasets.olivetti_faces import fetch_olivetti_faces
        data = fetch_olivetti_faces()
        X, y = data.data, data.target
        num_cls = 40
    elif dataset_name == '20newsgroups':
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups
        data = fetch_20newsgroups()
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(data.data)
        y = data.target
        num_cls = 20
    elif dataset_name == '20newsgroups_vectorized':
        from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups_vectorized
        data = fetch_20newsgroups_vectorized()
        X, y = data.data, data.target
        num_cls = 20
    elif dataset_name == 'lfw_people':
        from sklearn.datasets.lfw import fetch_lfw_people
        # all dataset
        # data = fetch_lfw_people()
        # X, y = data.data, data.target
        # num_cls = 5749
        # subset
        data = fetch_lfw_people(min_faces_per_person=50)
        X, y = data.data, data.target
        num_cls = 12
        assert num_cls == len(set(y))
    elif dataset_name == 'covtype':
        from benchmark.utils.dataset_scripts.covtype import load_covtype
        X, y = load_covtype()
        num_cls = 7
    elif dataset_name == 'rcv1':
        from sklearn.datasets.rcv1 import fetch_rcv1
        data = fetch_rcv1(subset='train')
        X, y = data.data, trans_label(np.argmax(data.target.A, axis=1))
        num_cls = len(set(y))
        assert num_cls == 37
    elif dataset_name == 'kddcup99-sa':
        from sklearn.datasets.kddcup99 import fetch_kddcup99
        data = fetch_kddcup99(subset='SA')

        y = trans_label(data.target)
        # one-hot encode the 1-th and 3-th column.
        # col1 = trans_label(data.data[:, 1]).reshape((len(data.data), 1))
        # col1 = one_hot(col1)
        # col3 = trans_label(data.data[:, 3]).reshape((len(data.data), 1))
        # col3 = one_hot(col3)

        # 2-th column has too many categories, just encode the label.
        X = data.data.copy()
        X[:, 2] = trans_label(data.data[:, 2])
        X[:, 1] = trans_label(data.data[:, 1])
        X[:, 3] = trans_label(data.data[:, 3])
        # # print(col1.shape, col3.shape, X.shape)
        # X = np.delete(X, [1, 3], axis=1)
        # X = X.astype(float)
        # print(col1.shape, col3.shape, X.shape)
        # print(X.dtype, col1.dtype, col3.dtype)
        # X = np.c_[X, col1]
        # X = np.c_[X, col3]
        num_cls = len(set(y))
    elif dataset_name == 'kddcup99-smtp':
        from sklearn import preprocessing
        from sklearn.datasets.kddcup99 import fetch_kddcup99
        data = fetch_kddcup99(subset='smtp')
        le = preprocessing.LabelEncoder()
        le.fit(data.target)
        y = le.transform(data.target)
        X = data.data
        num_cls = 3
    elif dataset_name == 'fall_detection':
        from benchmark.utils.dataset_scripts.fall_detection import load_fall_detection
        X, y = load_fall_detection()
        num_cls = 6
    elif dataset_name == 'banana':
        from benchmark.utils.dataset_scripts.banana import load_banana
        X, y = load_banana()
        num_cls = 2
    elif dataset_name == 'talkingdata':
        from benchmark.utils.dataset_scripts.talkingdata import load_talkinigdata
        X, y = load_talkinigdata()
        num_cls = 2
    elif dataset_name == 'biomechanical2C':
        from benchmark.utils.dataset_scripts.biomechanical import load_biomechanical2C
        X, y = load_biomechanical2C()
        num_cls = 2
    elif dataset_name == 'biomechanical3C':
        from benchmark.utils.dataset_scripts.biomechanical import load_biomechanical3C
        X, y = load_biomechanical3C()
        num_cls = 3
    elif dataset_name == 'susy':
        from benchmark.utils.dataset_scripts.susy import load_susy
        X, y = load_susy()
        num_cls = 2
    elif dataset_name == 'higgs':
        from benchmark.utils.dataset_scripts.higgs import load_higgs
        X, y = load_higgs()
        num_cls = 2
    elif dataset_name == 'hepmass':
        from benchmark.utils.dataset_scripts.hepmass import load_hepmass
        X, y = load_hepmass()
        num_cls = 2
    elif dataset_name == 'letter':
        from benchmark.utils.dataset_scripts.letter import load_letter
        X, y = load_letter()
        num_cls = 26
    elif dataset_name == 'usps':
        from benchmark.utils.dataset_scripts.usps import load_usps
        X, y = load_usps()
        num_cls = 10
    elif dataset_name == 'epsilon':
        from benchmark.utils.dataset_scripts.usps import load_epsilon
        X, y = load_epsilon()
        num_cls = 2
    elif dataset_name == 'dermatology':
        from benchmark.utils.dataset_scripts.dermatology import load_dermatology
        X, y = load_dermatology()
        num_cls = 4
    elif dataset_name == 'poker':
        from benchmark.utils.dataset_scripts.poker import load_poker
        X, y = load_poker()
        num_cls = 10
    elif dataset_name == 'sensorless':
        from benchmark.utils.dataset_scripts.sensorless import load_sensorless
        X, y = load_sensorless()
        num_cls = 11
    elif dataset_name == 'phishing':
        from benchmark.utils.dataset_scripts.phishinig import load_phishing
        X, y = load_phishing()
        num_cls = 2
    elif dataset_name == 'w8a':
        from benchmark.utils.dataset_scripts.w8a import load_w8a
        X, y = load_w8a()
        num_cls = 2
    elif dataset_name == 'a8a':
        from benchmark.utils.dataset_scripts.a8a import load_a8a
        X, y = load_a8a()
        num_cls = 2
    elif dataset_name == 'sector':
        from benchmark.utils.dataset_scripts.sector import load_sector
        X, y = load_sector()
        num_cls = 105
    elif dataset_name == 'protein':
        from benchmark.utils.dataset_scripts.protein import load_protein
        X, y = load_protein()
        num_cls = 3
    elif dataset_name == 'shuttle':
        from benchmark.utils.dataset_scripts.shuttle import load_shuttle
        X, y = load_shuttle()
        num_cls = 7
    elif dataset_name == 'vowel':
        from benchmark.utils.dataset_scripts.vowel import load_vowel
        X, y = load_vowel()
        num_cls = 11
    elif dataset_name == 'splice':
        from benchmark.utils.dataset_scripts.splice import load_splice
        X, y = load_splice()
        num_cls = 2
    elif dataset_name == 'codrna':
        from benchmark.utils.dataset_scripts.codrna import load_codrna
        X, y = load_codrna()
        num_cls = 2
    elif dataset_name == 'australian':
        from benchmark.utils.dataset_scripts.australian import load_australian
        X, y = load_australian()
        num_cls = 2
    else:
        raise ValueError('Invalid dataset name!')

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return (x_train, y_train), (x_test, y_test), num_cls

示例#11

0

显示文件

文件： 20NewsBags.py 项目： ttthhh/ballpark

###########
### The following file generates bags of instances for the 20Newsgroups data set (see description in paper)
##########
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

class_pair_list = [("sci.space","sci.med"),("comp.sys.mac.hardware","comp.sys.ibm.pc.hardware"),
                   ("rec.sport.hockey",'rec.sport.baseball')]

### Select binary classification task (category pair)  
c1,c2 = class_pair_list[2]

#load train,test data, represent with TF-IDF
newsgroups_valid = fetch_20newsgroups(subset = "test",categories=[c1, c2])
y_valid = newsgroups_valid.target
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words="english")   
                                 
newsgroups_train = fetch_20newsgroups(subset = "train",categories=[c1, c2])
dd = list(newsgroups_train.data)
y_train = list(newsgroups_train.target)
y_train =np.array(y_train)
dd = np.array(dd)
                         
X_train = vectorizer.fit_transform(dd)
X_valid = vectorizer.transform(newsgroups_valid.data)

y_train = np.where(y_train==0,-1,1)  
y_valid = np.where(y_valid==0,-1,1)

示例#12

0

显示文件

from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer  # Using inverse document frequency to filter the noise
from sklearn.pipeline import Pipeline
from sklearn import metrics
from global_variables import stop, cats
import time

train_dataset = fetch_20newsgroups(subset='train',
                                   categories=cats,
                                   remove=('headers', 'footers', 'quotes'),
                                   shuffle=True,
                                   random_state=42)
# remove=('headers', 'footers', 'quotes'),
# Using pipeline to fit data to model and then convert it to tf-idf counts
pipeline = Pipeline([('count_vectorizer',
                      CountVectorizer(stop_words=stop, ngram_range=(1, 2))),
                     ('tfidf_transformer', TfidfTransformer()),
                     ('classifier', SGDClassifier(alpha=1e-3,
                                                  random_state=42))])

start_time = time.process_time()

# SGDClassifier(alpha=1e-3, random_state=42)
pipeline.fit(train_dataset.data, train_dataset.target)
print("Training Time : " + str(time.process_time() - start_time))

test_dataset = fetch_20newsgroups(subset='test',
                                  remove=('headers', 'footers', 'quotes'),