예제 #1
0
def vectorize(train_words, test_words):
    # 停用词表
    with open('dict/stopwords.txt', 'r') as f:
        stopwords = set([w.strip() for w in f])

    v = HashingVectorizer(non_negative=True, stop_words=stopwords, n_features=30000)
    train_data = v.fit_transform(train_words)
    test_data = v.fit_transform(test_words)
    return train_data, test_data
예제 #2
0
 def feature_extraction(self, test):
     """
     function:特征提取
     :param test:
     :return:训练特征,测试特征
     """
     train = self.load_train_set()
     vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=25000)
     fea_train = vectorizer.fit_transform(train)  # 特征提取
     fea_test = vectorizer.fit_transform(test)  # 特征提取
     return fea_train, fea_test
예제 #3
0
파일: code.py 프로젝트: calippo/hackerrank
def main(new):
  with open("trainingdata.txt","r") as f:
    int(f.readline())
    training_set = [r.split(" ") for r in f]
  y = [int(doc[0]) for doc in training_set]
  corpus = [reduce(lambda x, y: x + " " + y, doc[1::]) for doc in training_set]
  # vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english', lowercase=True)
  vectorizer = HashingVectorizer()
  # vectorizer = HashingVectorizer()
  X_train = vectorizer.fit_transform(corpus)
  y_train = np.array(y)
  data = vectorizer.fit_transform(new)
  clf = (LinearSVC(), "SVM")
  # print Corups
  test(clf, X_train, y_train, data)
예제 #4
0
def get_hashing(data):
  t0 = time.time()
  print("* Making hashing vectorizor with the data ...")
  hasher = HashingVectorizer(stop_words='english', ngram_range=(1,3), norm='l2', non_negative=True) #l2 projected on the euclidean unit sphere
  hX = hasher.fit_transform(data)
  print("done in %0.3fs." % (time.time() - t0))
  return hX, hasher
예제 #5
0
def do_training():
    global X_train, X_test, feature_names, ch2
    print("Extracting features from the training data using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train_data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.25,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train_data)
    duration = time() - t0
    #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print("Extracting features from the test data using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test_data)
    duration = time() - t0
    #print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()

    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()

    if True:#opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" % 20000)
        t0 = time()
        ch2 = SelectKBest(chi2, k=20000)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [feature_names[i] for i
                             in ch2.get_support(indices=True)]
        print("done in %fs" % (time() - t0))
        print()
    
    if feature_names:
        feature_names = np.asarray(feature_names)

    results = []

    #for penalty in ["l2", "l1"]:
    penalty = 'l2'
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    clf = LinearSVC(loss='l2', penalty=penalty,dual=False, tol=1e-3)
    results.append(benchmark(clf))
        
    joblib.dump(vectorizer, 'vectorizer.pkl', compress=9)
    joblib.dump(ch2, 'feature_selector.pkl', compress=9)
    joblib.dump(clf, 'linearsvc_classifier.pkl', compress=9)
예제 #6
0
def sim_char10(text1, text2):
    vect = HashingVectorizer(analyzer='char_wb', tokenizer=normalize, stop_words='english', ngram_range=(10, 10))
    texts = [text1, text2]
    matrix = vect.fit_transform(texts)
    cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten()
    simmax = max(cosine_similarities[1:])
    return simmax
def trainOnModel(x_VariableList, y_VariableList, testSetList, classifier, hashing=False, chi_squared=False):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.feature_extraction.text import HashingVectorizer
    from sklearn.feature_selection import SelectKBest, chi2
    from sklearn.linear_model import RidgeClassifier
    from sklearn.svm import LinearSVC
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import PassiveAggressiveClassifier
    from sklearn.utils.extmath import density
    y_train = y_VariableList
    if hashing == True:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                       n_features=2 ** 16)
        X_train = vectorizer.transform(x_VariableList)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(x_VariableList)

    X_test = vectorizer.transform(testSetList)

    if chi_squared == True:
        print("Extracting best features by a chi-squared test")
        ch2 = SelectKBest(chi2, k=2 * 16)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)

    classifierObject = ""
    print "Using :", classifier

    if classifier == "LinearSVC":
        classifierObject = LinearSVC(penalty='l2', dual=False, tol=1e-3)

    elif classifier == "PassiveAggressiveClassifier":
        classifierObject = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge',
                                                       n_iter=50, n_jobs=1, random_state=None, shuffle=True,
                                                       verbose=0, warm_start=False)

    elif classifier == "RidgeClassifier":
        classifierObject = RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                                           max_iter=None, normalize=False, solver='lsqr', tol=0.01)

    elif classifier == "Perceptron":
        classifierObject = Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
                                      n_iter=50, n_jobs=1, penalty=None, random_state=0, shuffle=True,
                                      verbose=0, warm_start=False)

    elif classifier == "SGDClassifier":
        classifierObject = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
                                         eta0=0.0, fit_intercept=True, l1_ratio=0.15,
                                         learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
                                         penalty='l2', power_t=0.5, random_state=None, shuffle=True,
                                         verbose=0, warm_start=False)

    classifierObject.fit(X_train, y_train)
    pred = classifierObject.predict(X_test)
    return pred[0]
class Featurizer:
    def __init__(self):
        self.vectorizer = HashingVectorizer(stop_words="english")

    def train_feature(self, examples):
        return self.vectorizer.fit_transform(examples)

    def test_feature(self, examples):
        return self.vectorizer.transform(examples)
예제 #9
0
def ngrams_hashing_vectorizer(strings, n, n_features):
    """ Return the a disctionary with the count of every
    unique n-gram in the string.
    """
    hv = HashingVectorizer(analyzer='char', ngram_range=(n, n),
                           n_features=n_features, norm=None,
                           alternate_sign=False)
    hash_matrix = hv.fit_transform(strings)
    return hash_matrix
예제 #10
0
    def get_x(text,ngram_range):

        hash_vect_object = HashingVectorizer(ngram_range=ngram_range, stop_words="english", strip_accents="unicode")
        tfidf_transformer_object = TfidfTransformer(use_idf=True)

        x_train_counts = hash_vect_object.fit_transform(text)
        x_train_tfidf = tfidf_transformer_object.fit_transform(x_train_counts)

        return x_train_tfidf
예제 #11
0
파일: sentiment.py 프로젝트: nobsu/grape
def vectorize(docs):
    """
    文档向量化
    :param docs list: iterable over raw text documents
    :return:
    """
    v = HashingVectorizer(tokenizer=comma_tokenizer, n_features=30000, non_negative=True)
    train_data = v.fit_transform(docs)
    return train_data
예제 #12
0
def vectorize_data(train_data, test_data):
    global app_vocabulary
    
    # vectorize=CountVectorizer(vocabulary=list(app_vocabulary))
    # counts_train=vectorize.fit_transform(train_data)
    # counts_test=vectorize.fit_transform(test_data)

    # tfidftransformer = TfidfTransformer();
    # counts_train=tfidftransformer.fit(counts_train).transform(counts_train);
    # counts_test=tfidftransformer.fit(counts_test).transform(counts_test);

    # f=open('model/vector.pkl','w')
    # pickle.dump(vectorize, f)
    vectorizer=HashingVectorizer()
    counts_train=vectorizer.fit_transform(train_data)
    counts_test=vectorizer.fit_transform(test_data)

    return counts_train, counts_test
예제 #13
0
def vector_func_char(l):
    vectorizer = HashingVectorizer(
        analyzer="char",
        input="content",
        decode_error="ignore",
        strip_accents="ascii",
        ngram_range=(2, 2),
        n_features=524288,
    )

    return str(l).split(" ")[0], vectorizer.fit_transform(str(l).replace(str(l).split(" ")[0], ""))
예제 #14
0
def vectorize_2(test_words):
    input_words = jieba.lcut(test_words[0])
    print check_neg(input_words)

    #  if len(jieba.lcut(test_words[0])) < 2:
    if len(jieba.lcut(test_words[0])) < 2:
        return None, False
    else:
        v = HashingVectorizer(tokenizer=comma_tokenizer, stop_words=stopwords, n_features=100000, non_negative=True)
        test_data = v.fit_transform(test_words)
        print test_data
        return test_data, check_neg(input_words)
예제 #15
0
def vector_func_word(l):
    vectorizer = HashingVectorizer(
        non_negative=True,
        stop_words="english",
        input="content",
        decode_error="ignore",
        strip_accents="ascii",
        n_features=262144,
    )

    # return str(l).split(" ")[0],vectorizer.fit_transform(str(l).replace(str(l).split(" ")[0],""))
    return vectorizer.fit_transform(l).shape
예제 #16
0
def tfidfVectorizeData(listOfSentences, useHashTable=False, nFeatures=100):
    
    if useHashTable:
        from sklearn.feature_extraction.text import HashingVectorizer
        vec = HashingVectorizer(stop_words='english', non_negative=True, n_features=nFeatures)
        X_noProcess = vec.transform(listOfSentences).toarray()
    else:
        from sklearn.feature_extraction.text import TfidfVectorizer
        vec = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
        X_noProcess = vec.fit_transform(listOfSentences).toarray()

    return vec, X_noProcess
예제 #17
0
    def get_x(text,ngram_range):

        hash_vect_object = HashingVectorizer(ngram_range=ngram_range,
                                             stop_words="english",
                                             strip_accents="unicode",
                                             token_pattern=r"(?u)\b[a-zA-Z_][a-zA-Z_]+\b") # tokens are character strings of 2 or more characters
        tfidf_transformer_object = TfidfTransformer(use_idf=True)

        x_train_counts = hash_vect_object.fit_transform(text)
        x_train_tfidf = tfidf_transformer_object.fit_transform(x_train_counts)

        return x_train_tfidf
 def trainFeatureExtract(self, opts, trainData, trainDataSize):
     print 'Extracting features from the training dataset using a sparse vectorizer'
     t0 = time()
     if opts.use_hashing:
         vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features)
         dataTrain = vectorizer.transform(trainData.data)
     else:
         vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
         dataTrain = vectorizer.fit_transform(trainData.data)
     duration = time() - t0
     print 'done in %fs at %0.3fMB/s' % (duration, trainDataSize / duration)
     print 'n_samples: %d, n_features: %d' % dataTrain.shape
     print 
     return dataTrain, vectorizer
예제 #19
0
class MultiNBClass:
    def __init__ (self, corpus, classes, method):
        # Set up vectorizier
        if method == 'count':
            self.vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 3)) 
        elif method == 'tfidf':
            self.vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 3))
        elif method == 'hashing':
            self.vectorizer = HashingVectorizer(non_negative = True)
        else:
            print 'Method must be count, tfidf, or hashing'
        # vectorize and set up classifier. 
        self.X = self.vectorizer.fit_transform(corpus)
        classifier = MultinomialNB()
        self.classifier = classifier.fit(self.X, classes)
예제 #20
0
class CompanyPrefix(BaseEstimator, TransformerMixin):
    def __init__(self, filepath, key=""):
        self.key = key
        self.vect = HashingVectorizer(decode_error='strict', n_features = 2**18, binary=True)
        self.gcp_length_table = pd.read_csv(filepath, sep="\t", dtype=str)
        lens = self.gcp_length_table['prefix'].str.len()
        self.max_key_len = lens.max()
        self.min_key_len = lens.min()

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        # Pad GTIN's with zeroes so that they are all 13  chars long
        data_dict[self.key] = data_dict[self.key].fillna(0).astype(int).astype(str).str.pad(13, fillchar="0")
        # Create a row for keeping track of company prefix lengths
        data_dict['gcp_length'] = np.nan
        # Iterate through the different lengths of prefixes in the
        # company prefix lookup table. Then extract possible prefixes from the
        # GTIN's for each prefix length.
        # Join these possible prefixes with the lookup table prefixes
        # to get the length of this gcp prefix
        for length in range(self.min_key_len, self.max_key_len+1):
            # Generate column gtin_<length> with the first length digits of each
            # row's GTIN
            data_dict['gtin_'+str(length)] = data_dict[self.key].str[0:length]
            # Join this GTIN prefix with the lookup table to see if it exists.
            # If it doesn't, the gcp_len will be NaN
            data_dict = data_dict.merge(self.gcp_length_table, how="left", left_on="gtin_"+str(length), right_on='prefix', sort=False)
            is_not_nan = pd.notnull(data_dict['gcp_len'])
            # set gcp_length column for rows that aren't NaN
            data_dict.ix[is_not_nan, 'gcp_length'] = data_dict.ix[is_not_nan, 'gcp_len']
            # Drop temporary rows that were created
            data_dict.drop(['gtin_'+str(length), 'prefix', 'gcp_len'], axis=1, inplace=True)
        # Fill NaN's with 0's
        data_dict['gcp_length'] = data_dict['gcp_length'].fillna('0').astype(int)
        # Create string columns for storing actual gcp's
        data_dict['gcp'] = ''
        # Only apply to strings that actually have a GTIN
        # (would have been padded with 13 zeroes in earlier steps)
        isvalid = data_dict[self.key] != '0000000000000'
        # lambda to substring gtins based on gcp_length column value
        substring_GTIN = lambda row: row[self.key][0:row['gcp_length']]
        # apply substring
        data_dict.ix[isvalid, 'gcp'] = data_dict.ix[isvalid].apply(substring_GTIN, axis=1)
        # apply HashingVectorizer to result
        return self.vect.fit_transform(data_dict['gcp'])
    def test_dummy_analyzer(self):
        X, X_rdd = self.generate_text_dataset()

        def splitter(x):
            return x.split()
        X = map(splitter, X)
        X_rdd = X_rdd.map(lambda x: map(splitter, x))

        local = HashingVectorizer(analyzer=lambda x: x)
        dist = SparkHashingVectorizer(analyzer=lambda x: x)

        result_local = local.transform(X)
        result_dist = sp.vstack(dist.transform(X_rdd).collect())
        assert_array_equal(result_local.toarray(), result_dist.toarray())

        result_local = local.fit_transform(X)
        result_dist = sp.vstack(dist.fit_transform(X_rdd).collect())
        assert_array_equal(result_local.toarray(), result_dist.toarray())
    def test_chunked_hashing_vectorizer(self):
        # results should not depend on chunk size
        _, X = _extract_reads(Artifact.import_data(
            'FeatureData[Sequence]',
            self.get_data_path('se-dna-sequences.fasta')).view(DNAIterator))

        params = {'analyzer': 'char',
                  'n_features': 8192,
                  'ngram_range': [8, 8],
                  'alternate_sign': False}
        hv = HashingVectorizer(**params)
        unchunked = hv.fit_transform(X)

        for chunk_size in (-1, 3, 13):
            chv = ChunkedHashingVectorizer(chunk_size=chunk_size, **params)
            chunked = chv.fit_transform(X)
            for x1, x2 in zip(chunked, unchunked):
                self.assertTrue((x1.todense() == x2.todense()).all())
예제 #23
0
    def test_dummy_analyzer(self):
        X, X_rdd = self.make_text_rdd()

        def splitter(x):
            return x.split()
        X = list(map(splitter, X))
        X_rdd = X_rdd.map(lambda x: list(map(splitter, x)))

        local = HashingVectorizer(analyzer=lambda x: x)
        dist = SparkHashingVectorizer(analyzer=lambda x: x)

        result_local = local.transform(X).toarray()
        result_dist = dist.transform(X_rdd).toarray()
        assert_array_equal(result_local, result_dist)

        result_local = local.fit_transform(X).toarray()
        result_dist = dist.fit_transform(X_rdd).toarray()
        assert_array_equal(result_local, result_dist)
    def vectorize(self, wsl):
        print("loading wiki documents dataset")
        # wsl = WikiSampleLoader()
        data = wsl.load_dataset()
        self._cluster_list = data.target_names
        self._labels = data.target
        print("%d documents" % len(data.data))
        print("%d categories" % len(data.target_names))
        print
        print("Extracting features from the training dataset using a sparse vectorizer")
        t0 = time()
        if self._use_hashing:
            if self._use_idf:
                # Perform an IDF normalization on the output of HashingVectorizer
                hasher = HashingVectorizer(
                    n_features=self._n_features,
                    stop_words=self._stop_words,
                    non_negative=self._non_negative,
                    norm=self._norm,
                    binary=self._binary,
                )
                vectorizer = make_pipeline(hasher, TfidfTransformer())
            else:
                vectorizer = HashingVectorizer(
                    n_features=self._n_features,
                    stop_words=self._stop_words,
                    non_negative=self._non_negative,
                    norm="l2",
                    binary=self._binary,
                )
        else:
            vectorizer = TfidfVectorizer(
                max_df=self._max_df,
                max_features=self._n_features,
                min_df=self._min_df,
                stop_words=self._stop_words,
                use_idf=self._use_idf,
            )
        self._X = vectorizer.fit_transform(data.data)
        self._vectorizer = vectorizer

        print("done in %fs" % (time() - t0))
        print("n_samples: %d, n_features: %d" % self._X.shape)
        print()
예제 #25
0
class svm_text(SVC):
#    svm_ = SVC(C=500, kernel='poly', gamma=.01, shrinking=True, probability=False, degree= 10, coef0=2,
#        tol=0.001, cache_size=20000, class_weight=None, verbose=False, max_iter=-1)
    def __init__(self, train_data, C=5, kernel='poly', gamma=.001, degree=10, coef0=2, n_features=10000000,
                 ngram_range=(1, 10), tfidf=False, dfrange=(2, 1.0), probability=False, class_weight=None):
        self.conn = None
        self.is_tfidf = tfidf
        if tfidf:
            self.vectorizer = TfidfVectorizer(stop_words=None, min_df=dfrange[0], max_df=dfrange[1],
                                              max_features=n_features, strip_accents='unicode',
                                              ngram_range=ngram_range, analyzer='word', norm='l2')
        else:
            self.vectorizer = HashingVectorizer(stop_words=None, non_negative=True,
                                                n_features=n_features, strip_accents='unicode',
                                                ngram_range=ngram_range, analyzer='word', norm='l2')
        self.param_set = {'C': str(C), 'kernel': str(kernel), 'gamma': str(gamma),
                          'degree': str(degree), 'coef0': str(coef0), 'n_features': str(n_features)}
        if class_weight == 'auto':
            class_weight = {}
            for item in train_data.target:
                if class_weight.get(item):
                    class_weight.update({item: class_weight[item] + 1.0})
                else:
                    class_weight.update({item: 1.0})
            for key in class_weight:
                class_weight.update({key: 1.0 / class_weight[key]})
        self.class_weight_dict = class_weight
        super(svm_text, self).__init__(C=C, kernel=kernel, gamma=gamma, shrinking=True, probability=probability, degree=degree, coef0=coef0,
                                       tol=0.001, cache_size=20000, class_weight=class_weight, verbose=False, max_iter=-1)
        if self.is_tfidf:
            train_x = self.vectorizer.fit_transform(train_data.data)
        else:
            train_x = self.vectorizer.transform(train_data.data)
        self.fit(train_x, train_data.target)
    def test_data(self, test_data):
        test_x = self.vectorizer.transform(test_data.data)
        predicted_values = self.predict(test_x)
        test_y = test_data.target
        self.score = metrics.f1_score(test_y, predicted_values)
        self.accuracy = metrics.accuracy_score(test_y, predicted_values)
    def guess_text(self, text_text):
        text_x = self.vectorizer.transform([pre_proc(text_text, removestop=False, alwayskeep=True, word_punc=True, unquote=True),])
        return self.predict(text_x)
def build_sentiment_classifier(X, y, bids, dates):
	'''
	Train and pickle the sentiment classifier
	'''
	n_train_samples = y.shape[0]

	tfidf = HashingVectorizer(tokenizer=word_tokenize, stop_words='english', \
		ngram_range=(1, 3), n_features=10000)
	X_tfidf = tfidf.fit_transform(X)#.todense()
	'''
	X1 = X[:n_train_samples]
	X2 = X[n_train_samples:]
	'''
	X1_tfidf = X_tfidf[:n_train_samples, :]
	X2_tfidf = X_tfidf[n_train_samples:, :]

	# Uncomment the section below to enable Grid Search for optimal parameter
	# search

	'''
	clf_SVM = Pipeline([('clf_SVM',	LinearSVC())])

	params = {
          'clf_SVM__C': [0.01, 0.5, 1, 10],
          'clf_SVM__tol': [1e-2, 1e-3, 1e-4],
          'clf_SVM__dual': [True, False]
          }

	gs = GridSearchCV(clf_SVM, params, cv=5, scoring='f1')
	
	gs.fit(X1_tfidf, y)
	print gs.best_score_
	print gs.best_estimator_.get_params()
	'''

	clf_SVM = LinearSVC(C=0.5, tol=1e-2, dual=False)

	clf_SVM.fit(X1_tfidf, y)

	y2 = clf_SVM.predict(X2_tfidf)
	y2 = np.vstack((dates, bids, y2))

	return y2
예제 #27
0
def calssify(text):
    # Multinomial Naive Bayes Classifier
    clf = MultinomialNB()
    clf = joblib.load('model/'+str(type(clf))[8:-2]+'.model')


    with open('dict/stopwords.txt', 'r') as f:
        stopwords = set([w.strip() for w in f])
    v = HashingVectorizer(non_negative=True, stop_words=stopwords, n_features=30000)

    text = text.replace('\n', ' ')
    text = text.replace('\t', ' ')
    text = ' '.join(jieba.cut(text, cut_all=False))
    text = re.sub(u'[$^()-=~!@#¥%……&*()——+·{}|:“”《》?【】、;‘’,。、]+', u'', text)

    text = text.encode('utf-8')
    test_data = v.fit_transform([text])
    pred = clf.predict(test_data)
    return pred[0][0]
def extractFeatures():

    print("Extracting features from the training dataset using a sparse vectorizer")

    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', 
                                       non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train.data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, 
                                     max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train.data)
    
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()


    print("Extracting features from the test dataset using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()


    if opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" %
              opts.select_chi2)
        t0 = time()
        ch2 = SelectKBest(chi2, k=opts.select_chi2)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        print("done in %fs" % (time() - t0))
        print()

    return X_train, X_test
예제 #29
0
def main():
    vectorizer = HashingVectorizer(stop_words="english", binary=True,
            tokenizer=lambda text: text.split(),
            token_pattern=r"(?u)\b(?:\w|\?)(?:\w|\?)+\b",
            ngram_range=(1,2))
    (X, Y) = get_train()
    (Xcv, Ycv) = get_cv()
    Xt = vectorizer.fit_transform(X)
    #selector = SelectPercentile(f_classif, percentile=40)
    #Xtt = selector.fit_transform(Xt, Y)
    Xtt = Xt
    Xcvt = vectorizer.transform(Xcv)
    #Xcvtt = selector.transform(Xcvt)
    Xcvtt = Xcvt
    #model = LinearSVC()
    #model = SVC(kernel='rbf', gamma=1.0, cache_size=1000)
    model = MultinomialNB(fit_prior=False)
    model.fit(Xtt, Y)
    Pcv = model.predict(Xcvtt)
    print_stats(Ycv, Pcv)
예제 #30
0
class svm_multi_label_text(OneVsRestClassifier):
#    svm_ = SVC(C=500, kernel='poly', gamma=.01, shrinking=True, probability=False, degree= 10, coef0=2,
#        tol=0.001, cache_size=20000, class_weight=None, verbose=False, max_iter=-1)
    def __init__(self, train_data,  C=None, n_features=10000000, loss='l2', penalty='l1',
                 ngram_range=(1, 10), tfidf=False, dfrange=(2, 1.0), dual=True, tol=1e-4):
        self.conn = None
        self.is_tfidf = tfidf
        if tfidf:
            self.vectorizer = TfidfVectorizer(stop_words=None, min_df=dfrange[0], max_df=dfrange[1],
                                              max_features=n_features, strip_accents='unicode',
                                              ngram_range=ngram_range, analyzer='word')
        else:
            self.vectorizer = HashingVectorizer(stop_words=None, non_negative=True,
                                                n_features=n_features, strip_accents='unicode',
                                                ngram_range=ngram_range, analyzer='word')
        self.param_set = {'C': str(), 'kernel': str(), 'gamma': str(),
                          'degree': str(), 'coef0': str(), 'n_features': str(n_features)}
        super(svm_multi_label_text, self).__init__(LinearSVC(C=C, loss=loss, penalty=penalty,
                                                             dual=(False if penalty == 'l1' else dual), tol=tol))
        if self.is_tfidf:
            train_x = self.vectorizer.fit_transform(train_data.data)
        else:
            train_x = self.vectorizer.transform(train_data.data)
        train_y = train_data.target
        self.fit(train_x, train_y)
    def test_data(self, test_data):
        test_x = self.vectorizer.transform(test_data.data)
        predicted_values = self.predict(test_x)
        test_y = test_data.target
        try:
            self.score = metrics.f1_score(test_y, predicted_values)
        except ZeroDivisionError:
            self.score = -0.1
        try:
            self.accuracy = metrics.accuracy_score(test_y, predicted_values)
        except ZeroDivisionError:
            self.accuracy = -0.1
    def guess_text(self, text_text):
        text_x = self.vectorizer.transform([pre_proc(text_text, removestop=False, alwayskeep=True, word_punc=True, unquote=True),])
        return self.predict(text_x)
예제 #31
0
from sklearn.feature_extraction.text import HashingVectorizer
from bin2op import parse, unique, counts, nextIndex
import numpy as np
import math
import sys
np.set_printoptions(threshold=sys.maxsize)

file = './a.exe'
syntax = "intel"
shellcode, code, opcodes, operands, instructions = parse(file, syntax, None)

sentences = instructions
ops = unique(operands + opcodes)
ops.sort()
unique_ops_count = len(ops)

vectorizer = HashingVectorizer(norm=None, n_features=unique_ops_count)
sentence_vectors = vectorizer.fit_transform(sentences)
vector2array = sentence_vectors.toarray()
arr = np.array(vector2array)
print(arr[0:3])
예제 #32
0
# Build tokenizer (removes upper case )
tokenizer = TweetTokenizer(preserve_case=False,
                           reduce_len=True,
                           strip_handles=False)
# Make a callable function for the vectorizer
tok_func = lambda s: tokenizer.tokenize(s)

#############################
# VECTORIZER and CLASSIFIER #
#############################

vectorizer = HashingVectorizer(tokenizer=tok_func, ngram_range=(1, 1))

# Vectorize the tweets
train_vectors = vectorizer.fit_transform(train_tweets)
dev_vectors = vectorizer.transform(dev_tweets)
test_vectors = vectorizer.transform(test_tweets)

# Add lexicon information
train_vectors = hstack((train_vectors, train_polarities))
dev_vectors = hstack((dev_vectors, dev_polarities))
test_vectors = hstack((test_vectors, test_polarities))

classifier = LinearSVC(C=0.1)

#########
# TRAIN #
#########

classifier.fit(train_vectors, train_labels)
예제 #33
0
import codecs
from idlelib.ReplaceDialog import replace
from idlelib.IOBinding import encoding
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.cluster import KMeans

fr = open('weibo_fenci_result.txt', 'r', encoding='utf-8')
id_list = []
data_list = []
for line in fr.readlines():
    term = line.strip().split("\t")
    if len(term) == 2 and term[1] != " ":
        id_list.append(term[0])
        data_list.append(term[1])
hv = HashingVectorizer(n_features=10000, non_negative=True)
post_tfidf = hv.fit_transform(data_list)
print('Size of fea_train:' + repr(post_tfidf.shape))
print(post_tfidf.nnz)
print("tfidf has done!!!")

id = id_list
tfidf_vec = post_tfidf
kmean = KMeans(n_clusters=300)
kmean.fit(tfidf_vec)
pred = kmean.predict(tfidf_vec)
print(pred)
fo = open("cluster.txt", "a+", encoding="utf-8")
count = 0
for i in range(len(pred)):
    count += 1
    fo.write(id[i] + "\t" + str(pred[i]) + "\n")
예제 #34
0
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    vectorizer = HashingVectorizer(stop_words='english',
                                   non_negative=True,
                                   n_features=opts.n_features)
    X_train = vectorizer.transform(data_train.data)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

# mapping from integer feature name to original token string
if opts.use_hashing:
])

# Import HashingVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

# Get text data: text_data
text_data = combine_text_columns(X_train)

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate the HashingVectorizer: hashing_vec
hashing_vec = HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC)

# Fit and transform the Hashing Vectorizer
hashed_text = hashing_vec.fit_transform(text_data)

# Create DataFrame and print the head
hashed_df = pd.DataFrame(hashed_text.data)
print(hashed_df.head())

# Import the hashing vectorizer
from sklearn.feature_extraction.text import HashingVectorizer

# Instantiate the winning model pipeline: pl
pl = Pipeline([
    ('union',
     FeatureUnion(transformer_list=[
         ('numeric_features',
          Pipeline([('selector', get_numeric_data), ('imputer', Imputer())])),
         ('text_features',
예제 #36
0
파일: example.py 프로젝트: sevengram/ml-hw
                        default=False,
                        action='store_true',
                        help="Use dictionary features")
    parser.add_argument('--limit',
                        default=-1,
                        type=int,
                        help="How many sentences to use")

    flags = parser.parse_args()

    analyzer = Analyzer(flags.word, flags.all_before, flags.all_after,
                        flags.one_before, flags.one_after, flags.characters,
                        flags.dictionary)
    vectorizer = HashingVectorizer(analyzer=analyzer)

    x_train = vectorizer.fit_transform(
        ex for ex, tgt in all_examples(flags.limit))
    x_test = vectorizer.fit_transform(
        ex for ex, tgt in all_examples(flags.limit, train=False))

    for ex, tgt in all_examples(1):
        print(" ".join(analyzer(ex)))

    y_train = array(list(tgt for ex, tgt in all_examples(flags.limit)))
    y_test = array(
        list(tgt for ex, tgt in all_examples(flags.limit, train=False)))

    lr = SGDClassifier(loss='log', penalty='l2', shuffle=True)
    lr.fit(x_train, y_train)

    print("TRAIN\n-------------------------")
    accuracy(lr, x_train, y_train, all_examples(flags.limit))
예제 #37
0
		if opts.use_idf:
				# Perform an IDF normalization on the output of HashingVectorizer
				hasher = HashingVectorizer(n_features=opts.n_features,
																	 stop_words='english', alternate_sign=False,
																	 norm=None, binary=False)
				vectorizer = make_pipeline(hasher, TfidfTransformer())
		else:
				vectorizer = HashingVectorizer(n_features=opts.n_features,
																			 stop_words='english',
																			 alternate_sign=False, norm='l2',
																			 binary=False)
else:
		vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
																 min_df=2, stop_words='english',
																 use_idf=opts.use_idf)
X = vectorizer.fit_transform(dataset.data)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

if opts.n_components:
		print("Performing dimensionality reduction using LSA")
		t0 = time()
		# Vectorizer results are normalized, which makes KMeans behave as
		# spherical k-means for better results. Since LSA/SVD results are
		# not normalized, we have to redo the normalization.
		svd = TruncatedSVD(opts.n_components)
		normalizer = Normalizer(copy=False)
		lsa = make_pipeline(svd, normalizer)
예제 #38
0
cosine_list = clean_concat(lm_total)
cosine_list['concat_data'] = cosine_list['concat_data'].str.lower()

del lm_total
#
# a = cosine_list.concat_data.str.split(expand=True).stack().value_counts().head(2000)
# # keep the stop-wordd not the digits
# ind_word_a = [ind for ind in a.index if ~ind.isdigit()]
# a = a[a.index.isin(ind_word_a)]
'''CONCAT_DATA  '''
# vectorizer = TfidfVectorizer(analyzer=ngrams, min_df=1)
stop_words = get_stop_words('de')
vectorizer = HashingVectorizer(stop_words=stop_words)

vector_1 = vectorizer.fit_transform(cosine_sign_ups.concat_data)
vector_2 = vectorizer.fit_transform(cosine_list.concat_data)
t1 = time.time()
matches = awesome_cossim_top(vector_1, vector_2.transpose(), 1, 0.1)
t = time.time() - t1
print("SELFTIMED:", t)

matches_df = get_matches_df(matches,
                            name_vector_1=cosine_sign_ups,
                            name_vector_2=cosine_list,
                            col_name='concat_data',
                            top=cosine_list.shape[0])
matches_df.sort_values('similarity', inplace=True)
''''''
matches_df = pd.read_csv('matches_df_left_overs.csv')
matches_df = matches_df.query('similarity > 0.35')
예제 #39
0
    if opts.use_idf:
        # Perform an IDF normalization on the output of HashingVectorizer
        hasher = HashingVectorizer(n_features=opts.n_features,
                                   stop_words='english', non_negative=True,
                                   norm=None, binary=False)
        vectorizer = make_pipeline(hasher, TfidfTransformer())
    else:
        vectorizer = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       non_negative=False, norm='l2',
                                       binary=False)
else:
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
                                 min_df=2, stop_words='english',
                                 use_idf=opts.use_idf)
X = vectorizer.fit_transform(corpus)

# print(X)


print("done in %fs" % (time() - t0))
# n_samples: how many articles are there
# n_features: how many different words in all articles are there
print("n_samples: %d, n_features: %d" % X.shape)
print()

if opts.n_components:
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are
예제 #40
0
class Reddit:
    # initialize class variables
    text_matrix = None
    unstemmed_text_matrix = None
    vectorized_text_matrix = None
    text_matrix_reduced = None
    sub_list = None
    sub_to_index = None
    index_to_sub = None
    commonality_matrix = None

    def __init__(self, from_db=True, encoding_type='tfidf', distance_method='cosine'):
        # check if everything already exists
        # get it from the db or from the web if not
        try:
            print("Checking if information is available...", end="")
            Reddit.text_matrix = np.load('text_matrix.npy')
            Reddit.unstemmed_text_matrix = np.load('unstemmed_text_matrix.npy')
            Reddit.sub_list = pickle.load(open("sub_list.p", "rb"))
            Reddit.sub_to_index = pickle.load(open("sub_to_index.p", "rb"))
            print("Done.\n")

        except FileNotFoundError:
            print("Not available.\n")
            print("Loading from database.\n")
            print("This will take a minute...\n")

            if from_db:
                Reddit.text_matrix, Reddit.sub_list, Reddit.sub_to_index = self.data_from_db()

            else:
                Reddit.text_matrix, Reddit.sub_list, Reddit.sub_to_index = self.data_from_scrape()

        Reddit.index_to_sub = {value: key for key, value in Reddit.sub_to_index.items()}

        if encoding_type == 'tfidf':
            self.vectorizer = TfidfVectorizer()

        elif encoding_type == 'count':
            self.vectorizer = CountVectorizer()

        elif encoding_type == 'hash':
            self.vectorizer = HashingVectorizer()

        if distance_method == 'cosine':
            self.distance = self.cosine_distance

        try:
            Reddit.vectorized_text_matrix = np.load('vectorized_text_matrix.npy')
            Reddit.text_matrix_reduced = np.load('text_matrix_reduced.npy')

        except FileNotFoundError:
            print("Vectorizing and reducing text matrix.\n")
            Reddit.vectorized_text_matrix, Reddit.text_matrix_reduced = self.process_text_matrix()
            print("Done\n")

        # check if the commonality matrix already exists, build it if it doesn't
        try:
            Reddit.commonality_matrix = np.load('commonality_matrix.npy')

        except FileNotFoundError:
            Reddit.commonality_matrix = self.build_matrix()

    def data_from_db(self):
        """
        get subreddit corpus from database reddit.db

        :return:
        text_matrix: matrix of text in subreddits. rows are subreddits.
        sub_list: list of subreddits included in the matrix
        sub_to_index: dictionary for converting from subreddit name to index in the matrix
        """

        sub_list = []
        text_matrix = []
        unstemmed_text_matrix = []  # used for word cloud later

        connecting_to_db = True

        sql_command = "SELECT subreddit, GROUP_CONCAT(body, ' ') as all_comments FROM comments GROUP BY subreddit"

        while connecting_to_db:
            try:
                print("Connecting to DB.\n")
                pwd = os.getcwd()
                db_conn = sqlite3.connect(pwd + '/../db/reddit.db')
                c = db_conn.cursor()
                results = c.execute(sql_command)

            except sqlite3.OperationalError:
                print("Table does not exist yet. Creating from CSV.\n")
                create_db(db_conn)
                continue

            print("Done.")

            break

        english_stop_words = stopwords.words('english')

        r = praw.Reddit(user_agent='daniel_scraper')

        for i, row in enumerate(list(results)):
            print("Loading subreddit {}: {}....".format(i, row[0]), end="")

            '''
            try:
                if r.get_subreddit(row[0]).subscribers < 50000:
                    print("Done")
                    continue

            except:
                print("Something went wrong. Continuing.")
                continue
            '''

            sub_list.append(row[0].lower())
            text_matrix.append(process_text(row[1], punctuation, english_stop_words))

            unstemmed_text_matrix.append(process_text(row[1], punctuation, english_stop_words, stem=False))

            print("Done")

        sub_to_index = {sub_name: index for sub_name, index in zip(sub_list, range(len(sub_list)))}

        print("Done.\n")

        text_matrix = np.array(text_matrix)
        unstemmed_text_matrix = np.array(unstemmed_text_matrix)

        np.save('unstemmed_text_matrix.npy', unstemmed_text_matrix)
        np.save('text_matrix.npy', text_matrix)
        pickle.dump(sub_list, open("sub_list.p", "wb"))
        pickle.dump(sub_to_index, open("sub_to_index.p", "wb"))

        return text_matrix, sub_list, sub_to_index

    def data_from_scrape(self):
        """
        get subreddit corpus from web scrape if database is not available

        :return:
        text_matrix: matrix of text in subreddits. rows are subreddits.
        sub_list: list of subreddits included in the matrix.
        sub_to_index: dictionary for converting from subreddit name to index in the matrix.
        """

        text_matrix = []

        response = requests.get('http://redditlist.com/sfw')

        sub_list = re.findall('/r/(\w+)\\\'', response.text)
        sub_list = set(sub_list)

        r = praw.Reddit(user_agent='daniel_scraper')

        for sub in self.sub_list:
            if r.get_subreddit(sub).subscribers < 50000:
                self.sub_list.pop(sub)

        sub_list = list(sub_list)

        for sub in sub_list:
            # instantiate string of submission and comments for this specific subreddit
            this_subs_submissions = ''
            this_subs_comments = ''

            submissions = r.get_subreddit(sub).get_hot(limit=25)  # get the top 25 submissions

            for submission in submissions:
                this_subs_submissions += " "
                this_subs_submissions += submission.title.lower()  # add submission to all submissions

                for comment in submission.comments:
                    this_subs_comments += " "
                    this_subs_comments += comment.body.lower()  # add comment to all comments

            text_matrix.append(this_subs_submissions + this_subs_comments)

        text_matrix = np.array(text_matrix)
        sub_to_index = {sub_name: index for sub_name, index in zip(sub_list, range(len(sub_list)))}

        np.save('text_matrix.npy', text_matrix)

        return text_matrix, sub_list, sub_to_index

    def process_text_matrix(self, n_components=100):
        """
        :param n_components: number of singular values to retain
        :return: reduced dimension text matrix using truncated SVD
        """

        vectorized_text_matrix = self.vectorizer.fit_transform(self.text_matrix)
        reducer = TruncatedSVD(n_components=n_components)
        text_matrix_reduced = reducer.fit_transform(vectorized_text_matrix)

        np.save('vectorized_text_matrix.npy', vectorized_text_matrix)
        np.save('text_matrix_reduced.npy', text_matrix_reduced)

        return vectorized_text_matrix, text_matrix_reduced

    @staticmethod
    def cosine_distance(vec1, vec2):
        """
        :param vec1: 1D numpy array
        :param vec2: 1D numpy array
        :return: cosine distance between the two vectors
        """

        # confirm they're numpy arrays
        vec1 = np.array(vec1)
        vec2 = np.array(vec2)

        return vec1.dot(vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))

    def build_matrix(self):
        """
        :return: Reddit "commonality matrix" C

        C[i,j] corresponds to the similarity between subreddit i and subreddit j
        Distance measure is a parameter to the class, defaults to cosine distance
        """

        # initialize a commonality matrix
        commonality_matrix = np.zeros((Reddit.text_matrix.shape[0], Reddit.text_matrix.shape[0]))

        for i in range(len(commonality_matrix)):
            for j in range(i, len(commonality_matrix)):
                commonality = self.distance(Reddit.text_matrix_reduced[i], Reddit.text_matrix_reduced[j])

                commonality_matrix[i, j] = commonality

            commonality_matrix[(i+1):, i] = commonality_matrix[i, (i+1):]

        # save commonality matrix for later use
        np.save('commonality_matrix.npy', commonality_matrix)

        return commonality_matrix
예제 #41
0
 if x < -0.05:
  return 0
 elif -0.05 < x < 0.05:
  return 1
 else :
  return 2

#Labeling based on returned values:
data_df['label_stemmed'] = data_df['sentiment_stemmed'].apply(lambda x: convert(x['compound']))
#importing HashingVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
#hashing vectorization
X= data_df['tweet_stemmed']
hashing_vectorizer = HashingVectorizer(stop_words = 'english',alternate_sign= False)
hash_stem = hashing_vectorizer.fit_transform(X)
y= data_df['label_stemmed']
#print("Data vectorized")

#vectorization time
Vectorizing_time = time.time()
#print("Vectorizing_time :",Vectorizing_time - start_time)

#train and test set formed
hashing_trainset = hash_stem[:319685, :]
hashing_testset  = hash_stem[319685:,:]
x_train, x_test , y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
x_train = hashing_trainset[y_train.index]
x_test= hashing_trainset[y_test.index]
print("Data split into train and test set")
take = .98
filtered_df = films[
    films['opening_wknd'] < films['opening_wknd'].quantile(.97)].reset_index(
        drop=True)
filtered_df = filtered_df[
    filtered_df['pct_profit'] < filtered_df['pct_profit'].quantile(take)]
filtered_df = filtered_df[filtered_df['pct_profit'] > filtered_df['pct_profit']
                          .quantile(1 - take)].reset_index(drop=True)

#******this filters the films we suspect reported bad data**********
# filtered_df = filtered_df.drop(filtered_df[(filtered_df['budget'] != filtered_df['opening_wknd']) &
#                                            (filtered_df['budget'] < 150000)].index).reset_index(drop=True)

# Make the vector from the strings
vectorizer = HashingVectorizer(n_features=1000)
vector = vectorizer.fit_transform(filtered_df['train_string'].to_numpy())
vec_df = pd.DataFrame.sparse.from_spmatrix(vector)

#make dummies from our curated columns
dum = pd.get_dummies(filtered_df[[
    'release_month', 'actor1_class', 'actor2_class', 'actor3_class', 'rating'
]])  #'actor1_class', 'actor2_class', 'actor3_class',
dum.head(1)

#pull the columns we want from the main DF
use_cols = filtered_df[[
    'budget', 'action', 'adventure', 'animated', 'biography', 'drama',
    'documentary', 'comedy', 'crime', 'fantasy', 'family', 'musical', 'horror',
    'war', 'mystery', 'sci-fi', 'thriller', 'romance'
]]
print()

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    print('using hashing...')
    vectorizer = HashingVectorizer(non_negative=True,
                                   n_features=opts.n_features,
                                   tokenizer=jieba_tokenizer)
    X_train = vectorizer.transform(x_train)
else:
    print('using tfidf...')
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 tokenizer=jieba_tokenizer)
    X_train = vectorizer.fit_transform(x_train)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(x_test)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

# mapping from integer feature name to original token string
if opts.use_hashing:
예제 #44
0
def plot():

    # Display progress logs on stdout
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')
    
    
    # parse commandline arguments
    op = OptionParser()
    op.add_option("--report",
                  action="store_true", dest="print_report",
                  help="Print a detailed classification report.")
    op.add_option("--chi2_select",
                  action="store", type="int", dest="select_chi2",
                  help="Select some number of features using a chi-squared test")
    op.add_option("--confusion_matrix",
                  action="store_true", dest="print_cm",
                  help="Print the confusion matrix.")
    op.add_option("--top10",
                  action="store_true", dest="print_top10",
                  help="Print ten most discriminative terms per class"
                       " for every classifier.")
    op.add_option("--all_categories",
                  action="store_true", dest="all_categories",
                  help="Whether to use all categories or not.")
    op.add_option("--use_hashing",
                  action="store_true",
                  help="Use a hashing vectorizer.")
    op.add_option("--n_features",
                  action="store", type=int, default=2 ** 16,
                  help="n_features when using the hashing vectorizer.")
    op.add_option("--filtered",
                  action="store_true",
                  help="Remove newsgroup information that is easily overfit: "
                       "headers, signatures, and quoting.")
    
    (opts, args) = op.parse_args()
    if len(args) > 0:
        op.error("this script takes no arguments.")
        sys.exit(1)
    
    print(__doc__)
    op.print_help()
    print()
    
    
    ###############################################################################
    # Load some categories from the training set
    if opts.all_categories:
        categories = None
    else:
        categories = [
            'alt.atheism',
            'talk.religion.misc',
            'comp.graphics',
            'sci.space',
        ]
    
    if opts.filtered:
        remove = ('headers', 'footers', 'quotes')
    else:
        remove = ()
    
    print("Loading 20 newsgroups dataset for categories:")
    print(categories if categories else "all")
    
    data_train = fetch_20newsgroups(subset='train', categories=categories,
                                    shuffle=True, random_state=42,
                                    remove=remove)
    
    data_test = fetch_20newsgroups(subset='test', categories=categories,
                                   shuffle=True, random_state=42,
                                   remove=remove)
    print('data loaded')
    
    categories = data_train.target_names    # for case categories == None
    
    
    def size_mb(docs):
        return sum(len(s.encode('utf-8')) for s in docs) / 1e6
    
    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)
    
    print("%d documents - %0.3fMB (training set)" % (
        len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" % (
        len(data_test.data), data_test_size_mb))
    print("%d categories" % len(categories))
    print()
    
    # split a training set and a test set
    y_train, y_test = data_train.target, data_test.target
    
    print("Extracting features from the training dataset using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train.data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()
    
    print("Extracting features from the test dataset using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()
    
    if opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" %
              opts.select_chi2)
        t0 = time()
        ch2 = SelectKBest(chi2, k=opts.select_chi2)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        print("done in %fs" % (time() - t0))
        print()
    
    
    def trim(s):
        """Trim string to fit on terminal (assuming 80-column display)"""
        return s if len(s) <= 80 else s[:77] + "..."
    
    
    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = np.asarray(vectorizer.get_feature_names())
    
    
    ###############################################################################
    # Benchmark classifiers
    def benchmark(clf):
        print('_' * 80)
        print("Training: ")
        print(clf)
        t0 = time()
        clf.fit(X_train, y_train)
        train_time = time() - t0
        print("train time: %0.3fs" % train_time)
    
        t0 = time()
        pred = clf.predict(X_test)
        test_time = time() - t0
        print("test time:  %0.3fs" % test_time)
    
        score = metrics.f1_score(y_test, pred)
        print("f1-score:   %0.3f" % score)
    
        if hasattr(clf, 'coef_'):
            print("dimensionality: %d" % clf.coef_.shape[1])
            print("density: %f" % density(clf.coef_))
    
            if opts.print_top10 and feature_names is not None:
                print("top 10 keywords per class:")
                for i, category in enumerate(categories):
                    top10 = np.argsort(clf.coef_[i])[-10:]
                    print(trim("%s: %s"
                          % (category, " ".join(feature_names[top10]))))
            print()
    
        if opts.print_report:
            print("classification report:")
            print(metrics.classification_report(y_test, pred,
                                                target_names=categories))
    
        if opts.print_cm:
            print("confusion matrix:")
            print(metrics.confusion_matrix(y_test, pred))
    
        print()
        clf_descr = str(clf).split('(')[0]
        return clf_descr, score, train_time, test_time
    
    
    results = []
    for clf, name in (
            (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
            (Perceptron(n_iter=50), "Perceptron"),
            (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
            (KNeighborsClassifier(n_neighbors=10), "kNN")):
        print('=' * 80)
        print(name)
        results.append(benchmark(clf))
    
    for penalty in ["l2", "l1"]:
        print('=' * 80)
        print("%s penalty" % penalty.upper())
        # Train Liblinear model
        results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                                dual=False, tol=1e-3)))
    
        # Train SGD model
        results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                               penalty=penalty)))
    
    # Train SGD with Elastic Net penalty
    print('=' * 80)
    print("Elastic-Net penalty")
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty="elasticnet")))
    
    # Train NearestCentroid without threshold
    print('=' * 80)
    print("NearestCentroid (aka Rocchio classifier)")
    results.append(benchmark(NearestCentroid()))
    
    # Train sparse Naive Bayes classifiers
    print('=' * 80)
    print("Naive Bayes")
    results.append(benchmark(MultinomialNB(alpha=.01)))
    results.append(benchmark(BernoulliNB(alpha=.01)))
    
    
    class L1LinearSVC(LinearSVC):
    
        def fit(self, X, y):
            # The smaller C, the stronger the regularization.
            # The more regularization, the more sparsity.
            self.transformer_ = LinearSVC(penalty="l1",
                                          dual=False, tol=1e-3)
            X = self.transformer_.fit_transform(X, y)
            return LinearSVC.fit(self, X, y)
    
        def predict(self, X):
            X = self.transformer_.transform(X)
            return LinearSVC.predict(self, X)
    
    print('=' * 80)
    print("LinearSVC with L1-based feature selection")
    results.append(benchmark(L1LinearSVC()))
    
    
    # make some plots
    
    indices = np.arange(len(results))
    
    results = [[x[i] for x in results] for i in range(4)]
    
    clf_names, score, training_time, test_time = results
    training_time = np.array(training_time) / np.max(training_time)
    test_time = np.array(test_time) / np.max(test_time)
    
    pl.figure(figsize=(12,8))
    pl.title("Score")
    pl.barh(indices, score, .2, label="score", color='r')
    pl.barh(indices + .3, training_time, .2, label="training time", color='g')
    pl.barh(indices + .6, test_time, .2, label="test time", color='b')
    pl.yticks(())
    pl.legend(loc='best')
    pl.subplots_adjust(left=.25)
    pl.subplots_adjust(top=.95)
    pl.subplots_adjust(bottom=.05)
    
    for i, c in zip(indices, clf_names):
        pl.text(-.3, i, c)

    pl.show()
예제 #45
0
### Most real
sorted(zip(clf.coef_[0], feature_names), reverse=True)[:20]

### Most fake
sorted(zip(clf.coef_[0], feature_names))[:20]                               # clearly there are certain words which might show political intent and source in the top fake features (such as the words corporate and establishment).

tokens_with_weights = sorted(list(zip(feature_names, clf.coef_[0])))
#print(tokens_with_weights)

#--------------------------------------------------------------
# HashingVectorizer : require less memory and are faster (because they are sparse and use hashes rather than tokens)
#--------------------------------------------------------------


hash_vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
hash_train = hash_vectorizer.fit_transform(X_train)
hash_test = hash_vectorizer.transform(X_test)

#--------------------------------------------------------------
# Naive Bayes classifier for Multinomial model 
#-------------------------------------------------------------- 

clf = MultinomialNB(alpha=.01)

clf.fit(hash_train, y_train)
pred = clf.predict(hash_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
print(cm)
            if word != ' ' and word not in stopwords:
                words.append(word)
        sentences.append(' '.join(words))
    return sentences


# 训练时CountVectorizer用的什么vocab 测试时也必须使用原本的vocab
#vectorizer = CountVectorizer()
#FLAG='countvectorizer'
#vectorizer = TfidfVectorizer()
#FLAG='tfidfvectorizer'
vectorizer = HashingVectorizer()
FLAG = 'hashingvectorizer'

reviews = pd.read_csv('./data/train.csv')
vectorizer.fit_transform(get_sentences(reviews.review.values, stopwords))
print('模型加载中...')
model = joblib.load('./lr_weibo_output/' + 'weibo_lr_' + FLAG + '_model.pkl')
print('模型加载结束...')


def predict():
    print('请输入文本:')
    review = str(input())
    if review == 'exit':
        exit(0)
    else:
        try:
            sentences = get_sentences([review], stopwords)
            print(sentences)
            review_ids = vectorizer.transform(sentences)
예제 #47
0
#y.ravel()
#y = np.array([y])
#data_train_size_mb = size_mb(X_train.data)
#data_test_size_mb = size_mb(X_test.data)
print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    vectorizer = HashingVectorizer(stop_words='english',
                                   non_negative=True,
                                   n_features=opts.n_features)
    X_train = vectorizer.transform(bunch.data)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 stop_words='english')
    X_train = vectorizer.fit_transform(bunch.data)
#y_train = y_train.reshape(y_train.shape[0],1)
duration = time() - t0
#print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
#y_train = np.array([y_train]).T
#y_train = y_train.reshape(y_train.shape[0],1)
np.transpose(X_train)
print("samples %d features %d" % X_train.shape)
print(X_train.shape)
#print(y_train.shape)

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(bunch.data)
#y_test = y_test.reshape(y_test.shape[0],1)
#y_test = np.array([y_test]).T
예제 #48
0
def vectorize(df_test):
    v = HashingVectorizer()
    train_vectors = v.fit_transform(df_test)
    return train_vectors
예제 #49
0
  (0, 3)	1
  (0, 15)	2
  (0, 4)	1
  (1, 5)	1
  (1, 9)	1
  (1, 2)	1
  (1, 6)	1
  (1, 14)	1
  (1, 3)	1
  (2, 1)	1
  (2, 0)	1
  (2, 12)	1
  (2, 7)	1
  (3, 10)	1
  (3, 8)	1
  (3, 11)	1
  (3, 18)	1
  (3, 17)	1
  (3, 13)	1
  (3, 5)	1
  (3, 6)	1
  (3, 15)	1
  
  左边的括号中第一个数字是文本的序号,第二个数字是词的序号,注意词的序号是基于所有的文档的,
  第三个数字就是我们的词频。
'''

#利用hash trick 进行降维
vectorizer2 = HashingVectorizer(n_features=6, norm= None)
print(vectorizer2.fit_transform(corpus))
예제 #50
0
파일: visu.py 프로젝트: fraka6/mlboost
def main(args=None):
    args = args.split(' ') if isinstance(args, str) else args
    args = args or sys.argv[1:]
    import logging
    import numpy as np
    from optparse import OptionParser
    from time import time
    from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
    from sklearn.feature_selection import SelectKBest, chi2
    import dim_reduction as dr

    # Display progress logs on stdout
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    # parse commandline arguments
    op = OptionParser()

    op.add_option(
        "--chi2_select",
        default=-1,
        action="store",
        type="int",
        dest="select_chi2",
        help=
        "Select some number of features using a chi-squared test; all set -1")
    op.add_option('-f',
                  "--filename",
                  default="data.tsv",
                  dest="fname",
                  help="data filename")
    op.add_option("-d",
                  "--dataset",
                  default='news',
                  dest="dataset",
                  help="dataset to load (%s)" % list(_load_map.keys()))
    op.add_option('-n',
                  "--n_features",
                  action="store",
                  type=int,
                  default=1000,
                  help="n_features when using the hashing vectorizer.")
    op.add_option("--use_hashing",
                  default=False,
                  action="store_true",
                  help="Use a hashing vectorizer.")
    op.add_option("--hack",
                  default=False,
                  action="store_true",
                  dest="hack",
                  help="use test instead on train to speedup process")
    op.add_option("--no-text",
                  default=True,
                  action="store_false",
                  dest="text",
                  help="features are not text (default = text)")
    op.add_option("--class_sample",
                  default=2,
                  type=int,
                  dest="n_sample_by_class",
                  help="show only [%default%] sample by class")
    op.add_option("--lnob",
                  default=True,
                  action='store_true',
                  dest='legend_outside_box',
                  help="legend not outside of the box")
    op.add_option("--legend",
                  default=False,
                  action='store_true',
                  dest='enable_legend_picking',
                  help='set legend picking not points')
    op.add_option(
        "--noX",
        default=False,
        action='store_true',
        dest='nox',
        help=
        "if you just want to generate graph and don't have acess to the X server "
    )
    op.add_option(
        "-m",
        "--methods",
        default=dr.METHODS,
        dest="methods",
        help="dimension reduction method to try (split by ','); default = %s" %
        dr.METHODS)
    op.add_option("-e",
                  dest='exclude',
                  default=None,
                  help="exclude class (separarated by ,)")
    op.add_option("-o",
                  dest='only',
                  default=None,
                  help="include only class (separarated by ,)")
    op.add_option("-v",
                  dest='verbose',
                  default=False,
                  action='store_true',
                  help="verbose")

    (opts, args) = op.parse_args(args)

    if len(args) > 0:
        op.error("this script takes no arguments.")
        sys.exit(1)

    if opts.nox:
        matplotlib.use('Agg')
    # warning: pylab should be import after call to matplotlib.use(...)
    import pylab

    # load data
    data_train, data_test, legend_labels = _load_map[opts.dataset](opts.fname)

    if opts.hack:
        print("hack: working on test dataset")
        data_train = data_test
        opts.dataset += '_test'

    if opts.verbose:
        print("----------example data loaded--------------")
        print("data:", data_train.data[0].strip())
        print("target:", data_train.target[0])
        print("-------------------------------------------")
    y_train, y_test = data_train.target, data_test.target

    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)

    print(("%d documents - %0.3fMB (training set)" %
           (len(data_train.data), data_train_size_mb)))
    print(("%d documents - %0.3fMB (test set)" %
           (len(data_test.data), data_test_size_mb)))

    print(
        "Extracting features from the training dataset using a sparse vectorizer"
    )
    t0 = time()
    if not opts.text:
        print("std features")
        X_train = np.array(data_train.data, ndmin=2)
        features_names = data_train.features

    else:  # its text features dood
        print(
            "features are extracted from text -> words vectorization is required, hey Samu!"
        )
        if opts.use_hashing:
            print(("Use feature hashing %s" % opts.n_features))
            vectorizer = HashingVectorizer(stop_words='english',
                                           non_negative=True,
                                           n_features=opts.n_features)
            X_train = vectorizer.transform(data_train.data)
        else:
            vectorizer = TfidfVectorizer(sublinear_tf=True,
                                         max_df=0.5,
                                         stop_words='english')
            # mapping from integer feature name to original token string
            X_train = vectorizer.fit_transform(data_train.data)
            feature_names = vectorizer.get_feature_names()

    if opts.verbose:
        print("----------example data transformed--------------")
        print("data:", X_train[0])
        print("target:", y_train[0])
        print("-------------------------------------------")

    duration = time() - t0
    print(("done in %fs at %0.3fMB/s" %
           (duration, data_train_size_mb / duration)))
    print(("n_samples: %d, n_features: %d" % X_train.shape))
    print()

    print(
        "Extracting features from the test dataset using the same vectorizer")
    t0 = time()
    if not opts.text:
        X_test = np.array(data_test.data, ndmin=2)
    else:
        X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print(("done in %fs at %0.3fMB/s" %
           (duration, data_test_size_mb / duration)))
    print(("n_samples: %d, n_features: %d" % X_test.shape))
    print()

    if opts.select_chi2 != -1:
        print(("Extracting %d best features by a chi-squared test" %
               opts.select_chi2))
        t0 = time()
        ch2 = SelectKBest(chi2, k=opts.select_chi2)
        print("data:", X_train[0])
        print("target", y_train[0])
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        print(("done in %fs" % (time() - t0)))
        print()

    X = X_train.todense() if "todense" in dir(X_train) else X_train
    X_test = X_test.todense() if "todense" in dir(X_test) else X_test
    print("data shape: (%i,%i)" % (X.shape))

    if opts.only:
        idx = opts.only.split(',')
        X, y_train = filter_classes(X, y_train, idx, False)
        X_test, y_test = filter_classes(X_test, y_test, idx, False)

    if opts.exclude:
        idx = opts.exclude.split(',')
        X, y_train = filter_classes(X, y_train, idx, True)
        X_test, y_test = filter_classes(X_test, y_test, idx, True)

    # run all dim reduction algo
    for method in opts.methods.split(','):
        t0 = time()
        try:
            resdr = dr.dim_reduce(method, X=X, Y=y_train)
            if resdr == None:
                continue
            trans, X_trans, title = resdr
            print(('Projecting {} on test set'.format(method)))
            if hasattr(trans, "transform"):
                X_trans_test = trans.transform(X_test)
            elif hasattr(trans, "fit_transform"):
                warnings.warn(
                    "the method as no transform (fallback to fit_transform",
                    Warning)
                X_trans_test = trans.fit_transform(X_test)
            title = "%s (time %.2fs)" % (title, (time() - t0))
            print(('Rendering plot {}'.format(title)))
            has_plot = dr.plot_embedding(
                X=X_trans_test,
                Y=y_test,
                title=title,
                n_sample_by_class=opts.n_sample_by_class,
                source=data_test.data,
                legend_outside_box=opts.legend_outside_box,
                enable_legend_picking=opts.enable_legend_picking,
                legend_labels=legend_labels)
            if has_plot:
                fname = "%s_%s.png" % (opts.dataset, method)
                print("saving %s" % fname)
                pylab.savefig(fname, bbox_inches=0)
            else:
                print('Nothing to plot.')

        except Exception as ex:
            print(method, ex)
            print(traceback.format_exc())

    pylab.show()
예제 #51
0
]
newsgroup_train = fetch_20newsgroups(subset='train', categories=categories)

#print category names
from pprint import pprint

pprint(list(newsgroup_train.target_names))

#newsgroup_train.data is the original documents, but we need to extract the
#feature vectors inorder to model the text data
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(stop_words='english',
                               non_negative=True,
                               n_features=10000)
fea_train = vectorizer.fit_transform(newsgroup_train.data)
fea_test = vectorizer.fit_transform(newsgroups_test.data)

#return feature vector 'fea_train' [n_samples,n_features]
print 'Size of fea_train:' + repr(fea_train.shape)
print 'Size of fea_train:' + repr(fea_test.shape)
#11314 documents, 130107 vectors for all categories
print 'The average feature sparsity is {0:.3f}%'.format(
    fea_train.nnz / float(fea_train.shape[0] * fea_train.shape[1]) * 100)

#----------------------------------------------------
#method 1:CountVectorizer+TfidfTransformer
print '*************************\nCountVectorizer+TfidfTransformer\n*************************'
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

count_v1 = CountVectorizer(stop_words='english', max_df=0.5)
예제 #52
0
    u'连衣裙': 12,
    u'其它': 13
}
data[u'二级类目'] = data[u'二级类目'].map(secondtype_mapping)
data.fillna(0)

# print(vectorizer.fit_transform(data[u'产品标题']))
print(vectorizer.fit_transform(data[u'产品标题']).toarray())
# print(vectorizer.get_feature_names())

from sklearn.feature_extraction.text import HashingVectorizer
vectorizer2 = HashingVectorizer(n_features=100, norm=None)

data_Y = data[u'二级类目']
#data_X=vectorizer.fit_transform(data[u'产品标题']).toarray()
data_X = vectorizer2.fit_transform(data[u'产品标题']).toarray()
'''
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import learning_curve
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import recall_score
예제 #53
0
 def func():
     hv = HashingVectorizer()
     hv.fit_transform(['hello world', np.nan, 'hello hello'])
예제 #54
0
if __name__ == '__main__':
    use_hashing = True
    select_chi2 = True
    X_train, X_test, y_train, y_test, target_names = get_data()
    print("%d rows: "  %len(y_train) + "\n")
    print("%d features:" % len(X_train[0] + "\n"))
    if use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                   n_features=400)
        X_train = vectorizer.transform(X_train)
        X_test = vectorizer.transform(X_test)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
        X_train = vectorizer.fit_transform(X_train)
        X_test = vectorizer.fit_transform(X_test)

    # feature select
    if select_chi2:
        ch2 = SelectKBest(chi2, k=30)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)

    # classify, train model
    mnb = MultinomialNB(alpha=1)
    mnb.fit(X_train, y_train)
    mnb_result = mnb.predict(X_test)
    scoremnb = metrics.accuracy_score(y_test, mnb_result)

    nb = multiNBayes.multiNByes(X_train.toarray(), y_train)
예제 #55
0
file.drop("label", axis=1)
"""
Chooses random files in dataset to be training and testing data. test_size shows what portion of the data
will be test data. random_state is used for generating random numbers to help
"""
X_train, X_test, y_train, y_test = train_test_split(file['text'],
                                                    y,
                                                    test_size=0.4,
                                                    random_state=53)

#Stores tokens as numerical indexes

hash_vect = HashingVectorizer(stop_words='english', non_negative=True)

#fits the data to make a normal model
hash_train = hash_vect.fit_transform(X_train)
hash_test = hash_vect.transform(X_test)

#Creates instance of passive aggressive classifier
classifier = PassiveAggressiveClassifier()
#fit classifier onto training data
classifier.fit(hash_train, y_train)
#using 'learned' features from training data, predicts whether news is fake or real
prediction = classifier.predict(hash_test)

accuracy = accuracy_score(y_test, prediction) * 100
#print out total accuracy of classifier
print("The accuracy is %0.5f" % accuracy + " percent.")

#creates confusion matrix
matrix = confusion_matrix(y_test, prediction, labels=['FAKE', 'REAL'])
예제 #56
0
 def create_sentence_vectors(self):
     vectorizer = HashingVectorizer(norm=None, n_features=17)
     return (vectorizer.fit_transform(
         self.formatted_article_text)).toarray()
예제 #57
0
f = open(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'snippetCollection_text.pkl'), 'rb')
snippetCollection_text = pickle_zloads(f.read())
f.close()
'''

from nltk.stem.snowball import SnowballStemmer
import re
from sklearn.feature_extraction.text import HashingVectorizer
import numpy
import sys


def stemTokenize(doc):
    stemmer = SnowballStemmer('english')
    return [stemmer.stem(word) for word in re.findall(r'\b\w+\b', doc)]


vectorizer = HashingVectorizer(tokenizer=stemTokenize,
                               ngram_range=(1, 3),
                               token_pattern=r'\b\w+\b',
                               stop_words='english',
                               binary=False,
                               norm='l2',
                               n_features=2**19)
trainedVectorArray = vectorizer.fit_transform(snippetCollection_text)
anchorVector = vectorizer.transform([sys.argv[1]]).toarray()
distances = (anchorVector * trainedVectorArray.T)[0]
nonzeroIndices = numpy.nonzero(distances)[0]
sortedIndices = nonzeroIndices[numpy.argsort(distances[nonzeroIndices])][::-1]
for i in sortedIndices[:int(sys.argv[2])]:
    print(snippetCollection_text[i] + '\n')
import numpy as np
import pickle
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

data = pickle.load(open('sklearn-data.pickle', 'rb'))
x_train = data["x_train"]
y_train = data["y_train"]
x_test = data["x_test"]
y_test = data["y_test"]

v  #####  Vectorizing data for sklearn
vectorizer = HashingVectorizer(stop_words="english",
                               lowercase=True,
                               binary=True,
                               n_features=2**18)
x_train_hash = vectorizer.fit_transform(x_train)
x_test_hash = vectorizer.fit_transform(x_test)

classifier_DT = DecisionTreeClassifier()
classifier_DT.fit(x_train_hash, y_train)
y_DT = classifier_DT.predict(x_test_hash)

acc_DT = accuracy_score(y_DT, y_test)

print("\nDecision tree accuracy: ", round(acc_DT, 4) * 100, "%")
예제 #59
0
def predict_and_cluster(opts, mode):

    n_digits = 3
    # n_samples, n_features = (25,1927)
    n_samples, n_features = (25, 491)

    labels = array([
        0, 1, 2, 1, 1, 2, 2, 1, 2, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2,
        1
    ])
    true_k = np.unique(labels).shape[0]

    corpus, news = jieba_tokenizer()

    print(
        "Extracting features from the training dataset using a sparse vectorizer"
    )
    t0 = time()
    if opts.use_hashing:
        if opts.use_idf:
            # Perform an IDF normalization on the output of HashingVectorizer
            hasher = HashingVectorizer(n_features=opts.n_features,
                                       stop_words='english',
                                       non_negative=True,
                                       norm=None,
                                       binary=False)
            vectorizer = make_pipeline(hasher, TfidfTransformer())
        else:
            vectorizer = HashingVectorizer(n_features=opts.n_features,
                                           stop_words='english',
                                           non_negative=False,
                                           norm='l2',
                                           binary=False)
    else:
        vectorizer = TfidfVectorizer(max_df=0.5,
                                     max_features=opts.n_features,
                                     min_df=2,
                                     stop_words='english',
                                     use_idf=opts.use_idf)

    X = vectorizer.fit_transform(corpus)

    print("done in %fs" % (time() - t0))
    # n_samples: how many articles are there
    # n_features: how many different words in all articles are there
    print("n_samples: %d, n_features: %d" % X.shape)
    print()

    if opts.n_components:
        print("Performing dimensionality reduction using LSA")
        t0 = time()
        # Vectorizer results are normalized, which makes KMeans behave as
        # spherical k-means for better results. Since LSA/SVD results are
        # not normalized, we have to redo the normalization.
        svd = TruncatedSVD(opts.n_components)
        lsa = make_pipeline(svd, Normalizer(copy=False))

        X = lsa.fit_transform(X)

        print("done in %fs" % (time() - t0))

        svd = TruncatedSVD().fit(X)
        X_proj = svd.transform(X)
        explained_variances = np.var(X_proj, axis=0) / np.var(X, axis=0).sum()

        print("Explained variance of the SVD step: {}%".format(
            int(explained_variances[0] * 100)))

        print()

    # =================================================
    # KMeans clustering

    # if opts.minibatch:
    #     km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
    #                          init_size=1000, batch_size=1000, verbose=True)
    # else:
    print('*' * 80)

    km = KMeans(n_clusters=true_k,
                init='k-means++',
                max_iter=100,
                n_init=1,
                verbose=True)  # always better

    print("Clustering sparse data with %s" % km)
    t0 = time()
    km.fit(X)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    print("Completeness: %0.3f" %
          metrics.completeness_score(labels, km.labels_))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    print("Adjusted Rand-Index: %.3f" %
          metrics.adjusted_rand_score(labels, km.labels_))

    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(X, labels, sample_size=None))

    print("labels    ", labels)
    print("my_labels ", km.labels_)

    if not (opts.n_components or opts.use_hashing):
        print("Top terms per cluster:")
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]
        terms = vectorizer.get_feature_names()
        for i in range(true_k):
            print("Cluster %d:" % i, end='')
            for ind in order_centroids[i, :10]:
                print(' %s' % terms[ind], end='')
            print()

    for i in range(len(news)):
        news[i].category = labels[i]

    from sklearn.metrics.pairwise import cosine_similarity

    FG = nx.Graph()

    for i in range(len(news)):
        news[i].similarity = cosine_similarity(X[i:i + 1], X)[0]
        cs = news[i].similarity
        # print (cs)
        for j in range(len(news)):
            if i != j:
                FG.add_weighted_edges_from([(i, j, cs[j])])

    print()
    print('*' * 80)

    print(X.shape[0])
    print(X.shape)
    print(self)

    gmm(X)

    print()
    print('*' * 80)

    best_part(FG)

    print()
    print('*' * 80)
예제 #60
0
def numberize_hash(filename, number_of_features):
	vectorizer = HashingVectorizer(n_features = number_of_features)
	return vectorizer, vectorizer.fit_transform(read_file(filename))