def run_online_classifier():
    vect = HashingVectorizer(
        decode_error='ignore',
        n_features=2**21,
        preprocessor=None,
        tokenizer=tokenizer_streaming,
    )
    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)

    csv_filename = os.path.join('datasets', 'movie_data.csv')
    doc_stream = stream_docs(path=csv_filename)

    classes = np.array([0, 1])
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if X_train is None:
            break
        else:
            X_train = vect.transform(X_train)
            clf.partial_fit(X_train, y_train, classes=classes)

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print("Test accuracy: %.3f" % clf.score(X_test, y_test))

    clf = clf.partial_fit(X_test, y_test)
예제 #2
0
파일: cluster.py 프로젝트: aolieman/xtas
def big_kmeans(docs, k, batch_size=1000, n_features=(2 ** 20),
               single_pass=True):
    """k-means for very large sets of documents.

    See kmeans for documentation. Differs from that function in that it does
    not computer tf-idf or LSA, and fetches the documents in a streaming
    fashion, so they don't need to be held in memory. It does not do random
    restarts.

    If the option single_pass is set to False, the documents are visited
    twice: once to fit a k-means model, once to determine their label in
    this model.
    """
    from sklearn.cluster import MiniBatchKMeans
    from sklearn.feature_extraction.text import HashingVectorizer

    v = HashingVectorizer(input="content", n_features=n_features, norm="l2")
    km = MiniBatchKMeans(n_clusters=k)

    labels = []
    for batch in batches(docs, batch_size):
        batch = map(fetch, docs)
        batch = v.transform(batch)
        y = km.fit_predict(batch)
        if single_pass:
            labels.extend(y.tolist())

    if not single_pass:
        for batch in batches(docs, batch_size):
            batch = map(fetch, docs)
            batch = v.transform(batch)
            labels.extend(km.predict(batch).tolist())

    return labels
예제 #3
0
def get_hashing(data):
  t0 = time.time()
  print("* Making hashing vectorizor with the data ...")
  hasher = HashingVectorizer(stop_words='english', ngram_range=(1,3), norm='l2', non_negative=True) #l2 projected on the euclidean unit sphere
  hX = hasher.fit_transform(data)
  print("done in %0.3fs." % (time.time() - t0))
  return hX, hasher
예제 #4
0
파일: ooc.py 프로젝트: audy/bfc
def main():
    '''
        >>> main() # stuff happens
    '''

    args = parse_args()
    setup_logging(args.log, verbose=args.verbose)

    chunks = sequence_chunk_generator(args.fasta_file,
                                      chunk_size=args.chunk_size)

    hasher = HashingVectorizer(analyzer='char',
                               n_features = 2 ** 18,
                               ngram_range=(args.ngram_min, args.ngram_max),
                               )

    estimator = AffinityPropagation()

    for chunk in chunks:

        logging.info('hashing chunk')
        chunk_vector = hasher.transform([ str(i.seq) for i in chunk ])

        logging.info('clustering')

        estimator.fit(chunk_vector)

        logging.info('got %s clusters' % len(set(estimator.labels_)))
def train():
    vect = HashingVectorizer(decode_error='ignore',
                             n_features=2**21,
                             preprocessor=None,
                             ngram_range=(1, 3),
                             tokenizer=tokenizer)
    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
    stream_path = os.path.join(work_path, 'movie_data.csv')
    doc_stream = stream_docs(path=stream_path)

    pbar = pyprind.ProgBar(45)
    classes = np.array([0, 1])
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if not X_train:
            break
        X_train = vect.transform(X_train)
        clf.partial_fit(X_train, y_train, classes=classes)
        pbar.update()

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print('Accuracy: %.3f' % clf.score(X_test, y_test))

    clf = clf.partial_fit(X_test, y_test)

    return clf
예제 #6
0
def sim_char10(text1, text2):
    vect = HashingVectorizer(analyzer='char_wb', tokenizer=normalize, stop_words='english', ngram_range=(10, 10))
    texts = [text1, text2]
    matrix = vect.fit_transform(texts)
    cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten()
    simmax = max(cosine_similarities[1:])
    return simmax
예제 #7
0
def sim_char5(text1, text2):
    vect = HashingVectorizer(analyzer='word', tokenizer=normalize, stop_words='english')
    texts = [text1, text2]
    matrix = vect.transform(texts)
    cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten()
    simmax = max(cosine_similarities[1:])
    return simmax
예제 #8
0
 def __init__(self, train_data, C=5, kernel='poly', gamma=.001, degree=10, coef0=2, n_features=10000000,
              ngram_range=(1, 10), tfidf=False, dfrange=(2, 1.0), probability=False, class_weight=None):
     self.conn = None
     self.is_tfidf = tfidf
     if tfidf:
         self.vectorizer = TfidfVectorizer(stop_words=None, min_df=dfrange[0], max_df=dfrange[1],
                                           max_features=n_features, strip_accents='unicode',
                                           ngram_range=ngram_range, analyzer='word', norm='l2')
     else:
         self.vectorizer = HashingVectorizer(stop_words=None, non_negative=True,
                                             n_features=n_features, strip_accents='unicode',
                                             ngram_range=ngram_range, analyzer='word', norm='l2')
     self.param_set = {'C': str(C), 'kernel': str(kernel), 'gamma': str(gamma),
                       'degree': str(degree), 'coef0': str(coef0), 'n_features': str(n_features)}
     if class_weight == 'auto':
         class_weight = {}
         for item in train_data.target:
             if class_weight.get(item):
                 class_weight.update({item: class_weight[item] + 1.0})
             else:
                 class_weight.update({item: 1.0})
         for key in class_weight:
             class_weight.update({key: 1.0 / class_weight[key]})
     self.class_weight_dict = class_weight
     super(svm_text, self).__init__(C=C, kernel=kernel, gamma=gamma, shrinking=True, probability=probability, degree=degree, coef0=coef0,
                                    tol=0.001, cache_size=20000, class_weight=class_weight, verbose=False, max_iter=-1)
     if self.is_tfidf:
         train_x = self.vectorizer.fit_transform(train_data.data)
     else:
         train_x = self.vectorizer.transform(train_data.data)
     self.fit(train_x, train_data.target)
예제 #9
0
파일: tfidf.py 프로젝트: fallingleaf/rsweb
def tfidf_classify(user):
    train_set, y, src, test_set = extract_data(user.id)
    if not train_set:
        return []
    # Analyse using tf-idf
    # vector = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
    vector = HashingVectorizer(n_features=1000, non_negative=True, stop_words='english')
    # List of topic extracted from text
    # feature_names = vector.get_feature_names()
    # print feature_names
    xtrain = vector.transform(train_set)
    xtest = vector.transform(test_set)

    # Select sample using chi-square
    ch2 = SelectKBest(chi2)
    xtrain = ch2.fit_transform(xtrain, y)
    xtest = ch2.transform(xtest)

    # Predict testing set
    # classifier = DecisionTreeClassifier()
    classifier = KNeighborsClassifier(n_neighbors=4)
    classifier = classifier.fit(xtrain, y)
    result = classifier.predict(xtest)
    final = []
    for i in xrange(len(result)):
        if result[i]:
            final.append(src[i])
    print len(final)
    return final
예제 #10
0
def big_kmeans(docs, k, batch_size=1000, n_features=(2 ** 20),
               single_pass=True):
    """k-means for very large sets of documents.

    """
    from sklearn.cluster import MiniBatchKMeans
    from sklearn.feature_extraction.text import HashingVectorizer

    v = HashingVectorizer(input="content", n_features=n_features, norm="l2")
    km = MiniBatchKMeans(n_clusters=k)

    labels = []
    for batch in batches(docs, batch_size):
        batch = map(fetch, docs)
        batch = v.transform(batch)
        y = km.fit_predict(batch)
        if single_pass:
            labels.extend(y.tolist())

    if not single_pass:
        for batch in batches(docs, batch_size):
            batch = map(fetch, docs)
            batch = v.transform(batch)
            labels.extend(km.predict(batch).tolist())

    return labels
예제 #11
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = HashingVectorizer()
        dist = SparkHashingVectorizer()

        result_local = local.transform(X).toarray()
        result_dist = dist.transform(X_rdd).toarray()
        assert_array_equal(result_local, result_dist)
    def test_same_output(self):
        X, X_rdd = self.generate_text_dataset()
        local = HashingVectorizer()
        dist = SparkHashingVectorizer()

        result_local = local.transform(X)
        result_dist = sp.vstack(dist.transform(X_rdd).collect())
        assert_array_equal(result_local.toarray(), result_dist.toarray())
예제 #13
0
파일: sentiment.py 프로젝트: jannson/crfseg
def predict(line, tagger):
    tok_cn = lambda (x): crfseg.cut_zh(x, tagger)

    hasher = HashingVectorizer(n_features=2**16,
                               tokenizer=tok_cn, non_negative=True,
                               norm=None, binary=False)
    x_test = hasher.transform([line])
    return clf_global.predict_proba(x_test)
예제 #14
0
class ReviewTrainer(TrainerModel):
	def __init__(self):
		pass

	#get rids of stopwords
	def preprocess(self, l):
		res = {}
		sw = stopwords.words('english')
		clean = ' '.join([w for w in l['text'].split() if w not in sw])
		res[l['review_id']] = {'text' : clean, 'label' : l['votes']['useful']}
		return res

	#the labels are already given for this review
	def group_labels(self, fname):
		pass

	#vectorizes data and selects K best feats.
	def prepare_data(self, x, y):
		self.hv = HashingVectorizer(strip_accents='ascii', non_negative=True)
		self.feats = self.hv.transform(x)
		self.labels = np.array(y)
		
		self.ch2 = SelectKBest(chi2, k=K_FEAT)
		self.feats = self.ch2.fit_transform(self.feats, self.labels)
		
	def get_error(self, pred, y):
		return super(ReviewTrainer, self).get_error(pred,y)
	
	#optimizes for hyper-parameter alpha
	def _cross_validate(self):
		grid = dict(alpha=10.0 ** np.arange(-4,1))
		return super(ReviewTrainer, self)._cross_validate_base(
			Ridge(), grid) 
	
	#builds examples to feed trainer
	#MUST RUN BEFORE train
	def build_examples(self, data, labels=None):
		feats = []
		labels = []
		ex = {}
		for k,v in data.items():
			feats.append(v['text'])
			labels.append(v['label'])
		ex['feats'] = feats
		ex['labels'] = labels
		return ex

	#fits model using optimal parameters
	def train(self):
		self.clf = self._cross_validate()
		self.clf.fit(self.feats, self.labels)

	#predicts Y given X
	def predict(self, data):
		data = self.hv.transform(data)
		data = self.ch2.transform(data)
		pred = self.clf.predict(data)
		return pred			
예제 #15
0
def trainOnModel(x_VariableList, y_VariableList, testSetList, classifier, hashing=False, chi_squared=False):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.feature_extraction.text import HashingVectorizer
    from sklearn.feature_selection import SelectKBest, chi2
    from sklearn.linear_model import RidgeClassifier
    from sklearn.svm import LinearSVC
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import PassiveAggressiveClassifier
    from sklearn.utils.extmath import density
    y_train = y_VariableList
    if hashing == True:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                       n_features=2 ** 16)
        X_train = vectorizer.transform(x_VariableList)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(x_VariableList)

    X_test = vectorizer.transform(testSetList)

    if chi_squared == True:
        print("Extracting best features by a chi-squared test")
        ch2 = SelectKBest(chi2, k=2 * 16)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)

    classifierObject = ""
    print "Using :", classifier

    if classifier == "LinearSVC":
        classifierObject = LinearSVC(penalty='l2', dual=False, tol=1e-3)

    elif classifier == "PassiveAggressiveClassifier":
        classifierObject = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge',
                                                       n_iter=50, n_jobs=1, random_state=None, shuffle=True,
                                                       verbose=0, warm_start=False)

    elif classifier == "RidgeClassifier":
        classifierObject = RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                                           max_iter=None, normalize=False, solver='lsqr', tol=0.01)

    elif classifier == "Perceptron":
        classifierObject = Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
                                      n_iter=50, n_jobs=1, penalty=None, random_state=0, shuffle=True,
                                      verbose=0, warm_start=False)

    elif classifier == "SGDClassifier":
        classifierObject = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
                                         eta0=0.0, fit_intercept=True, l1_ratio=0.15,
                                         learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
                                         penalty='l2', power_t=0.5, random_state=None, shuffle=True,
                                         verbose=0, warm_start=False)

    classifierObject.fit(X_train, y_train)
    pred = classifierObject.predict(X_test)
    return pred[0]
예제 #16
0
def vectorize(train_words, test_words):
    # 停用词表
    with open('dict/stopwords.txt', 'r') as f:
        stopwords = set([w.strip() for w in f])

    v = HashingVectorizer(non_negative=True, stop_words=stopwords, n_features=30000)
    train_data = v.fit_transform(train_words)
    test_data = v.fit_transform(test_words)
    return train_data, test_data
예제 #17
0
    def get_x(text,ngram_range):

        hash_vect_object = HashingVectorizer(ngram_range=ngram_range, stop_words="english", strip_accents="unicode")
        tfidf_transformer_object = TfidfTransformer(use_idf=True)

        x_train_counts = hash_vect_object.fit_transform(text)
        x_train_tfidf = tfidf_transformer_object.fit_transform(x_train_counts)

        return x_train_tfidf
class Featurizer:
    def __init__(self):
        self.vectorizer = HashingVectorizer(stop_words="english")

    def train_feature(self, examples):
        return self.vectorizer.fit_transform(examples)

    def test_feature(self, examples):
        return self.vectorizer.transform(examples)
예제 #19
0
def main(output=RESULTS):
    # change ROOT ID in config.py to your computer's path so that is writes to correct file
    # load and puts data and desired numpy format
    movies = load_balanced_movies(MOVIES_DATA, False) # True is for debugging
    data = pd.DataFrame(movies)
    pd.options.mode.chained_assignment = None  # default='warn' ignore
    summaries = data[['summary']]
    summaries['summary'] = summaries['summary'].str.replace('[^\w\s]','').str.lower()  ## cleans out puncutation and characters
    Y = np.array(data[['year']])
    Y = np.ravel(Y)
    X = np.array(summaries['summary'])

    # standard CountVectorizer for bag of words
    # vectorizer = CountVectorizer()
    # X = vectorizer.fit_transform(X)

    # print "Old Shape Dim" 
    # print X.shape 

    # uses random projections to reduce dimensionality
    # transformer = random_projection.SparseRandomProjection()
    # X_new = transformer.fit_transform(X)
    # print "New Shape Dim"
    # print X_new.shape 

    # perform vectorization and dim reduction using Hashing Vectorizer (counts # of times a word appears)
    vectorizer = HashingVectorizer(stop_words='english', n_features=80000)  # uses 80,000 word instances as k
    X = vectorizer.transform(X)

    # instantiate scaling of data for preprocessing
    X = StandardScaler(with_mean=False).fit_transform(X)

    # splits training and test data equally
    xtrain, xtest, ytrain, ytest = train_test_split(X, Y)

    names = ["SGDClassifier", "Linear SVC", "SVC Kernel RBF", "PerceptronL1", "PerceptronL2", "Nearest Neighbors", "Ridge Classifier"] # 
    classifiers = [
        SGDClassifier(loss="hinge", penalty="l2"),
        LinearSVC(),
        SVC(kernel="rbf"),
        Perceptron(penalty='l1'),
        Perceptron(penalty='l2', n_iter=25),
        KNeighborsClassifier(),
        RidgeClassifier(),
        ]

    print "Calculating accuracies"
    # fits chosen classifier on training data
    for name, clf in zip(names, classifiers):
        print name
        clf.fit(xtrain, ytrain)
        print "Accuracy: %0.2f%%" % (100 * clf.score(xtest, ytest)) # Predict and score accuracy

        with open(output, "a+") as outputFile:  # write results to file 
            score = 100 * clf.score(xtest, ytest) 
            outputFile.write("Ran classifier {}    ".format(name) + '\n'
            " Achieved accuracy {}   ".format(score) )
예제 #20
0
파일: sentiment.py 프로젝트: nobsu/grape
def vectorize(docs):
    """
    文档向量化
    :param docs list: iterable over raw text documents
    :return:
    """
    v = HashingVectorizer(tokenizer=comma_tokenizer, n_features=30000, non_negative=True)
    train_data = v.fit_transform(docs)
    return train_data
예제 #21
0
def ngrams_hashing_vectorizer(strings, n, n_features):
    """ Return the a disctionary with the count of every
    unique n-gram in the string.
    """
    hv = HashingVectorizer(analyzer='char', ngram_range=(n, n),
                           n_features=n_features, norm=None,
                           alternate_sign=False)
    hash_matrix = hv.fit_transform(strings)
    return hash_matrix
예제 #22
0
 def feature_extraction(self, test):
     """
     function:特征提取
     :param test:
     :return:训练特征,测试特征
     """
     train = self.load_train_set()
     vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=25000)
     fea_train = vectorizer.fit_transform(train)  # 特征提取
     fea_test = vectorizer.fit_transform(test)  # 特征提取
     return fea_train, fea_test
예제 #23
0
def feature_extraction(feature, target_name, df):
    vect = HashingVectorizer(decode_error='ignore', ngram_range=(1,2), n_features = 2**18, binary=True, norm="l2")
    le = preprocessing.LabelEncoder()
    # for multiple features replace this with http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html
    df[feature] = df[feature].fillna('')
    titles = vect.transform(df[feature])

    X = titles
    #y = le.fit_transform(df[target_name])
    y = df[target_name]
    return X, y
예제 #24
0
def vector_func_char(l):
    vectorizer = HashingVectorizer(
        analyzer="char",
        input="content",
        decode_error="ignore",
        strip_accents="ascii",
        ngram_range=(2, 2),
        n_features=524288,
    )

    return str(l).split(" ")[0], vectorizer.fit_transform(str(l).replace(str(l).split(" ")[0], ""))
예제 #25
0
    def get_x(text,ngram_range):

        hash_vect_object = HashingVectorizer(ngram_range=ngram_range,
                                             stop_words="english",
                                             strip_accents="unicode",
                                             token_pattern=r"(?u)\b[a-zA-Z_][a-zA-Z_]+\b") # tokens are character strings of 2 or more characters
        tfidf_transformer_object = TfidfTransformer(use_idf=True)

        x_train_counts = hash_vect_object.fit_transform(text)
        x_train_tfidf = tfidf_transformer_object.fit_transform(x_train_counts)

        return x_train_tfidf
예제 #26
0
def vector_func_word(l):
    vectorizer = HashingVectorizer(
        non_negative=True,
        stop_words="english",
        input="content",
        decode_error="ignore",
        strip_accents="ascii",
        n_features=262144,
    )

    # return str(l).split(" ")[0],vectorizer.fit_transform(str(l).replace(str(l).split(" ")[0],""))
    return vectorizer.fit_transform(l).shape
예제 #27
0
def tfidfVectorizeData(listOfSentences, useHashTable=False, nFeatures=100):
    
    if useHashTable:
        from sklearn.feature_extraction.text import HashingVectorizer
        vec = HashingVectorizer(stop_words='english', non_negative=True, n_features=nFeatures)
        X_noProcess = vec.transform(listOfSentences).toarray()
    else:
        from sklearn.feature_extraction.text import TfidfVectorizer
        vec = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
        X_noProcess = vec.fit_transform(listOfSentences).toarray()

    return vec, X_noProcess
예제 #28
0
def vectorize_2(test_words):
    input_words = jieba.lcut(test_words[0])
    print check_neg(input_words)

    #  if len(jieba.lcut(test_words[0])) < 2:
    if len(jieba.lcut(test_words[0])) < 2:
        return None, False
    else:
        v = HashingVectorizer(tokenizer=comma_tokenizer, stop_words=stopwords, n_features=100000, non_negative=True)
        test_data = v.fit_transform(test_words)
        print test_data
        return test_data, check_neg(input_words)
예제 #29
0
def vectorize(concepts):
    """
    This vectorizes a list or a string of concepts;
    the regular `vectorize` method is meant to vectorize text documents;
    it is trained for that kind of data and thus is inappropriate for concepts.
    So instead we just use a simple hashing vectorizer.
    """
    h = HashingVectorizer(input='content', stop_words='english', norm=None, tokenizer=Tokenizer())
    if type(concepts) is str:
        # Extract and return the vector for the single document.
        return h.transform([concepts]).toarray()[0]
    else:
        return h.transform(concepts)
예제 #30
0
 def __init__ (self, corpus, classes, method):
     # Set up vectorizier
     if method == 'count':
         self.vectorizer = CountVectorizer(min_df=2, ngram_range=(1, 3)) 
     elif method == 'tfidf':
         self.vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 3))
     elif method == 'hashing':
         self.vectorizer = HashingVectorizer(non_negative = True)
     else:
         print 'Method must be count, tfidf, or hashing'
     # vectorize and set up classifier. 
     self.X = self.vectorizer.fit_transform(corpus)
     classifier = MultinomialNB()
     self.classifier = classifier.fit(self.X, classes)
예제 #31
0
class Bayes(object):
    def __init__(self, config_dir):
        """Initialize a bayes model.
		Args:
			alpha: parameter for bayes model.
		"""
        self.name = 'Bayes'
        self.config_dir = config_dir
        self.config = dict()
        self.Vec = None
        self.clf = None

        self.output_path = './result/Bayes/'
        if not os.path.exists(self.output_path):
            os.mkdir(self.output_path)
        self.model_path = './model/Bayes/'
        if not os.path.exists(self.model_path):
            os.mkdir(self.model_path)

    def get_config(self):
        """Get the config from the config file. Put them into the config dictionary.
		Raises:
			ValueError: the config file does not exists.
		"""
        if os.path.exists(self.config_dir):
            print('Read in configuration from dir %s.\n' % self.config_dir)
            with open(self.config_dir + 'Bayes_config', 'rt') as f:
                for line in f.readlines():
                    tmp = line.strip().split(':')
                    self.config[tmp[0]] = tmp[1]
            for key, value in self.config.items():
                print('%s:%s' % (key, value))

            print('Build up the model according to configuration.')
            if self.config.get('alpha') == None:
                raise ValueError(
                    'Parameter Alpha has not been set. Please re-edit the configuration file.'
                )
            # Build up the feature vector.
            if self.config.get('feature') == None:
                raise ValueError(
                    'Feature parameter has not been set. Please re-edit the configuration file.'
                )
            else:
                if self.config['feature'] == 'CountVectorizer':
                    self.Vec = CountVectorizer()
                elif self.config['feature'] == 'TfidfVectorizer':
                    self.Vec = TfidfVectorizer()
                elif self.config['feature'] == 'HashingVectorizer':
                    self.Vec = HashingVectorizer()
                else:
                    raise ValueError(
                        'Can not use %s as a feature, please re-edit your configuration file.'
                        % self.config['feature'])

            # Build up model.
            if self.config.get('model') == None:
                raise ValueError(
                    'Model parameter has not been set. Please re-edit the configuration file.'
                )
            else:
                if self.config['model'] == 'GaussianNB':
                    self.clf = GaussianNB()
                elif self.config['model'] == 'MultinomialNB':
                    self.clf = MultinomialNB(
                        alpha=(float)(self.config['alpha']))
                elif self.config['model'] == 'BernoulliNB':
                    self.clf = BernoulliNB()
                else:
                    raise ValueError(
                        'No model named %s, please re-edit your configuration file.'
                        % self.config['model'])

        else:
            print('Configuration file %s does not exists.' % self.config_dir)

    def fit(self, train):
        """Fit the data into the model and train.
		Args:
			train: training data in format (data,label)
		Return:
			None
		Raises:
			ValueError: invalid config value.
		"""
        fea = self.Vec.fit_transform(train.data)
        fea = fea.todense()
        self.clf.fit(fea, train.target)

    def predict(self, test_data):
        """Run the model for a single step to get the predicted result.
		Args:
			data_piece: a piece of data fit into the model.
		Return:
			predicted result.
		"""
        fea = self.Vec.transform(test_data)
        fea = fea.todense()
        return self.clf.predict(fea)

    def test(self, test):
        """Test the model.
		Args:
			model: the model to be evaluated.
			test: test dataset in format (data,label)
		Output:
			result: predicted label. A label each line.
		Return:
			accuracy:
		"""
        preds = self.predict(test.data)
        # Write into files.
        filename = self.output_path + self.config['model'] + '_' + self.config[
            'feature']
        with open(filename, 'wt', encoding='utf-8') as f:
            for pred in preds:
                f.write(str(pred) + '\n')
        return evaluate(test.target, preds)
예제 #32
0
import numpy as np
from sklearn.feature_extraction.text import HashingVectorizer

os.chdir("E:/graduate/class/非结构化/文本分析_李翠平/垃圾短信数据集与代码")
mescon_all = pd.read_csv('result.csv', header=None, encoding='gbk')
listtodel = []
#此处目的是将内容为空的短信删除,只保留可以转成特征向量的短信
for i, line in enumerate(mescon_all[1]):
    if type(line) != np.unicode:
        listtodel.append(i)
mescon_all = mescon_all.drop(listtodel)

#vector=TfidfVectorizer(CountVectorizer())
#temp = vector.fit_transform(mescon_all[1]).todense()
outfile = open('features.txt', 'w')

vector = HashingVectorizer(n_features=100)
temp = vector.transform(mescon_all[1]).todense()

x = [[i, j] for i, j in enumerate(mescon_all[0])]
temp = temp.tolist()
for i, line in enumerate(temp):
    outstr = ''
    for word in line:
        outstr += str(word + 1)
        outstr += ' '
    outfile.write((str(mescon_all[0][x[i][1]]) + ',' + outstr) + '\n')

outfile.close()

#unicode是np里面的
예제 #33
0
from tqdm import tqdm

from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

# == Load data ==
print("Loading dataset...")
files = os.listdir()
D = []

# Label:
# - dry: 0
# - normal: 1
# - oil: 2
label = np.array([])

print(len(D), " documents, ", len(label), " labels.")

# == Chinese Segmentation ==
for 

# == Vector Transformation ==
print("Extracting features from the dataset...")
vectorizer = Pipeline([
                       ('vect', HashingVectorizer(n_features=(2 ** 21), non_negative=True, lowercase=False)),
                       ('tfidf', TfidfTransformer(norm='l2')),
                       ])

if __name__ == '__main__':
    vectorizer.fit()
split_size = int(len(news.data) * SPLIT_PERC)
X_train = news.data[:split_size]
X_test = news.data[split_size:]
y_train = news.target[:split_size]
y_test = news.target[split_size:]

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
clf_1 = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB()),
])

clf_2 = Pipeline([
    ('vect', HashingVectorizer(non_negative=True)),
    ('clf', MultinomialNB()),
])

clf_3 = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB()),
])

from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem


def evaluate_cross_validation(clf, X, y, K):
    # create a k-fold croos validation iterator of k=5 folds
    cv = KFold(len(y), K, shuffle=True, random_state=0)
예제 #35
0
#print('using counts of rel. entity cluster nodes')
##clusters = int(X.shape[1]/(10*math.sqrt(s))) #use number of tfidf extracted features as basis for word vector clustering
#kmeans = KMeans(n_clusters = clusters)
#nodes = kmeans.fit_predict(np.asarray(rel_vectors))
#X = np.zeros([s+1, clusters])
#for source, node in zip(rel_sources, nodes):
#    X[source, node] += 1
#evaluate(MultinomialNB(), X)
#print('repeat with -',names[0])
#evaluate(classifiers[0], X)

for model, name in zip(classifiers, names):

    print('using hashed count vectors -', name)
    hasher = HashingVectorizer(n_features=1000,
                               stop_words='english',
                               norm='l2')
    vectorizer = make_pipeline(hasher, TfidfTransformer())
    X = vectorizer.fit_transform(files)
    evaluate(model, X)

    print('using avg softmax -', name)
    X = np.asarray(probs)
    evaluate(model, X)

    print('now try with each feature in turn -', name)
    for feature in range(X.shape[1]):
        X = np.asarray([[f[feature]] for f in probs])
        evaluate(model, X)

    print('using avg similarity -', name)
예제 #36
0
def trainAndEvaluateModels(dataFrame):

    # We'll test-drive two vectorizers. HashingVectorizer is famed to be memory-efficient!
    vectorizers = {'CountVectorizer', 'HashingVectorizer', 'TfidfVectorizer'}

    # We'll also try out some classifiers. Should be fun!
    classifiers = {
        'MultinomialNB', 'BernoulliNB', 'SGDClassifier',
        'PassiveAggressiveClassifier'
    }

    for vectorizer in vectorizers:  #For each combination of vectorizer and classifier
        for classifier in classifiers:
            vect = None
            if (vectorizer == 'CountVectorizer'):
                vect = CountVectorizer()
            elif (vectorizer == 'HashingVectorizer'):
                vect = HashingVectorizer(stop_words='english',
                                         non_negative=True,
                                         norm=None,
                                         binary=False)
            elif (vectorizer == 'TfidfVectorizer'):
                vect = TfidfVectorizer(max_df=0.5,
                                       min_df=2,
                                       stop_words='english',
                                       use_idf=True)

            clf = None
            if (classifier == 'MultinomialNB'):
                clf = MultinomialNB()
            elif (classifier == 'BernoulliNB'):
                clf = BernoulliNB()
            elif (classifier == 'SGDClassifier'):
                clf = SGDClassifier()
            elif (classifier == 'PassiveAggressiveClassifier'):
                clf = PassiveAggressiveClassifier()

            #Some spirits don't mix!
            if (vectorizer == 'HashingVectorizer'
                    and classifier == 'BernoulliNB'):
                continue

            if (vectorizer != 'TfIdfVectorizer'):
                #Setup a pipeline to vectorize and classify data.
                pipeline = Pipeline([('vectorizer', vect),
                                     ('classifier', clf)])

                #We will divide our dataset into 10 pieces, train on 9 of them and test on the remaining one.
                #We will do this until each piece has been the test piece atleast once.
                k_fold = KFold(n=len(dataFrame), n_folds=10)
                totalScore = 0

                for train_indices, test_indices in k_fold:
                    train_text = dataFrame.iloc[train_indices]['text'].values
                    train_y = dataFrame.iloc[train_indices]['label'].values

                    test_text = dataFrame.iloc[test_indices]['text'].values
                    test_y = dataFrame.iloc[test_indices]['label'].values

                    #Train the model on the training set
                    pipeline.fit(train_text, train_y)

                    #Test the model on the test set
                    predictions = pipeline.predict(test_text)

                    #print predictions
                    score = f1_score(test_y, predictions)
                    totalScore = totalScore + score

                print 'Vectorizer: ', vectorizer, ' Classifier: ', classifier, ' Average prediction score: ', (
                    totalScore / 10)

            #Some spirits don't mix!!
            if classifier == 'MultinomialNB':
                continue

            #Setup a pipeline to work with a transformer too.
            pipeline1 = Pipeline([('vector', vect),
                                  ('transform', TfidfTransformer()),
                                  ('classifier', clf)])

            #We will divide our dataset into 10 pieces, train on 9 of them and test on the remaining one.
            #We will do this until each piece has been the test piece atleast once.
            k_fold = KFold(n=len(dataFrame), n_folds=10)
            totalScore = 0

            for train_indices, test_indices in k_fold:
                train_content = dataFrame.iloc[train_indices]['text'].values
                train_labels = dataFrame.iloc[train_indices]['label'].values

                test_content = dataFrame.iloc[test_indices]['text'].values
                test_labels = dataFrame.iloc[test_indices]['label'].values

                #Train the model on the training set
                pipeline1.fit(train_content, train_labels)

                #Test the model on the test set
                predictions = pipeline1.predict(test_content)
                score = f1_score(test_labels, predictions)
                totalScore = totalScore + score

            print 'Vectorizer: ', vectorizer, ' TfIdfTransformer, Classifier: ', classifier, ' Average prediction score: ', (
                totalScore / 10)
예제 #37
0
import os
import re

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import HashingVectorizer

curdir = os.path.dirname(__file__)


def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    tokenized = [
        w for w in text.split() if w not in stopwords.words('english')
    ]
    return tokenized


vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)
예제 #38
0
 def _get_vectorizer(vectorizer,
                     training_mode,
                     vectorizer_file="vectorizer.pkl"):
     token_pattern = r'\S+'
     if not training_mode and vectorizer not in [
             DocumentPoolEmbeddings.__name__
     ]:
         v = pickle_manager.load(vectorizer_file)
         assert vectorizer == v.__class__.__name__
     elif vectorizer == TfidfVectorizer.__name__:
         v = TfidfVectorizer(input='content',
                             encoding='utf-8',
                             decode_error='strict',
                             strip_accents=None,
                             lowercase=True,
                             preprocessor=None,
                             tokenizer=None,
                             analyzer='word',
                             stop_words=[],
                             token_pattern=token_pattern,
                             ngram_range=(1, 1),
                             max_df=1.0,
                             min_df=1,
                             max_features=None,
                             vocabulary=None,
                             binary=False,
                             dtype=np.float64,
                             norm='l2',
                             use_idf=True,
                             smooth_idf=True,
                             sublinear_tf=False)
     elif vectorizer == CountVectorizer.__name__:
         v = CountVectorizer(input='content',
                             encoding='utf-8',
                             decode_error='strict',
                             strip_accents=None,
                             lowercase=True,
                             preprocessor=None,
                             tokenizer=None,
                             stop_words=[],
                             token_pattern=token_pattern,
                             ngram_range=(1, 1),
                             analyzer='word',
                             max_df=1.0,
                             min_df=1,
                             max_features=None,
                             vocabulary=None,
                             binary=False,
                             dtype=np.int64)
     elif vectorizer == HashingVectorizer.__name__:
         v = HashingVectorizer(input='content',
                               encoding='utf-8',
                               decode_error='strict',
                               strip_accents=None,
                               lowercase=True,
                               preprocessor=None,
                               tokenizer=None,
                               stop_words=[],
                               token_pattern=token_pattern,
                               ngram_range=(1, 1),
                               analyzer='word',
                               n_features=1048576,
                               binary=False,
                               norm='l2',
                               alternate_sign=True,
                               non_negative=False,
                               dtype=np.float64)
     elif vectorizer == DocumentPoolEmbeddings.__name__:
         v = DocumentPoolEmbeddings(
             [BertEmbeddings('bert-base-multilingual-uncased')])
     else:
         raise ValueError("Invalid vectorizer: %s" % (vectorizer))
     return v
예제 #39
0
def jieba_tokenize(text):
    return jieba.cut(text)


tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenize, lowercase=False)
'''
tokenizer: 指定分词函数
lowercase: 在分词之前将所有的文本转换成小写,因为涉及到中文文本处理,
所以最好是False
'''
text_list = [
    "今天天气真好啊啊啊啊", "小明上了清华大学", "我今天拿到了Google的Offer", "清华大学在自然语言处理方面真厉害"
]

hv = HashingVectorizer(tokenizer=jieba_tokenize, n_features=10)
tt = hv.transform(text_list)

print(tt)

# 需要进行聚类的文本集
tfidf_matrix = tfidf_vectorizer.fit_transform(text_list)

print(tfidf_vectorizer)

# print(tfidf_matrix)

terms = tfidf_vectorizer.get_feature_names()

print(terms)
print(len(terms))
import pandas as pd
import matplotlib.pyplot as plt
y_train = pd.read_table('F:\\WeiWeiHe\\data_y.txt',header=None)
y_train = np.array(y_train).ravel()
X_train = corpus
'''获取Y中标签的类别总数'''
true_k = np.unique(y_train).shape[0]
print y_train
X_test = X_train
Y_test = y_train


from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer,TfidfTransformer
from sklearn.pipeline import make_pipeline
hasher = HashingVectorizer(n_features,non_negative=True,binary=False)
'''根据实际情况可能选择多种不同参数,
1)构建多从选择参数'''
from optparse import OptionParser
import sys
op = OptionParser()
op.add_option("--lsa",dest="n_components", type="int",help="Preprocess documents with latent semantic analysis.")
op.add_option("--no-minibatch",action="store_false", dest="minibatch", default=True,
              help="Use ordinary k-means algorithm (in batch mode).")
op.add_option("--no-idf",action="store_false", dest="use_idf", default=True,
              help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--use-hashing",action="store_true",default=False,help="using a hashing feature vectorizer")
op.add_option("--n-features",type=int,default=10000,help="Maximum number of features(dimensions to extract from text.)")
op.add_option("--verbose",action="store_true",dest="verbose",default=False,help="print report inside k-means")
op.print_help()
(opts,args) = op.parse_args()
예제 #41
0
train = train[train.year != 1]
train = train.merge(artist, how='inner', on='artist_id')
test = test.merge(artist, how='inner', on='artist_id')
del artist

train.song_hotttnesss[train.song_hotttnesss > train.song_hotttnesss.mean()] = 1
train.song_hotttnesss[train.song_hotttnesss < train.song_hotttnesss.mean()] = 0

train.year = (train.year // 10) * 10
test.year = (test.year // 10) * 10

CategoricalFeatures = train[['artist_id', 'title', 'audio_md5']]

from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer

vectorizer = HashingVectorizer(n_features=750)
#vectorizer = TfidfVectorizer(min_df  = 0.0002)

TfidfVectorizerObject = vectorizer.fit(pd.concat([train.title, test.title]))
CountVectorizerTrainData = TfidfVectorizerObject.transform(train["title"])
CountVectorizerTestData = TfidfVectorizerObject.transform(test["title"])

DropFeatures = [
    'song_id', 'artist_id', 'title', 'audio_md5', 'analysis_sample_rate',
    'key_confidence', 'audio_md5', 'year', 'end_of_fade_in', 'duration',
    'time_signature_confidence', 'artist_latitude', 'artist_longitude'
]

trainSongId = train[['song_id']]
train = train.drop(DropFeatures, axis=1)
song_id = test['song_id']
예제 #42
0
def text_vectorizer_hashing_vectorizer(plots):
    vectorizer = HashingVectorizer(n_features=1000)
    return vectorizer.transform(plots)
예제 #43
0
sorted(zip(clf.coef_[0], feature_names), reverse=True)[:20]

### Most fake
sorted(
    zip(clf.coef_[0], feature_names)
)[:
  20]  # clearly there are certain words which might show political intent and source in the top fake features (such as the words corporate and establishment).

tokens_with_weights = sorted(list(zip(feature_names, clf.coef_[0])))
# print(tokens_with_weights)

# --------------------------------------------------------------
# HashingVectorizer : require less memory and are faster (because they are sparse and use hashes rather than tokens)
# --------------------------------------------------------------

hash_vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
hash_train = hash_vectorizer.fit_transform(X_train)
hash_test = hash_vectorizer.transform(X_test)

# --------------------------------------------------------------
# Naive Bayes classifier for Multinomial model
# --------------------------------------------------------------

clf = MultinomialNB(alpha=.01)

clf.fit(hash_train, y_train)
pred = clf.predict(hash_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])
예제 #44
0
class Classifier:
    """
    Wrapper class for model. Exposes two functions, train and predict.
    """
    _model: OneVsOneClassifier
    _modelTags: dict
    _labelDict: dict
    _Encoder: LabelEncoder
    _hashVect: HashingVectorizer
    modelName: str
    savePath: str

    _labelAll = ["negative", "neutral", "positive"]
    _stopwords = [
        'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
        "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
        'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
        'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
        'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
        'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was',
        'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do',
        'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
        'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with',
        'about', 'against', 'between', 'into', 'through', 'during', 'before',
        'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
        'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
        'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both',
        'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own',
        'same', 'so', 'than', 'too', 'very', 's', 'can', 'will', 'just', 'don',
        "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've',
        'y', 'ain', 'ma'
    ]
    _auxverbs = {"wo": "will", "sha": "shall"}

    def __init__(self, makeNewModel: bool, modelName: str = None):
        """
        Constructor for the class.\n
        :param makeNewModel: Set to true if making new model. False otherwise.
        :param modelName: If makeNewModel is True, this parameter will be ignored. Otherwise, loads a model with the name supplied.
        """
        if not makeNewModel and re.search("classifier_\\d{11}(\\.P)?",
                                          modelName) is None:
            makeNewModel = True

        if not makeNewModel:
            if not modelName.endswith(PICKLE_FILE_EXTENSION):
                modelName += PICKLE_FILE_EXTENSION
            self.modelName = modelName
            self.savePath = model_folder + self.modelName
            self._model, self._modelTags = self._loadModel(self.savePath)
        else:
            self.modelName = "classifier_" + getEpochIdentifier(
            ) + PICKLE_FILE_EXTENSION
            self.savePath = model_folder + self.modelName
            self._initUser()
        self._Encoder = LabelEncoder()
        self._Encoder.fit_transform(self._labelAll)
        self._labelDict = dict(
            zip(self._Encoder.classes_,
                self._Encoder.transform(self._Encoder.classes_)))
        self._hashVect = HashingVectorizer(decode_error='replace',
                                           n_features=2**20,
                                           alternate_sign=False)

        # initialize model for each sentiment by passing blank data
        for sents in self._labelAll:
            self.train(text='', sentiment=sents)

    def train(self,
              text: str or list,
              sentiment: str,
              tags=None,
              returnProcessedText=False,
              fromDB=False):
        """
        Wrapper method that trains the model on only a single data. Set fromDB = True if the data is from the database.

        :param text: A string or a list. List should only be used on data already preprocessed, i.e. from database.
        :param sentiment: The sentiment for the data. Accepted inputs are only 'positive', 'negative' or 'neutral'.
        :param returnProcessedText: Set to true to return the processed text. Use to save to database.
        :param fromDB: Set to true only if the data (list) is from the database.
        :return: The processed text as a list. Only returns if returnProcessedText = True.
        """
        if type(text) is str:
            text = self._preprocessText(text)
        elif type(text) is list:
            if fromDB:  # required for already processed text stored in dbs
                text = self._preprocessFromDB(text)
            text = self._getSeriesFromList(text)

        if type(tags) is str:
            tags = separateTags(tags)

        self._saveModel(self._train(text, sentiment, tags), self.savePath)

        if returnProcessedText:
            text = list(text)
            return text

    def predict(self, rawText: str):
        """
        Wrapper method that predicts a single piece of data.

        :param rawText: The raw (unprocessed) text to predict on.
        :return: The sentiment of the predicted text and the tags.
        """
        processedText = self._preprocessText(rawText)

        return self._predict(processedText)

    @staticmethod
    def _preprocessFromDB(processedText: str) -> list:
        """
        Wrapper method for when the data is from the database. Separates the string data into a list of words.

        :param processedText: The already-processed text from the database.
        :return: Returns a list to be trained.
        """
        return _separateTags(processedText)

    @staticmethod
    def _getSeriesFromList(processedTextList: list) -> pd.Series:
        """
        Gets a pandas series from a list of processed text.

        :param processedTextList: A list containing the processed text/words.
        :return: Returns a pandas series of the text.
        """
        return pd.Series(' '.join(processedTextList))

    def _preprocessText(self, pre_text):
        """
        Performs preprocessing on the text.

        :param pre_text: A unprocessed string to perform text on. Accepts tuple, list and str.
        :return: A pandas series of processed text.
        """
        if type(pre_text) is list or type(pre_text) is tuple:
            pre_text = ' '.join(pre_text)

        # remove HTMl tags, if any
        pre_text = self._strip_html(pre_text)

        # split into multiple sentences
        text_tuple = sent_tokenize(pre_text, language='english')

        # tokenize
        text_tuple = [
            word_tokenize(sentence, language='english')
            for sentence in text_tuple
        ]

        for sentence in text_tuple:
            for index, word in enumerate(sentence):
                sentence[index] = word.lower()
                if word == "n't":
                    sentence[index] = "not"
                    if sentence[index - 1] in self._auxverbs:
                        sentence[index - 1] = self._auxverbs[sentence[index -
                                                                      1]]

        # WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc.
        tag_map = defaultdict(lambda: wn.NOUN)
        tag_map['J'] = wn.ADJ
        tag_map['V'] = wn.VERB
        tag_map['R'] = wn.ADV

        # remove stop words + lemmatization
        filtered_final = []
        for index, entry in enumerate(text_tuple):
            word_Lemmatized = WordNetLemmatizer()
            for word, tag in pos_tag(entry):
                if word not in self._stopwords and word.isalpha():
                    word_Final = word_Lemmatized.lemmatize(
                        word, tag_map[tag[0]])
                    filtered_final.append(word_Final)

        # change all text to lower case
        filtered_final = [_entry.lower() for _entry in filtered_final]

        # concatenate into one data instead of multiple individual words
        _textSeries = pd.Series(' '.join(filtered_final))

        return _textSeries

    def _initUser(self):
        """
        Wrapper method to initialize a first time user. Creates a new model and saves the model.
        """
        self._model = self._makeNewClassifier()
        self._modelTags = dict()
        self._saveModel([self._model, self._modelTags], self.savePath)

    @staticmethod
    def _strip_html(text: str):
        """
        Strips HTMl tags from a string text.

        :param text: A raw unprocessed text.
        :return: The same text with HTMl tags stripped.
        """
        soup = BeautifulSoup(text, "html.parser")
        return soup.getText()

    @staticmethod
    def _makeNewClassifier():
        """
        Makes a new model using OneVsOne classification. Model is a Logistic Regression algorithm fitted with SGD.

        :return: A OneVsOneClassifier object.
        """
        return OneVsOneClassifier(
            SGDClassifier(loss='log',
                          penalty='l2',
                          max_iter=150,
                          learning_rate='optimal',
                          eta0=0.00,
                          alpha=1e-04))

    def _makeNewModelDict(self, tags):
        """
        Creates a dict and initializes into the modelTags using the tag name as the key and the corresponding model as the value.\n
        The model is a binary class where class [0] is the tag (positive class) and class [1] is not the tag (negative class).
        For N number of tags, this dict will have N number of key-value pairs. This dict is essentially multiple OneVsRestClassifiers in one list.\n
        :param tags: List of string tags.
        """
        for eachTag in tags:
            self._modelTags[eachTag] = MultinomialNB()

    @staticmethod
    def _saveModel(save_classifier, _filename):
        """
        Saves a model locally. The models are saved in the ./Models/ folder.

        :param save_classifier: The model/classifier to save.
        :param _filename: The filename to save as.
        """
        saveFile = open(_filename, 'wb')
        pickle.dump(save_classifier, saveFile)
        saveFile.close()

    @staticmethod
    def _loadModel(_filename):
        """
        Loads a model from ./Models/ folder.

        :param _filename: The filename to load the classifier from.
        :return: The loaded model/classifier.
        """
        loadFile = open(_filename, 'rb')
        classifier, modelTags = pickle.load(loadFile)
        loadFile.close()

        return classifier, modelTags

    def _train(self, _text, _sentiment, userTags):
        """
        The inner method for training the model. The model is trained using partial_fit function.

        :param _text: A preprocessed pandas series.
        :param _sentiment: The raw string sentiment, either 'positive', 'negative' or 'neutral'.
        :return: The trained model.
        """
        encSentiment = self._labelDict.get(_sentiment)

        X_new = self._hashVect.transform(_text)

        self._model.partial_fit(
            X_new, [encSentiment],
            self._Encoder.transform(self._Encoder.classes_))

        # if not training the tag model
        if userTags is None:
            return self._model, self._modelTags

        # check if modelTags contain any existing tag classes
        # if yes, check if user added any new tags
        if len(self._modelTags.keys()) != 0:
            newTags = list(set(self._modelTags.keys()) - set(userTags))

            # adds the new tag to the dict and add a new model if there are new tags
            if len(newTags) != 0:
                for eachNewTag in newTags:
                    self._modelTags[eachNewTag] = MultinomialNB()
        else:  # if modelTag doesn't contain any tag classes, initialize it
            for eachTag in userTags:
                self._modelTags[eachTag] = MultinomialNB()

        for eachTag in self._modelTags:
            if eachTag in userTags:
                self._modelTags[eachTag].partial_fit(X_new, [0], [0, 1])
            else:
                self._modelTags[eachTag].partial_fit(X_new, [1], [0, 1])

        return self._model, self._modelTags

    def _predict(self, _text):
        """
        The inner method for getting predictions from the model.

        :param _text: A preprocessed pandas series.
        :return: The string sentiment, either 'positive', 'negative' or 'neutral'.
        """
        X_new = self._hashVect.transform(_text)

        sentiment = getDictKey(self._labelDict, self._model.predict(X_new))
        retTags = list()

        for eachTag in self._modelTags:
            if self._modelTags[eachTag].predict(X_new) == [0]:
                retTags.append(eachTag)

        return sentiment, retTags
예제 #45
0
from sklearn.datasets import fetch_20newsgroups

use_hashing = False
use_idf = False
minibatch = True
verbose = False
n_features = 10000
n_components = 10
true_k = 10

if use_hashing:
    if use_idf:
        hasher = HashingVectorizer(n_features=n_features,
                                   stop_words='english',
                                   alternate_sign=False,
                                   norm=None,
                                   binary=False)
        vectorizer = make_pipeline(hasher, TfidfTransformer())
    else:
        vectorizer = HashingVectorizer(n_features=n_features,
                                       stop_words='english',
                                       alternate_sign=False,
                                       norm='l2',
                                       binary=False)
else:
    vectorizer = TfidfVectorizer(max_df=0.5,
                                 max_features=n_features,
                                 min_df=2,
                                 stop_words='english',
                                 use_idf=use_idf)
예제 #46
0
 def __init__(self, chunksize=100000, **kwargs):
     self.chunksize = chunksize
     HashingVectorizer.__init__(
         self, **kwargs)
예제 #47
0
    X = np.array(["numpy", "scipy", "sklearn"])
    vectorizer = TfidfVectorizer(dtype=vectorizer_dtype)

    warning_msg_match = "'dtype' should be used."
    warning_cls = UserWarning
    expected_warning_cls = warning_cls if warning_expected else None
    with pytest.warns(expected_warning_cls, match=warning_msg_match) as record:
        X_idf = vectorizer.fit_transform(X)
    if expected_warning_cls is None:
        relevant_warnings = [w for w in record if isinstance(w, warning_cls)]
        assert len(relevant_warnings) == 0
    assert X_idf.dtype == output_dtype


@pytest.mark.parametrize("vec", [
    HashingVectorizer(ngram_range=(2, 1)),
    CountVectorizer(ngram_range=(2, 1)),
    TfidfVectorizer(ngram_range=(2, 1))
])
def test_vectorizers_invalid_ngram_range(vec):
    # vectorizers could be initialized with invalid ngram range
    # test for raising error message
    invalid_range = vec.ngram_range
    message = ("Invalid value for ngram_range=%s "
               "lower boundary larger than the upper boundary." %
               str(invalid_range))
    if isinstance(vec, HashingVectorizer):
        pytest.xfail(reason='HashingVectorizer not supported on PyPy')

    assert_raise_message(ValueError, message, vec.fit, ["good news everyone"])
    assert_raise_message(ValueError, message, vec.fit_transform,
next(stream_docs(path='./movie_data.csv'))

def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y


vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,y_train, classes=classes)
    pbar.update()
예제 #49
0
        )),
        ('scale', MaxAbsScaler()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])


TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Import the hashing vectorizer

p2 = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                     non_negative=True, norm=None, binary=False,
                                                     ngram_range=(1,2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', XGBClassifier())))
    ])
예제 #50
0
from sklearn.feature_extraction.text import HashingVectorizer

corpus = [
    "Preach lol! :) RT @mention: #alliwantis this type of weather all the time.. I live for beautiful days like this! #minneapolis",
    "@mention good morning sunshine", "rhode island",
    "RT @mention: I absolutely love thunderstorms!",
    "@mention right this weather is something else",
    "TOP CHOICE --&gt; {link} - Today is awesome!!! Free comic books, lunch with my mama, sunshine & DJ'n ... (via @mention)",
    "CCAk Trail Update: Archangel Road, Mat-Su - 8:00 PM, Thu May 05, 2011: Snow column beginning to break up especia...  {link}"
]

counts = [[3, 0, 1], [2, 0, 0], [3, 0, 0], [4, 0, 0], [3, 2, 0], [3, 0, 2]]

# count vectorizer
vectorizer = CountVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
X.toarray()

# tfidf transformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(counts)
tfidf.toarray()

# combination : tfidf vectorizer
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
X.toarray()

# hasher : save time and space
hv = HashingVectorizer()
hv.transform(corpus)
#!/usr/bin/python
#coding:utf-8
import sys
from sklearn.svm import LinearSVC
#from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import HashingVectorizer
if sys.version_info[0]>=3: raw_input=input
transformer=HashingVectorizer(stop_words='english')

_train=[]
train_label=[]
f=open('trainingdata.txt')
for i in range(int(f.readline())):
	s=f.readline().rstrip()
	idx=s.find(' ')
	_train.append(s[idx+1:])
	train_label.append(int(s[:idx]))
f.close()
train = transformer.fit_transform(_train)
svm=LinearSVC()
svm.fit(train,train_label)

_test=[]
for i in range(int(raw_input())):
	s=raw_input().rstrip()
	_test.append(s)
test = transformer.transform(_test)
test_label=svm.predict(test)
for e in test_label: print(e)
예제 #52
0
def run(keyn, nPart):
    all_classes = np.array([0, 1])
    allKeys = [l.split()[0] for l in open('keywordsAll.txt').readlines()]
    keyFreqs = [
        float(l.split()[1]) / 4205907
        for l in open('keywordsAll.txt').readlines()
    ]
    key = allKeys[keyn]
    freq = keyFreqs[keyn]

    opt = 'body+title+code'
    bv = 'True'
    nneg = 'True'
    nv = 'None'
    #testopt = 'c'
    #testopt = 'w'
    #testopt = 'l2'
    testopt = 'l1'

    if testopt == 'c':
        cls = SGDClassifier(loss='hinge',
                            learning_rate="constant",
                            alpha=1e-6,
                            eta0=1e-2,
                            penalty='l2')
    elif testopt == 'w':
        cls = SGDClassifier(class_weight={1: 1.0 / freq / 8.0, 0: 1})
    elif testopt == 'l2':
        cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l2')
    elif testopt == 'l1':
        cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l1')

    outputName = 'key_' + str(
        keyn) + '_SGDtune_' + opt + '_partialfit_' + testopt + '.txt'
    pklName = 'SGD_key_' + str(keyn) + '_' + testopt + '.pkl'
    n0, ntrain = resumeJob(outputName, pklName)

    body_test, y_test = getTestSet(10, key, opt, testSize=0.2, seed=123)
    tot_pos = sum(y_test)
    vectorizer = HashingVectorizer(decode_error='ignore',
                                   n_features=2**20,
                                   token_pattern=r"\b\w[\w#+.-]*(?<!\.$)",
                                   binary=str2bool(bv),
                                   norm=normOpt(nv),
                                   non_negative=str2bool(nneg))

    X_test = vectorizer.transform(body_test)
    #print 'test case:', len(y_test), 'positive', tot_pos, 'key:', key, 'X norm:', X_test.sum(), 'binary:', bv, 'norm:', nv, 'nneg:', nneg
    if n0 >= 2:
        cls = joblib.load(pklName)
    for n in xrange(n0, 10):
        outfile = open(outputName, 'a')
        data = json.load(gzip.open('Train.rdup.' + str(n) + '.json.gz'))
        minibatch_size = len(data) / nPart + 1
        for i in xrange(nPart):
            n1 = i * minibatch_size
            n2 = (i + 1) * minibatch_size
            if i == nPart - 1:
                n2 = len(data)
            ntrain += (n2 - n1)
            body_train, y_train = getMiniBatch(data, n1, n2, key, opt)
            X_train = vectorizer.transform(body_train)
            shuffledRange = range(n2 - n1)
            for n_iter in xrange(5):
                X_train, y_train = shuffle(X_train, y_train)
            cls.partial_fit(X_train, y_train, classes=all_classes)
            y_pred = cls.predict(X_test)
            f1 = metrics.f1_score(y_test, y_pred)
            p = metrics.precision_score(y_test, y_pred)
            r = metrics.recall_score(y_test, y_pred)
            accu = cls.score(X_train, y_train)
            y_pred = cls.predict(X_train)
            f1t = metrics.f1_score(y_train, y_pred)
            outfile.write(
                "%3d %8d %.4f %.3f %.3f %.3f %.3f %5d  %5d\n" %
                (n, ntrain, accu, f1t, f1, p, r, sum(y_pred), tot_pos))
        _ = joblib.dump(cls, pklName, compress=9)
        outfile.close()
import re
from sklearn.feature_extraction.text import HashingVectorizer
import pickle
import os

cur_dir = os.path.dirname(__file__)
stop = pickle.load(
    open(os.path.join(cur_dir, 'pkl_objects', 'stopwords.pkl'), 'rb'))


def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text


def tokenizer_porter(text):
    porter = PorterStemmer()
    line = [porter.stem(word) for word in text.split()]
    return line


vect_optimized = HashingVectorizer(decode_error='ignore',
                                   norm=None,
                                   n_features=2**21,
                                   preprocessor=preprocessor,
                                   stop_words=stop,
                                   tokenizer=tokenizer_porter)
예제 #54
0
print("%d documents - %0.3fMB (training set)" %
      (len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" %
      (len(data_test.data), data_test_size_mb))

print("%d categories" % len(categories))
print()
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    vectorizer = HashingVectorizer(stop_words='english',
                                   non_negative=True,
                                   n_features=opts.n_features)
    X_train = vectorizer.transform(data_train.data)
else:
    vectorizer = CountVectorizer(ngram_range=(2, 2),
                                 max_df=0.5,
                                 stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
예제 #55
0
sdf = spark.createDataFrame(pdf)
cols = [F.col(str(c)) for c in sdf.columns]

# apply predict UDFs and select prediction output
prediction_df = (sdf.withColumn("scores", predict_proba(*cols)).withColumn(
    "preds", predict(*cols)).select("preds", "scores"))
prediction_df.show()

# single text feature
data = fetch_20newsgroups(shuffle=True,
                          random_state=1,
                          remove=("headers", "footers", "quotes"))
X = data["data"][:100]
y = data["target"][:100]
model = Pipeline([
    ("vec", HashingVectorizer()),
    ("clf", LogisticRegression(solver="liblinear", multi_class="auto")),
])
model.fit(X, y)

# get UDFs with 'text' feature types
predict = get_prediction_udf(model, method="predict", feature_type="text")
predict_proba = get_prediction_udf(model,
                                   method="predict_proba",
                                   feature_type="text")

# create PySpark DataFrame from features
pdf = pd.DataFrame(X)
sdf = spark.createDataFrame(pdf)
cols = [F.col(str(c)) for c in sdf.columns]
예제 #56
0
clf = RandomForestClassifier(n_estimators=50)
clf = clf.fit(train_data_features[0:8301], topics_has_earn_word[0:8301])
print "CountVecorizer BoW encoding \n" \
      "80% train data, 20% test data \n" \
      "Using 50 trees in RandomForestClassifier \n" \
      "Execution time: " , datetime.now() - old_time

score = clf.score(train_data_features[8301:], topics_has_earn_word[8301:])
print "Score:", score * 100, "\n"

# ------------------ BoW using feature hashing -----------------

# from http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html#sphx-glr-auto-examples-text-document-classification-20newsgroups-py

vectorizer = HashingVectorizer(stop_words='english',
                               non_negative=True,
                               n_features=1000)
X_train = vectorizer.transform(data_clean_train)

old_time = datetime.now()

clf = RandomForestClassifier(n_estimators=50)
clf.fit(X_train[0:8301], topics_has_earn_word[0:8301])
print "HashingVecorizer BoW encoding \n" \
      "80% train data, 20% test data \n" \
      "Using 50 trees in RandomForestClassifier \n" \
      "Execution time: " , datetime.now() - old_time

score = clf.score(X_train[8301:], topics_has_earn_word[8301:])

print "Score:", score * 100
예제 #57
0
print("%d documents - %0.3fMB (training set)" %
      (len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" %
      (len(data_test.data), data_test_size_mb))
print("%d categories" % len(target_names))
print()

# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    vectorizer = HashingVectorizer(stop_words='english',
                                   alternate_sign=False,
                                   n_features=opts.n_features)
    X_train = vectorizer.transform(data_train.data)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
    parser = ReutersParser()
    for filename in glob(os.path.join(data_path, "*.sgm")):
        for doc in parser.parse(open(filename, 'rb')):
            yield doc


###############################################################################
# Main
# ----
#
# Create the vectorizer and limit the number of features to a reasonable
# maximum

vectorizer = HashingVectorizer(decode_error='ignore',
                               n_features=2**18,
                               alternate_sign=False)

# Iterator over parsed Reuters SGML files.
data_stream = stream_reuters_documents()

# We learn a binary classification between the "acq" class and all the others.
# "acq" was chosen as it is more or less evenly distributed in the Reuters
# files. For other datasets, one should take care of creating a test set with
# a realistic portion of positive instances.
all_classes = np.array([0, 1])
positive_class = 'acq'

# Here are some classifiers that support the `partial_fit` method
partial_fit_classifiers = {
    'SGD': SGDClassifier(),
예제 #59
0
 def func():
     hv = HashingVectorizer()
     hv.fit_transform(['hello world', np.nan, 'hello hello'])
예제 #60
0
	"WordBag",
]

data_sizes= [40000, 80000, 160000, 320000, 640000, 1280000]

for task in tasks:
	for data_size in data_sizes:
		texts_chunk = texts[:data_size]
		print("Task:", task, "Data size:", data_size)
		for backend in backends:
			batcher = Batcher(procs=16, minibatch_size=5000, backend=backend[0], backend_handle=backend[1])
			#try:
			with timer("Completed: ["+task+","+str(len(texts_chunk))+","+backend[0]+"]"), warnings.catch_warnings():
				warnings.simplefilter("ignore")
				if task=="ApplyBatch":
					hv = HashingVectorizer(decode_error='ignore', n_features=2 ** 25, preprocessor=normalize_text,
										   ngram_range=(1, 2), norm='l2')
					t= ApplyBatch(hv.transform, batcher=batcher).transform(texts_chunk)
					print(t.shape, t.data[:5])

				if task=="WordBag":
					wb = WordBatch(normalize_text=normalize_text,
					               dictionary=Dictionary(min_df=10, max_words=1000000, verbose=0),
					               tokenizer= Tokenizer(spellcor_count=2, spellcor_dist=2, stemmer= stemmer),
					               extractor=WordBag(hash_ngrams=0, norm= 'l2', tf= 'binary', idf= 50.0),
					               batcher= batcher,
					               verbose= 0)
					t = wb.fit_transform(texts_chunk)
					print(t.shape, t.data[:5])
			# except:
			# 	print("Failed ["+task+","+str(len(texts_chunk))+","+backend[0]+"]")
		print("")