def run_online_classifier():
    vect = HashingVectorizer(
        decode_error='ignore',
        n_features=2**21,
        preprocessor=None,
        tokenizer=tokenizer_streaming,
    )
    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)

    csv_filename = os.path.join('datasets', 'movie_data.csv')
    doc_stream = stream_docs(path=csv_filename)

    classes = np.array([0, 1])
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if X_train is None:
            break
        else:
            X_train = vect.transform(X_train)
            clf.partial_fit(X_train, y_train, classes=classes)

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print("Test accuracy: %.3f" % clf.score(X_test, y_test))

    clf = clf.partial_fit(X_test, y_test)
예제 #2
0
def do_training():
    global X_train, X_test, feature_names, ch2
    print("Extracting features from the training data using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train_data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.25,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train_data)
    duration = time() - t0
    #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print("Extracting features from the test data using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test_data)
    duration = time() - t0
    #print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()

    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()

    if True:#opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" % 20000)
        t0 = time()
        ch2 = SelectKBest(chi2, k=20000)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [feature_names[i] for i
                             in ch2.get_support(indices=True)]
        print("done in %fs" % (time() - t0))
        print()
    
    if feature_names:
        feature_names = np.asarray(feature_names)

    results = []

    #for penalty in ["l2", "l1"]:
    penalty = 'l2'
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    clf = LinearSVC(loss='l2', penalty=penalty,dual=False, tol=1e-3)
    results.append(benchmark(clf))
        
    joblib.dump(vectorizer, 'vectorizer.pkl', compress=9)
    joblib.dump(ch2, 'feature_selector.pkl', compress=9)
    joblib.dump(clf, 'linearsvc_classifier.pkl', compress=9)
예제 #3
0
파일: cluster.py 프로젝트: aolieman/xtas
def big_kmeans(docs, k, batch_size=1000, n_features=(2 ** 20),
               single_pass=True):
    """k-means for very large sets of documents.

    See kmeans for documentation. Differs from that function in that it does
    not computer tf-idf or LSA, and fetches the documents in a streaming
    fashion, so they don't need to be held in memory. It does not do random
    restarts.

    If the option single_pass is set to False, the documents are visited
    twice: once to fit a k-means model, once to determine their label in
    this model.
    """
    from sklearn.cluster import MiniBatchKMeans
    from sklearn.feature_extraction.text import HashingVectorizer

    v = HashingVectorizer(input="content", n_features=n_features, norm="l2")
    km = MiniBatchKMeans(n_clusters=k)

    labels = []
    for batch in batches(docs, batch_size):
        batch = map(fetch, docs)
        batch = v.transform(batch)
        y = km.fit_predict(batch)
        if single_pass:
            labels.extend(y.tolist())

    if not single_pass:
        for batch in batches(docs, batch_size):
            batch = map(fetch, docs)
            batch = v.transform(batch)
            labels.extend(km.predict(batch).tolist())

    return labels
def train():
    vect = HashingVectorizer(decode_error='ignore',
                             n_features=2**21,
                             preprocessor=None,
                             ngram_range=(1, 3),
                             tokenizer=tokenizer)
    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
    stream_path = os.path.join(work_path, 'movie_data.csv')
    doc_stream = stream_docs(path=stream_path)

    pbar = pyprind.ProgBar(45)
    classes = np.array([0, 1])
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if not X_train:
            break
        X_train = vect.transform(X_train)
        clf.partial_fit(X_train, y_train, classes=classes)
        pbar.update()

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print('Accuracy: %.3f' % clf.score(X_test, y_test))

    clf = clf.partial_fit(X_test, y_test)

    return clf
예제 #5
0
파일: tfidf.py 프로젝트: fallingleaf/rsweb
def tfidf_classify(user):
    train_set, y, src, test_set = extract_data(user.id)
    if not train_set:
        return []
    # Analyse using tf-idf
    # vector = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
    vector = HashingVectorizer(n_features=1000, non_negative=True, stop_words='english')
    # List of topic extracted from text
    # feature_names = vector.get_feature_names()
    # print feature_names
    xtrain = vector.transform(train_set)
    xtest = vector.transform(test_set)

    # Select sample using chi-square
    ch2 = SelectKBest(chi2)
    xtrain = ch2.fit_transform(xtrain, y)
    xtest = ch2.transform(xtest)

    # Predict testing set
    # classifier = DecisionTreeClassifier()
    classifier = KNeighborsClassifier(n_neighbors=4)
    classifier = classifier.fit(xtrain, y)
    result = classifier.predict(xtest)
    final = []
    for i in xrange(len(result)):
        if result[i]:
            final.append(src[i])
    print len(final)
    return final
예제 #6
0
def big_kmeans(docs, k, batch_size=1000, n_features=(2 ** 20),
               single_pass=True):
    """k-means for very large sets of documents.

    """
    from sklearn.cluster import MiniBatchKMeans
    from sklearn.feature_extraction.text import HashingVectorizer

    v = HashingVectorizer(input="content", n_features=n_features, norm="l2")
    km = MiniBatchKMeans(n_clusters=k)

    labels = []
    for batch in batches(docs, batch_size):
        batch = map(fetch, docs)
        batch = v.transform(batch)
        y = km.fit_predict(batch)
        if single_pass:
            labels.extend(y.tolist())

    if not single_pass:
        for batch in batches(docs, batch_size):
            batch = map(fetch, docs)
            batch = v.transform(batch)
            labels.extend(km.predict(batch).tolist())

    return labels
def trainOnModel(x_VariableList, y_VariableList, testSetList, classifier, hashing=False, chi_squared=False):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.feature_extraction.text import HashingVectorizer
    from sklearn.feature_selection import SelectKBest, chi2
    from sklearn.linear_model import RidgeClassifier
    from sklearn.svm import LinearSVC
    from sklearn.linear_model import SGDClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import PassiveAggressiveClassifier
    from sklearn.utils.extmath import density
    y_train = y_VariableList
    if hashing == True:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                       n_features=2 ** 16)
        X_train = vectorizer.transform(x_VariableList)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(x_VariableList)

    X_test = vectorizer.transform(testSetList)

    if chi_squared == True:
        print("Extracting best features by a chi-squared test")
        ch2 = SelectKBest(chi2, k=2 * 16)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)

    classifierObject = ""
    print "Using :", classifier

    if classifier == "LinearSVC":
        classifierObject = LinearSVC(penalty='l2', dual=False, tol=1e-3)

    elif classifier == "PassiveAggressiveClassifier":
        classifierObject = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge',
                                                       n_iter=50, n_jobs=1, random_state=None, shuffle=True,
                                                       verbose=0, warm_start=False)

    elif classifier == "RidgeClassifier":
        classifierObject = RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                                           max_iter=None, normalize=False, solver='lsqr', tol=0.01)

    elif classifier == "Perceptron":
        classifierObject = Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
                                      n_iter=50, n_jobs=1, penalty=None, random_state=0, shuffle=True,
                                      verbose=0, warm_start=False)

    elif classifier == "SGDClassifier":
        classifierObject = SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
                                         eta0=0.0, fit_intercept=True, l1_ratio=0.15,
                                         learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1,
                                         penalty='l2', power_t=0.5, random_state=None, shuffle=True,
                                         verbose=0, warm_start=False)

    classifierObject.fit(X_train, y_train)
    pred = classifierObject.predict(X_test)
    return pred[0]
예제 #8
0
class ReviewTrainer(TrainerModel):
	def __init__(self):
		pass

	#get rids of stopwords
	def preprocess(self, l):
		res = {}
		sw = stopwords.words('english')
		clean = ' '.join([w for w in l['text'].split() if w not in sw])
		res[l['review_id']] = {'text' : clean, 'label' : l['votes']['useful']}
		return res

	#the labels are already given for this review
	def group_labels(self, fname):
		pass

	#vectorizes data and selects K best feats.
	def prepare_data(self, x, y):
		self.hv = HashingVectorizer(strip_accents='ascii', non_negative=True)
		self.feats = self.hv.transform(x)
		self.labels = np.array(y)
		
		self.ch2 = SelectKBest(chi2, k=K_FEAT)
		self.feats = self.ch2.fit_transform(self.feats, self.labels)
		
	def get_error(self, pred, y):
		return super(ReviewTrainer, self).get_error(pred,y)
	
	#optimizes for hyper-parameter alpha
	def _cross_validate(self):
		grid = dict(alpha=10.0 ** np.arange(-4,1))
		return super(ReviewTrainer, self)._cross_validate_base(
			Ridge(), grid) 
	
	#builds examples to feed trainer
	#MUST RUN BEFORE train
	def build_examples(self, data, labels=None):
		feats = []
		labels = []
		ex = {}
		for k,v in data.items():
			feats.append(v['text'])
			labels.append(v['label'])
		ex['feats'] = feats
		ex['labels'] = labels
		return ex

	#fits model using optimal parameters
	def train(self):
		self.clf = self._cross_validate()
		self.clf.fit(self.feats, self.labels)

	#predicts Y given X
	def predict(self, data):
		data = self.hv.transform(data)
		data = self.ch2.transform(data)
		pred = self.clf.predict(data)
		return pred			
예제 #9
0
def vectorize(concepts):
    """
    This vectorizes a list or a string of concepts;
    the regular `vectorize` method is meant to vectorize text documents;
    it is trained for that kind of data and thus is inappropriate for concepts.
    So instead we just use a simple hashing vectorizer.
    """
    h = HashingVectorizer(input='content', stop_words='english', norm=None, tokenizer=Tokenizer())
    if type(concepts) is str:
        # Extract and return the vector for the single document.
        return h.transform([concepts]).toarray()[0]
    else:
        return h.transform(concepts)
예제 #10
0
파일: learn.py 프로젝트: blagarde/rssadio
class App(object):
    def __init__(self):
        self.hv = HashingVectorizer(norm=None, non_negative=True)
        if os.path.isfile(MODEL):
            self.clf = load(MODEL)
        else:
            self.clf = linear_model.SGDClassifier(warm_start=True)
            zeros = self.vector('seed')
            self.clf.partial_fit(zeros, np.unique([GOOD]), classes=(GOOD, BAD))

    def feed(self):
        for doc in stories:
            self.pred_X = self.vector(doc)
            self.pred_y = self.clf.predict(self.pred_X)
            if self.pred_y == GOOD:
                yield doc

    def save_model(self):
        dump(self.clf, MODEL)

    def train(self, doc, y):
        X = self.vector(doc)
        self.clf.partial_fit(X, [y])

    def vector(self, doc):
        clean = preprocess(doc)
        return self.hv.transform([clean])

    def score(self, y):
        print self.clf.score(self.pred_X, [y])
예제 #11
0
class FeatureExtractor(object):

    def __init__(self, csv_filename, batch_size = 1000):
        self.vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 ** 18, non_negative=True)
        self.train_pd = pandas.read_csv(csv_filename)
        self.index = 0
        self.batch_size = batch_size

    def nextBatch(self):
        """
        Return a generator for a batch_size number of train (X, y) pairs
        """
        train_length = len(self.train_pd)
        while self.index < train_length:
            end_index = min(self.index + self.batch_size, train_length)
            print "Reading ", self.index, ": ", end_index
            X_train = list()
            y_train = list()

            for i in range(self.index, end_index):
                filename = "data/" + self.train_pd["file"][i]
                text = open(filename, "rb").readlines()
                X_train.append("\n".join(text))
                y_train.append(int(self.train_pd["sponsored"][i]))
            self.index = end_index
            yield (self.vectorizer.transform(X_train), y_train)

    def dumpBatch(self, batch, filename):
        with open(filename, "wb") as f:
            cPickle.dump(batch, f)

    def dump(self, filename):
        with open(filename, "wb") as f:
            cPickle.dump(self, f)
예제 #12
0
파일: ooc.py 프로젝트: audy/bfc
def main():
    '''
        >>> main() # stuff happens
    '''

    args = parse_args()
    setup_logging(args.log, verbose=args.verbose)

    chunks = sequence_chunk_generator(args.fasta_file,
                                      chunk_size=args.chunk_size)

    hasher = HashingVectorizer(analyzer='char',
                               n_features = 2 ** 18,
                               ngram_range=(args.ngram_min, args.ngram_max),
                               )

    estimator = AffinityPropagation()

    for chunk in chunks:

        logging.info('hashing chunk')
        chunk_vector = hasher.transform([ str(i.seq) for i in chunk ])

        logging.info('clustering')

        estimator.fit(chunk_vector)

        logging.info('got %s clusters' % len(set(estimator.labels_)))
예제 #13
0
def sim_char5(text1, text2):
    vect = HashingVectorizer(analyzer='word', tokenizer=normalize, stop_words='english')
    texts = [text1, text2]
    matrix = vect.transform(texts)
    cosine_similarities = linear_kernel(matrix[0:1], matrix).flatten()
    simmax = max(cosine_similarities[1:])
    return simmax
예제 #14
0
    def test_same_output(self):
        X, X_rdd = self.make_text_rdd()
        local = HashingVectorizer()
        dist = SparkHashingVectorizer()

        result_local = local.transform(X).toarray()
        result_dist = dist.transform(X_rdd).toarray()
        assert_array_equal(result_local, result_dist)
    def test_same_output(self):
        X, X_rdd = self.generate_text_dataset()
        local = HashingVectorizer()
        dist = SparkHashingVectorizer()

        result_local = local.transform(X)
        result_dist = sp.vstack(dist.transform(X_rdd).collect())
        assert_array_equal(result_local.toarray(), result_dist.toarray())
예제 #16
0
파일: sentiment.py 프로젝트: jannson/crfseg
def predict(line, tagger):
    tok_cn = lambda (x): crfseg.cut_zh(x, tagger)

    hasher = HashingVectorizer(n_features=2**16,
                               tokenizer=tok_cn, non_negative=True,
                               norm=None, binary=False)
    x_test = hasher.transform([line])
    return clf_global.predict_proba(x_test)
예제 #17
0
파일: __init__.py 프로젝트: keho98/argos
def vectorize(docs):
    """
    Vectorizes a list of documents.

    Args:
        | docs (list)       -- the documents to vectorize.
        | docs (str)        -- a single document to vectorize.

    Returns:
        | scipy sparse matrix (CSR/Compressed Sparse Row format)
    """
    h = HashingVectorizer(input='content', stop_words='english', norm=None, tokenizer=Tokenizer())

    if type(docs) is str:
        # Extract and return the vector for the single document.
        return h.transform([docs]).toarray()[0]
    else:
        return h.transform(docs)
class Featurizer:
    def __init__(self):
        self.vectorizer = HashingVectorizer(stop_words="english")

    def train_feature(self, examples):
        return self.vectorizer.fit_transform(examples)

    def test_feature(self, examples):
        return self.vectorizer.transform(examples)
예제 #19
0
def main(output=RESULTS):
    # change ROOT ID in config.py to your computer's path so that is writes to correct file
    # load and puts data and desired numpy format
    movies = load_balanced_movies(MOVIES_DATA, False) # True is for debugging
    data = pd.DataFrame(movies)
    pd.options.mode.chained_assignment = None  # default='warn' ignore
    summaries = data[['summary']]
    summaries['summary'] = summaries['summary'].str.replace('[^\w\s]','').str.lower()  ## cleans out puncutation and characters
    Y = np.array(data[['year']])
    Y = np.ravel(Y)
    X = np.array(summaries['summary'])

    # standard CountVectorizer for bag of words
    # vectorizer = CountVectorizer()
    # X = vectorizer.fit_transform(X)

    # print "Old Shape Dim" 
    # print X.shape 

    # uses random projections to reduce dimensionality
    # transformer = random_projection.SparseRandomProjection()
    # X_new = transformer.fit_transform(X)
    # print "New Shape Dim"
    # print X_new.shape 

    # perform vectorization and dim reduction using Hashing Vectorizer (counts # of times a word appears)
    vectorizer = HashingVectorizer(stop_words='english', n_features=80000)  # uses 80,000 word instances as k
    X = vectorizer.transform(X)

    # instantiate scaling of data for preprocessing
    X = StandardScaler(with_mean=False).fit_transform(X)

    # splits training and test data equally
    xtrain, xtest, ytrain, ytest = train_test_split(X, Y)

    names = ["SGDClassifier", "Linear SVC", "SVC Kernel RBF", "PerceptronL1", "PerceptronL2", "Nearest Neighbors", "Ridge Classifier"] # 
    classifiers = [
        SGDClassifier(loss="hinge", penalty="l2"),
        LinearSVC(),
        SVC(kernel="rbf"),
        Perceptron(penalty='l1'),
        Perceptron(penalty='l2', n_iter=25),
        KNeighborsClassifier(),
        RidgeClassifier(),
        ]

    print "Calculating accuracies"
    # fits chosen classifier on training data
    for name, clf in zip(names, classifiers):
        print name
        clf.fit(xtrain, ytrain)
        print "Accuracy: %0.2f%%" % (100 * clf.score(xtest, ytest)) # Predict and score accuracy

        with open(output, "a+") as outputFile:  # write results to file 
            score = 100 * clf.score(xtest, ytest) 
            outputFile.write("Ran classifier {}    ".format(name) + '\n'
            " Achieved accuracy {}   ".format(score) )
예제 #20
0
class svm_text(SVC):
#    svm_ = SVC(C=500, kernel='poly', gamma=.01, shrinking=True, probability=False, degree= 10, coef0=2,
#        tol=0.001, cache_size=20000, class_weight=None, verbose=False, max_iter=-1)
    def __init__(self, train_data, C=5, kernel='poly', gamma=.001, degree=10, coef0=2, n_features=10000000,
                 ngram_range=(1, 10), tfidf=False, dfrange=(2, 1.0), probability=False, class_weight=None):
        self.conn = None
        self.is_tfidf = tfidf
        if tfidf:
            self.vectorizer = TfidfVectorizer(stop_words=None, min_df=dfrange[0], max_df=dfrange[1],
                                              max_features=n_features, strip_accents='unicode',
                                              ngram_range=ngram_range, analyzer='word', norm='l2')
        else:
            self.vectorizer = HashingVectorizer(stop_words=None, non_negative=True,
                                                n_features=n_features, strip_accents='unicode',
                                                ngram_range=ngram_range, analyzer='word', norm='l2')
        self.param_set = {'C': str(C), 'kernel': str(kernel), 'gamma': str(gamma),
                          'degree': str(degree), 'coef0': str(coef0), 'n_features': str(n_features)}
        if class_weight == 'auto':
            class_weight = {}
            for item in train_data.target:
                if class_weight.get(item):
                    class_weight.update({item: class_weight[item] + 1.0})
                else:
                    class_weight.update({item: 1.0})
            for key in class_weight:
                class_weight.update({key: 1.0 / class_weight[key]})
        self.class_weight_dict = class_weight
        super(svm_text, self).__init__(C=C, kernel=kernel, gamma=gamma, shrinking=True, probability=probability, degree=degree, coef0=coef0,
                                       tol=0.001, cache_size=20000, class_weight=class_weight, verbose=False, max_iter=-1)
        if self.is_tfidf:
            train_x = self.vectorizer.fit_transform(train_data.data)
        else:
            train_x = self.vectorizer.transform(train_data.data)
        self.fit(train_x, train_data.target)
    def test_data(self, test_data):
        test_x = self.vectorizer.transform(test_data.data)
        predicted_values = self.predict(test_x)
        test_y = test_data.target
        self.score = metrics.f1_score(test_y, predicted_values)
        self.accuracy = metrics.accuracy_score(test_y, predicted_values)
    def guess_text(self, text_text):
        text_x = self.vectorizer.transform([pre_proc(text_text, removestop=False, alwayskeep=True, word_punc=True, unquote=True),])
        return self.predict(text_x)
def extractFeatures():

    print("Extracting features from the training dataset using a sparse vectorizer")

    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', 
                                       non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train.data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, 
                                     max_df=0.5,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train.data)
    
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()


    print("Extracting features from the test dataset using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test.data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()


    if opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" %
              opts.select_chi2)
        t0 = time()
        ch2 = SelectKBest(chi2, k=opts.select_chi2)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        print("done in %fs" % (time() - t0))
        print()

    return X_train, X_test
예제 #22
0
def feature_extraction(feature, target_name, df):
    vect = HashingVectorizer(decode_error='ignore', ngram_range=(1,2), n_features = 2**18, binary=True, norm="l2")
    le = preprocessing.LabelEncoder()
    # for multiple features replace this with http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html
    df[feature] = df[feature].fillna('')
    titles = vect.transform(df[feature])

    X = titles
    #y = le.fit_transform(df[target_name])
    y = df[target_name]
    return X, y
예제 #23
0
def test_hashed_binary_occurrences():
    # by default multiple occurrences are counted as longs
    test_data = ["aaabc", "abbde"]
    vect = HashingVectorizer(analyzer="char", non_negative=True, norm=None)
    X = vect.transform(test_data)
    assert_equal(np.max(X[0:1].data), 3)
    assert_equal(np.max(X[1:2].data), 2)
    assert_equal(X.dtype, np.float64)

    # using boolean features, we can fetch the binary occurrence info
    # instead.
    vect = HashingVectorizer(analyzer="char", non_negative=True, binary=True, norm=None)
    X = vect.transform(test_data)
    assert_equal(np.max(X.data), 1)
    assert_equal(X.dtype, np.float64)

    # check the ability to change the dtype
    vect = HashingVectorizer(analyzer="char", non_negative=True, binary=True, norm=None, dtype=np.float64)
    X = vect.transform(test_data)
    assert_equal(X.dtype, np.float64)
예제 #24
0
def tfidfVectorizeData(listOfSentences, useHashTable=False, nFeatures=100):
    
    if useHashTable:
        from sklearn.feature_extraction.text import HashingVectorizer
        vec = HashingVectorizer(stop_words='english', non_negative=True, n_features=nFeatures)
        X_noProcess = vec.transform(listOfSentences).toarray()
    else:
        from sklearn.feature_extraction.text import TfidfVectorizer
        vec = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
        X_noProcess = vec.fit_transform(listOfSentences).toarray()

    return vec, X_noProcess
예제 #25
0
class svm_multi_label_text(OneVsRestClassifier):
#    svm_ = SVC(C=500, kernel='poly', gamma=.01, shrinking=True, probability=False, degree= 10, coef0=2,
#        tol=0.001, cache_size=20000, class_weight=None, verbose=False, max_iter=-1)
    def __init__(self, train_data,  C=None, n_features=10000000, loss='l2', penalty='l1',
                 ngram_range=(1, 10), tfidf=False, dfrange=(2, 1.0), dual=True, tol=1e-4):
        self.conn = None
        self.is_tfidf = tfidf
        if tfidf:
            self.vectorizer = TfidfVectorizer(stop_words=None, min_df=dfrange[0], max_df=dfrange[1],
                                              max_features=n_features, strip_accents='unicode',
                                              ngram_range=ngram_range, analyzer='word')
        else:
            self.vectorizer = HashingVectorizer(stop_words=None, non_negative=True,
                                                n_features=n_features, strip_accents='unicode',
                                                ngram_range=ngram_range, analyzer='word')
        self.param_set = {'C': str(), 'kernel': str(), 'gamma': str(),
                          'degree': str(), 'coef0': str(), 'n_features': str(n_features)}
        super(svm_multi_label_text, self).__init__(LinearSVC(C=C, loss=loss, penalty=penalty,
                                                             dual=(False if penalty == 'l1' else dual), tol=tol))
        if self.is_tfidf:
            train_x = self.vectorizer.fit_transform(train_data.data)
        else:
            train_x = self.vectorizer.transform(train_data.data)
        train_y = train_data.target
        self.fit(train_x, train_y)
    def test_data(self, test_data):
        test_x = self.vectorizer.transform(test_data.data)
        predicted_values = self.predict(test_x)
        test_y = test_data.target
        try:
            self.score = metrics.f1_score(test_y, predicted_values)
        except ZeroDivisionError:
            self.score = -0.1
        try:
            self.accuracy = metrics.accuracy_score(test_y, predicted_values)
        except ZeroDivisionError:
            self.accuracy = -0.1
    def guess_text(self, text_text):
        text_x = self.vectorizer.transform([pre_proc(text_text, removestop=False, alwayskeep=True, word_punc=True, unquote=True),])
        return self.predict(text_x)
class TwitterSentiment:
    def __init__(self):
        self.vec = HashingVectorizer(stop_words=stopwords.words("english"), non_negative=True)
        self.pp = PreProcessor(full_pp=True)
        self.cls = None

    def predict(self, text):
        '''predict an emoticon for any string given by text by using a trained classifier'''
        return self.predict_all([text])[0]

    def predict_all(self, seq):
        '''predict all emoticons for a list of strings by using a trained classifier'''
        return self.cls.predict(self.vec.transform(map(self.pp.process_tweet, seq)))
 def trainFeatureExtract(self, opts, trainData, trainDataSize):
     print 'Extracting features from the training dataset using a sparse vectorizer'
     t0 = time()
     if opts.use_hashing:
         vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=opts.n_features)
         dataTrain = vectorizer.transform(trainData.data)
     else:
         vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
         dataTrain = vectorizer.fit_transform(trainData.data)
     duration = time() - t0
     print 'done in %fs at %0.3fMB/s' % (duration, trainDataSize / duration)
     print 'n_samples: %d, n_features: %d' % dataTrain.shape
     print 
     return dataTrain, vectorizer
예제 #28
0
    def __wordhash_features(self, data, vect=None, num_features=3000):
        '''
        extracts word ngram features from the provided data
        '''
        if vect is None:
            vect = HashingVectorizer(n_features=num_features,
                                    analyzer="word", stop_words='english',
                                    strip_accents='unicode',
                                    ngram_range=(1, 4))
            vect.fit(data)

        features = vect.transform(data)

        return features, vect
예제 #29
0
파일: test.py 프로젝트: calippo/hackerrank
def main():
  with open("trainingdata.txt","r") as f:
    int(f.readline())
    training_set = [r.split(" ") for r in f]
  y = [doc[0] for doc in training_set]
  corpus = [reduce(lambda x, y: x + " " + y, doc[1::]) for doc in training_set]
  N = len(corpus)/2
  X_train = corpus[:N]
  data = corpus[N:]
  y_train = y[:N]
  y_test = y[N:]
  vectorizer = HashingVectorizer(non_negative=True, analyzer='word')
  X_train = vectorizer.transform(X_train)
  data_test = vectorizer.transform(data)
  y_train = np.array(y_train)
  y_test = np.array(y_test)

  # Run classifier
  classifier = LinearSVC(kernel='linear', probability=True, random_state=0)
  probas_ = classifier.fit(X_train, y_train).predict_proba(data_test)

  # Compute ROC curve and area the curve
  fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
  roc_auc = auc(fpr, tpr)
  print("Area under the ROC curve : %f" % roc_auc)

  # Plot ROC curve
  pl.clf()
  pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
  pl.plot([0, 1], [0, 1], 'k--')
  pl.xlim([0.0, 1.0])
  pl.ylim([0.0, 1.0])
  pl.xlabel('False Positive Rate')
  pl.ylabel('True Positive Rate')
  pl.title('Receiver operating characteristic example')
  pl.legend(loc="lower right")
  pl.show()
예제 #30
0
def test_hashing_vectorizer():
    v = HashingVectorizer()
    X = v.transform(ALL_FOOD_DOCS)
    token_nnz = X.nnz
    assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
    assert_equal(X.dtype, v.dtype)

    # By default the hashed values receive a random sign and l2 normalization
    # makes the feature values bounded
    assert_true(np.min(X.data) > -1)
    assert_true(np.min(X.data) < 0)
    assert_true(np.max(X.data) > 0)
    assert_true(np.max(X.data) < 1)

    # Check that the rows are normalized
    for i in range(X.shape[0]):
        assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0)

    # Check vectorization with some non-default parameters
    v = HashingVectorizer(ngram_range=(1, 2), non_negative=True, norm='l1')
    X = v.transform(ALL_FOOD_DOCS)
    assert_equal(X.shape, (len(ALL_FOOD_DOCS), v.n_features))
    assert_equal(X.dtype, v.dtype)

    # ngrams generate more non zeros
    ngrams_nnz = X.nnz
    assert_true(ngrams_nnz > token_nnz)
    assert_true(ngrams_nnz < 2 * token_nnz)

    # makes the feature values bounded
    assert_true(np.min(X.data) > 0)
    assert_true(np.max(X.data) < 1)

    # Check that the rows are normalized
    for i in range(X.shape[0]):
        assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0)
예제 #31
0
def model_train():
    sys.path.insert(0, '../notebooks/')

    from helper import load_data, token
    from datetime import datetime
    import humanfriendly
    import pandas as pd
    import numpy as np

    from sklearn.externals import joblib
    from sklearn.model_selection import train_test_split
    from sklearn.feature_extraction.text import HashingVectorizer
    from sklearn.linear_model import SGDClassifier

    stops = joblib.load('../stops.pkl')
    df = load_data('../sentiment_data', balanced=True)

    hash_para = dict(decode_error='ignore',
                     n_features=2**18,
                     tokenizer=token,
                     stop_words=stops,
                     ngram_range=(1, 3),
                     alternate_sign=False)
    clf_prep = HashingVectorizer(**hash_para)

    clf_prep = HashingVectorizer(**hash_para)
    clf = SGDClassifier(loss='log', random_state=1, max_iter=1)

    u = datetime.now()
    clf.partial_fit(clf_prep.transform(df['features']),
                    df['y'],
                    classes=np.unique(df['y']))
    v = datetime.now()
    delta = v - u
    print('Training took: {}'.format(
        humanfriendly.format_timespan(delta.seconds)))

    joblib.dump(clf_prep, '../HashVectorizer.pkl')
    joblib.dump(clf, '../SGDclassifier.pkl')
예제 #32
0
def main():
    vec = HashingVectorizer(tokenizer=preprocess,
                            ngram_range=(3, 3),
                            analyzer='word')
    clu = Birch(n_clusters=3)
    #clu = MiniBatchKMeans(n_clusters=2)

    config = configparser.ConfigParser()
    config.read('cfg.ini')
    config = config['DEFAULT']
    api = twitter.Api(consumer_key=config['consumer_key'],
                      consumer_secret=config['consumer_secret'],
                      access_token_key=config['access_token_key'],
                      access_token_secret=config['access_token_secret'])
    queue = deque(maxlen=50)
    for n, line in enumerate(
            api.GetStreamFilter(track=[
                'pokemon', 'dark souls', 'darksouls', 'sonic', 'hedgehog'
            ],
                                languages=['en'])):
        if n > 1000000:
            break
        elif len(queue) != 50:
            try:
                queue.append(line['text'])
                logging.warning("%s", line['text'])
            except KeyError:
                pass
        else:
            try:
                v = vec.transform(queue)
                clu = clu.partial_fit(v)
                logging.warning('TESTING\n.\n.\n.\n.')
                logging.warning("%s, %s, %s", n, clu.predict(v[-1]), queue[-1])
            except KeyError:
                pass
            queue.clear()

    pickle.dump(clu, open('cluster_model.pkl', 'w'))
예제 #33
0
    def CreateRpeFeature(self, look, test=False, verbose=False):
        if not test:
            vectorizer = HashingVectorizer(n_features=2**8, ngram_range=(1, 2))
            vectorizer.fit(self.fulldata_words['rpe'].values)
            self.rpe_vectorizer = vectorizer

        def create_rpe_features(g):
            rpe = g[((g["word_num"] - g["target_word_num"]).abs() <= look)
                    & ~(g["word_num"] == g["target_word_num"])]['rpe'].values
            return " ".join(rpe)

        rpe_sentences = self.fulldata_words.groupby("sentence_num").apply(
            create_rpe_features)

        if test:
            return rpe_sentences.apply(lambda x: pd.Series(
                data=self.rpe_vectorizer.transform([x]).toarray()[0],
                index=[f"rpe_hash_{k}" for k in range(vectorizer.n_features)]))
        else:
            return rpe_sentences.apply(lambda x: pd.Series(
                data=vectorizer.transform([x]).toarray()[0],
                index=[f"rpe_hash_{k}" for k in range(vectorizer.n_features)]))
예제 #34
0
def batch_train(clf,
                fnames,
                labels,
                iterations=25,
                batchsize=1000,
                random_seed=1):
    vec = HashingVectorizer(encoding='latin-1')
    idx = np.arange(labels.shape[0])
    c_clf = clone(clf)
    rng = np.random.RandomState(seed=random_seed)

    for i in range(iterations):
        rnd_idx = rng.choice(idx, size=batchsize)
        documents = []
        for i in rnd_idx:
            with open(fnames[i], 'r', encoding='latin-1') as f:
                documents.append(f.read())
        X_batch = vec.transform(documents)
        batch_labels = labels[rnd_idx]
        c_clf.partial_fit(X=X_batch, y=batch_labels, classes=[0, 1])

    return c_clf
예제 #35
0
def batch_train(clf, fnames, labels, iterations=1,
                batchsize=1000, random_seed=1):
    vec = HashingVectorizer(encoding='latin-1')
    idx = np.arange(labels.shape[0])
    c_clf = clone(clf)
    rng = np.random.RandomState(seed=random_seed)
    shuffled_idx = rng.permutation(range(len(fnames)))
    fnames_ary = np.asarray(fnames)

    for _ in range(iterations):
        for batch in np.split(shuffled_idx, len(fnames) // 1000):
            documents = []
            for fn in fnames_ary[batch]:
                with open(fn, 'r') as f:
                    documents.append(f.read())
            X_batch = vec.transform(documents)
            batch_labels = labels[batch]
            c_clf.partial_fit(X=X_batch,
                              y=batch_labels,
                              classes=[0, 1])

    return c_clf
예제 #36
0
def _vectorize_chunk(dsid_dir, k, pars, pretend=False):
    """ Extract features on a chunk of files """
    from sklearn.feature_extraction.text import HashingVectorizer
    from sklearn.externals import joblib

    filenames = pars['filenames_abs']
    chunk_size = pars['chunk_size']
    n_samples = pars['n_samples']

    mslice = slice(k*chunk_size, min((k+1)*chunk_size, n_samples))

    hash_opts = {key: vals for key, vals in pars.items()
                 if key in ['stop_words', 'n_features',
                            'analyser', 'ngram_range']}
    hash_opts['alternate_sign'] = False
    fe = HashingVectorizer(input='content', norm=None, **hash_opts)
    if pretend:
        return fe
    fset_new = fe.transform(_read_file(fname) for fname in filenames[mslice])

    fset_new.eliminate_zeros()

    joblib.dump(fset_new, str(dsid_dir / 'features-{:05}'.format(k)))
def get_kmeans_prototypes(X, n_prototypes, hashing_dim=128,
                          ngram_range=(2, 4), sparse=False,
                          sample_weight=None, random_state=None):
    """
    Computes prototypes based on:
      - dimensionality reduction (via hashing n-grams)
      - k-means clustering
      - nearest neighbor
    """
    vectorizer = HashingVectorizer(analyzer='char', norm=None,
                                   alternate_sign=False,
                                   ngram_range=ngram_range,
                                   n_features=hashing_dim)
    projected = vectorizer.transform(X)
    if not sparse:
        projected = projected.toarray()
    kmeans = KMeans(n_clusters=n_prototypes, random_state=random_state)
    kmeans.fit(projected, sample_weight=sample_weight)
    centers = kmeans.cluster_centers_
    neighbors = NearestNeighbors()
    neighbors.fit(projected)
    indexes_prototypes = np.unique(neighbors.kneighbors(centers, 1)[-1])
    return np.sort(X[indexes_prototypes])
예제 #38
0
def test_explain_hashing_vectorizer(newsgroups_train_binary):
    # test that we can pass InvertableHashingVectorizer explicitly
    vec = HashingVectorizer(n_features=1000)
    ivec = InvertableHashingVectorizer(vec)
    clf = LogisticRegression(random_state=42)
    docs, y, target_names = newsgroups_train_binary
    ivec.fit([docs[0]])
    X = vec.fit_transform(docs)
    clf.fit(X, y)

    get_res = lambda **kwargs: explain_prediction(
        clf, docs[0], vec=ivec, target_names=target_names, top=20, **kwargs)
    res = get_res()
    check_explain_linear_binary(res, clf)
    assert res == get_res()
    res_vectorized = explain_prediction(
        clf, vec.transform([docs[0]])[0], vec=ivec, target_names=target_names,
        top=20, vectorized=True)
    pprint(res_vectorized)
    assert res_vectorized == _without_weighted_spans(res)

    assert res == get_res(
        feature_names=ivec.get_feature_names(always_signed=False))
예제 #39
0
def feature_engineering(raw_data, output_file="features.csv", feature_num=250):
    print("Input song data from the lyrics_datafile...")
    with open(raw_data, 'rb') as f:
        reader = csv.reader(f)
        data_list = list(reader)

    data_list = np.array(data_list)
    lyrics = data_list[1:, 7]
    tag = data_list[1:, 5]

    print("Processing the input lyrics...")
    hv = HashingVectorizer(n_features=feature_num)
    trans = hv.transform(lyrics)
    # convert to dense matrix
    dense = trans.todense()
    dense = dense.tolist()
    for i in range(len(dense)):
        dense[i].append(tag[i])
    print("Saving feature results...")
    with open(output_file, "wb") as f:
        writer = csv.writer(f)
        writer.writerows(dense)
    print("-----Feature engineering DONE-----")
예제 #40
0
def prepareTrainData():
    # preparing the data

    data_examples = filterDataWithNoEngDesc(
        getTokenizeCleanData(mypath, filename, trainpagename))
    y_examples = data_examples['segment']
    data_examples.data = data_examples['desc_tokens']
    data_examples_size_kb = size_kb(data_examples)

    print("%d documents - %0.3fKB (examples set)" %
          (len(data_examples.data), data_examples_size_kb))

    argv = ["--report"]
    op = getOptionParser()
    opts = getOpts(op, argv)
    print(
        "Extracting features from the training data using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english',
                                       alternate_sign=False,
                                       n_features=opts.n_features)
        X_examples = vectorizer.transform(data_examples.data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True,
                                     max_df=0.5,
                                     stop_words='english')
        X_examples = vectorizer.fit_transform(data_examples.data)
    duration = time() - t0
    print("done in %fs at %0.3fkB/s" %
          (duration, data_examples_size_kb / duration))
    print("n_samples: %d, n_features: %d" % X_examples.shape)
    print()

    # print(type(X_examples))
    # print(type(X_examples.todense()))
    return [X_examples, y_examples, vectorizer]
예제 #41
0
def _hashing_trick(x_train,
                   x_test,
                   n_features,
                   binary=True,
                   ngram_range=(1, 1)):
    df_train = pd.DataFrame(x_train.astype('str'))
    df_test = pd.DataFrame(x_test.astype('str'))

    for col_i in range(df_train.shape[1]):
        df_train.iloc[:, col_i] = '{}='.format(col_i) + df_train.iloc[:, col_i]
        df_test.iloc[:, col_i] = '{}='.format(col_i) + df_test.iloc[:, col_i]

    texts_train = df_train.apply(lambda row: ' '.join(row), axis=1).values
    texts_test = df_test.apply(lambda row: ' '.join(row), axis=1).values

    hv = HashingVectorizer(n_features=n_features,
                           binary=binary,
                           ngram_range=ngram_range)
    hashed_train = hv.fit_transform(texts_train)
    hashed_test = hv.transform(texts_test)

    hashed_train, hashed_test = np.array(hashed_train.todense()), np.array(
        hashed_test.todense())
    return hashed_train, hashed_test
def model_fit(input_song, model, feature_num=150):
    print("Process the input song...")
    with open(input_song, 'rb') as f:
        fa = LyricsToSentences(f.read())
    delete_table = "\xc3\xa2\xc2\x80\xc2\x99"
    lyrics = "".join(c for c in fa if c not in delete_table)
    lyrics = [lyrics]

    hv = HashingVectorizer(n_features=feature_num)
    trans = hv.transform(lyrics)
    # convert to dense matrix
    dense = trans.todense()
    dense = dense.tolist()
    res = []
    print("Running Model...")
    rank = zip(model.classes_, model.predict_proba(dense)[0])
    print("-----Prediction Done-----")
    print("")
    print("The prediction results are:")
    for i in range(3):
        tag, prob = sorted(rank, key=lambda x: -x[1])[i]
        print tag
        res.append(tag)
    return res
예제 #43
0
def model_education():
    data_train = pandas.read_csv('storage_1/data_base_semantica.csv',
                                 header=None)
    gen_text = read_row(data_train)
    data_to_learn = clean_text(get_minibatch_1(gen_text, size))
    k = 0
    cls_list = list()
    while list(data_to_learn.index):
        vectorize = HashingVectorizer(decode_error='ignore', n_features=2**21)
        classifier = SGDClassifier(loss='log',
                                   warm_start=True,
                                   n_jobs=-1,
                                   max_iter=5)
        #        cls_list.append(classifier.fit(vectorize.transform(data_to_learn[1]), data_to_learn[0]))
        classifier.fit(vectorize.transform(data_to_learn[1]), data_to_learn[0])
        _ = joblib.dump(classifier, str(k), compress=9)
        cls_list.append(str(k))
        k += size
        print('Обучено строк', k)
        try:
            data_to_learn = clean_text(get_minibatch_1(gen_text, size))
        except TypeError:
            break
    return cls_list, _
예제 #44
0
        except StopIteration:
            break
    return text, label


from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

#converting texts to occurance matrix
vect = HashingVectorizer(n_features=2**21,
                         decode_error='ignore',
                         preprocessor=None,
                         tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs('movie-data.csv')

import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    x_train, y_train = get_minibatch(doc_stream, 1000)
    if not x_train:
        break
    x_train = vect.transform(x_train)
    clf.partial_fit(x_train, y_train, classes=classes)
    pbar.update()
x_test, y_test = get_minibatch(doc_stream, 5000)
x_test = vect.transform(x_test)
print('accuracy: ', clf.score(x_test, y_test))
y_pred = clf.predict_proba(vect.transform(np.array(['it was too long'])))
    X_text, y = get_minibatch(doc_iter, minibatch_size)
    while len(X_text):
        yield X_text, y
        X_text, y = get_minibatch(doc_iter, minibatch_size)


# test data statistics
test_stats = {"n_test": 0, "n_test_pos": 0}

# First we hold out a number of examples to estimate accuracy
n_test_documents = 1000
tick = time.time()
X_test_text, y_test = get_minibatch(data_stream, 1000)
parsing_time = time.time() - tick
tick = time.time()
X_test = vectorizer.transform(X_test_text)
vectorizing_time = time.time() - tick
test_stats["n_test"] += len(y_test)
test_stats["n_test_pos"] += sum(y_test)
print("Test set is %d documents (%d positive)" % (len(y_test), sum(y_test)))


def progress(cls_name, stats):
    """Report progress information, return a string."""
    duration = time.time() - stats["t0"]
    s = "%20s classifier : \t" % cls_name
    s += "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats
    s += "%(n_test)6d test docs (%(n_test_pos)6d positive) " % test_stats
    s += "accuracy: %(accuracy).3f " % stats
    s += "in %.2fs (%5d docs/s)" % (duration, stats["n_train"] / duration)
    return s
예제 #46
0
print("Training :: %d documents - " % (len(data_train.data)))
print("Testing :: %d documents - " % (len(data_test.data)))
print("%d categories" % len(categories))

# ## split a training set and a test set
y_train, y_test = data_train.target, data_test.target

print(
    "For training : Extracting features from the training data using a sparse vectorizer"
)
t0 = time()
if False:  # or opts.use_hashing:
    vectorizer = HashingVectorizer(stop_words='english',
                                   non_negative=True,
                                   n_features=opts.n_features)
    X_train = vectorizer.transform(data_train.data)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 ngram_range=tuple([1, 3]),
                                 stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
duration = time() - t0
print("done in %fs" % (duration))  #, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print(
    "For testing : Extracting features from the test data using the same vectorizer"
)
t0 = time()
예제 #47
0
df_validation.USER_IP = df_validation.USER_IP.apply(create_ip_string)
validation_data += df_validation.USER_IP
#processing left features
validation_data += 'revision_id=' + df_validation.REVISION_SESSION_ID + ' '\
 + 'country_code=' + df_validation.USER_COUNTRY_CODE + ' '\
 + 'continent_code=' + df_validation.USER_CONTINENT_CODE + ' '\
 + 'region_code=' + df_validation.USER_REGION_CODE + ' '\
 + 'city_name=' + df_validation.USER_CITY_NAME + ' '\
 + 'county_name=' + df_validation.USER_COUNTY_NAME

#feature data

vec = HashingVectorizer(token_pattern="\\S+",n_features=10000000, norm=None,\
 binary=True, dtype=np.uint16, lowercase=False)

X_train = vec.transform(train_data)
X_validation = vec.transform(validation_data)

#label data
df_train.ROLLBACK_REVERTED = df_train.ROLLBACK_REVERTED.replace(['F'], 0)
df_train.ROLLBACK_REVERTED = df_train.ROLLBACK_REVERTED.replace(['T'], 1)
y_train = df_train.ROLLBACK_REVERTED.values

df_validation.ROLLBACK_REVERTED = df_validation.ROLLBACK_REVERTED.replace(
    ['F'], 0)
df_validation.ROLLBACK_REVERTED = df_validation.ROLLBACK_REVERTED.replace(
    ['T'], 1)
y_validation = df_validation.ROLLBACK_REVERTED.values

t = time()
#train
예제 #48
0
ngram = HashingVectorizer(strip_accents='unicode', binary=True, ngram_range=(1,4), stop_words=None, lowercase=True,  tokenizer=tokenizer.tokenize, n_features=10000) #N-gram feature vectorizer
character_gram = HashingVectorizer(strip_accents='unicode', binary=True, ngram_range=(4,5), stop_words=None, lowercase=True, analyzer='char', tokenizer=tokenizer.tokenize, n_features=22000) #Char-gram feature vectorizer

n_power = float(sys.argv[1]) #parameter of the n_power transformation, I used 0.9 for submission

#Linguistic, POS, sentiment disctionaries etc.
pos1, pos_features1, different_pos_tags1, pos_text1 = get_pos_tags_and_hashtags(tweetText+tweetTest) #Get POS of everything
pos, pos_features, different_pos_tags, pos_text =  pos1[:len(categories)], pos_features1[:len(categories)], different_pos_tags1, pos_text1[:len(categories)] #Split train-test again
pos_test, pos_features_test, different_pos_tags_test, pos_text_test = pos1[len(categories):], pos_features1[len(categories):], different_pos_tags1, pos_text1[len(categories):] #Split train-test again

ngram_features = ngram.fit_transform(tweetText) #Get n-gram features
character_gram_features = character_gram.fit_transform(tweetText) #Get char-gram features
ngram_features.data **= n_power #a-power transformation
character_gram_features.data **= n_power #a-power transformation

ngram_features_test = ngram.transform(tweetTest)
character_gram_features_test = character_gram.transform(tweetTest)
ngram_features_test.data **= n_power
character_gram_features_test.data **= n_power

x_train, y_train = createDataMatrix(ngram_features, character_gram_features, tweetText, pos, pos_features, different_pos_tags, pos_text, voca_clusters, categories) #Combine all  features (train)
x_test, y_test = createDataMatrix(ngram_features_test, character_gram_features_test, tweetTest, pos_test, pos_features_test, different_pos_tags_test, pos_text_test, voca_clusters, categories_test)# Combine feat test


print "SVMs crammer singer"
for c in np.logspace(-3,4,8): #used 100 for submission
    clf = svm.LinearSVC(C=c, loss='squared_hinge', penalty='l2', class_weight='balanced', multi_class='crammer_singer', max_iter=4000, dual=True, tol=1e-6)
    clf.fit(x_train, y_train)
    print "Hold-out",  showMyKLD(y_test, clf.predict(x_test), yo), c

print(vectorizer.vocabulary_)

# Transform text to vector
vector = vectorizer.transform(text)
print(vector.shape)
print(type(vector))
print(vector.toarray())

# As we can see here, only word 'can' occur two times
print(vector)
"""TF-IDF"""
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tdidf = TfidfVectorizer()
vectorizer_tdidf.fit(text)

print(vectorizer_tdidf.vocabulary_)
print(vectorizer_tdidf.idf_)

vector_tdidf = vectorizer_tdidf.transform([text[0]])

print(vector_tdidf.shape)
print(vector_tdidf.toarray())
"""Hashing Vectoring"""
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer_hash = HashingVectorizer(n_features=20)
vector_hash = vectorizer_hash.transform(text)

print(vector_hash.shape)
print(vector_hash.toarray())
예제 #50
0
__author__ = 'pratapdangeti'

# from sklearn.feature_extraction.text import CountVectorizer
# corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']
# vectorizer = CountVectorizer(stop_words='english')
# print(vectorizer.fit_transform(corpus).todense())
# print(vectorizer.vocabulary_)

#TD-IDF

# from sklearn.feature_extraction.text import TfidfVectorizer
# corpus = [
#     'The dog ate a sandwich and I ate a sandwich',
#     'The wizard trasnfigured a sandwich'
# ]
#
# vectorizer = TfidfVectorizer(stop_words='english')
# print(vectorizer.fit_transform(corpus).todense())

#Using hashing trick

from sklearn.feature_extraction.text import HashingVectorizer
corpus = ['the', 'ate', 'bacon', 'cat']
vectorizer = HashingVectorizer(n_features=6)
print(vectorizer.transform(corpus).todense())
예제 #51
0
import json,sys
from sklearn.svm import LinearSVC
#from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import HashingVectorizer
if sys.version_info[0]>=3: raw_input=input
transformer=HashingVectorizer(stop_words='english')

_train=[]
train_label=[]
f=open('training.json')
for i in range(int(f.readline())):
	h=json.loads(f.readline())
	_train.append(h['question']+"\r\n"+h['excerpt'])
	train_label.append(h['topic'])
f.close()
train = transformer.fit_transform(_train)
svm=LinearSVC()
svm.fit(train,train_label)

_test=[]
for i in range(int(raw_input())):
	h=json.loads(raw_input())
	_test.append(h['question']+"\r\n"+h['excerpt'])
test = transformer.transform(_test)
test_label=svm.predict(test)
for e in test_label: print(e)
def ordering(mdb, mycursor, myresult, custid, from_time, skip, limit):
    print(skip, limit)
    client_103 = MongoClient(
        'mongodb://*****:*****@49.156.128.103:27017/')
    mdb_103 = client_103["way2"]
    #user_rating_collection="user_data_new"
    #user_keyword_collection="newsreg-user-keywords-data"
    posts = []
    breaking = []
    push_posts = []
    predict_posts = []
    seen_posts = []
    #    t1=time.time()-t0
    #    print(t1)
    # t2=time.time()
    x = datetime.datetime.now()
    today_date = str(x).split(" ")[0]
    print('track_posts_cat_' + str(today_date.split("-")[2]).replace("0", "") +
          "_" + str(today_date.split("-")[1]).replace("0", "") + "_" +
          today_date.split("-")[0])
    seen_docs = mdb_103['track_posts_cat_' +
                        str(today_date.split("-")[2]).replace("0", "") + "_" +
                        str(today_date.split("-")[1]).replace("0", "") + "_" +
                        today_date.split("-")[0]].find({"custid": int(custid)})
    #seen_docs=mdb[user_rating_collection].find({"custid" : int(custid),"date" :{'$in':[today_date]}},{"_id":0,"postid":1})
    sp = []
    for pid in seen_docs:
        #print(pid)
        sp.append(pid["postid"])
    client_103.close()

    current_loop_date = str(
        datetime.datetime.fromtimestamp(from_time)).split(" ")[0]
    push_query = "SELECT post_id FROM push_notifications_queue WHERE lang_id=1 AND  push_date='" + current_loop_date + "'"
    mycursor.execute(push_query)
    push_result = mycursor.fetchall()
    push_ids = []
    for doc in push_result:
        #print("pushhhh",doc)
        push_ids.append(doc["post_id"])

    for row in myresult:
        #if(row['postid'])==1958817 and row["is_breaking"]==1 and str(row["publishdate"])==today_date:
        #print(row)
        if "is_breaking" in row and row["is_breaking"] == 1 and str(
                row["publishdate"]) == current_loop_date:
            if row['postid'] in sp:
                seen_posts.append(row)
            else:
                breaking.append(row)
        elif "news_type" in row and row["news_type"] == "breaking" and str(
                row["publishdate"]) == current_loop_date:
            if row['postid'] in sp:
                seen_posts.append(row)
            else:
                breaking.append(row)
        elif row['postid'] in push_ids:
            if row['postid'] in sp:
                seen_posts.append(row)
            else:
                push_posts.append(row)
        else:
            post_doc = row
            key_query = "SELECT post_id,lower(tag_name) as tag_name FROM way2app.mag_post_mechine_tags WHERE post_id={0}".format(
                row['postid'])
            mycursor.execute(key_query)
            key_result = mycursor.fetchall()
            post_doc['keywords'] = [x['tag_name'] for x in key_result]
            s = ''
            if "category_name" in post_doc and post_doc["category_name"]:
                s = s + cat[post_doc["category_name"]] if post_doc[
                    "category_name"] in cat else post_doc["category_name"]
            if "keywords" in post_doc and post_doc["keywords"]:
                s = s + " " + " ".join(post_doc["keywords"])
            posts.append(s)
            predict_posts.append(row)
    b_ids = [d['postid'] for d in breaking]
    p_ids = [d['postid'] for d in push_posts]
    print(b_ids, p_ids)

    break_plus_push = []
    break_plus_push.extend(breaking)
    break_plus_push.extend(push_posts)
    break_plus_push = sorted(break_plus_push,
                             key=itemgetter('post_gmt'),
                             reverse=True)
    #print(posts)
    #  t3=time.time()-t2
    # print(t3)
    # t4=time.time()
    vectorizer = HashingVectorizer()
    #vectorizer=joblib.load("../models/vect_"+str(custid))
    vect_posts = vectorizer.transform(posts)
    try:
        f = open("../models/" + str(custid), 'wb+')
        model = joblib.load(f)
        f.close()
        print("old user  ", custid)
        pred = model.predict(vect_posts)
        # t5=time.time()-t4
        # print(t5)
        #test_pred=model.predict(vectorizer.transform(["News chandrababu"]))
        #print("testtt  ",test_pred)

        ins_array = []
        for i in range(len(pred)):
            post_doc = predict_posts[i]
            #del post_doc['post_date']
            #postid=post_doc['postid']
            #post_doc['postid']=int(postid)
            post_doc['custid'] = custid
            post_doc['prediction'] = int(pred[i])

            #print(post_doc)
            ins_array.append(post_doc)
        #print(ins_array)
        #ins_array = sorted(ins_array, key=itemgetter('prediction','post_gmt'), reverse=True)
        ins_array = sorted(ins_array,
                           key=itemgetter('prediction'),
                           reverse=True)
        #print(ins_array)

        #print(type(custid),type(today_date))

        # =============================================================================
        #         cat_list=list(mdb[user_keyword_collection].find({"custid" : int(custid)},{"_id":0,"category":1}))
        #         #print(cat_list)
        #         cat_list=cat_list[0]["category"]
        #         cat_list=dict(sorted(cat_list.items(),key=itemgetter(1),reverse=True))
        #         #print(cat_list)
        #         cat_list=list(cat_list.keys())
        #         try:
        #             cat_list.remove("News")
        #             cat_list.remove("undefined")
        #             cat_list.append("News")
        #             cat_list.apped("undefined")
        #         except:
        #             pass
        #         #print(cat_list)
        # =============================================================================

        unseen_rated_posts = []
        unseen_unrated_posts = []
        c = []
        for p in ins_array:
            if "category_name" in p and p["category_name"]:
                if p["category_name"] in cat:
                    if cat[p["category_name"]] not in c:
                        c.append(cat[p["category_name"]])
                    p["category_name"] = cat[p["category_name"]]
                else:
                    if p["category_name"] not in c:
                        c.append(p["category_name"])
            if p["postid"] in sp:
                seen_posts.append(p)
            else:
                if p["prediction"] > 0:
                    unseen_rated_posts.append(p)
                else:
                    unseen_unrated_posts.append(p)


# =============================================================================
#         for e in cat_list:
#             try:
#                 c.remove(e)
#             except:
#                 pass
#         cat_list=list(cat_list)
#         cat_list.extend(c)
#         cat_list.append(None)
#         print(cat_list)
#         #pprint(unseen_unrated_posts)
#         srt = {b: i for i, b in enumerate(cat_list)}
#         unseen_unrated_posts=sorted(unseen_unrated_posts, key=lambda x: srt[x["category_name"]])
#
# =============================================================================
        final_array = []
        #final_array.extend(breaking)
        #final_array.extend(push_posts)
        final_array.extend(break_plus_push)
        final_array.extend(unseen_rated_posts)
        final_array.extend(unseen_unrated_posts)
        final_array.extend(seen_posts)
        s_ids = [d['postid'] for d in seen_posts]
        print(s_ids)
        #  t6=time.time()-t0
        # print(t6)

        zeros = [
            "daysdiff", "categoryid", "show_button", "postid", "btn_text_lang",
            "writer_custid", "is_ad", "whatsapp_share_count", "fb_share_count",
            "imgs_count", "sourceid", "lang", "post_parent"
        ]
        for post_doc in final_array:
            for key in post_doc:

                if key in post_doc and post_doc[key] is not None:
                    post_doc[key] = str(post_doc[key])
                if key in zeros and not post_doc[key]:
                    post_doc[key] = str(0)
        f_ids = [d['postid'] for d in final_array]
        print(f_ids)
        unseen_length = len(break_plus_push) + len(unseen_rated_posts) + len(
            unseen_unrated_posts)
        print("old user", "unseen::", unseen_length, "seen", len(seen_posts),
              "skip::", skip, "limit::", limit)
        # =============================================================================
        #         try:
        #             if skip and limit:
        #                 if unseen_length>0 and len(seen_posts)>0:
        #                     r_ids=  [d['postid'] for d in final_array[:limit]]
        #                     print("c1",r_ids)
        #                     return final_array[:limit]
        #                 else:
        #                     r_ids=  [d['postid'] for d in final_array[skip:skip+limit]]
        #                     print("c2",r_ids)
        #                     return final_array[skip:skip+limit]
        #             elif skip and not limit:
        #                 if unseen_length>0 and len(seen_posts)>0:
        #                     r_ids=  [d['postid'] for d in final_array[:len(final_array)-len(seen_posts)]]
        #                     print("c3",r_ids)
        #                     return final_array[:len(final_array)-len(seen_posts)]
        #                 else:
        #                     r_ids=  [d['postid'] for d in final_array[skip:]]
        #                     print("c4",r_ids)
        #                     return final_array[skip:]
        #             elif not skip and limit:
        #                 r_ids=  [d['postid'] for d in final_array[:limit]]
        #                 print("c5",r_ids)
        #                 return final_array[:limit]
        #         except Exception as e:
        #             print("2c8",e)
        #             return []
        # =============================================================================
        try:
            if skip and limit:
                if unseen_length > 0 and len(seen_posts) > 0:
                    res = []
                    new = len(final_array) - len(seen_posts)
                    res.extend(final_array[:new])
                    if len(res) >= limit:
                        r_ids = [d['postid'] for d in res]
                        print("1c1", r_ids)
                        return res
                    else:
                        res.extend(final_array[new + skip:new + skip + limit])
                        r_ids = [d['postid'] for d in res]
                        print("1c2", r_ids)
                        return res
                else:
                    r_ids = [
                        d['postid'] for d in final_array[skip:skip + limit]
                    ]
                    print("1c3", r_ids)
                    return final_array[skip:skip + limit]
            elif skip and not limit:
                if unseen_length > 0 and len(seen_posts) > 0:
                    res = []
                    new = len(final_array) - len(seen_posts)
                    res.extend(final_array[:new])
                    if len(res) >= limit:
                        r_ids = [d['postid'] for d in res]
                        print("1c4", r_ids)
                        return res
                    else:
                        res.extend(final_array[new + skip:])
                        r_ids = [d['postid'] for d in res]
                        print("1c5", r_ids)
                        return res
                else:
                    r_ids = [d['postid'] for d in final_array[skip:]]
                    print("1c6", r_ids)
                    return final_array[skip:]
            elif not skip and limit:
                r_ids = [d['postid'] for d in final_array[:limit]]
                print("1c7", r_ids)
                return final_array[:limit]
        except Exception as e:
            print("1c8", e)
            return []
    except:
        print("new user  ", custid)
        unseen_posts = []
        for p in predict_posts:
            if p["postid"] in sp:
                seen_posts.append(p)
            else:
                unseen_posts.append(p)

        final_array = []
        #final_array.extend(breaking)
        #final_array.extend(push_posts)
        final_array.extend(break_plus_push)
        final_array.extend(unseen_posts)
        final_array.extend(seen_posts)

        zeros = [
            "daysdiff", "categoryid", "show_button", "postid", "btn_text_lang",
            "writer_custid", "is_ad", "whatsapp_share_count", "fb_share_count",
            "imgs_count", "sourceid", "lang", "post_parent"
        ]
        for post_doc in final_array:
            for key in post_doc:

                if key in post_doc and post_doc[key] is not None:
                    post_doc[key] = str(post_doc[key])
                if key in zeros and not post_doc[key]:
                    post_doc[key] = str(0)
        f_ids = [d['postid'] for d in final_array]
        print(f_ids)
        unseen_length = len(break_plus_push) + len(unseen_posts)

        # =============================================================================
        #         try:
        #             if len(unseen_length)>0 and len(seen_posts)>0:
        #                 r_ids=  [d['postid'] for d in final_array[len(final_array)-len(seen_posts)]]
        #                 print(r_ids)
        #                 return final_array[len(final_array)-len(seen_posts)]
        #             else:
        #                 r_ids=  [d['postid'] for d in final_array[skip:]]
        #                 print(r_ids)
        #                 return final_array[skip:]
        #         except Exception as e:
        #             print("exception occured",e)
        #             return []
        # =============================================================================
        print("new user", "unseen::", unseen_length, "seen", len(seen_posts),
              "skip::", skip, "limit::", limit)
        try:
            if skip and limit:
                if unseen_length > 0 and len(seen_posts) > 0:
                    res = []
                    new = len(final_array) - len(seen_posts)
                    res.extend(final_array[:new])
                    if len(res) >= limit:
                        r_ids = [d['postid'] for d in res]
                        print("2c1", r_ids)
                        return res
                    else:
                        res.extend(final_array[new + skip:new + skip + limit])
                        r_ids = [d['postid'] for d in res]
                        print("2c2", r_ids)
                        return res
                else:
                    r_ids = [
                        d['postid'] for d in final_array[skip:skip + limit]
                    ]
                    print("2c3", r_ids)
                    return final_array[skip:skip + limit]
            elif skip and not limit:
                if unseen_length > 0 and len(seen_posts) > 0:
                    res = []
                    new = len(final_array) - len(seen_posts)
                    res.extend(final_array[:new])
                    if len(res) >= limit:
                        r_ids = [d['postid'] for d in res]
                        print("2c4", r_ids)
                        return res
                    else:
                        res.extend(final_array[new + skip:])
                        r_ids = [d['postid'] for d in res]
                        print("2c5", r_ids)
                        return res
                else:
                    r_ids = [d['postid'] for d in final_array[skip:]]
                    print("2c6", r_ids)
                    return final_array[skip:]
            elif not skip and limit:
                r_ids = [d['postid'] for d in final_array[:limit]]
                print("2c7", r_ids)
                return final_array[:limit]
        except Exception as e:
            print("2c8", e)
            return []
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

clf = SGDClassifier(loss='log', random_state=1)

df = pd.read_csv('./movie_data_small.csv', encoding='utf-8')

#df.loc[:100, :].to_csv('./movie_data_small.csv', index=None)

X_train = df['review'].values
y_train = df['sentiment'].values

X_train = vect.transform(X_train)
clf.fit(X_train, y_train)

pickle.dump(stop, open('stopwords.pkl', 'wb'), protocol=4)

pickle.dump(clf, open('classifier.pkl', 'wb'), protocol=4)
예제 #54
0

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + \
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

file_path = parenr_dir_path + '/classifier.pkl'

clf = pickle.load(open(file_path, 'rb'))

label = {0: 'negative', 1: 'positive'}
example = ['this movie is fun.']

X = vect.transform(example)

print(clf.predict(X))
print(clf.predict_proba(X))

print('Prediction: %s\nProbability: %.2f%%' %
      (label[clf.predict(X)[0]], np.max(clf.predict_proba(X)) * 100))
#Splitting the data into Test and Train
X_train, X_test, y_train, y_test = train_test_split(sdf['name_full'],
                                                    y,
                                                    test_size=0.2,
                                                    random_state=21,
                                                    stratify=y)
#Learning the parameters using HashingVectorizer
vect = HashingVectorizer(
    analyzer='char', n_features=325000, ngram_range=(2, 4), lowercase=False
).fit(
    X_train
)  # n_features = 325000,min_df=30, max_df=0.3,max_features = 9000,# ,stop_words = stopWords, sublinear_tf=True,norm = 'l2')#,#2,4 giving best accuracy right now
#Learning the parameters using TfidfVectorizer
#vect = TfidfVectorizer(analyzer='char',min_df=30, max_df=0.3,norm = 'l2',ngram_range=(2,4),lowercase=False).fit(X_train)
#Transforming Test Data
X_train_transform = vect.transform(X_train)

#Transformming Test Data
X_test_transform = vect.transform(X_test)

#Model Fitting
model = OneVsRestClassifier(LinearSVC(random_state=0)).fit(
    X_train_transform, y_train)
#Model Predictions on Test Data
svcPredictions = model.predict(X_test_transform)

svcAccuracy = accuracy_score(svcPredictions, y_test)
print("SVM Accuracy using HashingVectorizer:", svcAccuracy)

#Plotting Confusion matrix
class RCTRobot:
    def __init__(self):
        self.svm_clf = MiniClassifier(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/rct_svm_weights.npz'))

        cnn_weight_files = glob.glob(
            os.path.join(robotreviewer.DATA_ROOT, 'rct/*.h5'))
        json_filename = os.path.join(robotreviewer.DATA_ROOT,
                                     'rct/rct_cnn_structure.json')
        self.cnn_clfs = [
            get_model(json_filename, cnn_weight_file)
            for cnn_weight_file in cnn_weight_files
        ]
        self.svm_vectorizer = HashingVectorizer(binary=False,
                                                ngram_range=(1, 1),
                                                stop_words='english')
        self.cnn_vectorizer = KerasVectorizer(vocab_map_file=os.path.join(
            robotreviewer.DATA_ROOT, 'rct/rct_cnn_vocab_map.pck'))

        self.scale_constants = {
            'cnn': {
                'mean': 0.15592811611054261,
                'std': 0.22405916984696986,
                'weight': 1.6666666666666667
            },
            'ptyp': {
                'mean': 0.055155532891381948,
                'std': 0.22828359573751594
            },
            'svm': {
                'mean': -0.75481403525485891,
                'std': 0.7812955939364481,
                'weight': 10.0
            }
        }  # weighted in mean since we use only 1 SVM model (since produces near identical results to binning 10) and 6 CNN models (since runs faster, and no further reduction in variance for further models)

        self.thresholds = {
            'cnn': {
                'precise': 2.1340457758193034,
                'sensitive': -0.076709540491855063
            },
            'cnn_ptyp': {
                'precise': 3.529609848417909,
                'sensitive': 0.083502632442633312
            },
            'svm': {
                'precise': 1.9185522606237164,
                'sensitive': 0.093273630980694439
            },
            'svm_cnn': {
                'precise': 1.8749128673557529,
                'sensitive': 0.064481902000491614
            },
            'svm_cnn_ptyp': {
                'precise': 3.7674045603568755,
                'sensitive': 0.1952449060483534
            },
            'svm_ptyp': {
                'precise': 3.7358855328111837,
                'sensitive': 0.42992224964656178
            }
        }  # All precise models have been calibrated to 97.6% sensitivity
        # All sensitive models have been calibrated to 99.1% sensitivity

    def annotate(self, data):

        # use the best performing models from the validation paper (in draft...)
        filter_class = "svm_cnn_ptyp"
        threshold_class = "precise"

        if data.get("abstract") is not None and data.get("title") is not None:
            ti = data["title"]
            ab = data["abstract"]
        elif data.get("parsed_text") is not None:
            # then just use the start of the document
            TI_LEN = 30
            AB_LEN = 500
            # best guesses based on sample of RCT abstracts + aiming for 95% centile
            ti = data['parsed_text'][:TI_LEN].text
            ab = data['parsed_text'][:AB_LEN].text
        else:
            # else can't proceed
            return data

        if "pubmed" in data.data:
            ptyp = 1.0
        else:
            ptyp = 0.0

        X_ti_str = [ti]
        X_ab_str = ['{}\n\n{}'.format(ti, ab)]

        if "svm" in filter_class:

            X_ti = lil_matrix(self.svm_vectorizer.transform(X_ti_str))
            X_ab = lil_matrix(self.svm_vectorizer.transform(X_ab_str))

            svm_preds = self.svm_clf.decision_function(hstack([X_ti, X_ab]))
            svm_scale = (svm_preds - self.scale_constants['svm']['mean']
                         ) / self.scale_constants['svm']['std']

        if "ptyp" in filter_class:
            ptyp = np.array([ptyp])
            ptyp_scale = (ptyp - self.scale_constants['ptyp']['mean']
                          ) / self.scale_constants['ptyp']['std']

        if "cnn" in filter_class:
            X_cnn = self.cnn_vectorizer.transform(X_ab_str)
            cnn_preds = [clf.predict(X_cnn).T[0] for clf in self.cnn_clfs]
            cnn_preds = np.vstack(cnn_preds)
            cnn_scale = (cnn_preds - self.scale_constants['cnn']['mean']
                         ) / self.scale_constants['cnn']['std']

        if filter_class == "svm":
            y_preds = svm_scale
        elif filter_class == "svm_ptyp":
            y_preds = svm_scale + ptyp_scale
        elif filter_class == "ptyp":
            y_preds = ptyp_scale
        elif filter_class == "svm_cnn_ptyp":
            weights = [self.scale_constants['svm']['weight']] + (
                [self.scale_constants['cnn']['weight']] * len(self.cnn_clfs))
            y_preds = np.average(np.vstack([cnn_scale, svm_scale]),
                                 axis=0,
                                 weights=weights) + ptyp_scale

        structured_data = {
            "is_rct":
            bool(y_preds[0] > self.thresholds[filter_class][threshold_class]),
            "decision_score":
            y_preds[0],
            "model_class":
            filter_class
        }

        data.ml["rct"] = structured_data
        return data

        @staticmethod
        def get_marginalia(data):
            """
            Get marginalia formatted for Spa from structured data
            """
            marginalia = [{
                "type":
                "Trial Design",
                "title":
                "Is an RCT?",
                "annotations": [],
                "description":
                "{0} (Decision score={1:0.2f} using {} model)".format(
                    data["rct"]["is_rct"], data["rct"]["decision_score"],
                    data["rct"]["model_class"])
            }]
            return marginalia
예제 #57
0
            xml_content = xml_processor(xml_file.read())
            assert type(xml_content) == str
            yield xml_content
            print "sent file {0}, named \n {1} to processing".format(i, paths[i])
            i += 1


# First try producing features with Hashing Vectorizer,
# Which returns a scipy_sparse matrix with shape
# (n_samples, 2 ** 20 features). Has some downsides and
# may not be useable in training
if op.vectorizer == "hashing":
    # first use simple word tokens (whitespace sperated?)
    word_hasher = HashingVectorizer()
    hashed_sparse_mat = word_hasher.transform(
        generate_xml_paths(train_paths, test_paths)
    )

    print hashed_sparse_mat
    print type(hashed_sparse_mat)
    # Save the matrix as follows
    io.mmwrite("../data/features/naive_word_hashed_full_features.mtx",
               hashed_sparse_mat)

elif op.vectorizer == "hash_4gram_tfidf":
    # pipe vectorizer with ngrams and tfidf
    pipe = make_pipeline(
        HashingVectorizer(ngram_range=(1, 4)),
        TfidfTransformer()
    )
    hashed_sparse_mat = pipe.fit_transform(
예제 #58
0
tanonymous = np.array(tanonymous)
tpromotedto = np.array(tpromotedto)
tnumanswers = np.array(tnumanswers)
tnotopics = np.array(tnotopics)
tcontextfollowers = np.array(tcontextfollowers)
ttopicsfollowers = np.array(ttopicsfollowers)

print "extracting features"
quevectorizerTfid = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')  
topvectorizerTfid = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
quevectorizerHash = HashingVectorizer(stop_words='english',non_negative=True, n_features=1000)
topvectorizerHash = HashingVectorizer(stop_words='english',non_negative=True, n_features=1000)

#quesparseHash = quevectorizerHash.transform(question)
topsparseHash = topvectorizerHash.transform(topics)
#tquesparseHash = quevectorizerHash.transform(tquestion)
ttopsparseHash = topvectorizerHash.transform(ttopics)

cfscaler = preprocessing.StandardScaler().fit(contextfollowers)
tfscaler = preprocessing.StandardScaler().fit(topicsfollowers)

cfscaled = cfscaler.transform(contextfollowers)
tfscaled = tfscaler.transform(topicsfollowers)
tcfscaled = cfscaler.transform(tcontextfollowers)
ttfscaled = tfscaler.transform(ttopicsfollowers)

def benchmark(clf, trainx, trainy, test, dataset):
    print 80 * '_'
    print "Training..."
    print clf
예제 #59
0
            train_data.append(list[1])
            train_target.append(labeldict.get(list[0]))

print(cnt)
categories = dict.keys()
#print(train_data)
# split a training set and a test set
y_train = train_target

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    vectorizer = HashingVectorizer(stop_words='english',
                                   non_negative=True,
                                   n_features=opts.n_features)
    X_train = vectorizer.transform(train_data)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True,
                                 max_df=0.5,
                                 stop_words='english')
    X_train = vectorizer.fit_transform(train_data)
duration = time() - t0

print("Extracting features from the test data using the same vectorizer")
t0 = time()

duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, duration))
print()

if opts.use_hashing:
예제 #60
0
H_df = pd.read_csv(full_data_set, usecols=[5, 8])
"""### Counting the Full Data"""

# counting the number of ratings in the full set
rating_counts_full = H_df.groupby('Rating')['Rating'].count()
rating_counts_full.head()

# getting the ratios of the ratings
rating_counts_full / len(H_df)

H_df.describe()
"""##Creating the HashingVectorizer"""

# creates the HashingVectorizer that will be used with the full data
H_vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False)
H_X = H_vectorizer.transform(H_df['Text'])
H_y = H_df['Rating']

# splits the data 80/20 with random state 55
H_X_train, H_X_test, H_y_train, H_y_test = train_test_split(H_X,
                                                            H_y,
                                                            test_size=0.2,
                                                            random_state=55)
"""##NB Classifier with HashingVectorizer"""

H_modelNB = MultinomialNB()
H_modelNB.fit(H_X_train, H_y_train)
H_y_predNB = H_modelNB.predict(H_X_test)

results_function(H_modelNB, H_X_test, H_y_test, H_y_predNB)
"""##kNN Classifier with HashingVectorizer and K = 3"""