예제 #1
0
def get_hash_vectorizer(fname, n=20):
    dfx = pd.read_csv(fname)
    sentence_list = [sent for sent in dfx['clean_text'].values]
    hasher = HashingVectorizer(n_features=n)
    hasher.fit(sentence_list)
    return hasher
    pass
예제 #2
0
    def vectorize(self, X_text):
        news_df = X_text

        hash_text = HashingVectorizer(ngram_range=(3, 7),
                                      analyzer="char",
                                      alternate_sign=False)
        hash_title = HashingVectorizer(ngram_range=(3, 7),
                                       analyzer="char",
                                       alternate_sign=False)
        hash_author = HashingVectorizer(ngram_range=(3, 7),
                                        analyzer="char",
                                        alternate_sign=False)

        X_text = news_df['text']

        hash_text.fit(X_text)
        text_vector = hash_text.fit_transform(X_text.values.astype('U'))

        self.text_vector = text_vector
        X_title_text = news_df['title']

        print(text_vector[:1])

        title_vector = hash_title.fit_transform(
            X_title_text.values.astype('U'))
        self.title_vector = title_vector

        X_author = news_df['author']

        author_vector = hash_author.fit_transform(X_author.values.astype('U'))
        self.author_vector = author_vector

        return author_vector
예제 #3
0
    def vectorize(self, X_text):
        # Method takes X_text array as an argument with title, author, and text in that order.
        news = X_text

        hash_text = HashingVectorizer(ngram_range=(3, 7),
                                      analyzer="char",
                                      alternate_sign=False)
        hash_title = HashingVectorizer(ngram_range=(3, 7),
                                       analyzer="char",
                                       alternate_sign=False)
        hash_author = HashingVectorizer(ngram_range=(3, 7),
                                        analyzer="char",
                                        alternate_sign=False)

        X_text = news[:, 2]
        print('X_text')
        print(X_text[:5])
        hash_text.fit(X_text)
        text_vector = hash_text.fit_transform(X_text)

        self.text_vector = text_vector
        X_title_text = news[:, 0]

        print(text_vector[:1])

        title_vector = hash_title.fit_transform(X_title_text)
        self.title_vector = title_vector

        X_author = news[:, 1]

        author_vector = hash_author.fit_transform(X_author)
        self.author_vector = author_vector

        return author_vector
예제 #4
0
class FeatureExtractor(object):
    def __init__(self, count=True, hashing=False):
        self.count = count
        self.hashing = hashing
        self.was_fit = False

    def fit(self, X_text):
        if self.count:
            self.cv = CountVectorizer()
            self.cv.fit(X_text)
        if self.hashing:
            self.hv = HashingVectorizer(ngram_range=(1,2), norm=None, alternate_sign=False, binary=True)
            self.hv.fit(X_text)

        self.was_fit = True
        return

    def transform(self, X_text):
        assert self.was_fit
        if self.count:
            X_count = self.cv.transform(X_text)
            X = X_count
        if self.hashing:
            X_hashing = self.hv.transform(X_text)
            X = X_hashing
        if self.hashing and self.count:
            X = np.hstack([X_count, X_hashing])
        return X
예제 #5
0
    def train(cls, trn_corpus, config=None, dtype=np.float32):
        """Train on a corpus.

        Args:
            trn_corpus (list): Training corpus in the form of a list of strings.
            config (dict): Dict with keyword arguments to pass to sklearn's HashingVectorizer.
            dtype (type, optional): Data type. Default is `numpy.float32`.

        Returns:
            Hashing: Trained vectorizer.

        Raises:
            Exception: If `config` contains keyword arguments that the hashing vectorizer does not accept.
        """

        defaults = {
            "encoding": "utf-8",
            "strip_accents": "unicode",
            "stop_words": None,
            "ngram_range": (1, 2),
            "lowercase": True,
            "norm": "l2",
            "dtype": dtype,
            "n_features": 1048576,  # default number in HashingVectorizer
        }
        try:
            model = HashingVectorizer(**{**defaults, **config})
        except TypeError:
            raise Exception(
                f"vectorizer config {config} contains unexpected keyword arguments for HashingVectorizer"
            )
        model.fit(trn_corpus)
        return cls(model)
예제 #6
0
def train_hash_vectorizer(train, test, question):
    hash_vectorizer = HashingVectorizer(ngram_range=(1, 1))
    tfidf_txt = pd.Series(train[question + '1'].tolist() +
                          train[question + '2'].tolist() +
                          test[question + '1'].tolist() +
                          test[question + '2'].tolist()).astype(str)
    hash_vectorizer.fit(tfidf_txt)
    return hash_vectorizer
예제 #7
0
def BOW():
    from sklearn.feature_extraction.text import HashingVectorizer

    vectorizer = HashingVectorizer(ngram_range=(2, 2))
    vectorizer.fit(training_tweets)

    train_sequences = vectorizer.transform(training_tweets)
    val_sequences = vectorizer.transform(val_tweets)
    test_sequences = vectorizer.transform(test_tweets)
예제 #8
0
def test_hashing_vectorizer():
    for norm in ["l1", "l2", None]:
        vec = HashingVectorizer(n_features=2**8, norm=norm)
        vec.fit(X)
        vec_ = convert_estimator(vec)
        X_t = vec.transform(X)
        X_t_ = vec_.transform(X)
        assert np.allclose(
            vec.transform(X).toarray(),
            vec_.transform(X).todense())
예제 #9
0
    def __wordhash_features(self, data, vect=None, num_features=3000):
        '''
        extracts word ngram features from the provided data
        '''
        if vect is None:
            vect = HashingVectorizer(n_features=num_features,
                                    analyzer="word", stop_words='english',
                                    strip_accents='unicode',
                                    ngram_range=(1, 4))
            vect.fit(data)

        features = vect.transform(data)

        return features, vect
예제 #10
0
def tfidf_process_ci(data,train_data,test_data):

    y = train_data['Score']  
    tf1 = TfidfVectorizer(ngram_range=(1,6),token_pattern='\w+',analyzer='word')
    tf1.fit(data['cutted_Dis'])
    data1=tf1.transform(train_data['cutted_Dis'])
    test1 = tf1.transform(test_data['cutted_Dis'])
    print(data1.shape)  
    tf2 = HashingVectorizer(ngram_range=(1,2),lowercase=False)
    tf2.fit(data['cutted_Dis'])
    data2 = tf2.transform(train_data['cutted_Dis'])
    test2 = tf2.transform(test_data['cutted_Dis'])
    print(data2.shape)   
    train = hstack((data1,data2)).tocsr() 
    test = hstack((test1,test2)).tocsr()
    return train,test,y
예제 #11
0
def hash_feature(data, feature, max_f):
    vectorizer = HashingVectorizer(n_features=max_f,
                                   stop_words='english',
                                   alternate_sign=False,
                                   norm='l1',
                                   dtype=np.float32)
    # testing make_pipeline
    # vectorizer = make_pipeline(hasher, TfidfTransformer())

    if type(data) == np.ndarray:
        vectorizer.fit(data[:, feature])
        features_vec = vectorizer.transform(data[:, feature])
    else:
        vectorizer.fit(data[feature])
        features_vec = vectorizer.transform(data[feature])
    return vectorizer, features_vec
예제 #12
0
def fit_vectorizer(data, embedding_size, max_len, PAD):
    """用数据训练一个向量化器"""
    vectorizer = HashingVectorizer(n_features=embedding_size,
                                   analyzer='char',
                                   lowercase=False)
    words = [PAD]
    for sentences in X_train:
        t = list(sentences)
        if len(t) > max_len:
            t = t[:max_len]
        pad_size = max_len - len(t)
        if pad_size > 0:
            t = t + [PAD] * pad_size
        words += t
    vectorizer.fit(words)
    return vectorizer
예제 #13
0
def build_vectors():
    text = []
    print("Cleaning data")
    for idx, row in df.iterrows():
        cleaned = pipeline(row['title'] + ' ' + row['abstract'])
        text.append(cleaned)

    print("Building vectors")
    hv = HashingVectorizer(n_features=2**10)
    hv.fit(text)
    X = hv.transform(text)

    print("Saving")
    save_npz(VECTORS_F, X)
    pickle.dump(hv, open(MODEL_F, 'wb+'))

    return X, hv
예제 #14
0
def hash_vector(features, ngram=(1, 1), n_features=1048576, **kwargs):
    vectorizer = HashingVectorizer(analyzer='word',
                                   ngram_range=ngram,
                                   stop_words='english',
                                   norm='l2',
                                   non_negative=True,
                                   lowercase=True,
                                   n_features=n_features)
    fitted = vectorizer.fit(features)
    return fitted.transform(features), fitted
예제 #15
0
def tfidf_process_zi(data, train_data, test_data):
    data = cut_zi(data)
    train_data = cut_zi(train_data)
    test_data = cut_zi(test_data)

    y = train_data['Score']
    tf1 = TfidfVectorizer(ngram_range=(1, 6), analyzer='char')
    tf1.fit(data['cut_zi'])
    data1 = tf1.transform(train_data['cut_zi'])
    test1 = tf1.transform(test_data['cut_zi'])
    print(data1.shape)
    tf2 = HashingVectorizer(ngram_range=(1, 2), lowercase=False)
    tf2.fit(data['cut_zi'])
    data2 = tf2.transform(train_data['cut_zi'])
    test2 = tf2.transform(test_data['cut_zi'])
    print(data2.shape)
    train = hstack((data1, data2)).tocsr()
    test = hstack((test1, test2)).tocsr()
    return train, test, y
예제 #16
0
def setup():
    print("Configuring the logger")
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    print("Connecting to Mongo")
    client = MongoClient(config.DB_HOST, config.DB_PORT)
    db = client[config.DB_NAME]

    print("Loading the models")
    doc2vec = models.Doc2Vec.load(
        "../models/tweet_model_doc2vec_v2_300_new.bin")

    twitterCollection = db["tweet_leisure"]
    dictionaryCollection = db["dictionary"]

    vectorizer = HashingVectorizer(stop_words='english', ngram_range=(1, 1))

    documents = list(twitterCollection.find())
    documents = list(map(lambda x: (' '.join(x["tokens"])), documents))
    vectorizer.fit(documents)

    return dictionaryCollection, twitterCollection, doc2vec, vectorizer
예제 #17
0
    def CreateRpeFeature(self, look, test=False, verbose=False):
        if not test:
            vectorizer = HashingVectorizer(n_features=2**8, ngram_range=(1, 2))
            vectorizer.fit(self.fulldata_words['rpe'].values)
            self.rpe_vectorizer = vectorizer

        def create_rpe_features(g):
            rpe = g[((g["word_num"] - g["target_word_num"]).abs() <= look)
                    & ~(g["word_num"] == g["target_word_num"])]['rpe'].values
            return " ".join(rpe)

        rpe_sentences = self.fulldata_words.groupby("sentence_num").apply(
            create_rpe_features)

        if test:
            return rpe_sentences.apply(lambda x: pd.Series(
                data=self.rpe_vectorizer.transform([x]).toarray()[0],
                index=[f"rpe_hash_{k}" for k in range(vectorizer.n_features)]))
        else:
            return rpe_sentences.apply(lambda x: pd.Series(
                data=vectorizer.transform([x]).toarray()[0],
                index=[f"rpe_hash_{k}" for k in range(vectorizer.n_features)]))
예제 #18
0
def train_model(X_train):
    news_df = X_train
    hash_author = HashingVectorizer(ngram_range=(3, 7),
                                    analyzer="char",
                                    alternate_sign=False)
    hash_title = HashingVectorizer(ngram_range=(3, 7),
                                   analyzer="char",
                                   alternate_sign=False)
    hash_text = HashingVectorizer(ngram_range=(3, 7),
                                  analyzer="word",
                                  alternate_sign=False)

    X_text = news_df[:, 2]

    hash_text.fit(X_text)
    X = hash_text.fit_transform(X_text)
    X_title_text = news_df[:, 0]

    X2 = hash_title.fit_transform(X_title_text)

    X_author = news_df[:, 1]

    X3 = hash_author.fit_transform(X_author)
    print('vectorized')

    pickle_path1 = os.path.join("resources", "X_text_matrix.pkl")
    pickle_path2 = os.path.join("resources", "X_title_matrix.pkl")
    pickle_path3 = os.path.join("resources", "X_author_matrix.pkl")
    with open(pickle_path1, "wb") as output_file:
        pickle.dump(X, output_file)

    with open(pickle_path2, "wb") as output_file2:
        pickle.dump(X2, output_file2)

    with open(pickle_path3, "wb") as output_file3:
        pickle.dump(X3, output_file3)

    return
예제 #19
0
def save_fit_result(texts_ql_qr):
    marisa_count1 = MarisaCountVectorizer()
    marisa_tfidf1 = MarisaTfidfVectorizer()
    hashing1_18 = HashingVectorizer(n_features=2**20)
    hashing1_20 = HashingVectorizer(n_features=2**24)
    hashing2_18 = HashingVectorizer(ngram_range=(1,2),n_features=2**20)
    hashing2_20 = HashingVectorizer(ngram_range=(1,2),n_features=2**24)
    
    ql_qr_tfidf1 = marisa_tfidf1.fit(texts_ql_qr)
    with open('./data/yao_saver/ql_qr_tfidf1.pkl','wb') as a:
        pickle.dump(ql_qr_tfidf1, a)
    print('ql_qr_tfidf1 is ok')
    
    ql_qr_count1 = marisa_count1.fit(texts_ql_qr)
    with open('./data/yao_saver/ql_qr_count1.pkl','wb') as a:
        pickle.dump(ql_qr_count1, a)
    print('ql_qr_count1 is ok')
    
    ql_qr_hash1_18 = hashing1_18.fit(texts_ql_qr)
    with open('./data/yao_saver/ql_qr_hash1_18.pkl','wb') as a:
        pickle.dump(ql_qr_hash1_18, a)
    print('ql_qr_hash1_18 is ok')
    
    ql_qr_hash2_18 = hashing2_18.fit(texts_ql_qr)
    with open('./data/yao_saver/ql_qr_hash2_18.pkl','wb') as a:
        pickle.dump(ql_qr_hash2_18, a)
    print('ql_qr_hash2_18 is ok')
    
    ql_qr_hash1_20 = hashing1_20.fit(texts_ql_qr)
    with open('./data/yao_saver/ql_qr_hash1_20.pkl','wb') as a:
        pickle.dump(ql_qr_hash1_20, a)
    print('ql_qr_hash1_20 is ok')
    
    ql_qr_hash2_20 = hashing2_20.fit(texts_ql_qr)
    with open('./data/yao_saver/ql_qr_hash2_20.pkl','wb') as a:
        pickle.dump(ql_qr_hash2_20, a)
    print('ql_qr_hash2_20 is ok')
def hash_vector(features, 
                ngram=(1,1),
                n_features=1048576, 
                **kwargs):
  vectorizer = HashingVectorizer(
    analyzer='word', 
    ngram_range=ngram,
    stop_words = 'english',
    norm='l2', 
    non_negative=True, 
    lowercase=True,
    n_features=n_features
  )
  fitted = vectorizer.fit( features )
  return fitted.transform( features ), fitted
예제 #21
0
def Hashvect(train_int, test_int=None, Ngram_min=1, Ngram_max=1):
    import pandas as pd
    from sklearn.feature_extraction.text import HashingVectorizer

    def toktotxt(txt_int):
        if isinstance(txt_int[0], list):
            text = txt_int.apply(lambda x: " ".join(str(i) for i in x))
        else:
            text = txt_int
        return text

    train_txt = toktotxt(train_int)
    vectorizer = HashingVectorizer(ngram_range=(Ngram_min, Ngram_max))
    vectorizer.fit(train_txt)
    X = vectorizer.transform(train_txt)
    train = X.toarray()
    if test_int is None:
        out = train
    else:
        test_txt = toktotxt(test_int)
        Y = vectorizer.transform(test_txt)
        test = Y.toarray()
        out = train, test
    return out
예제 #22
0
def vectorize_data(data, n_feartures=50000):
    """

    :param data: a list of tweets
    :param n_feartures: int()
    :return: a matrix
    """
    vectorizer = HashingVectorizer(non_negative=True,
                                   binary=False,
                                   norm=None,
                                   ngram_range=(1, 2),
                                   analyzer='word',
                                   n_features=n_feartures)
    data_vec = vectorizer.fit(data)
    return data_vec
예제 #23
0
plotly.offline.init_notebook_mode(connected=True)

import itertools
import matplotlib.pyplot as plt

labels = labels()
review_tokens = all_reviews()

# vectorizer for organizing training/testing data
count_vect = CountVectorizer()
tf_vect = TfidfVectorizer()
hash_vect = HashingVectorizer()
# tokens must be a list of full reviews
count_vect.fit(review_tokens)
tf_vect.fit(review_tokens)
hash_vect.fit(review_tokens)

# split data into training and test set with train_test_split function - setting shuffle to True because, we
# have pos reviews on the 1st half and neg on 2nd half of 'tokens' array

x_train, x_test, y_train, y_test = train_test_split(review_tokens,
                                                    labels,
                                                    test_size=.5,
                                                    random_state=1234,
                                                    shuffle=True)

# create svm classifier and train it
count_lsvm = LinearSVC()
tf_lsvm = LinearSVC()
hash_lsvm = LinearSVC()
예제 #24
0
파일: data.py 프로젝트: LimingDeng/scnn
def parse_wikirfa_edges_as_search(n_features = 100):
    '''Parse wiki-rfa for an edge sentiment analysis experiment.'''

    from sklearn.feature_extraction.text import HashingVectorizer

    path = "%s/data/wikirfa/" % (current_dir,)

    n_nodes = 10926
    n_edges = 176963  # number of +/- edges

    n_edges_read = 189004  # includes neutral edges

    # username -> index map constructed on the fly
    usermap = {}

    A = np.zeros((n_features + 1, n_nodes, n_nodes), dtype='float32')
    B = np.zeros((n_edges, n_nodes), dtype='float32')
    X = np.ones((n_nodes, 1) , dtype='float32')
    Y = np.zeros((n_edges, 2), dtype='int32')

    # build comment vectorizer
    with open(path+'all_comments.txt','r') as f:
        vectorizer = HashingVectorizer('content', n_features=n_features)
        vectorizer.fit(f.xreadlines())

    u = 0
    v = 0

    resmap  = {-1: 0, 1: 1}
    votemap = {-1: 0, 1: 1}

    with open(path + 'wiki-RfA.txt','r') as f:
        for i in range(n_edges_read):
            # Read in the entry
            # SRC:Guettarda
            tail = f.readline().strip()[4:]

            # TGT:Lord Roem
            head = f.readline().strip()[4:]

            # VOT:1
            vote = int(f.readline().strip()[4:])

            # RES:1
            res = int(f.readline().strip()[4:])

            # YEA:2013
            year = int(f.readline().strip()[4:])

            #DAT:19:53, 25 January 2013
            date = f.readline().strip()[4:]

            # TXT:'''Support''' per [[WP:DEAL]]: clueful, and unlikely to break Wikipedia.
            txt = f.readline().strip()[4:]

            # kill blank line
            f.readline()

            # Process the entry
            # index users
            if tail not in usermap:
                usermap[tail] = u
                u += 1
            if head not in usermap:
                usermap[head] = u
                u += 1

            if vote != 0:
                # add vote edge
                A[0, usermap[tail], usermap[head]] = vote

                # add to incidence matrix

                B[v, usermap[tail]] = 1
                B[v, usermap[head]] = 1

                # vectorize text using the hashing trick
                #try:
                #    commentcount[(tail,head)] += 1
                #except KeyError:
                #    commentcount[(tail,head)] = 1
                #features = np.zeros(n_features)
                #for token in txt.strip().split(' '):
                #    features[hash(token) % n_features] += 1
                #A[1:, usermap[tail], usermap[head]] = features

                A[1:, usermap[tail], usermap[head]] = np.asarray(vectorizer.transform([txt]).todense())[0]

                X[usermap[head],0] = resmap[res]

                # treat the result as the node class
                Y[v, votemap[vote]] = 1
                v += 1

    # normalize features
    #for edge in commentcount:
    #    A[1:, edge[0], edge[1]] /= commentcount[edge]

    print u
    print v

    assert len(usermap) == n_nodes
    assert u == n_nodes
    assert v == n_edges

    return A, B, X, Y
예제 #25
0
enc = OneHotEncoder()
for feature in one_hot_feature:
    enc.fit(data[feature].values.reshape(-1, 1))
    train_a = enc.transform(train[feature].values.reshape(-1, 1))
    test_a = enc.transform(test[feature].values.reshape(-1, 1))
    train_x = sparse.hstack((train_x, train_a))
    test_x = sparse.hstack((test_x, test_a))
print('one-hot prepared !')

print(train_x.shape)
print(test_x.shape)

cv = HashingVectorizer(n_features=1000)
#cv=CountVectorizer()
for feature in vector_feature:
    cv.fit(data[feature])
    train_a = cv.transform(train[feature])
    test_a = cv.transform(test[feature])
    train_x = sparse.hstack((train_x, train_a))
    test_x = sparse.hstack((test_x, test_a))
print('cv prepared !')

print(train_x.shape)
print(test_x.shape)
del train, test


def LGB_test(train_x, train_y, test_x, test_y):
    print("LGB test")
    clf = lgb.LGBMClassifier(boosting_type='gbdt',
                             num_leaves=31,
예제 #26
0
def clean_text(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = []
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    string = ' '.join(filtered_sentence)
    return string


# In[10]:

# Function to predict if an input string is likely to be in top journal
# note: copy/pasted from Econ_machineLearn.ipynb
hash_vectorizer = HashingVectorizer(analyzer='word', ngram_range=(1, 2))
hash_vectorizer.fit(X_train)


def model_predict(s):
    string = []
    string.append(s)
    vectorized = hash_vectorizer.transform(string)
    probab = round(max(clf2.predict_proba(vectorized)[0]) * 100, 2)
    prediction = clf2.predict(vectorized)[0]
    if prediction == 1:
        result = "Predicted to be in the top 20 Economics journals"
    else:
        result = "Predicted to NOT be in the top 20 Economics journals"
    return result + " with a probability of " + str(probab) + "%."

예제 #27
0
                    if word not in set(stopwords.words('english')))
    return text


df['Sentence'] = df['Sentence'].apply(clean_text)

X_core = df['Sentence'].values
midway = int(X_core.shape[0] / 2)

######################

from sklearn.feature_extraction.text import HashingVectorizer

hashing_vect = HashingVectorizer(n_features=15000)

fitted_vect = hashing_vect.fit(X_core[:midway])
with open('hashing_fitted_vect.pickle', 'wb') as fin:
    pickle.dump(fitted_vect, fin)

X_hash = fitted_vect.transform(X_core[:midway]).toarray()

X_train, X_test, y_train, y_test = train_test_split(
    X_hash,
    pd.get_dummies(df['Risk_Factor'][:midway]).values,
    test_size=0.2,
    random_state=42)

RandomProjection = random_projection.GaussianRandomProjection(
    n_components=4000)
X_train = RandomProjection.fit_transform(X_train)
X_test = RandomProjection.transform(X_test)
예제 #28
0
N_FEATURES = [5000, 10000]
stopwords = nltk.corpus.stopwords
STOPWORDS = stopwords.words('english') + list(string.punctuation) + [
    '``', '...', '--', ',', ':', 'br', '\'s', '\'', 'n\'t', '\'\''
]
DATA_FOLDER = './processed/'
CORPUS = pd.read_pickle(DATA_FOLDER + 'pd.DF.train_both.pickle')['text']

for n_gram in N_GRAM:
    for n_features in N_FEATURES:

        # create the hashing vectorizer
        hv = HashingVectorizer(stop_words=STOPWORDS,
                               n_features=n_features,
                               ngram_range=n_gram)
        hv.fit(CORPUS)

        # create a pipeline for tfidf calculations
        hv2 = HashingVectorizer(stop_words=STOPWORDS,
                                n_features=n_features,
                                norm=None,
                                ngram_range=n_gram)
        tf = TfidfTransformer()
        tfidf = Pipeline([("hash", hv2), ("tf", tf)])

        tfidf.fit(CORPUS)

        # transform all the data we have
        for dataset in ['train', 'test']:
            neg_corpus = pd.read_pickle(DATA_FOLDER + 'pd.DF.' + dataset +
                                        '_neg.pickle')['text']
예제 #29
0
stop_words = nltk.corpus.stopwords.words('english') + list(string.punctuation)
vectorizer = HashingVectorizer(norm='l1') #this worked best
table = str.maketrans('', '', string.punctuation)
training_words = get_text_features(train_data)
test_words = get_text_features(test_data)
all_words = training_words + test_words

train_text = list()
test_text = list()
for item in training_words:
    text = " ".join(item)
    train_text.append(text)
for item in test_words:
    text = " ".join(item)
    test_text.append(text)
vectorizer.fit(train_text +test_text)
X_train_text_features = vectorizer.transform(train_text)
X_test_text_features= vectorizer.transform(test_text)

print ("Concatenating all features.....")


X_train_final = sp.hstack([X_train_categorical,X_train_text_features.astype(float)])
X_test_final = sp.hstack([X_test_categorical,X_test_text_features.astype(float)])


#print (X_train_final.shape)



예제 #30
0
파일: data.py 프로젝트: LimingDeng/scnn
def parse_wikirfa(n_features = 100):
    '''Parse wiki-rfa for an edge sentiment analysis experiment.'''

    #from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import HashingVectorizer

    path = "%s/data/wikirfa/" % (current_dir,)

    n_nodes = 10926
    n_edges = 176963  # number of +/- edges

    n_edges_read = 189004  # includes neutral edges

    # username -> index map constructed on the fly
    usermap = {}

    A = np.zeros((n_nodes, n_nodes), dtype='float32')
    B = np.zeros((n_edges, n_nodes), dtype='float32')
    X_N = np.ones((n_nodes, 1) , dtype='float32')
    X_E = np.ones((n_edges, n_features) , dtype='float32')
    Y = np.zeros((n_edges, 2), dtype='int32')

    # build comment vectorizer
    with open(path+'all_comments.txt','r') as f:
        #vectorizer = CountVectorizer('content', max_features=n_features)
        vectorizer = HashingVectorizer('content', n_features=n_features)
        vectorizer.fit(f.xreadlines())

    u = 0
    v = 0

    resmap  = {-1: 0, 1: 1}
    votemap = {-1: 0, 1: 1}

    with open(path + 'wiki-RfA.txt','r') as f:
        for i in range(n_edges_read):
            # Read in the entry
            # SRC:Guettarda
            tail = f.readline().strip()[4:]

            # TGT:Lord Roem
            head = f.readline().strip()[4:]

            # VOT:1
            vote = int(f.readline().strip()[4:])

            # RES:1
            res = int(f.readline().strip()[4:])

            # YEA:2013
            year = int(f.readline().strip()[4:])

            #DAT:19:53, 25 January 2013
            date = f.readline().strip()[4:]

            # TXT:'''Support''' per [[WP:DEAL]]: clueful, and unlikely to break Wikipedia.
            txt = f.readline().strip()[4:]

            # kill blank line
            f.readline()

            # Process the entry
            # index users
            if tail not in usermap:
                usermap[tail] = u
                u += 1
            if head not in usermap:
                usermap[head] = u
                u += 1

            if vote != 0:
                # add vote edge
                A[usermap[tail], usermap[head]] = vote
                A[usermap[head], usermap[tail]] = vote

                # add to incidence matrix
                B[v, usermap[tail]] = 1
                B[v, usermap[head]] = 1

                X_N[usermap[head],0] = resmap[res]
                X_E[v, :] = np.asarray(vectorizer.transform([txt]).todense())[0]

                Y[v, votemap[vote]] = 1
                v += 1

    print u
    print v

    assert len(usermap) == n_nodes
    assert u == n_nodes
    assert v == n_edges

    return A, B, X_N, X_E, Y
예제 #31
0
    # efficient in case of large datasets
    vectorizer = HashingVectorizer(stop_words='english')

    # In case of HashingVectorizer we don't need to fit
    # the data, just transform would work.
    X_train = vectorizer.transform(data_train.data)
    X_test = vectorizer.transform(data_test.data)
elif feature_extractor_type == "count":
    # The other vectorizer we can use is CountVectorizer with
    # binary=True. But for CountVectorizer we need to fit
    # transform over both training and test data as it
    # requires the complete vocabulary to create the matrix
    vectorizer = CountVectorizer(stop_words='english', binary=True)

# First fit the data
vectorizer.fit(data_train.data + data_test.data)

# Then transform it
X_train = vectorizer.transform(data_train.data)
X_test = vectorizer.transform(data_test.data)

# alpha is additive (Laplace/Lidstone) smoothing parameter (0 for
# no smoothing).
clf = BernoulliNB(alpha=.01)

# Training the classifier
clf.fit(X_train, y_train)

# Predicting results
y_predicted = clf.predict(X_test)
score = metrics.accuracy_score(y_test, y_predicted)
예제 #32
0
del artist

train.song_hotttnesss[train.song_hotttnesss > train.song_hotttnesss.mean()] = 1
train.song_hotttnesss[train.song_hotttnesss < train.song_hotttnesss.mean()] = 0

train.year = (train.year // 10) * 10
test.year = (test.year // 10) * 10

CategoricalFeatures = train[['artist_id', 'title', 'audio_md5']]

from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer

vectorizer = HashingVectorizer(n_features=750)
#vectorizer = TfidfVectorizer(min_df  = 0.0002)

TfidfVectorizerObject = vectorizer.fit(pd.concat([train.title, test.title]))
CountVectorizerTrainData = TfidfVectorizerObject.transform(train["title"])
CountVectorizerTestData = TfidfVectorizerObject.transform(test["title"])

DropFeatures = [
    'song_id', 'artist_id', 'title', 'audio_md5', 'analysis_sample_rate',
    'key_confidence', 'audio_md5', 'year', 'end_of_fade_in', 'duration',
    'time_signature_confidence', 'artist_latitude', 'artist_longitude'
]

trainSongId = train[['song_id']]
train = train.drop(DropFeatures, axis=1)
song_id = test['song_id']
test = test.drop(DropFeatures, axis=1)
train = pd.concat(
    [train, pd.DataFrame(CountVectorizerTrainData.toarray())], axis=1)
예제 #33
0
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# initialize TFIDF vectorizer and Hashing Vectorizer
tfidf = TfidfVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                        ngram_range=(1, 2),
                        max_df=1.0,
                        min_df=10,
                        stop_words='english')

hsv = HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                        stop_words='english')

# fit tfidf and hashing vectorizer to train data
print("Feature Extraction")
tfidf.fit(x_)
hsv.fit(x_)

# transform
X_tfidf = tfidf.transform(x_)
X_test_tfidf = tfidf.transform(test_x_)

X_hsv = hsv.transform(x_)
X_test_hsv = hsv.transform(test_x_)

# combine
X = sparse.hstack((X_hsv, X_tfidf))
X_test = sparse.hstack((X_test_hsv, X_test_tfidf))

######################################################
# Using SGDClassifier to build 27 models. Each class
# (9 total) will have 3 models - varying some of the
예제 #34
0
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

# Creates a df that lists numbers of uniques and the unique 'string'
df  = pd.DataFrame(cvec.transform([spam]).todense(),
             columns=cvec.get_feature_names())

df.transpose().sort_values(0, ascending=False).head(10).transpose()

# Hash Vectorizer, more for big data
from sklearn.feature_extraction.text import HashingVectorizer
hvec = HashingVectorizer()
hvec.fit([spam])

df  = pd.DataFrame(hvec.transform([spam]).todense())
df.transpose().sort_values(0, ascending=False).head(10).transpose()

# Breaks up sentences and puts them into an array
from nltk.tokenize import PunktSentenceTokenizer
easy_text = "I went to the zoo today. What do you think of that? I bet you hate it! Or maybe you don't"
sent_detector = PunktSentenceTokenizer()
sent_detector.sentences_from_text(easy_text)

"""
Out[6]: 
['I went to the zoo today.',
 'What do you think of that?',
 'I bet you hate it!',