예제 #1
0
def jiebaCounter(max_features=5000,prefix="extraction-",begin=1, end=1,dictionary=""):
    # get stopwords
    sf = open('chi_,.txt','r')
    stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')]
    if dictionary=="":
        vectorizer=cv(max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer)
    else:
        vocabulary=open(dictionary,'r').read().split("\n")
        vectorizer=cv(vocabulary=vocabulary,max_features=max_features,stop_words=stopwords)#tokenizer=tokenizer)
    d={}
    st=time.time()
    d,txt=getText(prefix=prefix,begin=begin,end=end)
    getdatatime=time.time()
    print getdatatime-st
    corpus={}
    for i in range(len(txt)):#d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i]=(' '.join(jieba.cut(txt[i],cut_all=False)))
    vect=vectorizer.fit_transform(corpus.values()).toarray()
    print vect.shape
    voc=vectorizer.get_feature_names()
    wordssum = vect.sum(axis=0)
    index=range(len(voc))
    index = [index for (y,x,index) in sorted(zip(wordssum,voc,index),reverse=True) if x not in stopwords]
    print time.time() - st
    voc_sorted = [voc[i] for i in index]
    print time.time()-getdatatime
    return vect,voc,txt
def version2():  # Data cleaning in NLP Model
    corpus = []

    for i in range(0, 527383):
        review = re.sub(
            '[^a-zA-Z]', ' ',
            df.iloc[i,
                    1])  # Removing all elements except words from all reviews
        review = review.lower()
        review = review.split()
        review = [
            word for word in review if not word in set(sw.words('english'))
        ]
        stammer = ps()
        review = [stammer.stem(word) for word in review]
        review = " ".join(review)
        corpus.append(review)

    features = cv().fit_transform(corpus)
    labels = df.iloc[:, -1]

    train_test_split(features, labels, 100)

    features_test_vectorized = cv().transform(features_test)
    features_train_vectorized = cv().fit_transform(features_train)

    model = lr().fit(features_train_vectorized, labels_train)
    predictions = model.predict(features_test_vectorized)
    ras(labels_test, predictions)
    cm(labels_test, predictions)

    return model
def version1():  # Logistic Regression Model
    train_test_split(df["reviewText"], df["Positivity"], 100)

    features_train_vectorized = cv().fit_transform(features_train)
    features_test_vectorized = cv().transform(features_test)

    model = lr().fit(features_train_vectorized,
                     labels_train)  # Model creation for logistic regression
    predictions = model.predict(features_test_vectorized)

    ras(labels_test, predictions)  # Generating prediction score
    cm(labels_test, predictions)

    return model
예제 #4
0
def news_iterator_raw(input_size,batchsize):
    # Load some categories from the training set
    categories = [
        'alt.atheism',
        'talk.religion.misc',
    ]
    print("Loading 20 newsgroups dataset for 20 categories.")
    #print(categories)
    traindata = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'),categories=None)
    tfidf_vectorizer = cv(max_df=0.95, min_df=2, max_features=input_size,
                                   stop_words='english')
    train = tfidf_vectorizer.fit_transform(traindata.data)

    # bag of words
    vocabulary=tfidf_vectorizer.get_feature_names() # feature name
    x = train.astype(np.float64).toarray()

    train_x = x / (np.sum(x, axis=1)[:, None] + 1e-10)
    train_y = traindata.target

    testdata = fetch_20newsgroups(subset='test', categories=None, remove=('headers','footers','quotes'))
    test_features=tfidf_vectorizer.transform(testdata.data)

    x = test_features.astype(np.float64).toarray()
    test_x = x / (np.sum(x, axis=1)[:, None] + 1e-10)
    test_y = testdata.target
    return (train_x, train_y, test_x, test_y, vocabulary)
예제 #5
0
def main():

	traindata = (p.read_table('train.tsv'))
	tr_title, tr_body, tr_url = convert_text(traindata)

	testdata = list(np.array(p.read_table('test.tsv'))[:,2])
	y = np.array(p.read_table('train.tsv'))[:,-1]

	wordCount = cv(stop_words = 'english', encoding='latin-1')
	wordTFIDF = tfidf(stop_words = 'english', encoding='latin-1')

	corpus = tr_body

	bag = wordCount.fit_transform(corpus)
	tfdif = wordTFIDF.fit_transform(corpus)

	tfdif = tfdif.toarray()

	kmeans_soln.getDender(bag, tr_title)

	titles = np.array(tr_title)

	vocab = wordCount.get_feature_names()
	vocabTF = wordTFIDF.get_feature_names()

	topWords(centers, vocab)
예제 #6
0
def main():

    traindata = (p.read_table('train.tsv'))
    tr_title, tr_body, tr_url = convert_text(traindata)

    testdata = list(np.array(p.read_table('test.tsv'))[:, 2])
    y = np.array(p.read_table('train.tsv'))[:, -1]

    wordCount = cv(stop_words='english', encoding='latin-1')
    wordTFIDF = tfidf(stop_words='english', encoding='latin-1')

    corpus = tr_body

    bag = wordCount.fit_transform(corpus)
    tfdif = wordTFIDF.fit_transform(corpus)

    tfdif = tfdif.toarray()

    kmeans_soln.getDender(bag, tr_title)

    titles = np.array(tr_title)

    vocab = wordCount.get_feature_names()
    vocabTF = wordTFIDF.get_feature_names()

    topWords(centers, vocab)
    def train(self):
        """
        Trains the model based on movies; title, genres, tag
        """
        for attribute in ['title', 'genres', 'actorName', 'directorName']:
            self.movies[attribute] = self.movies[attribute]
        self.movies['merged'] = self.movies.apply(self.merge, axis=1)
        count_vectorized = cv()
        cs = cosine_similarity(
            count_vectorized.fit_transform(self.movies['merged']))
        recommended_movies = list(
            enumerate(cs[self.get_movie_id(self.watched_movie)]))

        if recommended_movies:
            predicted = self.get_highest(recommended_movies)
            for i, row in self.movies.iterrows():

                if predicted[0] == i:
                    print('\nSince you\'ve liked', self.watched_movie,
                          'We recommend: ', row['title'], 'genres:',
                          row['genres'])
                    print('Accuracy', predicted[1])
                if i == 999:
                    print(self.watched_movie, 'movie\'s genre:', row['genres'])
                    print()
        else:
            print('Something went wrong with the analysis')
예제 #8
0
def news_iterator(input_size,batchsize=100,alldata=True,label="y"):
    # Load some categories from the training set
    categories = [
        'alt.atheism',
        'talk.religion.misc',
    ]
    # load train data
    if(alldata):
        print("Loading 20 newsgroups dataset for 20 categories.")
        traindata = fetch_20newsgroups(subset='train',  remove=('headers','footers','quotes'),categories=None)
    else:
        print("Loading 20 newsgroups dataset for 2 categories.")
        traindata = fetch_20newsgroups(subset='train',  remove=('headers','footers','quotes'),categories=categories)
    # preprocessing
    words=traindata.data
    train_words=[]
    for i in range(0,len(traindata.data)):
        train_words.append( raw_to_words(traindata.data[i]) )

    # train iterator
    vectorizer=cv(analyzer="word",max_features=input_size, stop_words='english')
    train_features=vectorizer.fit_transform(train_words).toarray()
    vocabulary=vectorizer.get_feature_names() # feature name
    x = train_features.astype(np.float64)
    X_normalized = x / (np.max(x, axis=1)[:, None] + 1e-10)
    #ss = pp.StandardScaler(with_mean=False).fit(x)
    #X_normalized = ss.transform(x)
    y = traindata.target
    if(label=="x"):
        train_dataiter = mx.io.NDArrayIter(data=X_normalized, label = X_normalized, batch_size=batchsize, shuffle = True)
    else:
        train_dataiter = mx.io.NDArrayIter(data=X_normalized, label = y, batch_size=batchsize, shuffle = True)
    
    # load test data
    if alldata :
        testdata = fetch_20newsgroups(subset='test',  categories=None, remove=('headers','footers','quotes'))
    else:
        testdata = fetch_20newsgroups(subset='test',  categories=categories, remove=('headers','footers','quotes'))
    test_words=[]
    for i in range(0,len(testdata.data)):
        test_words.append( raw_to_words(testdata.data[i]) )

    # test iterator
    test_features=vectorizer.transform(test_words).toarray()
    x = test_features.astype(np.float64)
    #X_normalized = ss.transform(x)
    X_normalized = x / (np.max(x, axis=1)[:, None] + 1e-10)
    y = testdata.target
    if(label=="y"):
        val_dataiter = mx.io.NDArrayIter(data=X_normalized, label = y,  batch_size=batchsize, shuffle = True)
    else:
        val_dataiter = mx.io.NDArrayIter(data=X_normalized, label = X_normalized,  batch_size=batchsize, shuffle = True)

    return (train_dataiter, val_dataiter,vocabulary)
예제 #9
0
def transformer_array_vsm(tweet_list):
    # 向量空间模型下的词向文档矩阵
    corpus = []
    for tweet in tweet_list:
        corpus.append(' '.join(tweet.get_word_list()))
    vectorizer = cv()
    transformer = tt()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    array = tfidf.toarray()
    word_list = vectorizer.get_feature_names()
    return array, word_list
예제 #10
0
def jiebaCounter(max_features=5000,
                 prefix="extraction-",
                 begin=1,
                 end=1,
                 dictionary=""):
    # get stopwords
    sf = open('chi_,.txt', 'r')
    stopwords = [x.strip().decode('utf-8') for x in sf.read().split(',')]
    if dictionary == "":
        vectorizer = cv(max_features=max_features,
                        stop_words=stopwords)  #tokenizer=tokenizer)
    else:
        vocabulary = open(dictionary, 'r').read().split("\n")
        vectorizer = cv(vocabulary=vocabulary,
                        max_features=max_features,
                        stop_words=stopwords)  #tokenizer=tokenizer)
    d = {}
    st = time.time()
    d, txt = getText(prefix=prefix, begin=begin, end=end)
    getdatatime = time.time()
    print getdatatime - st
    corpus = {}
    for i in range(len(txt)):  #d.items():
        #corpus.append(" ".join(jieba.cut(line.split(',')[0],cut_all=False)))
        corpus[i] = (' '.join(jieba.cut(txt[i], cut_all=False)))
    vect = vectorizer.fit_transform(corpus.values()).toarray()
    print vect.shape
    voc = vectorizer.get_feature_names()
    wordssum = vect.sum(axis=0)
    index = range(len(voc))
    index = [
        index
        for (y, x, index) in sorted(zip(wordssum, voc, index), reverse=True)
        if x not in stopwords
    ]
    print time.time() - st
    voc_sorted = [voc[i] for i in index]
    print time.time() - getdatatime
    return vect, voc, txt
예제 #11
0
def cossim(doc1, doc2):
    from sklearn.metrics.pairwise import cosine_similarity as cs
    from sklearn.feature_extraction.text import CountVectorizer as cv

    x = [doc1, doc2]
    vectorizer = cv().fit_transform(x)
    vectors = vectorizer.toarray()

    a = vectors[0].reshape(1, -1)
    b = vectors[1].reshape(1, -1)

    similarity_score = cs(a, b)

    return similarity_score
예제 #12
0
def bayes(frame, size, occmin):

    # Filtering dataset to occupations occurring more than n times

    unfiltq = getqcols(frame)
    firstq = frame.columns.get_loc(unfiltq[0])
    tofilter = frame.iloc[:, firstq:]
    overmin = tofilter.iloc[:, (
        frame.iloc[:, firstq:].sum() > occmin).values].columns
    frame = frame.loc[:, frame.columns[:firstq].append(overmin)]

    filtq = getqcols(frame)
    firstq = frame.columns.get_loc(filtq[0])

    # Text cleaning
    frame['text'] = frame['text'].apply(nostops).apply(
        lambda i: ' '.join(i)).apply(cleaner)
    frame.insert(
        frame.columns.get_loc(filtq[0]), 'sets',
        frame['text'].apply(lambda i: set(re.findall("[a-z]{3,}", i))))
    frame['sets'] = [([a for a in x if a not in stoplist])
                     for x in frame['sets']]
    frame['sets'] = list(map(set, frame['sets']))
    frame['sets'] = frame.sets.apply(lambda i: ' '.join(i))

    # Vectorizer, fit, df data
    vec = cv(stop_words='english', max_features=30000)
    vec.fit(frame.sets)
    X = vec.transform(frame.text).toarray()
    acc = []
    rec = []
    pre = []
    cts = []
    totalcts = []
    cm = []
    label_used = []
    auc = []

    # Bayes loops
    for i in filtq:
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            frame.loc[:, i],
                                                            test_size=size,
                                                            random_state=0)
        gb = GaussianNB()
        gb.fit(X_train, y_train)
        pred = gb.predict(X_test)
        try:
            output = metrics.classification_report(y_test,
                                                   pred,
                                                   output_dict=True,
                                                   zero_division=0)['0']
            auc_score = metrics.roc_auc_score(y_test, pred)
            a = 0
        except:
            output = metrics.classification_report(y_test,
                                                   pred,
                                                   output_dict=True,
                                                   zero_division=0)['1']
            a = 1
            auc_score = 'N/A'
        label_used.append(str(a))
        acc.append("{:0.3}".format(metrics.accuracy_score(y_test, pred)))
        pre.append("{:0.3}".format(output['precision']))
        rec.append("{:0.3}".format(output['recall']))
        cts.append(y_test.sum())
        cm.append(metrics.confusion_matrix(y_test, pred))
        auc.append(auc_score)
        totalcts.append(sum(frame.loc[:, i]))

    d = {
        'label': label_used,
        'acc': acc,
        'pre': pre,
        'rec': rec,
        'auc': auc,
        'cts': cts,
        'totalcts': totalcts,
        'conf': cm
    }
    df = pd.DataFrame(data=d, index=filtq)
    return df
예제 #13
0
"""  bag of words vectorizor """
from sklearn.feature_extraction.text import CountVectorizer as cv

string1 = 'Hey Brian, go get me sone water best, Tony'
string2 = 'Dearest Tony, go suck a c**k. Bye, Brian'
string3 = 'Morning Brian, you little bitch, I am gonna report you. From, Tony'

#you need to put the emails in a list to vectorize them
emails = [string1, string2, string3]

#assigns the classifier
vectorize = cv()

#fits the data and transform (similar to predict) the emails
bag_of_words = vectorize.fit(emails)
bag_of_words = vectorize.transform(emails)

print bag_of_words
"""
How to read bag of words

(2,20)  2
(string3, word number 20)  #numer of occurance

"""

# this prints what feature number a word is 
print vectorize.vocabulary_.get('you')


예제 #14
0
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer as cv
from sklearn.metrics.pairwise import cosine_similarity as cs
cv = cv()
# a = np.array(4000)
import random


def find_title(index):
    try:
        return df[df.index == index]["title"].values[0]
    except:
        pass


def find_ref(title):
    try:
        return df[df.title == title]["index"].values[0]
    except:
        return 1932


df = pd.read_csv("movie.csv")
columns = df.columns
for column in columns:
    df[column] = df[column].fillna('')
    df[column] = df[column].dropna()


def recommend_by_feature(row):
예제 #15
0
findElbow(x_iris)


# In[45]:

from sklearn.feature_extraction.text import CountVectorizer as cv
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.metrics.pairwise import linear_kernel

df = pd.read_pickle('articles.pkl')
df.head()


# In[104]:

wordVector = cv(stop_words = 'english', encoding='latin-1')
wordWeights = tfidf(stop_words = 'english', encoding='latin-1')

corpus = df[df['section_name'] == 'Sports']['content']
corpus = corpus.append(df[df['section_name'] == 'Arts']['content'])
corpus = corpus.append(df[df['section_name'] == 'Business Day']['content'])

bag = wordVector.fit_transform(corpus)
weightybags = wordWeights.fit_transform(corpus)


# In[105]:

weightybags = weightybags.toarray()

예제 #16
0
def para2words(para):
    para_text = bs(para).get_text()
    para_lettersonly = re.sub("[^a-zA-Z]", "", para_text)
    para_words = para_lettersonly.lower().split()
    stops = set(sw.words("english"))
    para_meaning = [n for n in para_words if not n in stops]
    return (" ".join(para_meaning))


num_reviews = train["review"].size
clean_train = []
for i in range(0, num_reviews):
    clean_train.append(para2words(train["review"][i]))

vector = cv(analyzer="word",
            tokenizer=None,
            preprocessor=None,
            max_features=5000)

train_df = vector.fit_transform(clean_train)
train_df = train_df.toarray()

forest = rf(n_estimators=150)
forest = forest.fit(train_df, train["sentiment"])

numofrev = len(test["review"])
cleanreview = []
for i in range(0, numofrev):
    cleanreview.append(para2words(test["review"][i]))

test_df = vector.transfor(cleanreview)
test_df = test_df.toarray()
예제 #17
0
    res = f1_score(y_test, clf.predict(X_test), pos_label=None, average='macro')
    print 'f1 macro:', res
    print
    # color = cm(1. * i / NUM_COLORS)  # color will now be an RGBA tuple
    # cm = plt.get_cmap('gist_rainbow')
    # fig = plt.figure(figsize=(8.0, 5.0))
    # ax = fig.add_subplot(111)
    # # ax.set_color_cycle([cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)])
    # ax.plot(range(len(scores)), scores, label=str(threshold))
    # ax.text(len(scores) - 1, scores[len(scores) - 1], threshold, fontsize='smaller')
    # plt.show()
    print name
    return res


vec_list = [tf(), cv()]
clf_list = [svc(), lr()]
threshold_list = np.arange(0.5, 3, 0.5)
print len(threshold_list)
# results_size = (len(vec_list), len(clf_list),len(threshold_list))
# results = np.zeros(results_size, dtype = np.float)
# a, b, c = range(3), range(3), range(3)
# def my_func(x, y, z):
#     return (x + y + z) / 3.0, x * y * z, max(x, y, z)

grids = np.vectorize(run)(*np.ix_(threshold_list, vec_list, clf_list))
# mean_grid, product_grid, max_grid = grids
print len(grids)
try:
    print grids.shape
except:
예제 #18
0

findElbow(x_iris)

# In[45]:

from sklearn.feature_extraction.text import CountVectorizer as cv
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.metrics.pairwise import linear_kernel

df = pd.read_pickle('articles.pkl')
df.head()

# In[104]:

wordVector = cv(stop_words='english', encoding='latin-1')
wordWeights = tfidf(stop_words='english', encoding='latin-1')

corpus = df[df['section_name'] == 'Sports']['content']
corpus = corpus.append(df[df['section_name'] == 'Arts']['content'])
corpus = corpus.append(df[df['section_name'] == 'Business Day']['content'])

bag = wordVector.fit_transform(corpus)
weightybags = wordWeights.fit_transform(corpus)

# In[105]:

weightybags = weightybags.toarray()

# In[106]:
예제 #19
0
        avg = (np.mean(new_vec, axis=0))
        max = (np.max(new_vec, axis=0))
        min = (np.max(new_vec, axis=0))
        all = np.concatenate((avg, max, min))
        new_data.append(all)
    return new_data


if __name__ == "__main__":
    datafolder = '/Users/claire/Dropbox/PycharmProjects/Thesis/Scripts/Data/'
    trainfile = datafolder + 'twitter/twitter.train'
    testfile = datafolder + 'twitter/twitter.dev'
    tr_data, tr_target = load_twitter_2class(trainfile)
    te_data, te_target = load_twitter_2class(testfile)

    vec1 = cv(analyzer='word', ngram_range=(1, 4))
    vec2 = cv(analyzer='char_wb', ngram_range=(1, 4))

    combined_features = FeatureUnion([("word", vec1), ("char", vec2)])
    print combined_features

    # Use combined features to transform dataset:
    print 'Fit transform data'
    X_train = combined_features.fit_transform(tr_data)
    print X_train.shape
    X_test = combined_features.transform(te_data)
    # X_train = vec2.fit_transform(tr_data)
    # X_test = vec2.transform(te_data)

    print 'TRANSFORMED'
    for i in [log, svm]:
예제 #20
0
def reviews2words(raw_review):
    review_text = bs(raw_review).get_text()
    letters_only = re.sub("[^a-zA-z]", " ", review_text)
    wordlist = letters_only.lower().split()
    words = [w for w in wordlist if not w in stop]
    return " ".join(words)

if __name__ == "__main__":
    train = pd.read_csv(".\data\labeledTrainData.tsv", header = 0, delimiter = "\t", quoting=3)
    clean_train = []
    print 'Start cleaning reviews...\n'
    for i in xrange(0, train['review'].size):
        clean_train.append(reviews2words(train['review'][i]))
        
    print 'Creating the bag of words...\n'
    vectorizer = cv(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
    train_data_features = vectorizer.fit_transform(clean_train)
    train_data_features = train_data_features.toarray()
    # print train_data_features.shape
     
    vocab = vectorizer.get_feature_names()
    # print vocab
    
    forest = rf(n_estimators = 100)
    forest = forest.fit(train_data_features, train['sentiment'])

    test = pd.read_csv(".\data\\testData.tsv", header=0, delimiter="\t", quoting=3)
    # Create an empty list and append the clean reviews one by one

    num_reviews = len(test["review"])
    clean_test_reviews = [] 
예제 #21
0
    def fit(self, x):  # Fit the encoder/scaler
        self.n = x.shape[0]
        self.p = x.shape[1]
        dt1 = pd.Series([type(x.iloc[0][kk]).__name__ for kk in range(self.p)])
        dt2 = x.dtypes.astype(str).reset_index(drop=True)
        self.dt = pd.Series(
            np.where(
                dt1.isin(['int64', 'float64'])
                & dt2.isin(['int64', 'float64']), 'float', 'str'))
        if not all(self.dt.values == 'float'):
            self.dt[~(self.dt.values == 'float')] = \
                np.where(x.loc[:, ~(self.dt.values == 'float')].apply(lambda x: x.str.contains('\\|', na=False).any()),
                 'lst',self.dt[~(self.dt.values == 'float')])
        self.cn = np.array(x.columns)
        stopifnot(all(self.dt.isin(['float', 'lst', 'str'])))
        self.cidx = np.where(self.dt == 'str')[0]
        self.nidx = np.where(self.dt == 'float')[0]
        self.tidx = np.where(self.dt == 'lst')[0]
        stopifnot(
            all(
                np.sort(reduce(np.union1d, [self.cidx, self.nidx, self.tidx]))
                == np.arange(self.p)))
        self.iter = {'cenc': True, 'nenc': True, 'tenc': True}
        self.all_enc = {}

        #############################################################
        # --- Encoder (i): Categorical/ordinal integer features --- #

        if len(self.cidx) > 0:
            self.cenc = ohe(sparse=self.sparse,
                            dtype=self.dtype,
                            handle_unknown='ignore',
                            drop=None)
            self.cenc.categories_ = [
                np.unique(x.iloc[:, kk]) for kk in self.cidx
            ]
            self.cmode = [x.iloc[:, kk].mode()[0] for kk in self.cidx]
            cmode_idx = np.array([
                np.where(vec == mm)[0][0]
                for vec, mm in zip(self.cenc.categories_, self.cmode)
            ])
            cum_idx = np.append([0],
                                np.cumsum(
                                    [len(z) for z in self.cenc.categories_]))
            self.cenc.drop_idx = []
            self.cenc.drop_idx_ = None
            self.cenc.p = cum_idx.max() - len(
                self.cenc.drop_idx
            )  # How many features after dropping most common
            self.cenc.cn = list(
                np.delete(self.cenc.get_feature_names(self.cn[self.cidx]),
                          self.cenc.drop_idx))
            self.all_enc['cenc'] = self.cenc
        else:
            self.iter['cenc'] = False

        ###############################################
        # --- Encoder (ii): Continuous numerical ---- #

        if len(self.nidx) > 0:
            if self.quantize:
                u_nidx = np.array(
                    [len(x.iloc[:, kk].unique()) for kk in self.nidx])
                self.nidx1 = self.nidx[u_nidx > 31]  # quantize
                self.nidx2 = self.nidx[u_nidx <= 31]  # one-hot-encode
                self.nenc = {'enc': {}, 'cn': {}}
                if len(self.nidx1) > 0:
                    self.nenc1 = KD(n_bins=self.nbins, strategy='quantile')
                    if not self.sparse:
                        self.nenc1.encode = 'onehot-dense'
                    self.nenc1.fit(x.iloc[:, self.nidx1])
                    self.nenc1.cn = ljoin([
                        cn + '_q' + pd.Series(qq).astype(str)
                        for cn, qq in zip(self.cn[self.nidx1], [
                            np.arange(len(z) - 1) + 1
                            for z in self.nenc1.bin_edges_
                        ])
                    ])
                    self.nenc['enc']['nenc1'] = self.nenc1
                    self.nenc['cn']['nenc1'] = self.nenc1.cn
                if len(self.nidx2) > 0:
                    self.nenc2 = ohe(sparse=self.sparse,
                                     handle_unknown='ignore',
                                     drop=None)
                    self.nenc2.fit(x.iloc[:, self.nidx2])
                    self.nenc2.cn = self.nenc2.get_feature_names(
                        self.cn[self.nidx2])
                    self.nenc['enc']['nenc2'] = self.nenc2
                    self.nenc['cn']['nenc2'] = self.nenc2.cn
                self.nenc['cn'] = ljoin(list(self.nenc['cn'].values()))
                self.all_enc['nenc'] = self.nenc
            else:
                self.nenc = ss(copy=False)
                self.nenc.mean_ = x.iloc[:, self.nidx].mean(axis=0).values
                self.nenc.scale_ = x.iloc[:, self.nidx].std(axis=0).values
                self.nenc.n_features_in_ = self.nidx.shape[0]
                self.nenc.p = self.nidx.shape[0]
                self.nenc.cn = list(self.cn[self.nidx])
                self.all_enc['nenc'] = self.nenc
        else:
            self.iter['nenc'] = False

        ################################################
        # --- Encoder (iii): Tokenize text blocks ---- #

        if len(self.tidx) > 0:
            self.tenc = dict(
                zip(self.cn[self.tidx], [
                    cv(tokenizer=lambda x: tok_fun(x),
                       lowercase=False,
                       token_pattern=None,
                       binary=True) for z in range(self.tidx.shape[0])
                ]))
            self.tenc = {'cv': self.tenc}
            for kk, jj in enumerate(self.cn[self.tidx]):
                self.tenc['cv'][jj].fit(x.loc[:, jj].astype('U'))
            self.tenc['p'] = sum(
                [len(z.vocabulary_) for z in self.tenc['cv'].values()])
            self.tenc['cn'] = ljoin([
                l + '_' + pd.Series(list(z.vocabulary_.keys())) for z, l in
                zip(self.tenc['cv'].values(), self.tenc['cv'].keys())
            ])
            self.all_enc['tenc'] = self.tenc
        else:
            self.iter['tenc'] = False

        # Store all in dictionary to iteration over self.iter
        self.enc_transform = {
            'cenc': self.cenc_transform,
            'nenc': self.nenc_transform,
            'tenc': self.tenc_transform
        }
        # Get the valid categories
        self.tt = np.array(list(self.iter.keys()))[np.where(
            list(self.iter.values()))[0]]
        # Get full feature names
        cn = []
        for ee in self.tt:
            if hasattr(self.all_enc[ee], 'cn'):
                cn.append(self.all_enc[ee].cn)
            else:
                cn.append(self.all_enc[ee]['cn'])
        cn = ljoin(cn)
        self.cn_transform = cn
예제 #22
0
''' This file implements the functions needed for the SVM classifier. 
Most of them are fairly straightforward.
'''
from sklearn.feature_extraction.text import CountVectorizer as cv
from sklearn import svm

''' GLobals. For some reason this is the only way to get the classifiers to work. 
'''
#    Vectorizes the words into a format suitable for proxessing
#    the ngram range shows the minimum and maximum number of ngrams it takes into account. 
vectorizer = cv(ngram_range=(1,2),token_pattern=r'\b\w+\b', min_df = 1)
svm = svm.LinearSVC()


def train(train_data, labels):
    global svm    
    svm.fit(train_data, labels)
    return
    
''' Returns the accuracy on a given dataset'''
def score(test_data, test_labels):
    global svm
    return svm.score(test_data, test_labels)
''' Predicts the class of the tweets passed as input.'''
def predict(test_data):
    return svm.predict(test_data)
    
''' Turns a set of tweets into it's corresponding vector. For more about vectorization see:
   http://en.wikipedia.org/wiki/Bag-of-words_model
'''
def vectorize_tweets(training_set,train = True):
예제 #23
0
import seaborn as sns 
%matplotlib inline 
import string 
from nltk.corpus import stopwords 

#Removing punctuations  and stopwords
def function_before (mess): 
    nopunc = [] 
    for char in mess : 
        if char not in string.punctuation: 
            nopunc.append(char) 
    nopunc=''.join(nopunc) 
    clean = [] 
    for word in nopunc.split(): 
        word = word.lower() 
        if word not in stopwords.words('english'): 
            clean.append(word) 
    return clean 
    
from sklearn.pipeline import Pipeline  
from sklearn.naive_bayes import MultinomialNB 
from sklearn.feature_extraction.text import CountVectorizer as cv 
from sklearn.feature_extraction.text import TfidfTransformer

#Using the naive bayes classifier we created an NLP model 
pipeline = Pipeline([('bow', cv(analyzer=function_before)), 
                    ('tfidf', TfidfTransformer()), 
                    ('classifier', MultinomialNB()), 
                    ]) 
pipeline.fit(data, emotional_class) 
예제 #24
0
파일: driver.py 프로젝트: patrickdamery/nlp
                                            '').replace('""', '').replace(
                                                '"', '').replace('|', '')
            word = cleansed_w.lower()
            if word not in stop_words:
                revised_rev.append(word + ' ')
        test_text.append(''.join(revised_rev))

    train_text = list()
    labels = list()
    n = 0
    for data in train_data:
        train_text.append(data[1])
        labels.append(data[2])

    vectorizer = cv(encoding='utf-8',
                    strip_accents='unicode',
                    ngram_range=(1, 1),
                    decode_error='replace')
    vector_data = vectorizer.fit_transform(train_text)

    model_selector = model_selection
    X_train, X_test, y_train, y_test = model_selector.train_test_split(
        vector_data, labels, stratify=labels, test_size=0.2)

    classifier = sgd(loss='hinge', penalty='l1')
    classifier.fit(X_train, y_train)

    train_scores = classifier.score(X_train, y_train)
    print('Unigram Results')
    print('Train Scores')
    print(train_scores)
    print("Accuracy: %0.2f (+/- %0.2f)" %
예제 #25
0
 def __init__(self, revisoes, ngram_range=(1, 1)):
     self.vetorizador = cv(ngram_range=ngram_range)
     self.revisoes = revisoes
     self.vetorizar()
     print("\nCountVectorizer concluiu a vetorização de %s." %
           str(ngram_range))
예제 #26
0
                    header=0,
                    delimiter='\t',
                    quoting=3)

tokens, reviews = [], []

# preprocessing data => converting reviews to token list of words
for rev in range(0, train.shape[0]):
    t, r = review_to_words(train["review"][rev])
    tokens.append(t)  # token list
    reviews.append(r)  # seperate reviews

vocabulary = 5000  # max features
vectorizer = cv(analyzer="word",
                tokenizer=None,
                preprocessor=None,
                stop_words=None,
                max_features=vocabulary)

X = vectorizer.fit_transform(reviews).toarray()
Y = train["sentiment"]

validation_size = 0.20
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(
    X, Y, test_size=validation_size)

#classifier = DecisionTreeClassifier()		#DTC
#classifier = SVC()				#SVM
#classifier = KNeighborsClassifier()		#KNN
classifier = NB(alpha=2)  #alpha=0 means no laplace smoothing
classifier.fit(X_train, np.array(Y_train))