class TF_Transformer(base.BaseEstimator, base.TransformerMixin):
	def __init__(self):

		self.cv_bi = CountVectorizer(min_df=2,max_df=0.7,ngram_range=(1,2))
		self.tfidf_trans = TfidfTransformer()
		self.SVD_trans = TruncatedSVD(n_components=300)

    # X is a list of Fit_Review named tuples, y is none
	def fit(self, X, y=None):

		texts = [review.text for review in X]

		counts = self.cv_bi.fit_transform(texts)
		counts_tfidf = self.tfidf_trans.fit_transform(counts)
		self.SVD_trans.fit(counts_tfidf)

		return self

    # X is a list of either Fit_Review or Prod_Corpus named tuples
	def transform(self, X):

		texts = [review.text for review in X]

		counts = self.cv_bi.transform(texts)
		counts_tfidf = self.tfidf_trans.transform(counts)
		counts_trunc = self.SVD_trans.transform(counts_tfidf)

		return counts_trunc
예제 #2
0
def check_webshell(clf,dir):
    all=0
    all_php=0
    webshell=0

    webshell_files_list = load_files_re(webshell_dir)
    CV = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", max_features=max_features,
                         token_pattern=r'\b\w+\b', min_df=1, max_df=1.0)
    x = CV.fit_transform(webshell_files_list).toarray()

    transformer = TfidfTransformer(smooth_idf=False)
    transformer.fit_transform(x)


    g = os.walk(dir)
    for path, d, filelist in g:
        for filename in filelist:
            fulepath=os.path.join(path, filename)
            t = load_file(fulepath)
            t_list=[]
            t_list.append(t)
            x2 = CV.transform(t_list).toarray()
            x2 = transformer.transform(x2).toarray()
            y_pred = clf.predict(x2)
            all+=1
            if filename.endswith('.php'):
                all_php+=1
            if y_pred[0] == 1:
                print "%s is webshell" % fulepath
                webshell+=1

    print "Scan %d files(%d php files),%d files is webshell" %(all,all_php,webshell)
예제 #3
0
def text_sentiment(docs_new):
   docs_new=[docs_new]
   twenty_train= load_files('./Sentiment')  #the complete data is in this directory; like comp.graphics etc
   count_vect = CountVectorizer()
   X_train_counts = count_vect.fit_transform(twenty_train.data)
   tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
   X_train_tf = tf_transformer.transform(X_train_counts)
   tfidf_transformer = TfidfTransformer()
   X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

   # Fit a classifier on the training set
   #clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
   #f = open('my_classifier.pickle', 'wb')
   #pickle.dump(clf, f)
   #f = open('my_classifier.pickle',)
   #clf = pickle.load(f)
   #f.close()
   # save the classifier
   #with open('my_sentiment.pkl', 'wb') as fid:
      #cPickle.dump(clf, fid)    

   # load it again
   with open('my_sentiment.pkl', 'rb') as fid:
      clf = cPickle.load(fid)
   X_new_counts = count_vect.transform(docs_new)
   X_new_tfidf = tfidf_transformer.transform(X_new_counts)

   predicted = clf.predict(X_new_tfidf)
   return twenty_train.target_names[predicted]
예제 #4
0
def handle_doc(word_set,rs_path):
    doc_dir = os.listdir(rs_path)
    doc_matrix = []
    doc_cat = []
    for docs in doc_dir:
        files = os.listdir(rs_path+docs)
        print "start to handle the -->  "+docs
        for file_d in files:
            d_path = rs_path+docs+'/'+file_d
            #get the single file path
            with open(d_path,'rb') as text_file:
                str_tmp = ''
                file_lines = text_file.readlines()
                for line in file_lines:
                    pattern = r'''[a-zA-Z]+'''
                    tokens = nltk.regexp_tokenize(line,pattern)
                    for t in tokens:
                        if t.lower() in word_set:
                            str_tmp += t.lower()
                            str_tmp += ' '
                doc_matrix.append(str_tmp)
                doc_cat.append(cat_dic[docs])
            text_file.close()
    str_tmp = ''
    for sw in word_set:
        str_tmp += sw
        str_tmp += ' '
    doc_matrix.append(str_tmp)
    doc_cat.append('NAN')
    vectorizer = CountVectorizer()
    doc_num = vectorizer.fit_transform(doc_matrix)
    tfidf = TfidfTransformer()
    doc_tfidf = tfidf.fit_transform(doc_num)
    return doc_tfidf[:-1,:],doc_cat[:-1]
def bayes_tfidf(prefix, sufix, dic_fn):
    """
    prefix example: ./data/single_label_sen/sen_spanish_protest
    sufix example: pop_cat
    """

    train_file = prefix + "_train.txt.tok"
    test_file = prefix + "_test.txt.tok"

    train_y_file = prefix + "_train." + sufix
    test_y_file = prefix + "_test." + sufix
    
    dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))}


    word_train_set = [l.strip().lower() for l in open(train_file)]
    word_test_set = [l.strip().lower() for l in open(test_file)]

    train_y = [dic_cn[l.strip()] for l in open(train_y_file)]
    test_y = [dic_cn[l.strip()] for l in open(test_y_file)]

    # construct the word count matrix
    count_vect = CountVectorizer()
    train_set_count = count_vect.fit_transform(word_train_set)
    test_set_count = count_vect.transform(word_test_set)

    # construct tfidf matrix
    tfidf_transformer = TfidfTransformer()
    train_set_x = tfidf_transformer.fit_transform(train_set_count)
    test_set_x = tfidf_transformer.transform(test_set_count)

    print "start the model"
    test_score = bayes_experiment([train_set_x, train_y], [test_set_x, test_y])
    return test_score
예제 #6
0
def get_feature_by_bag_tfidf():
    global white_count
    global black_count
    global max_features
    print "max_features=%d" % max_features
    x=[]
    y=[]

    webshell_files_list = load_files_re(webshell_dir)
    y1=[1]*len(webshell_files_list)
    black_count=len(webshell_files_list)

    wp_files_list =load_files_re(whitefile_dir)
    y2=[0]*len(wp_files_list)

    white_count=len(wp_files_list)


    x=webshell_files_list+wp_files_list
    y=y1+y2

    CV = CountVectorizer(ngram_range=(2, 4), decode_error="ignore",max_features=max_features,
                                       token_pattern = r'\b\w+\b',min_df=1, max_df=1.0)
    x=CV.fit_transform(x).toarray()

    transformer = TfidfTransformer(smooth_idf=False)
    x_tfidf = transformer.fit_transform(x)
    x = x_tfidf.toarray()

    return x,y
예제 #7
0
def race_tfidf(data, can_be_noun_arg, stop_words):
    print 
    data = data.groupby('race')['last']
    data = dict(list(data))
    docs = []
    for k in data:
        docs.append(' '.join(data[k]))
    count_vectorizer = CountVectorizer(stop_words='english')
    counts = count_vectorizer.fit_transform(docs)
    #print counts.todense().shape
    tfidf = TfidfTransformer(norm="l2", sublinear_tf='True')
    tfidf.fit(counts)
    #print "IDF:", tfidf.idf_.shape
    tf_idf_matrix = tfidf.transform(counts)
    freqs = {}
    sorted_voc = sorted(count_vectorizer.vocabulary_.iteritems(), key=operator.itemgetter(1))
    terms,_ = zip(*sorted_voc)
    for i,k in enumerate(data.keys()):
        # make list
        row = np.array(tf_idf_matrix.todense()[i,:])[0].tolist()
        freq = zip(terms, row)
        freqs[k] = sorted(freq, reverse=True, key=lambda x: x[1])
        print freqs[k][:5]
    #print tf_idf_matrix.todense().shape
    return freqs
def work_with_simple_bag_of_words():
    count = CountVectorizer()
    docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining and the weather is sweet',
    ])
    bag = count.fit_transform(docs)
    print(count.vocabulary_)
    print(bag.toarray())

    np.set_printoptions(precision=2)
    tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
    print(tfidf.fit_transform(bag).toarray())

    tf_is = 2
    n_docs = 3
    idf_is = np.log((n_docs+1) / (3+1))
    tfidf_is = tf_is * (idf_is + 1)
    print("tf-idf of term 'is' = %.2f" % tfidf_is)

    tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True)
    raw_tfidf = tfidf.fit_transform(bag).toarray()[-1]
    print(raw_tfidf)

    l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2))
    print(l2_tfidf)
예제 #9
0
 def make_training_xy(self, data):
     X = self.vectorizer.fit_transform(data.body)
     if self.tfidf:
         X = TfidfTransformer().fit_transform(X)
     X = X.tocsc()
     Y = normalize_scores(data.net, data.subreddit[0])
     return X,Y
예제 #10
0
def test_tfidf_no_smoothing():
    X = [[1, 1, 1],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert_true((tfidf >= 0).all())

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])

    # the lack of smoothing make IDF fragile in the presence of feature with
    # only zeros
    X = [[1, 1, 0],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')

    clean_warning_registry()
    with warnings.catch_warnings(record=True) as w:
        1. / np.array([0.])
        numpy_provides_div0_warning = len(w) == 1

    in_warning_message = 'divide by zero'
    tfidf = assert_warns_message(RuntimeWarning, in_warning_message,
                                 tr.fit_transform, X).toarray()
    if not numpy_provides_div0_warning:
        raise SkipTest("Numpy does not provide div 0 warnings.")
예제 #11
0
파일: tfidf.py 프로젝트: zhouleian/jieba
def tfidf(fileList):
    segPath = sys.path[0] + '/seg_result'
    corpus = [] #存取文档的分词结果
    for eachFile in fileList:
        fileName = segPath + '/' + eachFile
        f = open(fileName,'r+')
        content = f.read()
        corpus.append(content)
    vectorizer = CountVectorizer()  # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
    transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值,同时会使用默认的中文停用词
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))  # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
    word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
    weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
    #创建tfidf文件夹,保存tf-idf的结果
    tfidfFilePath = os.getcwd() + '/tfidfFile'
    if not os.path.exists(tfidfFilePath):
        os.mkdir(tfidfFilePath)
    for i in range(len(weight)):
        print u"--------Writing all the tf-idf in the", i, u" file into ", tfidfFilePath + '/' + str(i) + '.txt', "--------"
        name = tfidfFilePath + '/' + string.zfill(i, 5) + '.txt'
        f = open(name,'w+')
        for j in range(len(word)):
            #f.write(word[j] + "    " + str(weight[i][j]) + "\n")
            #f.write(str(weight[i][j]) + "\n")
            f.write(word[j] + "\n")
        f.close()
예제 #12
0
def test_pickling_transformer():
    X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
    orig = TfidfTransformer().fit(X)
    s = pickle.dumps(orig)
    copy = pickle.loads(s)
    assert_equal(type(copy), orig.__class__)
    assert_array_equal(copy.fit_transform(X).toarray(), orig.fit_transform(X).toarray())
예제 #13
0
def test_tfidf_no_smoothing():
    X = [[1, 1, 1],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert_true((tfidf >= 0).all())

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])

    # the lack of smoothing make IDF fragile in the presence of feature with
    # only zeros
    X = [[1, 1, 0],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')

    with warnings.catch_warnings(record=True) as w:
        tfidf = tr.fit_transform(X).toarray()
        assert_equal(len(w), 1)
        # For Python 3 compatibility
        if hasattr(w[0].message,'args') :
            assert_true("divide by zero" in\
                w[0].message.args[0])
        else : 
            assert_true("divide by zero" in\
                w[0].message)
예제 #14
0
class VectorModel(object):
    
    def __init__(self , list_of_comments=None):
        self.__list_of_comments = list_of_comments
        self.__vectorizer = []
        self.__corpus_simple_vector = []
        self.__transformer = []
        self.__corpus_tf_idf = []
        #self.prepare_models()
    
    def prepare_models(self):
        self.__vectorizer = CountVectorizer()
        vector = self.__vectorizer.fit_transform(self.__list_of_comments)
        self.__corpus_simple_vector = vector.toarray()
        self.__transformer = TfidfTransformer()
        tfidf = self.__transformer.fit_transform(self.__corpus_simple_vector)
        self.__corpus_tf_idf = tfidf.toarray()
        return [self.__vectorizer , self.__corpus_simple_vector , self.__transformer , self.__corpus_tf_idf]
    
    def set_models(self , vectorizer , transformer):    
        self.__vectorizer = vectorizer
        self.__transformer = transformer
        
    
    def get_comment_frequency_vector(self , comments):
        vec_comments = []
        for i in comments:
            vec_comments.append(i)
        vectores = self.__vectorizer.transform(vec_comments).toarray()
        return vectores
    
    def get_comment_tf_idf_vector(self , comments):
        vector = self.get_comment_frequency_vector(comments)
        result = self.__transformer.transform(vector).toarray()
        return result
예제 #15
0
class UnitClassifier(Trainer):
    def __init__(self, x, y, train_ratio):
        super(UnitClassifier, self).__init__(x, y, train_ratio)
        self._count_vec = CountVectorizer()
        self._tfidf_transformer = TfidfTransformer()

    def Fit(self):
        x_count = self._count_vec.fit_transform(self._x_train)
        self._tfidf_transformer.fit(x_count)

    def Preprocess(self, x):
        return self._tfidf_transformer.transform(self._count_vec.transform(x))

    def Learn(self, x_train, y_train):
        LOG.info('x_train.shape = %s', str(x_train.shape))
        LOG.info('len(y_train) = %d', len(y_train))

        clf = RandomForestClassifier(verbose=0, n_jobs=-1, n_estimators=20)
        LOG.info('Training...')
        clf.fit(x_train, y_train)
        LOG.info('Done...')
        return clf

    def Eval(self):
        LOG.info('Eval ...')
        y_pred = self.Predict(self._x_test)
        return {
            'misclass': np.mean(y_pred != self._y_test),
            'report': classification_report(self._y_test, y_pred,
                                            target_names=self._model.classes_)
        }
예제 #16
0
class CaloriesRegressor(Trainer):
    def __init__(self, x, y, train_ratio):
        super(CaloriesRegressor, self).__init__(x, y, train_ratio)
        self._count_vec = CountVectorizer()
        self._tfidf_transformer = TfidfTransformer()

    def Fit(self):
        x_count = self._count_vec.fit_transform(self._x_train)
        self._tfidf_transformer.fit(x_count)

    def Preprocess(self, x):
        return self._tfidf_transformer.transform(self._count_vec.transform(x))

    def Learn(self, x_train, y_train):
        LOG.info('x_train.shape = %s', str(x_train.shape))
        LOG.info('len(y_train) = %d', len(y_train))

        clf = RandomForestRegressor(verbose=0, n_jobs=-1, n_estimators=100)
        LOG.info('Training...')
        clf.fit(x_train, y_train)
        LOG.info('Done...')
        return clf

    def Eval(self):
        LOG.info('Eval ...')
        y_pred = self.Predict(self._x_test)
        return {
            'median_absolute_error':
            median_absolute_error(self._y_test, y_pred),
            'mean_squared_error': mean_squared_error(self._y_test, y_pred),
            'explained_variance_score':
            explained_variance_score(self._y_test, y_pred),
        }
예제 #17
0
def load_dataset(prefix, sufix, dic_fn, vocab_fn='./data/english_review.trn-100000.vocab'):
    train_file = prefix + "_train.txt.tok"
    test_file = prefix + "_test.txt.tok"

    train_y_file = prefix + "_train." + sufix
    test_y_file = prefix + "_test." + sufix

    dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))}
    word_train_set = [l.strip().lower() for l in open(train_file)]
    word_test_set = [l.strip().lower() for l in open(test_file)]

    train_y = [dic_cn[l.strip()] for l in open(train_y_file)]
    test_y = [dic_cn[l.strip()] for l in open(test_y_file)]
    
    vocab = [l.strip().lower().split("\t")[0] for l in open(vocab_fn)]
    count_vect = CountVectorizer(vocabulary=vocab)
    train_set_count = count_vect.fit_transform(word_train_set)
    test_set_count = count_vect.transform(word_test_set)
    tfidf_transformer = TfidfTransformer()
    train_set_x = tfidf_transformer.fit_transform(train_set_count).toarray()
    test_set_x = tfidf_transformer.transform(test_set_count).toarray()

    train_shared_x, train_shared_y = shared_dataset([train_set_x, train_y]) 
    test_shared_x, test_shared_y = shared_dataset([test_set_x, test_y]) 
    return [(train_shared_x, train_shared_y), (test_shared_x, test_shared_y)]
예제 #18
0
def tfidf_score(train_set, test_set):

    stopwords = nltk.corpus.stopwords.words('english')
    vectorizer = TfidfVectorizer(min_df=1, stop_words=set(stopwords))
    #Remove all the None Types from the input datasets
    train_set = filter(None, train_set)
    test_set = filter(None, test_set)
    vectorizer.fit_transform(train_set)
    #print "Word Index is {0} \n".format(vectorizer.vocabulary_)
    smatrix = vectorizer.transform(test_set)
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(smatrix)
    #print "IDF scores:", tfidf.idf_
    tf_idf_matrix = tfidf.transform(smatrix)
    pairwise_similarity = tf_idf_matrix * tf_idf_matrix.T
    msum = tf_idf_matrix.sum(axis=1)
    cos_sum = pairwise_similarity.sum(axis=1)
    mlist = msum.tolist()
    cos_sim = cos_sum.tolist()
    count = 0
    tfidfscores = {}
    for s in train_set:
        tfidfscores[s] = []
        tfidfscores[s].append(mlist[count][0])
        tfidfscores[s].append(cos_sim[count][0])
        count += 1
    return tfidfscores
예제 #19
0
 def getContextFeature(self):
     import time
     print 'start to get Context Feature'
     start = time.time()
     
     from sklearn.feature_extraction.text import TfidfTransformer
     from sklearn.feature_extraction.text import CountVectorizer
     #when we meet the large corpus, need to input an iteration!
     corpus = self.getIterText()
     #transfer the text into word frequency matrix
     vectorizer = CountVectorizer()
     transformer = TfidfTransformer()
     tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))
     
     print 'get word'
     word=vectorizer.get_feature_names()
     print 'get weight'
     weight=tfidf
     
     print 'weight type:', type(weight)
     #print weight
     end = time.time()
     
     print 'total time: \t', end-start
     return weight,word
def getTfidfData(dataTrain, dataTest, dataHold):
    print dataTrain.target_names
    
    count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2)
    tfidf_transformer = TfidfTransformer(sublinear_tf=True)
    X_counts = count_vect.fit_transform(dataTrain.data)
    X_tfidf = tfidf_transformer.fit_transform(X_counts)
    print X_tfidf.shape
    
    Y_counts = count_vect.transform(dataTest.data)
    Y_tfidf = tfidf_transformer.transform(Y_counts)
    print Y_tfidf.shape
    
    H_counts = count_vect.transform(dataHold.data)
    H_tfidf = tfidf_transformer.transform(H_counts)
    
    print 'feature selection using chi square test', len(dataTrain.target)
    feature_names = count_vect.get_feature_names()
    
    ch2 = SelectKBest(chi2, k='all')
    X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target)
    Y_tfidf = ch2.transform(Y_tfidf)
    H_tfidf = ch2.transform(H_tfidf)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
        
    if feature_names:
        feature_names = numpy.asarray(feature_names)
        print 'important features'
        print feature_names[:10]
    return X_tfidf, Y_tfidf, H_tfidf
예제 #21
0
def tf_idf(seg_files):
    seg_path = './segfile/'
    corpus = []
    for file in seg_files:
        fname = seg_path + file
        f = open(fname, 'r+')
        content = f.read()
        f.close()
        corpus.append(content)

    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfdif = transformer.fit_transform(vectorizer.fit_transform(corpus))
    word = vectorizer.get_feature_names()
    weight = tfdif.toarray()

    save_path = './tfidffile'
    if not os._exists(save_path):
        os.mkdir(save_path)

    for i in range(len(weight)):
        print('--------Writing all the tf-idf in the', i, u' file into ', save_path + '/' + string.zfill(i, 5) + '.txt',
              '--------')
        f = open(save_path + '/' + string.zfill(i, 5) + '.txt', 'w+')
        for j in range(len(word)):
            f.write(word[j] + ' ' + str(weight[i][j]) + '\r\n')
        f.close()
예제 #22
0
def test_classifiers():
    print "running bayes classifier.."
    # train_bayesian_classifier_from_scratch()

    dataset = get_thing_from_file("training_dataset.txt")
    print dataset.target_names
    bayes = get_thing_from_file("bayes.txt")
    bayes_model = bayes.fit(dataset.data, dataset.target)
    bayes_model = get_thing_from_file("bayes_model.txt")

    results = []
    count = 0
    url_arr = []

    bayes_predicted = bayes_model.predict(dataset)

    # for url in get_test_articles():
    #     url_arr.append(url)

    article_arr = get_article_array(url_arr)

    docs_new = ['God is love', 'OpenGL on the GPU is fast']

    count_vect = CountVectorizer()
    tfidf_trans = TfidfTransformer()
    x_new_counts = count_vect.transform(docs_new)
    x_new_horse = tfidf_trans.transform(x_new_counts)

    predicted = bayes_model.predict(x_new_horse)

    for doc, category in zip(docs_new, predicted):
        print('%r => %s' % (doc, dataset.target_names[category]))
예제 #23
0
def tfidf(corpus, word_category, file_to_write):
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    weight = tfidf.toarray()
    sum_weight = np.sum(weight, axis=0)
    word = vectorizer.get_feature_names()
    word_and_weight = []
    for i in range(len(sum_weight)):
        word_and_weight.append([word[i], sum_weight[i]])
    word_and_weight.sort(key=lambda key: key[1], reverse=True)
    f = open(file_to_write, "w+")
    result = []
    for j in range(len(word_and_weight)):
        try:
            f.write(
                word_and_weight[j][0]
                + " "
                + str(word_and_weight[j][1])
                + " "
                + word_category[word_and_weight[j][0]]
                + "\n"
            )
            result.append([word_and_weight[j][0], word_and_weight[j][1], word_category[word_and_weight[j][0]]])
        except:
            continue
    f.close()
    return result
def extract_text_features(train_data, test_data):
    """
    Returns one types of training and test data features.
        1) Term Frequency times Inverse Document Frequency (tf-idf): X_train_tfidf, X_test_tfidf

    Parameters
    ----------
    train_data : List[str]
        Training data in list. Will only take 30000 reviews for efficiency purposes
    test_data : List[str]
        Test data in list

    Returns
    -------
    Tuple(scipy.sparse.csr.csr_matrix,.., list)
        Returns X_train_tfidf, X_test_tfidf, vocab as a tuple.
    """
    
    # set up a count vectorizer that removes english stopwords when building a term-doc matrix
    count_vect = CountVectorizer(stop_words=set(stopwords.words('english')))
    # build the term frequency per document matrix from a random sublist of 30,000 documents
    train_counts = count_vect.fit_transform(random.sample(train_data, 30000))
    test_counts = count_vect.transform(test_data)
    tfidf_transformer = TfidfTransformer()

    train_tfidf = tfidf_transformer.fit_transform(train_counts)
    test_tfidf = tfidf_transformer.transform(test_counts)
    
    vocab = count_vect.get_feature_names()
    
    return (train_tfidf, test_tfidf, vocab)
def cal_product_description_tfidf():
    #PART II compute the tf-idf for product description
    print "\nBegins,compute the tf-idf for product description ..."
    product_description_data = pd.read_csv('product_descriptions.csv')

    print "\nMerge the product description into database..."
    AllSet = pd.merge( AllSet , product_description_data, how='left', on='product_uid')

    print "\nStemming the product description ..."
    AllSet['product_description'] = AllSet['product_description'].map(lambda x: stem_process(x))
    product_description=AllSet['product_description']

    print "\nGet the (product description vocabulary)-(search term) frequency matrix..."
    search_vect_descrip = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency
    search_vect_descrip.fit(product_description)#learn the vocabulary
    search_descrip_fq_matrix = search_vect_descrip.transform(search_term) #get the (product description vocabulary)-(search term) frequency matrix

    print "\nGet the (product description vocabulary)-(product_description) frequency matrix..."
    description_vect = CountVectorizer(stop_words ='english')
    description_vect.fit_transform(product_description)#learn the vocabulary
    description_fq_matrix=description_vect.transform(product_description) #get the (product discription vocabulary)-(product_description) frequency matrix

    print "\nGet the idf matrix..."
    tfidf_transformer = TfidfTransformer(norm="l2",smooth_idf=True)
    tfidf_transformer.fit(description_fq_matrix) # get idf for each vocabulary
    tf_idf_descrip_matrix  = tfidf_transformer.transform(description_fq_matrix) #get the idf matrix


    print "\nCompute the result of tf-idf for product description ..."
    tf_idf_descrip_result=[]#compute the result of tf-idf for product title
    for index in range(tf_idf_descrip_matrix.shape[0]):
        tf_idf_descrip_result.append((np.multiply(tf_idf_descrip_matrix[index], search_descrip_fq_matrix[index].transpose()))[0, 0])

    pd.DataFrame({"id":AllSet['id'],"product_description_tfidf": tf_idf_descrip_result}).to_csv('product_description_tfidf.csv', index=False)
예제 #26
0
def LR_modeling(file_name, k, AUC=True, weight=False):
    raw_data = pd.read_csv(file_name)
    raw_data = raw_data.drop(['issue', 'field'], axis=1)
    X = raw_data.drop('panelvote', axis=1)
    y = raw_data['panelvote']
    tfidf = TfidfTransformer(norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
    X = tfidf.fit_transform(X.values)
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.8, random_state=42)
    lr = LogisticRegression(C=1)
    lr.fit(X_train, y_train)
    auc = np.mean(cross_validation.cross_val_score(lr, X, y, scoring="roc_auc"))
    if AUC == True:
        print "AUC for %s on the test data = %.3f" % (file_name, auc)
    if weight == False:
        top_positive, top_negative = get_top_k_nocoeff(lr.coef_[0], k)
        return raw_data.columns[top_positive], raw_data.columns[top_negative]
    else:
        top_positive, top_negative = get_top_k(lr.coef_[0], k)
        final_pos = {}
        final_neg = {}
        for i in top_positive.keys():
            final_pos[raw_data.columns[i]] = top_positive[i]
        for j in top_negative.keys():
            final_neg[raw_data.columns[j]] = top_negative[j]
        pos = sorted(final_pos.items(), key=operator.itemgetter(1), reverse=True)
        neg = sorted(final_neg.items(), key=operator.itemgetter(1))
        return pos, neg
def cal_product_title_tfidf():

    #PART I compute the tf-idf for product title
    print "\nBegins,compute the tf-idf for product title ..."


    print "\nStemming product_title..."
    AllSet['product_title'] = AllSet['product_title'].map(lambda x : stem_process(x))
    product_title = AllSet['product_title']

    print "\nGet the (product title vocabulary)-(search term) frequency matrix..."
    search_vect_tittle = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency
    search_vect_tittle.fit(product_title)#learn the vocabulary
    search_tittle_fq_matrix = search_vect_tittle.transform(search_term) #get the (product title vocabulary)-(search term) frequency matrix

    print "\nGet the (product title vocabulary)-(product_title) frequency matrix"
    title_vect = CountVectorizer(stop_words='english')
    title_vect.fit_transform(product_title)#learn the vocabulary
    title_fq_matrix = title_vect.transform(product_title) #get the (product title vocabulary)-(product_title) frequency matrix

    print "\nGet the idf matrix"
    tfidf_transformer = TfidfTransformer(norm="l2", smooth_idf=True)
    tfidf_transformer.fit(title_fq_matrix) # get idf for each vocabulary
    tf_idf_title_matrix = tfidf_transformer.transform(title_fq_matrix) #get the idf matrix

    print "\nCompute the result of tf-idf for product title ..."
    tf_idf_title_result = [] #compute the result of tf-idf for product title
    for index in range(tf_idf_title_matrix.shape[0]):
        tf_idf_title_result.append((np.multiply(tf_idf_title_matrix[index], search_tittle_fq_matrix[index].transpose()))[0, 0])

    pd.DataFrame({"id": AllSet['id'],"product_title_tfidf": tf_idf_title_result}).to_csv('product_title_tfidf.csv', index=False)

    return 0
예제 #28
0
    def __init__(self, **kwargs):
        pickle_loader = PickleLoader()
        saved_pickles_path = kwargs.get("saved_pickles_path")
        self.pickle_paths = PicklePaths(parent_dir=saved_pickles_path)

        if self.pickle_paths.pickles_exist():
            self.title_count_vectorizer = pickle_loader.load_pickle(
                self.pickle_paths.TITLE_COUNT_VECTORIZER_PICKLE_PATH
            )
            self.title_tfidf_transformer = pickle_loader.load_pickle(
                self.pickle_paths.TITLE_TFIDF_TRANSFORMER_PICKLE_PATH
            )
            self.title_classifier = pickle_loader.load_pickle(self.pickle_paths.TITLE_CLASSIFIER_PICKLE_PATH)
            self.desc_count_vectorizer = pickle_loader.load_pickle(self.pickle_paths.DESC_COUNT_VECTORIZER_PICKLE_PATH)
            self.desc_tfidf_transformer = pickle_loader.load_pickle(
                self.pickle_paths.DESC_TFIDF_TRANSFORMER_PICKLE_PATH
            )
            self.desc_classifier = pickle_loader.load_pickle(self.pickle_paths.DESC_CLASSIFIER_PICKLE_PATH)
        else:
            # todo: don't forget lemmatization, stopwords. etc
            self.title_count_vectorizer = CountVectorizer()
            self.title_tfidf_transformer = TfidfTransformer()
            self.desc_count_vectorizer = CountVectorizer()
            self.desc_tfidf_transformer = TfidfTransformer()
            self.title_classifier = None
            self.desc_classifier = None
예제 #29
0
def estimation(file='song_text.txt', separator=u'--text--'):
    arr = text_split_line(file, u'--text--')
    dvect = data_vector(arr)
    target = dvect[0]
    text = dvect[1]
    dic = dvect[2]      # for converting target integer to artist name
#    print (target)
#    print (dic)
    count_vect = CountVectorizer()
    word_vect = count_vect.fit_transform(text)
    tfidf_transformer = TfidfTransformer()
    vect_tfidf = tfidf_transformer.fit_transform(word_vect)
    machine = svm.SVC(probability=True) # one of the best for text, see tutorial working with text
    machine.fit(vect_tfidf, target)
    print (machine.score(vect_tfidf, target))
    prediction = machine.predict(vect_tfidf)        # accuracy test (tutorial)
    print (u'model predictive accuracy:  {:.1%}'
           .format(np.mean(prediction == target)))
    new_texts = [text[500], text[2345], text[-2], text[0], text[5893]]
    new_data = count_vect.transform(new_texts)
    new_tfidf = tfidf_transformer.transform(new_data)
    prediction = machine.predict(new_tfidf)
    for i in range(len(new_texts)):
        print (u'{}\t=> {}'.format(new_texts[i].splitlines()[:2],
                                  dic[prediction[i]]))
    return
예제 #30
0
    def use_pipeline_with_fs(self):

        #####################
        #Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent
        #####################

        pipeline = Pipeline([
            ('vect',
             TfidfVectorizer(stop_words=stopwords, min_df=3, max_df=0.90)),
            ("selector", SelectPercentile()),
            ('clf', RandomForestClassifier()),
        ])

        # Build a grid search to find the best parameter
        # Fit the pipeline on the training set using grid search for the parameters
        parameters = {
            'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
            'vect__use_idf': (True, False),
            'clf__n_estimators': (10, 50, 100),
            'clf__criterion': ("gini", "entropy"),
            'clf__max_depth': (None, 2, 4),
            'clf__min_samples_split': (2, 4, 6),
            'selector__score_func': (chi2, f_classif),
            'selector__percentile': (85, 95, 100),
        }

        #################
        # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used
        # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained.
        #################

        cv = StratifiedShuffleSplit(y_train,
                                    n_iter=5,
                                    test_size=0.2,
                                    random_state=42)
        grid_search = GridSearchCV(pipeline,
                                   param_grid=parameters,
                                   cv=cv,
                                   n_jobs=-1)
        clf_gs = grid_search.fit(docs_train, y_train)

        ###############
        # print the cross-validated scores for the each parameters set explored by the grid search
        ###############

        best_parameters, score, _ = max(clf_gs.grid_scores_,
                                        key=lambda x: x[1])
        for param_name in sorted(parameters.keys()):
            print("%s: %r" % (param_name, best_parameters[param_name]))

        print("Score for gridsearch is %0.2f" % score)

        #y_predicted = clf_gs.predict(docs_test)

        ###############
        # run the classifier again with the best parameters
        # in order to get 'clf' for get_important_feature function!
        ###############

        ngram_range = best_parameters['vect__ngram_range']
        use_idf = best_parameters['vect__use_idf']
        score_func = best_parameters['selector__score_func']
        percentile = best_parameters['selector__percentile']

        # vectorisation

        count_vect = CountVectorizer(stop_words=stopwords,
                                     min_df=3,
                                     max_df=0.90,
                                     ngram_range=ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print("Shape of train data is " + str(X_CV.shape))

        # tfidf transformation

        tfidf_transformer = TfidfTransformer(use_idf=use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        #################
        # feature selection
        #################

        selector = SelectPercentile(score_func=score_func,
                                    percentile=percentile)

        combined_features = Pipeline([("vect", count_vect),
                                      ("tfidf", tfidf_transformer),
                                      ("feat_select", selector)])

        X_features = combined_features.fit_transform(docs_train, y_train)
        X_test_features = combined_features.transform(docs_test)

        print("Shape of train data after feature selection is " +
              str(X_features.shape))
        print("Shape of test data after feature selection is " +
              str(X_test_features.shape))

        # run classifier on selected features

        clf = RandomForestClassifier().fit(X_features, y_train)

        # get the features which are selected and write to file

        feature_boolean = selector.get_support(indices=False)

        f = open(path_to_store_feature_selection_boolean_file, 'w')

        for fb in feature_boolean:
            f.write(str(fb) + '\n')

        f.close()

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf,
                                 X_features,
                                 y_train,
                                 cv=10,
                                 scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        #################
        # run classifier on test data
        #################

        y_predicted = clf.predict(X_test_features)

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " %
              clf.score(X_test_features, y_test))

        # Print and plot the confusion matrix

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        # import matplotlib.pyplot as plt
        # plt.matshow(cm)
        # plt.show()

        return clf, count_vect
예제 #31
0

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)


def LemNormalize(text):
    return LemTokens(
        nltk.word_tokenize(text.lower().translate(remove_punct_dict)))


LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english')
LemVectorizer.fit_transform(array)

print(LemVectorizer.vocabulary_)
tf_matrix = LemVectorizer.transform(array).toarray()
tfidfTran = TfidfTransformer(norm="l2")
tfidfTran.fit(tf_matrix)
print tfidfTran.idf_
tfidf_matrix = tfidfTran.transform(tf_matrix)
print("")
print(
    "*************** printing tfidf_martrix ****************************************"
)
print("")
print tfidf_matrix.toarray()
cos_similarity_matrix = tfidf_matrix * tfidf_matrix.T
print("")
print(
    "*************** printing  cos_similarity_matrix *******************************"
)
print("")
예제 #32
0
    def train_classifier(self):

        # Get list of features
        count_vect = CountVectorizer(stop_words=stopwords,
                                     min_df=3,
                                     max_df=0.90,
                                     ngram_range=_ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print("Shape of train data is " + str(X_CV.shape))

        # tfidf transformation###

        tfidf_transformer = TfidfTransformer(use_idf=_use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        # train the classifier

        print("Fitting data ...")
        clf = RandomForestClassifier(n_estimators=_n_estimators,
                                     criterion=_criterion,
                                     max_depth=_max_depth,
                                     min_samples_split=_min_samples_split).fit(
                                         X_tfidf, y_train)

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf,
                                 X_tfidf,
                                 y_train,
                                 cv=10,
                                 scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        ##################
        # run classifier on test data
        ##################

        X_test_CV = count_vect.transform(docs_test)

        print("Shape of test data is " + str(X_test_CV.shape))

        X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        y_predicted = clf.predict(X_test_tfidf)

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " %
              clf.score(X_test_tfidf, y_test))

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        return clf, count_vect
예제 #33
0
def make_reuters_data(data_dir):
    np.random.seed(1234)
    from sklearn.feature_extraction.text import CountVectorizer
    from os.path import join
    did_to_cat = {}
    cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
    with open(join(data_dir, 'rcv1-v2.topics.qrels')) as fin:
        for line in fin.readlines():
            line = line.strip().split(' ')
            cat = line[0]
            did = int(line[1])
            if cat in cat_list:
                did_to_cat[did] = did_to_cat.get(did, []) + [cat]
        for did in list(did_to_cat.keys()):
            if len(did_to_cat[did]) > 1:
                del did_to_cat[did]

    dat_list = [
        'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat',
        'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat',
        'lyrl2004_tokens_train.dat'
    ]
    data = []
    target = []
    cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
    del did
    for dat in dat_list:
        with open(join(data_dir, dat)) as fin:
            for line in fin.readlines():
                if line.startswith('.I'):
                    if 'did' in locals():
                        assert doc != ''
                        if did in did_to_cat:
                            data.append(doc)
                            target.append(cat_to_cid[did_to_cat[did][0]])
                    did = int(line.strip().split(' ')[1])
                    doc = ''
                elif line.startswith('.W'):
                    assert doc == ''
                else:
                    doc += line

    assert len(data) == len(did_to_cat)

    x = CountVectorizer(dtype=np.float64,
                        max_features=2000).fit_transform(data)
    y = np.asarray(target)

    from sklearn.feature_extraction.text import TfidfTransformer
    x = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(x)
    x = x[:10000]
    y = y[:10000]
    x = np.asarray(x.todense()) * np.sqrt(x.shape[1])
    print('todense succeed')

    p = np.random.permutation(x.shape[0])
    x = x[p]
    y = y[p]
    print('permutation finished')

    assert x.shape[0] == y.shape[0]
    x = x.reshape((x.shape[0], x.size / x.shape[0]))
    np.save(join(data_dir, 'reutersidf10k.npy'), {'data': x, 'label': y})
예제 #34
0
def loaddata(data):
    data1 = jieba.cut(data)
    data11 = ""
    for item in data1:
        data11 += item + " "
    return data11


dataf = pda.read_sql(sql, conn)
dataf2 = dataf.T
title = dataf2.values[0]
content = dataf2.values[1]
train_text = []
for i in content:
    thisdata = loaddata(str(i))
    pat1 = "<[^>]*?>"
    thisdata = re.sub(pat1, "", thisdata)
    thisdata = thisdata.replace("\n", "").replace("\t", "")
    train_text.append(thisdata)
count_vect = CountVectorizer()
train_x_counts = count_vect.fit_transform(train_text)
#tfidf模型
from sklearn.feature_extraction.text import TfidfTransformer
tf_ts = TfidfTransformer(use_idf=True).fit(train_x_counts)
train_x_tf = tf_ts.transform(train_x_counts)
from sklearn.cluster import KMeans
kms = KMeans(n_clusters=3)
y = kms.fit_predict(train_x_tf)
dataf["type"] = y
dataf.to_csv("./01聚类归类法结果.csv")
예제 #35
0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import LabelPowerset
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import svm

# LabelPowerset allows for multi-label classification
# Build a pipeline for multinomial naive bayes classification
text_clf = Pipeline([
    ('vect',
     CountVectorizer(stop_words="english", lowercase=True,
                     ngram_range=(1, 1))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', LabelPowerset(svm.LinearSVC())),
])

voting_clf = Pipeline([
    ('vect',
     CountVectorizer(stop_words="english", lowercase=True,
                     ngram_range=(1, 1))),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', ClassifierChain(svm.LinearSVC())),
])

text_clf = text_clf.fit(X_train, y_train)

voting_clf = voting_clf.fit(X_train, y_train)
예제 #36
0
              HashingVectorizer(ngram_range=(1, 3),
                                n_features=2**27,
                                dtype=np.float32,
                                norm='l2',
                                lowercase=False,
                                stop_words=stopwords)),
             ('drop_cols', DropColumnsByDf(min_df=2)),
         ]))
    ],
                              n_jobs=1)

    sparse_merge = vectorizer.fit_transform(merge)
    print(f'[{time() - start_time}] Merge vectorized')
    print(sparse_merge.shape)

    tfidf_transformer = TfidfTransformer()

    X = tfidf_transformer.fit_transform(sparse_merge)
    print(f'[{time() - start_time}] TF/IDF completed')

    X_train = X[:nrow_train]
    print(X_train.shape)

    X_test = X[nrow_train:]
    del merge
    del sparse_merge
    del vectorizer
    del tfidf_transformer
    gc.collect()

    X_train, X_test = intersect_drop_columns(X_train, X_test, min_df=1)
예제 #37
0
y = dataemp["label"]

# In[ ]:

# Vectorizing the text data

# In[40]:

cv = CountVectorizer()
X = cv.fit_transform(X)

# Feeding the output of vectors into TFIDF transformer

# In[41]:

tfidf = TfidfTransformer()
tfidf.fit_transform(X)

# Splitting the training and test data in the ratio of 70% training data and 30% test data

# In[42]:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=101)

# Naive Bayes Classification model

# In[43]:
예제 #38
0
print len(sent_list_old)
print sent_list_old

print sent_list_old[3]
for sentence in sent_list_old:
    sent_list.append(clean_str(sentence))

print sent_list[3]

print len(sent_list)

vectorizer = CountVectorizer(min_df=1,
                             stop_words='english',
                             strip_accents='ascii')
count_vectorizer = vectorizer.fit_transform(sent_list)
transformer = TfidfTransformer(smooth_idf=True)
tfidf = transformer.fit_transform(count_vectorizer)
print count_vectorizer.shape

mat = tfidf.toarray()

km = SphericalKMeans(10)
clusters = km.fit(mat)
centroids = km.cluster_centers_
labels = km.labels_
summa_index = []
for c in range(10):
    simi_list = []
    max_simi = -1 * float("inf")
    max_idx = -1
    centroid = centroids[c, :]
예제 #39
0
@file: tf-idf-liiuxuejiang.py
@time: 18-3-28 下午6:43
@contact: [email protected]
'''
import jieba
import jieba.posseg as pseg
import os
import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

if __name__ == "__main__":
    corpus = [
        "我 来到 北京 清华大学",  # 第一类文本切词后的结果,词之间以空格隔开
        "他 来到 了 网易 杭研 大厦",  # 第二类文本的切词结果
        "小明 硕士 毕业 与 中国 科学院",  # 第三类文本的切词结果
        "我 爱 北京 天安门"
    ]  # 第四类文本的切词结果
    vectorizer = CountVectorizer(
    )  # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
    transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值
    tfidf = transformer.fit_transform(vectorizer.fit_transform(
        corpus))  # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
    word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
    weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
    for i in range(
            len(weight)):  # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
        print(u"-------这里输出第", i, u"类文本的词语tf-idf权重------")
        for j in range(len(word)):
            print(word[j], weight[i][j])
예제 #40
0
    def predict_posts(self):

        docs_train, docs_test, y_train, y_test = train_test_split(
            X, y, test_size=0.01, random_state=42)

        print("Number of data point is " + str(len(y)))

        ###############
        # uncomment either one of the below
        # predict unlabelled tweet OR test classifier on gold standard
        ###############

        # dataset_topredict = pd.read_csv(path_to_file_to_be_predicted, header=0, names=['tweets'])
        dataset_topredict = pd.read_csv(path_to_gold_standard_file,
                                        header=0,
                                        names=['tweets', 'class'])

        X_topredict = dataset_topredict['tweets']
        y_goldstandard = dataset_topredict['class']

        ###############
        # train classifier
        ###############

        # Get list of features
        count_vect = CountVectorizer(stop_words=stopwords,
                                     min_df=3,
                                     max_df=0.90,
                                     ngram_range=_ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print("Shape of train data is " + str(X_CV.shape))

        # tfidf transformation###

        tfidf_transformer = TfidfTransformer(use_idf=_use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        # train the classifier

        print("Fitting data ...")
        clf = RandomForestClassifier().fit(X_tfidf, y_train)

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf,
                                 X_tfidf,
                                 y_train,
                                 cv=10,
                                 scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation
        # the factor two is to signify 2 sigma, which is 95% confidence level

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        ##################
        # run classifier to predict tweets
        ##################

        X_test_CV = count_vect.transform(X_topredict)

        print("Shape of test data is " + str(X_test_CV.shape))

        X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        y_predicted = clf.predict(X_test_tfidf)

        ##################
        # run classifier on gold standard (tweets that were labelled by twitter insight)
        ##################

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " %
              clf.score(X_test_tfidf, y_goldstandard))

        print(metrics.classification_report(y_goldstandard, y_predicted))
        cm = metrics.confusion_matrix(y_goldstandard, y_predicted)
        print(cm)

        ##################
        # write prediction results to file
        ##################

        f = open(path_to_store_predicted_results, 'w')

        for yp in y_predicted:
            f.write(yp + '\n')

        f.close()
예제 #41
0
from sklearn.datasets import fetch_20newsgroups

Training_Data = fetch_20newsgroups(subset='train', shuffle=True)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

classificationText = Pipeline([('vect', CountVectorizer()),
                               ('tfidf', TfidfTransformer()),
                               ('clf', MultinomialNB())])

classificationText = classificationText.fit(Training_Data.data,
                                            Training_Data.target)

# Performance measurement of NB Classifier
import numpy as np
Testing_Data = fetch_20newsgroups(subset='test', shuffle=True)
prediction_target = classificationText.predict(Testing_Data.data)

print("Accuracy in Categorization in percentage : ",
      (np.mean(prediction_target == Testing_Data.target)) * 100)
예제 #42
0
    def train_classifier_use_feature_selection(self):

        # Get list of features
        count_vect = CountVectorizer(stop_words=stopwords,
                                     min_df=3,
                                     max_df=0.90,
                                     ngram_range=_ngram_range)
        X_CV = count_vect.fit_transform(docs_train)

        # print number of unique words (n_features)
        print("Shape of train data is " + str(X_CV.shape))

        # tfidf transformation###

        tfidf_transformer = TfidfTransformer(use_idf=_use_idf)
        X_tfidf = tfidf_transformer.fit_transform(X_CV)

        #################
        # feature selection
        #################

        selector = SelectPercentile(score_func=_score_func,
                                    percentile=_percentile)

        print("Fitting data with feature selection ...")
        selector.fit(X_tfidf, y_train)

        # get how many features are left after feature selection
        X_features = selector.transform(X_tfidf)

        print("Shape of array after feature selection is " +
              str(X_features.shape))

        clf = RandomForestClassifier(n_estimators=_n_estimators,
                                     criterion=_criterion,
                                     max_depth=_max_depth,
                                     min_samples_split=_min_samples_split).fit(
                                         X_features, y_train)

        # get the features which are selected and write to file

        feature_boolean = selector.get_support(indices=False)

        f = open(path_to_store_feature_selection_boolean_file, 'w')

        for fb in feature_boolean:
            f.write(str(fb) + '\n')

        f.close()

        ##################
        # get cross validation score
        ##################

        scores = cross_val_score(clf,
                                 X_features,
                                 y_train,
                                 cv=10,
                                 scoring='f1_weighted')
        print("Cross validation score: " + str(scores))

        # Get average performance of classifier on training data using 10-fold CV, along with standard deviation

        print("Cross validation accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

        ####################
        #test clf on test data
        ####################

        X_test_CV = count_vect.transform(docs_test)

        print("Shape of test data is " + str(X_test_CV.shape))

        X_test_tfidf = tfidf_transformer.transform(X_test_CV)

        # apply feature selection on test data too
        X_test_selector = selector.transform(X_test_tfidf)
        print("Shape of array for test data after feature selection is " +
              str(X_test_selector.shape))

        y_predicted = clf.predict(X_test_selector)

        # print the mean accuracy on the given test data and labels

        print("Classifier score on test data is: %0.2f " %
              clf.score(X_test_selector, y_test))

        print(metrics.classification_report(y_test, y_predicted))
        cm = metrics.confusion_matrix(y_test, y_predicted)
        print(cm)

        return clf, count_vect
pickle.dump(nb, f)
f.close()

# importing classification report and confussion matrix
from sklearn.metrics import confusion_matrix,classification_report

print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

from sklearn.feature_extraction.text import  TfidfTransformer
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

X = text['review']
y = df['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=101)

pipeline.fit(X_train,y_train)

predictions = pipeline.predict(X_test)
print("Actual Ratings(rating): ",end = "")
display(y_test[:15])
print("Predicted Ratings: ",end = "")
# print(predictions[:15])
예제 #44
0
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print("lenth of the twenty_train--------->", len(twenty_train))
#print(twenty_train.target_names)  #prints all the categories

print("***First Line of the First Data File***")
#print("\n".join(twenty_train.data[0].split("\n")[:5]))#prints first line of the first data file

#2 Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print('dim=', X_train_counts.shape)

#3 TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

# Machine Learning
#4 Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2', smooth_idf=True, use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix
예제 #46
0
def load_imdb(feature_type='tfidf'):
    """
	Load IMDB data in several formats.

	:param feature_type: feature type, default is 'tfidf', others are 'origin', 'tfidf-seq'
	:return:
	"""
    data_dir = "../DeepForestTF_Data/"
    if (os.path.exists(data_dir + "imdb_x_train.npy")
            and os.path.exists(data_dir + "imdb_x_test.npy")
            and os.path.exists(data_dir + "imdb_y_train.npy")
            and os.path.exists(data_dir + "imdb_y_test.npy")):
        x_train = np.load(data_dir + "imdb_x_train.npy")
        x_test = np.load(data_dir + "imdb_x_test.npy")
        y_train = np.load(data_dir + "imdb_y_train.npy")
        y_test = np.load(data_dir + "imdb_y_test.npy")
        x_train = x_train.reshape((x_train.shape[0], -1))
        x_test = x_test.reshape((x_test.shape[0], -1))
        return x_train, x_test, y_train, y_test

    max_features = 0
    if feature_type.startswith('tfidf'):
        max_features = 5000
        (x_train, y_train), (x_test,
                             y_test) = imdb.load_data(num_words=max_features)
    else:
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=None)
    if feature_type == 'origin':
        max_len = 400
        x_train = sequence.pad_sequences(x_train, maxlen=max_len)
        x_test = sequence.pad_sequences(x_test, maxlen=max_len)
    elif feature_type == 'tfidf':
        from sklearn.feature_extraction.text import TfidfTransformer
        transformer = TfidfTransformer(smooth_idf=True)
        x_train_bin = np.zeros((len(x_train), max_features), dtype=np.int16)
        x_test_bin = np.zeros((len(x_test), max_features), dtype=np.int16)
        for i, x_i in enumerate(x_train):
            x_train_bin[i, :] = np.bincount(x_i, minlength=max_features)
        for i, x_i in enumerate(x_test):
            x_test_bin[i, :] = np.bincount(x_i, minlength=max_features)
        transformer.fit_transform(x_train_bin)
        x_train = transformer.transform(x_train_bin)
        x_test = transformer.transform(x_test_bin)
        x_train = np.asarray(x_train.todense())
        x_test = np.asarray(x_test.todense())
    elif feature_type == 'tfidf-seq':
        from sklearn.feature_extraction.text import TfidfTransformer
        transformer = TfidfTransformer(smooth_idf=True)
        transformer2 = TfidfTransformer(smooth_idf=True)
        max_len = 400
        n_train = len(x_train)
        n_test = len(x_test)
        x_train_bin = np.zeros((n_train, max_features), dtype=np.int16)
        x_test_bin = np.zeros((n_test, max_features), dtype=np.int16)
        for i, x_i in enumerate(x_train):
            x_train_bin_i = np.bincount(x_i)
            x_train_bin[i, :len(x_train_bin_i)] = x_train_bin_i
        for i, x_i in enumerate(x_test):
            x_test_bin_i = np.bincount(x_i)
            x_test_bin[i, :len(x_test_bin_i)] = x_test_bin_i
        x_train_tfidf = transformer.fit_transform(x_train_bin)
        x_test_tfidf = transformer2.fit_transform(x_test_bin)
        x_train_tfidf = np.asarray(x_train_tfidf.todense())
        x_test_tfidf = np.asarray(x_test_tfidf.todense())
        x_train_id = sequence.pad_sequences(x_train, maxlen=max_len)
        x_test_id = sequence.pad_sequences(x_test, maxlen=max_len)
        x_train = np.zeros(x_train_id.shape, dtype=np.float32)
        x_test = np.zeros(x_test_id.shape, dtype=np.float32)
        for i in range(n_train):
            x_train[i, :] = x_train_tfidf[i][x_train_id[i]]
        for i in range(n_test):
            x_test[i, :] = x_test_tfidf[i][x_test_id[i]]
    else:
        raise ValueError('Unknown feature type: {}'.format(feature_type))

    x_train = x_train[:, np.newaxis, :, np.newaxis].astype('float32')
    x_test = x_test[:, np.newaxis, :, np.newaxis].astype('float32')
    return x_train, x_test, y_train.astype('int8'), y_test.astype('int8')
예제 #47
0
    file = os.path.join(args.train, "processed_data.csv")

    # Loading Data
    df = pd.read_csv(file, engine="python")
    # isolating the target column (label)
    y = df['LABEL']
    X = df.drop(['LABEL'], axis=1)

    # Splitting into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=42)

    nb_clf = Pipeline([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()), ('nb', MultinomialNB())])
    parameters_ = {
        'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
        'tfidf__use_idf': (True, False),
        'tfidf__norm': ('l1', 'l2'),
        'nb__alpha': [1, 1e-1, 1e-2]
    }
    #cls_naivebayes_ = MultinomialNB()
    clf = GridSearchCV(nb_clf, param_grid=parameters_, cv=5)
    clf.fit(X_train["PROCESSED_REVIEW"].values, y_train)

    # saving the model using joblib
    joblib.dump(clf, os.path.join(args.model_dir, 'model.joblib'))


def input_fn(input_data, content_type='application/json'):
tf    = cv.fit_transform(comments)
terms = cv.get_feature_names()
term_sums = tf.sum(axis=0)
term_counts = []
for i in range(len(terms)):
    term_counts.append([terms[i], term_sums[0,i]])
def sortSecond(e):
    return e[1]
term_counts.sort(key=sortSecond, reverse=True)
print("\nTerms with Highest Frequency:")
for i in range(50):
        print('{:<15s}{:>5d}'.format(term_counts[i][0], term_counts[i][1]))
print("")
# Modify tf, term frequencies, to TF/IDF matrix from the data
print("Conducting Term/Frequency Matrix using TF-IDF")
tfidf_vect = TfidfTransformer(norm=None, use_idf=True) #set norm=None
tf         = tfidf_vect.fit_transform(tf)

term_idf_sums = tf.sum(axis=0)
term_idf_scores = []
for i in range(len(terms)):
    term_idf_scores.append([terms[i], term_idf_sums[0,i]])
term_idf_scores.sort(key=sortSecond, reverse=True)

# In sklearn, SVD is synonymous with LSA (Latent Semantic Analysis)
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,\
learning_method=learning_method, \
learning_offset=learning_offset, \
random_state=12345)
lda.fit_transform(tf)
예제 #49
0
파일: P2.py 프로젝트: frederik9530/P2
# I want to find articles related to the demonstrations in Hong Kong
#So i wanted originally to just make my query 'Hong Kong' but I landed on:
# 'hong', 'protests' and 'extradition'

terms = ['hong', 'protests', 'extradition']
# After several tries with versatile letters and "hong kong", i just settled for
# 'hong'. It's not like 'hong' og 'kong' is seperate words in english anyway

term_idxs = [model_vect.vocabulary_.get(term) for term in terms]
term_counts = [counts[idx] for idx in term_idxs]
print(term_counts)

#Here we get the term counts for each of the three words in our query

from sklearn.feature_extraction.text import TfidfTransformer
model_tfidf = TfidfTransformer()
data_tfidf = model_tfidf.fit_transform(data_vect)
data_tfidf
# And then here I transform the count to a tfidf representation


idfs = model_tfidf.idf_
term_idfs = [idfs[idx] for idx in term_idxs]
term_idfs
# And I get the individual weights for my selected terms

df = pd.DataFrame(columns=['count', 'idf'], index=terms, data=zip(term_counts,term_idfs))
df
# Here i use pandas to make a dataframe, that will let me compare my term counts with
# my term weights. The one that have the largest difference is 'extradition', which i 
# guess means that it has a higher semantic value compared to how often it is represented
예제 #50
0
    neg_word2vec_scores[i] += neg_min
#a_min = -min(diff)
#diff = [i + a_min for i in diff]

print X[0]

#pos_word2vec_scores, neg_word2vec_scores = np.array(pos_word2vec_scores), np.array(neg_word2vec_scores)
#diff = np.array(diff)
sentiment_prob_list = np.array(sentiment_prob_list)
X = np.array(X)
Y = np.array(Y)

print X.shape
print Y.shape
vectorizer = CountVectorizer(stop_words=stop)
transformer = TfidfTransformer()
#tfv = TfidfVectorizer(min_df=3,  max_features=None, strip_accents='unicode',  \
#	analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1)

trainVectorizerArray = vectorizer.fit_transform(X).toarray()
transformer.fit(trainVectorizerArray)
L = transformer.transform(trainVectorizerArray).toarray()
print L.shape

#tfv.fit(X)
#Z = tfv.transform(X)

#svd = TruncSVD(n_components = 25)
#totalsvd = svd.fit(Z)
#totalsvd = svd.fit_transform(Z)
#totalsvd = totalsvd[:,:] + 1
예제 #51
0
	file_obj.close()
	return bunch
#写入bunch对象	
def writebunchobj(path,bunchobj):
	file_obj = open(path, "wb")
	pickle.dump(bunchobj,file_obj) 
	file_obj.close()	

# 1. 读取停用词表	
stopword_path = "train_word_bag/hlt_stop_words.txt"
stpwrdlst = readfile(stopword_path).splitlines()

# 2. 导入分词后的词向量bunch对象
path = "train_word_bag/train_set.dat"        # 词向量空间保存路径
bunch	= readbunchobj(path)

# 3. 构建tf-idf词向量空间对象
tfidfspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})

# 4. 使用TfidfVectorizer初始化向量空间模型 
vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5)
transformer=TfidfTransformer() # 该类会统计每个词语的tf-idf权值
# 文本转为词频矩阵,单独保存字典文件 
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_

# 创建词袋的持久化
space_path = "train_word_bag/tfdifspace.dat"        # 词向量空间保存路径
writebunchobj(space_path,tfidfspace)

print "if-idf词向量空间创建成功!!!"
예제 #52
0
twenty_test = fetch_20newsgroups(subset='test',
                                 categories=categories,
                                 shuffle=True)
#twenty_train=fetch_20newsgroups(data_home='./scikit_learn_data',subset='train',shuffle=True)
#print(twenty_train)
#twenty_test=fetch_20newsgroups(data_home='./scikit_learn_data',subset='test',shuffle=True)
#print(twenty_train)
print("Number of Training Examples: ", len(twenty_train.data))
print("Number of Test Examples: ", len(twenty_test.data))
print(twenty_train.target_names)

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_tf = count_vect.fit_transform(twenty_train.data)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf)
X_train_tfidf.shape

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn import metrics
mod = MultinomialNB()
mod.fit(X_train_tfidf, twenty_train.target)
X_test_tf = count_vect.transform(twenty_test.data)
X_test_tfidf = tfidf_transformer.transform(X_test_tf)
predicted = mod.predict(X_test_tfidf)

print("Accuracy: ", accuracy_score(twenty_test.target, predicted))
print(
    classification_report(twenty_test.target,
# load data.npy and target.npy
data = sp.load('data.npy')
target = sp.load('target.npy')

# vectorize method step1: count vectorizer (bag of words)
from sklearn.feature_extraction.text import CountVectorizer

countVector = CountVectorizer(stop_words=stopWords, decode_error='ignore')
trainCounts = countVector.fit_transform(data)
# .shape output format: (sample number, dict size)
# print(trainCounts.shape)

# vectorize method step2: TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfTransformer
tfTransformer = TfidfTransformer(use_idf=False).fit(trainCounts)
trainTF = tfTransformer.transform(trainCounts)
# print(trainTF.shape)

# build naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB
    # 1) use bag of words vector
naiveBayesClassifier = MultinomialNB().fit(trainCounts, target)
    # 2) use TF-IDF vetor
# naiveBayesClassifier = MultinomialNB().fit(trainTF, target)

# print test results
predicted = naiveBayesClassifier.predict(tfTransformer.transform(countVector.transform(data)))

from sklearn import metrics
# print(metrics.classification_report(target, predicted))
예제 #54
0
파일: eucprecise.py 프로젝트: ybhy/ESSAY
import numpy as np
from numpy import *
from numpy import arange
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

corpus = []
for line in open('text3000.txt', 'r').readlines():
    line = line.split('\t')
    corpus.append(line[1].strip())
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
word = vectorizer.get_feature_names()
weight = tfidf.toarray()
print type(weight)
print len(weight)
print len(weight[0])
size = 3000
length = 200
preciseAll = 0

# resName = "euc60.txt"
# result = codecs.open(resName, 'w', 'utf-8')
for i in xrange(0, len(weight)):
    d = dict()
    for j in xrange(0, len(weight)):
예제 #55
0
    #    f.readline()
    #    print(f)
    #    np1 = np.loadtxt(f, dtype = 'str', delimiter = ',')
    #    np1 = np.loadtxt(filecp, dtype = 'str', delimiter = ',',skiprows=5)
    df1 = pd.read_csv(filename, header=None, encoding="utf8")
    np1 = np.array(df1)
    data = np1[:, 0:-1]
    label = np1[:, -1]
    return data, label.astype('float32')


if __name__ == "__main__":
    data, label = read_data("../input/data_clean.csv")
    data_combine = np.hstack((data[:, 0], data[:, 1]))
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    label_up = np.hstack((label, label)).astype("float32")
    similarity = np.zeros((data.shape[0], 1))
    tfidf = transformer.fit_transform(vectorizer.fit_transform(data_combine))
    question = csr_matrix(tfidf[0:data.shape[0]])
    answer = csr_matrix(tfidf[data.shape[0]:])
    data_sparse = hstack([question, answer])
    plt.figure()
    for model in ['logistic', 'linear', 'GBDT', 'NN']:
        mse_train = []
        mse_test = []
        print(model)
        for i in range(1, 100):
            data_sparse_one = SelectKBest(chi2, k=i * 100).fit_transform(
                data_sparse, label)
            #        print("precess finished")
예제 #56
0
for i in range(len(data['body'])):
    if i in selected_indices:
        bodyCommentsTest.append(data['body'][i])
        scoreTest.append(data['score'][i])
    else:
        bodyCommentsTraining.append(data['body'][i])
        scoreTraining.append(data['score'][i])

cv = CountVectorizer(stop_words='english',
                     strip_accents='ascii',
                     max_df=0.8,
                     ngram_range=(1, 5))  #, ngram_range=(1, 2)
bodyCommentsTraining = cv.fit_transform(bodyCommentsTraining)
bodyCommentsTest = cv.transform(bodyCommentsTest)

transformer = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True)
bodyCommentsTraining = transformer.fit_transform(bodyCommentsTraining)
bodyCommentsTest = transformer.transform(bodyCommentsTest)

y_pred = KMeans().fit_predict(bodyCommentsTraining)

print y_pred

values = set()

for x in y_pred:
    values.add(x)

temp = collections.Counter(y_pred)

maxValue = 0
예제 #57
0
class BotServer:
    def __init__(self, file_path):
        """
        Initialize corpus, bag-of-words, and TFIDF from CSV file at argument
        file_path.
        """
        processing = Processing()
        # Read in FAQ data
        self.faq = pd.read_csv(file_path, keep_default_na=False)
        self.corpus = self.faq.question + ' ' + self.faq.answer

        # Create BOW tranformer based on faq.question + faq.answer
        self.bow_transformer = CountVectorizer(
            analyzer=processing.text_process).fit(self.faq.question)
        # Tranform faq.question itself into BOW
        self.corpus_bow = self.bow_transformer.transform(self.faq.question)

        # Create TFIDF transformer based on faq.question's BOW
        self.tfidf_transformer = TfidfTransformer().fit(self.corpus_bow)
        # Transform faq.question's BOW into TFIDF
        self.corpus_tfidf = self.tfidf_transformer.transform(self.corpus_bow)

        # Initialize search module
        encoder, decoder, decoder_n_layers, self.voc = buildModels()
        self.searcher = GreedySearchDecoder(encoder, decoder, decoder_n_layers)

        # Set upload folder and output records folder
        self.UPLOAD_FOLDER = '/app/records/in'
        self.REC_RES_FOLDER = '/app/records/out'

        # Set allowed extensions
        self.ALLOWED_EXTENSIONS = {'wav'}

    def tfidf_similarity(self, query):
        """
        Returns (index, similarity value) of string argument query's most similar
        match in FAQ, determined by cosine similarity.
        """
        # Transform test question into BOW using BOW transformer
        query_bow = self.bow_transformer.transform([query])
        # Transform test question's BOW into TFIDF
        query_tfidf = self.tfidf_transformer.transform(query_bow)

        # Calculate cosine similarity and return maximum value with accompanying index
        similarities = np.transpose(
            cosine_similarity(query_tfidf, self.corpus_tfidf))
        max_similarity = similarities.max()
        max_index = np.argmax(similarities)

        return max_index, max_similarity

    def match_query(self, query):
        """
        Prints most similar match in FAQ to user query.
        """
        index, similarity = self.tfidf_similarity(query)

        if similarity > 0.5:
            response = self.faq.answer.iloc[index]
            print(similarity)
        else:
            query = normalizeString(query)
            output_words = evaluate(self.searcher, self.voc, query)
            output_words[:] = [
                x for x in output_words if not (x == 'EOS' or x == 'PAD')
            ]
            response = ' '.join(output_words)
        return response

    def allowed_file(self, filename):
        return '.' in filename and filename.rsplit(
            '.', 1)[1].lower() in self.ALLOWED_EXTENSIONS

    def get_duration(self, audio_name_only):
        fname = os.path.join(self.REC_RES_FOLDER, audio_name_only)
        audio = MP3(fname)
        return round(audio.info.length)

    def bot_dialog(self, request):
        """
        Given the argument POST request, parse it according to form data,
        and return a json response based on sklearn matching
        within the FAQ.
        """

        # Handle webhook request
        req = request.form
        msg_type = req.get('type')

        if msg_type == "Text":
            message = req.get('message')
            response_text = self.match_query(message)
            # Return json file as webhook response
            messages = [{
                "type": "Text",
                "message": msg,
                "fromBot": True
            } for msg in response_text.split("\n\n")]

        elif msg_type == "Audio":
            respfilename = ''
            record = request.files['record']
            if record and self.allowed_file(record.filename):
                filename = secure_filename(record.filename)
                record.save(os.path.join(self.UPLOAD_FOLDER, filename))

            list_records = []
            durations = []

            try:
                r = spechrec.Recognizer()
                with spechrec.AudioFile(
                        os.path.join(self.UPLOAD_FOLDER, filename)) as source:
                    # listen for the data (load audio to memory)
                    audio_data = r.record(source)
                    # recognize (convert from speech to text)
                    input_sentence = r.recognize_google(audio_data)
                #searcher = GreedySearchDecoder(encoder, decoder,decoder_n_layers)
                response_text = self.match_query(input_sentence)
                for msg in response_text.split("\n\n"):
                    now = datetime.now()
                    respfilename = now.strftime("%d-%m-%Y-%H:%M:%S") + ".mp3"
                    engine = gTTS('' + response_text, lang='en')
                    engine.save(os.path.join(self.REC_RES_FOLDER,
                                             respfilename))
                    list_records.append(respfilename)
                    durations.append(self.get_duration(respfilename))

            except:
                erreur = random.choice([
                    "Sorry, i did not understand you ,Please change the way you say it",
                    "please be a little simple in your discussion i m not a human",
                    "Sorry, get in mind  that you are talking only with a computer "
                ])
                print("" + erreur)
                now = datetime.now()
                respfilename = now.strftime("%d-%m-%Y-%H:%M:%S") + ".mp3"
                engine = gTTS('' + erreur, lang='en')
                engine.save(os.path.join(self.REC_RES_FOLDER, respfilename))
                list_records.append(respfilename)
                durations.append(self.get_duration(respfilename))

            #Return json file as webhook response
            messages = [{
                "type":
                "Audio",
                "path":
                "https://coronafaqsbot.herokuapp.com/records/" +
                list_records[i],
                "isLocal":
                False,
                "duration":
                durations[i],
                "fromBot":
                True
            } for i in range(len(list_records))]
        return jsonify({"messages": messages})
예제 #58
0
# use all 25K words. Higher accuracy
movieVzer = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)

# fit and tranform using training text
docs_train_counts = movieVzer.fit_transform(docs_train)

# 'screen' is found in the corpus, mapped to index 2290
print(movieVzer.vocabulary_.get('screen'))

# Likewise, Mr. Steven Seagal is present...
print(movieVzer.vocabulary_.get('seagal'))

print(docs_train_counts.shape)

# Convert raw frequency counts into TF-IDF values
movieTfmer = TfidfTransformer()
docs_train_tfidf = movieTfmer.fit_transform(docs_train_counts)

# Using the fitted vectorizer and transformer, tranform the test data
docs_test_counts = movieVzer.transform(docs_test)
docs_test_tfidf = movieTfmer.transform(docs_test_counts)

# Now ready to build a classifier.
# We will use Multinominal Naive Bayes as our model

# Train a Multimoda Naive Bayes classifier. Again, we call it "fitting"
clf = MultinomialNB()
clf.fit(docs_train_tfidf, y_train)

# Predict the Test set results, find accuracy
y_pred = clf.predict(docs_test_tfidf)
예제 #59
0
    def train(self, method = "xgboost"):
        # x_train, x_test, y_train, y_test = self.read_data()
        corpus, label, numclass = self.feature(feature=self.feature_)

        print "label: ", Counter(label)
        print "numclass: ", numclass
        print "------------------------------------------------"

        x_train, x_test, y_train, y_test = train_test_split(corpus, label, test_size=0.2)
        print "train num: ", len(y_train)
        print "test num: ", len(y_test)
        print "------------------------------------------------"

        vectorizer = CountVectorizer()
        tfidftransformer = TfidfTransformer()
        tfidf = tfidftransformer.fit_transform(vectorizer.fit_transform(x_train))
        weight = tfidf.toarray()
        print "weight.shape: ", weight.shape
        test_tfidf = tfidftransformer.transform(vectorizer.transform(x_test))
        test_weight = test_tfidf.toarray()
        print "test_weight.shape: ", test_weight.shape
        print "------------------------------------------------\n"

        # city count feature
        x_train_count = np.array([self.city_count_feature(x) for x in x_train])
        x_test_count = np.array([self.city_count_feature(x) for x in x_test])
        assert len(weight) == len(x_train_count)
        assert len(test_weight) == len(x_test_count)
        weight = np.concatenate((weight, x_train_count), axis=1)
        test_weight = np.concatenate((test_weight, x_test_count), axis=1)
        # for i in range(len(weight)):
        #     weight[i] = np.append(weight[i], x_train_count[i])
        #     test_weight[i] = np.append(test_weight[i], x_test_count[i])
        print "weight.shape: ", weight.shape
        print "test_weight.shape: ", test_weight.shape
        # sys.exit(0)

        print "---------------------- train --------------------------"
        xgbtrain = xgb.DMatrix(weight, label=y_train)
        xgbtest = xgb.DMatrix(test_weight, label=y_test)
        param = {'max_depth': 6, 'eta': 0.05, 'eval_metric': 'merror', 'silent': 1, 'objective': 'multi:softmax',
                 'num_class': numclass}  # 参数
        evallist = [(xgbtrain, 'train'), (xgbtest, 'test')]
        num_round = 100  # 循环次数
        bst = xgb.train(param, xgbtrain, num_round, evallist)
        # save model
        bst.save_model("./../model/TC{}.model".format(self.feature_))
        preds = bst.predict(xgbtest)
        print "------------------------------------------------"

        count = 0
        for i in range(80):
            if preds[i] == y_test[i]:
                count += 1
        print "precision: ", count * 1.0 / len(y_test)

        good_cases = []
        bad_cases = []
        valid_data = pd.read_csv(InputDataPath)
        label_mapping_convert = self.caseAnalysis()
        print "------------------ case analysis ------------------"
        for i in range(len(y_test)):
            # good case
            if y_test[i] == preds[i]:
                goodcase = {"case":"good"}
                index = 0
                while index < 399:
                    if valid_data["extract"][index].decode("utf-8") == x_test[i]:
                        break
                    index += 1
                goodcase["index"] = index
                for key, value in valid_data.iloc[index].items():
                    goodcase[key] = value
                goodcase["label"] = y_test[i]
                goodcase["predict"] = int(preds[i])
                goodcase["predict_text"] = label_mapping_convert[preds[i]]
                good_cases.append(goodcase)
            # bad case
            else:
                badcase = {"case": "bad"}
                index = 0
                while index < 399:
                    if valid_data["extract"][index].decode("utf-8") == x_test[i]:
                        break
                    index += 1
                badcase["index"] = index
                for key, value in valid_data.iloc[index].items():
                    badcase[key] = value
                badcase["label"] = y_test[i]
                badcase["predict"] = int(preds[i])
                badcase["predict_text"] = label_mapping_convert[preds[i]]
                bad_cases.append(badcase)
        print "------------------ good case  ------------------"
        print json.dumps(good_cases, ensure_ascii=False, encoding="utf-8", indent=4)
        print "------------------ bad case  ------------------"
        print json.dumps(bad_cases, ensure_ascii=False, encoding="utf-8", indent=4)

        print "------------------------------------------------"
        print "precision: ", count * 1.0 / len(y_test)
예제 #60
-1
def test_tfidf_no_smoothing():
    X = [[1, 1, 1],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert_true((tfidf >= 0).all())

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])

    # the lack of smoothing make IDF fragile in the presence of feature with
    # only zeros
    X = [[1, 1, 0],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=False, norm='l2')

    # First we need to verify that numpy here provides div 0 warnings
    with warnings.catch_warnings(record=True) as w:
        1. / np.array([0.])
        numpy_provides_div0_warning = len(w) == 1

    with warnings.catch_warnings(record=True) as w:
        tfidf = tr.fit_transform(X).toarray()
        if not numpy_provides_div0_warning:
            raise SkipTest("Numpy does not provide div 0 warnings.")
        assert_equal(len(w), 1)
        # For Python 3 compatibility
        if hasattr(w[0].message, 'args'):
            assert_true("divide by zero" in w[0].message.args[0])
        else:
            assert_true("divide by zero" in w[0].message)