Exemplo n.º 1
0
    def TFIDFEmbedding(self, text_list):
        print('embedding word by using TFIDF mdoel ... ')
        train_data = []
        for item in text_list:
            train_data.append(''.join(item))
        tfidf_word = TFIDF(min_df=0,
                           max_features=None,
                           analyzer='word',
                           ngram_range=(1, 3),
                           use_idf=1,
                           smooth_idf=1,
                           sublinear_tf=False,
                           stop_words='english')
        tfidf_char = TFIDF(min_df=0,
                           max_features=None,
                           strip_accents='unicode',
                           analyzer='word',
                           ngram_range=(1, 3),
                           use_idf=1,
                           smooth_idf=1,
                           sublinear_tf=False,
                           stop_words='english')
        tfidf_word.fit(train_data)

        vector_list = tfidf_word.transform(train_data)
        return vector_list
Exemplo n.º 2
0
def train_tfidf():
	skills, data = read()
	corpus = [' '.join(turnToWordList(d['description'])) for d in data]
	features = 5000
	tfidf = TFIDF(min_df=2, max_features=features, strip_accents="unicode", analyzer="word", token_pattern=r"\w{1,}", ngram_range=(1,3), use_idf=1,smooth_idf=1,sublinear_tf=1,stop_words="english")
	tfidf.fit(corpus)
	joblib.dump(tfidf, '../../model/tfidf.pkl')
Exemplo n.º 3
0
def train_tfidf(train_data):
    tfidf = TFIDF(min_df=5,
                  max_features=dim,
                  ngram_range=(1, 3),
                  use_idf=1,
                  smooth_idf=1,
                  sublinear_tf=True)
    tfidf.fit(train_data)

    X = tfidf.fit_transform(train_data)
    word_dict = {}
    name = tfidf.get_feature_names()
    with open('name.txt', 'w') as fw:
        for i, s in enumerate(name):
            s = s.replace("", "_")
            word_dict[i] = s
            fw.write(s)
            fw.write('\n')

    raw_text = []

    for line in X.A:
        s = ""
        for i in line:
            s += " " + word_dict[i]
        raw_text.append(s)
    return raw_text
Exemplo n.º 4
0
    def vectorize2(self, f, feature):
        data = self.preprocess(f)
        tfidf = TFIDF(vocabulary=feature)
        fit_t = tfidf.fit_transform(data["content"])
        weight = pd.DataFrame(fit_t.toarray())

        return weight.values, data["label"].values
Exemplo n.º 5
0
def tf_idf(train_data, test_data):
    """TF-IDF向量"""
    tfidf = TFIDF(
        min_df=2,
        max_features=None,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 3),  # 二元文法模型
        use_idf=1,
        smooth_idf=1,
        sublinear_tf=1,
        stop_words='english')  # 去掉英文停用词

    # 合并训练和测试集以便进行TFIDF向量化操作
    data_all = train_data + test_data
    len_train = len(train_data)

    tfidf.fit(data_all)
    data_all = tfidf.transform(data_all)
    # 恢复成训练集和测试集部分
    train_x = data_all[:len_train]
    test_x = data_all[len_train:]
    print("train: \n", np.shape(train_x[0]))
    print("test: \n", np.shape(test_x[0]))
    return train_x, test_x
def train_tfidf(train_data):
    tfidf = TFIDF(min_df=5,
                  max_features=5000,
                  ngram_range=(1, 3),
                  use_idf=1,
                  smooth_idf=1)
    tfidf.fit(train_data)
    return tfidf
Exemplo n.º 7
0
    def train_tfidf(self, facts):
        """ train the TFIDF vectorizer model """
        tfidf = TFIDF(min_df=5, max_features=DIM, ngram_range=(1, 3))
        tfidf.fit(facts)

        if util.DEBUG:
            print("DEBUG: TF-IDF model learnt.")
        return tfidf
Exemplo n.º 8
0
    def vectorize(self, data):
        tfidf = TFIDF()
        fit_t = tfidf.fit_transform(data["content"])
        weight = pd.DataFrame(fit_t.toarray())

        word = tfidf.get_feature_names()
        #print weight.shape
        return word, weight, data["label"].values
Exemplo n.º 9
0
def train_tfidf(train_data):
    tfidf = TFIDF(min_df=5,
                  max_features=dim,
                  ngram_range=(1, 2),
                  use_idf=1,
                  smooth_idf=1,
                  sublinear_tf=True)
    tfidf.fit(train_data)

    return tfidf
Exemplo n.º 10
0
def tf_idf(train_data, test_data):
    len_train = len(train_data)
    tokenizer = TweetTokenizer()
    vectorizer = TFIDF(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
    full_text = list(train_data['Phrase'].values) + list(
        test_data['Phrase'].values)
    vectorizer.fit(full_text)
    data_all = vectorizer.transform(full_text)
    # 恢复成训练集和测试集部分
    train_x = data_all[:len_train]
    test_x = data_all[len_train:]
    return train_x, test_x
Exemplo n.º 11
0
def to_matrix(all):
    tfidf = TFIDF(
        min_df=3,  # 最小支持度为3
        max_features=None,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 4),  # 二元文法模型
        use_idf=1,
        smooth_idf=1,
        sublinear_tf=1,
        stop_words='english')  # 去掉英文停用词
    tfidf.fit(all)
    data_all = tfidf.transform(all)
    return data_all
Exemplo n.º 12
0
def train_tfidf(train_data, dim=5000, ngram=3, min_df=5):
    ngram_range = (1, 3)
    if ngram == 1:
        ngram_range = (1, 1)
    elif ngram == 2:
        ngram_range = (1, 2)

    tfidf = TFIDF(min_df=min_df,
                  max_features=dim,
                  ngram_range=ngram_range,
                  use_idf=1,
                  smooth_idf=1)
    tfidf.fit(train_data)

    return tfidf
Exemplo n.º 13
0
def tfidf_extraction(texts,
                     vectorizer=None,
                     max_features=2000,
                     ngram_range=(1, 1)):
    if (vectorizer == None):
        vectorizer = TFIDF(tokenizer=LemmaTokenizer(),
                           binary="True",
                           strip_accents="unicode",
                           ngram_range=ngram_range,
                           analyzer='word',
                           stop_words='english',
                           max_features=max_features)
        tfidf = vectorizer.fit_transform(texts)
    else:
        tfidf = vectorizer.transform(texts)
    return tfidf, vectorizer
Exemplo n.º 14
0
def preprocessing():
    """数据预处理"""
    df = load_dataset()
    train_X, valid_X, train_y, valid_y = \
        train_test_split(df['content'], df['content_type'],
                         test_size=0.2, random_state=42)
    model_tfidf = TFIDF(min_df=5,
                        max_features=5000,
                        ngram_range=(1, 3),
                        use_idf=1,
                        smooth_idf=1)
    # 学习idf vector
    model_tfidf.fit(train_X)
    # 把文档转换成 X矩阵(该文档中该特征词出现的频次),行是文档个数,列是特征词的个数
    train_vec = model_tfidf.transform(train_X)
    valid_vec = model_tfidf.transform(valid_X)
    return train_X, valid_y, train_y, valid_y, train_vec, valid_vec
Exemplo n.º 15
0
def tfidf_count(train_data, test_data):
    begintime = datetime.datetime.now()
    # 参考:http://blog.csdn.net/longxinchen_ml/article/details/50629613
    tfidf = TFIDF(
        min_df=2,  # 最小支持度为2
        max_features=None,
        strip_accents='unicode',  #在预处理步骤中删除语气词。 
        #'ascii'是一种快速的方法,只适用于具有直接ASCII映射的字符。
        #'unicode'是一种稍慢的方法,适用于任何字符。 None(默认)不起作用
        analyzer='word',  #特征值是一个单词 还是一个n-gram
        token_pattern=r'\w{1,}',  #表示什么构成“token”的正则表达式,仅在分析器==“单词”时使用。 
        #默认正则表达式选择2个或更多字母数字字符的标记
        #(标点符号被完全忽略,并始终作为token分隔符处理)。
        ngram_range=(1, 3),  # 二元文法模型
        use_idf=1,  #反文档频率
        smooth_idf=1,  #通过将文档频率添加一个平滑的idf权重,防止0频率
        sublinear_tf=1,  #用1 + log(tf)替换tf。
        stop_words=None)  # 去掉英文停用词  如果是字符串,则将其传递给_check_stop_list,并返回相应的停止列表。
    #'english'是目前唯一支持的字符串值

    # 合并训练和测试集以便进行TFIDF向量化操作
    data_all = train_data + test_data
    len_train = len(train_data)

    tfidf.fit(data_all)  #Learn vocabulary and idf from training set.

    data_all = tfidf.transform(
        data_all)  #Transform documents to document-term matrix

    #print(data_all)

    # 恢复成训练集和测试集部分
    train_tfidf = data_all[:len_train]
    test_tfidf = data_all[len_train:]
    print('TF-IDF over.')

    endtime = datetime.datetime.now()
    tfidftime = (endtime - begintime).seconds * 1000 + (
        endtime - begintime).microseconds / 1000  #微妙转换为毫秒

    return train_tfidf, test_tfidf, tfidftime
Exemplo n.º 16
0
def tfidf_tsne():
    indx_sent, word2idx, idx2word = Sentences().limit_vocab()

    word_sent_counts = np.zeros((len(word2idx) + 1, len(indx_sent) + 1))
    j = 0
    for sentence in indx_sent:
        for idx in sentence:
            word_sent_counts[idx, j] += 1
        j += 1

    word_sent_tfidf = TFIDF().fit_transform(word_sent_counts).toarray()
    word_sent_tsne = TSNE().fit_transform(word_sent_tfidf)

    plt.scatter(word_sent_tsne[:, 0], word_sent_tsne[:, 1])
    for label in range(len(word2idx)):
        try:
            plt.annotate(s=idx2word[label].encode('utf8'),
                         xy=(word_sent_tsne[label, 0], word_sent_tsne[label,
                                                                      1]))
        except UnicodeError:
            pass
        except KeyError:
            pass
    plt.show()
Exemplo n.º 17
0
 def train_tfidf(self, facts):
     tfidf = TFIDF(min_df=5, max_features=DIM, ngram_range=(1, 3))
     tfidf.fit(facts)
Exemplo n.º 18
0
print(all_data[0])
print(all_data[1999])
print(type(all_data[0]))
print(data_all[0])
print(data_all[1999])
print(type(data_all[0]))


# In[4]:

from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
tfidf = TFIDF(min_df=5, # 最小支持度为2
           max_features=None,
           strip_accents='unicode',
           analyzer='word',
           token_pattern=r'\w{1,}',
           ngram_range=(1,1),  # 1元文法模型
           use_idf=1,
           smooth_idf=1,
           sublinear_tf=1)


# In[5]:

tfidf.fit(all_data)
all_data = tfidf.transform(all_data)
print(type(all_data))


# In[6]:
Exemplo n.º 19
0
def train_tfidf(train_data):
    tfidf = TFIDF(token_pattern=r"(?u)\b\w+\b")  # 0.85030136
    tfidf.fit(train_data)
    return tfidf
Exemplo n.º 20
0
    train_data.append(' '.join(review_to_wordlist(train['review'][i])))
test_data = []
for i in range(len(test['review'])):
    test_data.append(' '.join(review_to_wordlist(test['review'][i])))

# 预览数据
#print (train_data[0], '\n')
#print( test_data[0])

from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF

tfidf = TFIDF(min_df=2, # 最小支持度为2
           max_features=None,
           strip_accents='unicode',
           analyzer='word',
           token_pattern=r'\w{1,}',
           ngram_range=(1, 3),  # 二元文法模型
           use_idf=1,
           smooth_idf=1,
           sublinear_tf=1,
           stop_words = 'english') # 去掉英文停用词

# 合并训练和测试集以便进行TFIDF向量化操作
data_all = train_data + test_data
len_train = len(train_data)

tfidf.fit(data_all)
data_all = tfidf.transform(data_all)
# 恢复成训练集和测试集部分
train_x = data_all[:len_train]
test_x = data_all[len_train:]
print ('TF-IDF over.')
Exemplo n.º 21
0
# print("*************")
# 是否存在样本不平衡问题?
# for i in [0, 1, 2, 3]:
#     print(i, (train.target == i).sum()/len(train.target))
'''
0 0.26052974381241856
1 0.25749023013460703
2 0.23708206686930092
3 0.24489795918367346
'''
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
Xtrain = train.data
Xtest = test.data
Ytrain = train.target
Ytest = test.target
tfidf = TFIDF().fit(Xtrain)
Xtrain_ = tfidf.transform(Xtrain)
Xtest_ = tfidf.transform(Xtest)
# print(Xtrain_)
tosee = pd.DataFrame(Xtrain_.toarray(), columns=tfidf.get_feature_names())
# print(tosee.shape) # (2303, 40725)
# print(tosee.head())

# 建模
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import brier_score_loss as BS
name = ["Multinomial", "Complement", "Bournulli"]  #
# 注意高斯朴素贝叶斯不接受稀疏矩阵
models = [MultinomialNB(), ComplementNB(), BernoulliNB()]  #
for name, clf in zip(name, models):
    clf.fit(Xtrain_, Ytrain)
Exemplo n.º 22
0
    # ,"talk.politics.guns" #政治 - 枪支问题
    ,
    "talk.politics.mideast"
]  #政治 - 中东问题

train = fetch_20newsgroups(subset="train", categories=categories)
test = fetch_20newsgroups(subset="test", categories=categories)

xtrain = train.data
xtest = test.data
print("train", len(data))
# print("text",data[0])
ytrain = train.target
ytest = test.target

tfidf = TFIDF().fit(xtrain)
xtrain_ = tfidf.transform(xtrain)
xtest_ = tfidf.transform(xtest)
print("xtrain_", xtrain_.shape)

from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import brier_score_loss as BS

name = ["Multinomial", "Complement", "Bournulli"]
models = [MultinomialNB(), ComplementNB(), BernoulliNB()]

# for name,clf in zip(name,models):
#     clf.fit(xtrain_,ytrain)
#     y_pred=clf.predict(xtest_)
#     proba=clf.predict_proba(xtest_)
#     score=clf.score(xtest_,ytest)
api_train = pd.DataFrame(api_train)
api_train.rename(columns={0:'return'},inplace=True)

test['api_return'] = test.return_value.map(str)
api_test = test.groupby(by='file_id').apply(lambda x:' '.join(x.api_return))
api_test = pd.DataFrame(api_test)
api_test.rename(columns={0:'return'},inplace=True)

import networkx as nx
apiSet = list(set(train.api) | set(test.api))

# TFIDF特征
print('tfidf starts')
tfidf = False
if tfidf:
    vec = TFIDF(ngram_range=(1, 4), max_features=300000)
    tfidf_train = vec.fit_transform(df_train['text'])
    tfidf_test = vec.transform(df_test['text'])
    print(tfidf_train.shape, tfidf_test.shape, time.time() - start0)

    sparse.save_npz('./virus_set/tfidf_train.npz', tfidf_train)  # 保存
    sparse.save_npz('./virus_set/tfidf_test.npz', tfidf_test)  # 保存

    # TFIDF特征
    # vec = TFIDF(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)
    # tfidf_train_api_return = vec.fit_transform(df_train['parallel_api'])
    # tfidf_test_api_return = vec.transform(df_test['parallel_api'])
    # print(tfidf_train_api_return.shape, tfidf_test_api_return.shape,time.time()-start0)

    # sparse.save_npz('./virus_set/tfidf_train_api_return.npz', tfidf_train_api_return)  #保存
    # sparse.save_npz('./virus_set/tfidf_test_api_return.npz', tfidf_test_api_return)    #保存
Exemplo n.º 24
0
			#process context
			#filter out punctuations 
			temp_context = client.annotate(raw_context)
			parsed_context = []
			for s in temp_context.sentence:
				this_sentence = []
				for token in s.token: 
					ts = token.lemma.lower()
					if ts not in PUNCTUATIONS:
						this_sentence.append(ts)
				parsed_sentence = " ".join(this_sentence)
				parsed_context.append(parsed_sentence)

			#train TFIDF model using processed context content without stopwords
			unigram_model = TFIDF(input=parsed_context, analyzer='word', dtype=np.float32, stop_words=STOP_WORDS)			
			#process each question in the question & answers set
			for q in set_qass:
				raw_question = q['question']
				qid = q['id']
				
				#process a single question, generating a corresponding questionSpan
				#additionally, analize the type of the question
				#if the question type is WDT WHAT or WHICH, we should get the part for substitution
				temp_question = client.annotate(raw_question)
				this_question = []
				size_tokens = len(temp_question.sentence[0].token)
				list_tokens = temp_question.sentence[0].token
				IDENTFY = False
				qtype = 8
				QSpan = questionSpan()
Exemplo n.º 25
0
testData = []
for i in range(len(test['review'])):
    testData.append(' '.join(turnToWordList(test['review'][i])))
print(len(testData))

#print(train.head())
#print(test.head())

from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
print('training TFIDF')
tfidf = TFIDF(min_df=2,
              max_features=1000,
              strip_accents="unicode",
              analyzer="word",
              token_pattern=r"\w{1,}",
              ngram_range=(1, 3),
              use_idf=1,
              smooth_idf=1,
              sublinear_tf=1,
              stop_words="english")

allData = trainData + testData
lentrain = len(trainData)

tfidf.fit(allData)
allData = tfidf.transform(allData)

train_x = allData[:lentrain]
test_x = allData[lentrain:]

print("TF-IDF处理结束")
Exemplo n.º 26
0
from sklearn.model_selection import train_test_split

# 对数据进行洗牌处理
from sklearn.utils import shuffle
X_shuf, Y_shuf = shuffle(content, data['rumorType'])

# 将有标签的数据集划分成训练集和测试集
train_X, test_X, train_y, test_y = train_test_split(X_shuf,
                                                    Y_shuf,
                                                    test_size=0.2,
                                                    random_state=42)

# 模型构建
model_tfidf = TFIDF(min_df=5,
                    max_features=5000,
                    ngram_range=(1, 3),
                    use_idf=1,
                    smooth_idf=1)
# 学习idf vector
model_tfidf.fit(train_X)
# 把文档转换成 X矩阵(该文档中该特征词出现的频次),行是文档个数,列是特征词的个数
train_vec = model_tfidf.transform(train_X)

# 模型训练
model_SVC = LinearSVC()
clf = CalibratedClassifierCV(model_SVC)
clf.fit(train_vec, train_y)

# 把文档转换成矩阵
test_vec = model_tfidf.transform(test_X)
# 验证
Exemplo n.º 27
0
def main():
    sw = list(stopwords.words("english"))
    data_src = "D:\\Reference\\aclImdb"
    tags = ['neg', 'pos']

    gbm_param_grid = {
        'n_estimators': range(5, 20),
        'max_depth': range(6, 20),
        'learning_rate': [.4, .45, .5, .55, .6],
        'colsample_bytree': [.6, .7, .8, .9, 1],
        'min_child_weight': range(1, 6, 2)
    }

    # Training
    x_train = []
    y_train = []
    for tag in tags:
        for aFile in os.listdir(f"{data_src}\\train\\{tag}"):
            with open(f"{data_src}\\train\\{tag}\\{aFile}",
                      "r",
                      encoding="utf-8") as f:
                x_train.append(f.read().strip())
                y_train.append(tags.index(tag))

    tfidf = TFIDF(stop_words=sw)
    x_train_tfidf = tfidf.fit_transform(x_train)

    xgb = XGBClassifier()
    xgb_random = CV(param_distributions=gbm_param_grid,
                    estimator=xgb,
                    scoring="accuracy",
                    verbose=1,
                    n_iter=50,
                    cv=5,
                    n_jobs=-1)
    xgb_random.fit(x_train_tfidf, y_train)

    print("Search log: ", xgb_random.cv_results_)
    print("Best parameters found: ", xgb_random.best_params_)
    print("Best accuracy found: ", xgb_random.best_score_)

    # Testing
    x_test = []
    y_test = []
    for tag in tags:
        for aFile in os.listdir(f"{data_src}\\test\\{tag}"):
            with open(f"{data_src}\\test\\{tag}\\{aFile}",
                      "r",
                      encoding="utf-8") as f:
                x_test.append(f.read().strip())
                y_test.append(tags.index(tag))
    x_test_tfidf = tfidf.transform(x_test)
    y_pred = xgb_random.predict(x_test_tfidf)

    print("Acc:", accuracy_score(y_test, y_pred))
    print("Rec:", recall_score(y_test, y_pred))
    print("Pre:", precision_score(y_test, y_pred))
    print("F1:", f1_score(y_test, y_pred))

    # Save The Model
    dump(tfidf, "model/tfidf.pkl")
    dump(xgb_random, "model/xgb.pkl")
    data_all[i] = remove_number(data_all[i])
    data_all[i] = remove_link(data_all[i])
    print(i)

# In[4]:

print(data_all[100])

# In[5]:

from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
tfidf = TFIDF(
    min_df=5,  # 最小支持度为2
    max_features=None,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 1),  # 1元文法模型
    use_idf=1,
    smooth_idf=1,
    sublinear_tf=1)

# In[7]:

all_data = data_all
tfidf.fit(all_data)
all_data = tfidf.transform(all_data)
print(type(all_data))

# In[8]:

print(tfidf.vocabulary_)
print(train_data[0], '\n')
print(test_data[0])

# ## 1.3 Feature Extraction and Vectorization of text

# In[44]:

from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF

tfidf = TFIDF(
    min_df=2,
    max_features=None,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),  # binary grammar model
    #ngram_range=(1, 3),  # Ternary grammar model
    use_idf=1,
    smooth_idf=1,
    sublinear_tf=1,
    stop_words='english')  # Remove English stop words

# Combine training and test sets for TF-IDF vectorization
data_all = train_data + test_data
len_train = len(train_data)

tfidf.fit(data_all)
data_all = tfidf.transform(data_all)
# Restore to training set and testing set sections
train_x = data_all[:len_train]
test_x = data_all[len_train:]
Exemplo n.º 30
0
sample = [
    "Machine learning is fascinating, it is wonderful",
    "Machine learning is a sensational techonology",
    "Elsa is a popular character"
]

from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
x = vec.fit_transform(sample)

print("features", vec.get_feature_names())

# print("x",x.shape,type(x),x)
# print("x",x.toarray())
# import pandas as pd
# vcresult=pd.DataFrame(x.toarray(),columns=vec.get_feature_names())

from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
vec = TFIDF()
x = vec.fit_transform(sample)
print("x", x)