/
bm25vetorizer.py
166 lines (130 loc) · 5.98 KB
/
bm25vetorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 21 14:42:53 2017
@author: yangyonghuan
"""
import numpy as np
import scipy.sparse as sp
import numbers
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.fixes import bincount
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity
class BM25Vectorizer(BaseException, TransformerMixin):
def __init__(self, k1=1.5, b = 0.75, norm =None, input='content', encoding='utf-8',
decode_error='strict', strip_accents=None, lowercase=True,
preprocessor=None, tokenizer=None, analyzer='word',
stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
ngram_range=(1, 1), max_df=1.0, min_df=1,
max_features=None, vocabulary=None, binary=False,
dtype=np.int64):
self.k1 = k1
self.b = b
self.norm = norm
self.tf_vectorizer = CountVectorizer(input=input,
encoding=encoding,
decode_error=decode_error,
strip_accents=strip_accents,
lowercase=lowercase,
preprocessor=preprocessor,
tokenizer=tokenizer,
stop_words=stop_words,
token_pattern=token_pattern,
ngram_range=ngram_range,
analyzer=analyzer,
max_df=max_df,
min_df=min_df,
max_features=max_features,
vocabulary=vocabulary,
binary=binary,
dtype=dtype)
def fit(self, X):
"""
- It should train the BM25 model on the given corpus docs
- Return nothing
"""
X = self.tf_vectorizer.fit_transform(X).toarray()
if not sp.issparse(X):
X = sp.csc_matrix(X)
n_samples, n_features = X.shape
if sp.isspmatrix_csr(X):
df = bincount(X.indices, minlength=X.shape[1])
else:
df = np.diff(sp.csc_matrix(X, copy=False).indptr)
#compute idf weight
#idf = np.log((float(n_samples)-df+0.5)/(df+0.5))
idf = np.log(float(n_samples) / df) + 1.0
self._idf_diag = sp.spdiags(idf, diags=0, m=n_features,
n=n_features, format='csr')
#compute the length for each document and average length of the corpus
doc_len = np.sum(X,axis=1)
self._doc_len = np.reshape(doc_len, (n_samples,1))
self._avgdl = np.sum(X)/n_samples
def transform(self, X):
"""
- It should convert the given corpus into the vector-space
representation based on the model we learn from ‘fit’ method
- Return a matrix where each row is one document, each column
is feature
"""
X = self.tf_vectorizer.fit_transform(X).toarray()
n_samples, n_features = X.shape
if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
# preserve float family dtype
X = sp.csr_matrix(X)
else:
# convert counts or binary occurrences to floats
X = sp.csr_matrix(X, dtype=np.float64)
expected_n_features = self._idf_diag.shape[0]
if n_features != expected_n_features:
raise ValueError("Input has n_features=%d while the model"" has been trained with n_features=%d"
% (n_features, expected_n_features))
#compute BM25 score
bottom = X + self.k1*(1 - self.b + self._doc_len*(self.b/self._avgdl))
weight = (X*(1+self.k1))/bottom
score = weight * self._idf_diag
#normalization
if self.norm:
score = normalize(score, norm=self.norm, copy=False)
return score
def fit_transform(self, X):
"""
- Train the BM25 model and return a vector-space representation of
the corpus
- Return a matrix where each row is one document, each column is
feature
"""
self.fit(X)
return self.transform(X)
def get_vocabulary(self):
"""
- This method returns a list of unique words from the given corpus
during calling ‘fit’ method
"""
return self.tf_vectorizer.get_feature_names()
if __name__ == '__main__':
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
remove = ('headers', 'footers', 'quotes')
print("Loading 20 newsgroups dataset for categories:")
print(categories if categories else "all")
data_train = fetch_20newsgroups(subset='train', categories=categories,
shuffle=True, random_state=42,
remove=remove)
data_test = fetch_20newsgroups(subset='test', categories=categories,
shuffle=True, random_state=42,
remove=remove)
print('data loaded')
# split a training set and a test set
#y_train, y_test = data_train.target, data_test.target
bm25 = BM25Vectorizer()
score = bm25.fit_transform(data_train.data)
print(score)