def load_data(): print('loading data....') from sklearn.feature_extraction.text import CountVectorizer did_to_cat = {} cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT'] with open('dataset/reuters/rcv1-v2.topics.qrels') as fin: for line in fin.readlines(): line = line.strip().split(' ') cat = line[0] did = int(line[1]) if cat in cat_list: did_to_cat[did] = did_to_cat.get(did, []) + [cat] copy_dc = copy.copy(did_to_cat) for did in copy_dc.keys(): if len(did_to_cat[did]) > 1: del did_to_cat[did] dat_list = [ 'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat', 'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat', 'lyrl2004_tokens_train.dat' ] data = [] target = [] cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3} del did for dat in dat_list: print(dat + '....') with open('dataset/reuters/' + dat) as fin: for line in fin.readlines(): if line.startswith('.I'): if 'did' in locals(): assert doc != '' if did in did_to_cat.keys(): data.append(doc) target.append(cat_to_cid[did_to_cat[did][0]]) did = int(line.strip().split(' ')[1]) doc = '' elif line.startswith('.W'): assert doc == '' else: doc += line assert len(data) == len(did_to_cat) X = CountVectorizer(dtype=np.float64, max_features=2000).fit_transform(data) Y = np.asarray(target) from sklearn.feature_extraction.text import TfidfTransformer X = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(X) X = np.asarray(X.todense()) * np.sqrt(X.shape[1]) X = preprocessing.normalize(X, norm='l2') * 200 X = X.astype('float32') X = X[:685000] # for 100 minibatch training Y = Y[:685000] return X, Y
def test_model_tfidf_transform(self): corpus = numpy.array([ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', "Troisième document en français", ]).reshape((5, 1)) data = CountVectorizer(ngram_range=(1, 1)).fit_transform( corpus.ravel()).todense() data = data.astype(numpy.float32) for sublinear_tf in (False, True): if sublinear_tf: # scikit-learn applies a log on a matrix # but only on strictly positive coefficients break for norm in (None, 'l1', 'l2'): for smooth_idf in (False, True): for use_idf in (False, True): model = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) model.fit(data) dt = data.copy() model_onnx = convert_sklearn( model, 'TfidfTransformer', [('input', FloatTensorType([1, data.shape[1]]))]) self.assertTrue(model_onnx is not None) suffix = norm.upper() if norm else '' suffix += 'Sub' if sublinear_tf else '' suffix += 'Idf' if use_idf else '' suffix += 'Smooth' if smooth_idf else '' dump_data_and_model( data, model, model_onnx, basename="SklearnTfidfTransform" + suffix, # Operator mul is not implemented in onnxruntime allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.2')" )
def make_reuters_data(data_dir): np.random.seed(1234) from sklearn.feature_extraction.text import CountVectorizer from os.path import join did_to_cat = {} cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT'] with open(join(data_dir, 'rcv1-v2.topics.qrels')) as fin: for line in fin.readlines(): line = line.strip().split(' ') cat = line[0] did = int(line[1]) if cat in cat_list: did_to_cat[did] = did_to_cat.get(did, []) + [cat] # did_to_cat = {k: did_to_cat[k] for k in list(did_to_cat.keys()) if len(did_to_cat[k]) > 1} for did in list(did_to_cat.keys()): if len(did_to_cat[did]) > 1: del did_to_cat[did] dat_list = ['lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat', 'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat', 'lyrl2004_tokens_train.dat'] data = [] target = [] cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3} del did for dat in dat_list: with open(join(data_dir, dat)) as fin: for line in fin.readlines(): if line.startswith('.I'): if 'did' in locals(): assert doc != '' if did in did_to_cat: data.append(doc) target.append(cat_to_cid[did_to_cat[did][0]]) did = int(line.strip().split(' ')[1]) doc = '' elif line.startswith('.W'): assert doc == '' else: doc += line print((len(data), 'and', len(did_to_cat))) assert len(data) == len(did_to_cat) x = CountVectorizer(dtype=np.float64, max_features=2000).fit_transform(data) y = np.asarray(target) from sklearn.feature_extraction.text import TfidfTransformer x = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(x) x = x.astype(np.float32) print(x.dtype, x.size) y = y x = np.asarray(x.todense()) * np.sqrt(x.shape[1]) print('todense succeed') p = np.random.permutation(x.shape[0]) x = x[p] y = y[p] print('permutation finished') assert x.shape[0] == y.shape[0] x = x.reshape((x.shape[0], -1)) np.save(join(data_dir, 'reutersidf10k.npy'), {'data': x, 'label': y})