Exemplo n.º 1
0
def load_data():
    print('loading data....')
    from sklearn.feature_extraction.text import CountVectorizer
    did_to_cat = {}
    cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
    with open('dataset/reuters/rcv1-v2.topics.qrels') as fin:
        for line in fin.readlines():
            line = line.strip().split(' ')
            cat = line[0]
            did = int(line[1])
            if cat in cat_list:
                did_to_cat[did] = did_to_cat.get(did, []) + [cat]
        copy_dc = copy.copy(did_to_cat)
        for did in copy_dc.keys():
            if len(did_to_cat[did]) > 1:
                del did_to_cat[did]

    dat_list = [
        'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat',
        'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat',
        'lyrl2004_tokens_train.dat'
    ]
    data = []
    target = []
    cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
    del did
    for dat in dat_list:
        print(dat + '....')
        with open('dataset/reuters/' + dat) as fin:
            for line in fin.readlines():
                if line.startswith('.I'):
                    if 'did' in locals():
                        assert doc != ''
                        if did in did_to_cat.keys():
                            data.append(doc)
                            target.append(cat_to_cid[did_to_cat[did][0]])
                    did = int(line.strip().split(' ')[1])
                    doc = ''
                elif line.startswith('.W'):
                    assert doc == ''
                else:
                    doc += line

    assert len(data) == len(did_to_cat)

    X = CountVectorizer(dtype=np.float64,
                        max_features=2000).fit_transform(data)
    Y = np.asarray(target)

    from sklearn.feature_extraction.text import TfidfTransformer
    X = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(X)
    X = np.asarray(X.todense()) * np.sqrt(X.shape[1])
    X = preprocessing.normalize(X, norm='l2') * 200
    X = X.astype('float32')
    X = X[:685000]  # for 100 minibatch training
    Y = Y[:685000]
    return X, Y
Exemplo n.º 2
0
    def test_model_tfidf_transform(self):
        corpus = numpy.array([
            'This is the first document.',
            'This document is the second document.',
            'And this is the third one.',
            'Is this the first document?',
            "Troisième document en français",
        ]).reshape((5, 1))
        data = CountVectorizer(ngram_range=(1, 1)).fit_transform(
            corpus.ravel()).todense()
        data = data.astype(numpy.float32)

        for sublinear_tf in (False, True):
            if sublinear_tf:
                # scikit-learn applies a log on a matrix
                # but only on strictly positive coefficients
                break
            for norm in (None, 'l1', 'l2'):
                for smooth_idf in (False, True):
                    for use_idf in (False, True):
                        model = TfidfTransformer(norm=norm,
                                                 use_idf=use_idf,
                                                 smooth_idf=smooth_idf,
                                                 sublinear_tf=sublinear_tf)
                        model.fit(data)
                        dt = data.copy()
                        model_onnx = convert_sklearn(
                            model, 'TfidfTransformer',
                            [('input', FloatTensorType([1, data.shape[1]]))])
                        self.assertTrue(model_onnx is not None)
                        suffix = norm.upper() if norm else ''
                        suffix += 'Sub' if sublinear_tf else ''
                        suffix += 'Idf' if use_idf else ''
                        suffix += 'Smooth' if smooth_idf else ''
                        dump_data_and_model(
                            data,
                            model,
                            model_onnx,
                            basename="SklearnTfidfTransform" + suffix,
                            # Operator mul is not implemented in onnxruntime
                            allow_failure=
                            "StrictVersion(onnx.__version__) < StrictVersion('1.2')"
                        )
Exemplo n.º 3
0
def make_reuters_data(data_dir):
    np.random.seed(1234)
    from sklearn.feature_extraction.text import CountVectorizer
    from os.path import join
    did_to_cat = {}
    cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
    with open(join(data_dir, 'rcv1-v2.topics.qrels')) as fin:
        for line in fin.readlines():
            line = line.strip().split(' ')
            cat = line[0]
            did = int(line[1])
            if cat in cat_list:
                did_to_cat[did] = did_to_cat.get(did, []) + [cat]
        # did_to_cat = {k: did_to_cat[k] for k in list(did_to_cat.keys()) if len(did_to_cat[k]) > 1}
        for did in list(did_to_cat.keys()):
            if len(did_to_cat[did]) > 1:
                del did_to_cat[did]

    dat_list = ['lyrl2004_tokens_test_pt0.dat',
                'lyrl2004_tokens_test_pt1.dat',
                'lyrl2004_tokens_test_pt2.dat',
                'lyrl2004_tokens_test_pt3.dat',
                'lyrl2004_tokens_train.dat']
    data = []
    target = []
    cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
    del did
    for dat in dat_list:
        with open(join(data_dir, dat)) as fin:
            for line in fin.readlines():
                if line.startswith('.I'):
                    if 'did' in locals():
                        assert doc != ''
                        if did in did_to_cat:
                            data.append(doc)
                            target.append(cat_to_cid[did_to_cat[did][0]])
                    did = int(line.strip().split(' ')[1])
                    doc = ''
                elif line.startswith('.W'):
                    assert doc == ''
                else:
                    doc += line

    print((len(data), 'and', len(did_to_cat)))
    assert len(data) == len(did_to_cat)

    x = CountVectorizer(dtype=np.float64, max_features=2000).fit_transform(data)
    y = np.asarray(target)

    from sklearn.feature_extraction.text import TfidfTransformer
    x = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(x)
    x = x.astype(np.float32)
    print(x.dtype, x.size)
    y = y
    x = np.asarray(x.todense()) * np.sqrt(x.shape[1])
    print('todense succeed')

    p = np.random.permutation(x.shape[0])
    x = x[p]
    y = y[p]
    print('permutation finished')

    assert x.shape[0] == y.shape[0]
    x = x.reshape((x.shape[0], -1))
    np.save(join(data_dir, 'reutersidf10k.npy'), {'data': x, 'label': y})