Python make_features 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: dragnet.kohlschuetter

메소드/함수: make_features

hotexamples.com에서의 예제들: 6

Python make_features - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 dragnet.kohlschuetter.make_features에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: test_kohlschuetter.py 프로젝트: thiseye/dragnet

    def test_small_doc(self):
        self.assertEqual((None, []), kohlschuetter.make_features('<html></html>'))
        self.assertEqual('', kohlschuetter.analyze('<html></html>'))

        s = '<html> <p>a</p> <div>b</div> </html>'
        features, blocks = kohlschuetter.make_features(s)
        self.assertTrue(features is None)
        self.block_output_tokens(blocks, [['a'], ['b']])
        self.assertEqual('a b', kohlschuetter.analyze(s))

예제 #2

파일 보기

파일: test_content_extraction_model.py 프로젝트: thiseye/dragnet

    def test_dragnet_model(self):
        params = {'b': 0.2, 'w': [0.4, -0.2, 0.9, 0.8, -0.3, -0.5]}
        block_model = LogisticRegression.load_model(params)
        mean_std = {
            'mean': [0.0, 0.1, 0.2, 0.5, 0.0, 0.3],
            'std': [1.0, 2.0, 0.5, 1.2, 0.75, 1.3]
        }
        koh_features = NormalizedFeature(kohlschuetter_features, mean_std)

        dm = ContentExtractionModel(Blockifier, [koh_features],
                                    block_model,
                                    threshold=0.5)
        content = dm.analyze(big_html_doc)

        # make prediction from individual components
        # to do so, we use kohlschuetter.make_features and LogisticRegression
        features, blocks = kohlschuetter.make_features(big_html_doc)
        nblocks = len(blocks)
        features_normalized = np.zeros(features.shape)
        for k in range(6):
            features_normalized[:,
                                k] = (features[:, k] -
                                      mean_std['mean'][k]) / mean_std['std'][k]
        blocks_keep_indices = np.arange(nblocks)[
            block_model.predict(features_normalized) > 0.5]

        actual_content = ' '.join(
            [blocks[index].text for index in blocks_keep_indices])

        # check that the tokens are the same!
        self.assertEqual(re.split('\s+', actual_content.strip()),
                         re.split('\s+', content.strip()))

예제 #3

파일 보기

파일: test_content_extraction_model.py 프로젝트: RedSunCMX/dragnet

    def test_dragnet_model(self):
        params = {'b':0.2, 'w':[0.4, -0.2, 0.9, 0.8, -0.3, -0.5]}
        block_model = LogisticRegression.load_model(params)
        mean_std = {'mean':[0.0, 0.1, 0.2, 0.5, 0.0, 0.3], 'std':[1.0, 2.0, 0.5, 1.2, 0.75, 1.3]}
        koh_features = NormalizedFeature(kohlschuetter_features, mean_std)

        dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5)
        content = dm.analyze(big_html_doc)

        # make prediction from individual components
        # to do so, we use kohlschuetter.make_features and LogisticRegression
        features, blocks = kohlschuetter.make_features(big_html_doc)
        nblocks = len(blocks)
        features_normalized = np.zeros(features.shape)
        for k in xrange(6):
            features_normalized[:, k] = (features[:, k] - mean_std['mean'][k]) / mean_std['std'][k]
        blocks_keep_indices = np.arange(nblocks)[block_model.predict(features_normalized) > 0.5]

        actual_content = ' '.join([blocks[index].text for index in blocks_keep_indices])

        # check that the tokens are the same!
        self.assertEqual(re.split('\s+', actual_content.strip()),
                        re.split('\s+', content.strip()))


        # check that we maintain backward compatability
        from dragnet import DragnetModelKohlschuetterFeatures
        dmkf = DragnetModelKohlschuetterFeatures(block_model, mean_std)
        content_dragnetmodelkohschuetterfeatures = dmkf.analyze(big_html_doc)
        self.assertEqual(re.split('\s+', actual_content.strip()),
            re.split('\s+', content_dragnetmodelkohschuetterfeatures.strip()))

예제 #4

파일 보기

    def test_make_features(self):
        s = '<html> <p>first </p> <div> <p>second block with <a href=' '>anchor</a> </p> <p>the third block</p> </div> </html>'
        features, blocks = kohlschuetter.make_features(s)
        self.block_output_tokens(
            blocks, [['first'], ['second', 'block', 'with', 'anchor'],
                     ['the', 'third', 'block']])
        self.link_output_tokens(blocks, [[], ['anchor'], []])

        text_density = [1.0, 4.0, 3.0]
        link_density = [1.0, 0.25, 1.0 / 3.0]

        self.assertTrue(
            np.allclose(features[0, :], [
                0.0, 0.0, link_density[0], text_density[0], link_density[1],
                text_density[1]
            ]))
        self.assertTrue(
            np.allclose(features[1, :], [
                link_density[0], text_density[0], link_density[1],
                text_density[1], link_density[2], text_density[2]
            ]))
        self.assertTrue(
            np.allclose(features[2, :], [
                link_density[1], text_density[1], link_density[2],
                text_density[2], 0.0, 0.0
            ]))

예제 #5

파일 보기

파일: test_kohlschuetter.py 프로젝트: seomoz/dragnet

    def test_make_features(self):
        s = '<html> <p>first </p> <div> <p>second block with <a href=''>anchor</a> </p> <p>the third block</p> </div> </html>'
        features, blocks = kohlschuetter.make_features(s)
        self.block_output_tokens(blocks, [['first'], ['second', 'block', 'with', 'anchor'], ['the', 'third', 'block']])
        self.link_output_tokens(blocks, [[], ['anchor'], []])

        text_density = [1.0, 4.0, 3.0]
        link_density = [1.0, 0.25, 1.0 / 3.0]

        self.assertTrue(np.allclose(features[0, :], [0.0, 0.0, link_density[0], text_density[0], link_density[1], text_density[1]]))
        self.assertTrue(np.allclose(features[1, :], [link_density[0], text_density[0], link_density[1], text_density[1], link_density[2], text_density[2]]))
        self.assertTrue(np.allclose(features[2, :], [link_density[1], text_density[1], link_density[2], text_density[2], 0.0, 0.0]))

예제 #6

파일 보기

파일: test_content_extraction_model.py 프로젝트: rw/dragnet

    def test_dragnet_model(self):
        params = {"b": 0.2, "w": [0.4, -0.2, 0.9, 0.8, -0.3, -0.5]}
        block_model = LogisticRegression.load_model(params)
        mean_std = {"mean": [0.0, 0.1, 0.2, 0.5, 0.0, 0.3], "std": [1.0, 2.0, 0.5, 1.2, 0.75, 1.3]}
        koh_features = NormalizedFeature(kohlschuetter_features, mean_std)

        dm = ContentExtractionModel(Blockifier, [koh_features], block_model, threshold=0.5)
        content = dm.analyze(big_html_doc)

        # make prediction from individual components
        # to do so, we use kohlschuetter.make_features and LogisticRegression
        features, blocks = kohlschuetter.make_features(big_html_doc)
        nblocks = len(blocks)
        features_normalized = np.zeros(features.shape)
        for k in xrange(6):
            features_normalized[:, k] = (features[:, k] - mean_std["mean"][k]) / mean_std["std"][k]
        blocks_keep_indices = np.arange(nblocks)[block_model.predict(features_normalized) > 0.5]

        actual_content = " ".join([blocks[index].text for index in blocks_keep_indices])

        # check that the tokens are the same!
        self.assertEqual(re.split("\s+", actual_content.strip()), re.split("\s+", content.strip()))