Exemplo n.º 1
0
 def test_iter_commands(self):
     extractor = CommandExtractor()
     for doc in iter_html_docs(TEST_DATA_DIR):
         print(doc.url.url)
         for (line, cmd), correct in zip(extractor.iter_commands(doc),
                                         COMMANDS[doc.url.url]):
             self.assertEqual((line, cmd), correct)
 def test_iter_texts(self):
     extractor = CommandExtractor()
     for doc in iter_html_docs(TEST_DATA_DIR):
         nr_txts = 0
         for _ in extractor.iter_text_lines(doc):
             nr_txts += 1
         self.assertEqual(NR_TEXTS[doc.url.url], nr_txts)
    def test_extract_commands(self):
        cmds = extract_commands(iter_html_docs(TEST_DATA_DIR))
        self.assertEqual(set(cmds.commands.keys()), MERGED_COMMANDS)

        cmds = extract_commands(iter_html_docs(TEST_DATA_DIR), 'xargs')
        self.assertEqual(set(cmds.commands.keys()), set([
            'find /tmp -name "*.tmp" | xargs rm',
            u'find ./music -name "*.mp3" -print0 | xargs -0 ls',
            'find . -name "*.sh" | xargs grep "ksh"',
            'find /tmp -name "*.tmp" -print0 | xargs -0 rm',
            'find . -name "*.sh" -print0 | xargs -0 -I {} mv {} ~/back.scripts',
            u'find ./work -print | xargs grep "profit"']))

        cmds = extract_commands(
            get_html_doc(TEST_DATA_DIR, 'stackoverflow.com'), 'xargs')
        self.assertEqual(cmds.commands, {})

        doc = HtmlDocument('http://stackoverflow.com', b'')
        doc.body = None
        cmds = extract_commands(doc)
        self.assertEqual(cmds.nr_docs, 0)
Exemplo n.º 4
0
    def test_cache(self):
        doc = get_html_doc('search_engines', 'google.com')
        docs = list(iter_html_docs('search_engines'))
        obj = {'single': doc, 'multi': docs}

        cache.store(obj, testing=True)
        from_cache = cache.get(testing=True)

        self.assertEqual(from_cache.keys(), obj.keys())
        self.assertEqual(obj['single'].url.url, from_cache['single'].url.url)

        self.assertIsNone(cache.get(does_not_exist=True))
Exemplo n.º 5
0
    def test_cache(self):
        doc = get_html_doc('search_engines', 'google.com')
        docs = list(iter_html_docs('search_engines'))
        obj = {'single': doc, 'multi': docs}

        cache.store(obj, testing=True)
        from_cache = cache.get(testing=True)

        self.assertEqual(from_cache.keys(), obj.keys())
        self.assertEqual(obj['single'].url.url, from_cache['single'].url.url)

        self.assertIsNone(cache.get(does_not_exist=True))
Exemplo n.º 6
0
    def test_iter_get(self):
        docs = list(iter_html_docs('cmdextract'))
        reqs = [download.Request(doc.url.url) for doc in docs]
        with requests_mock.mock() as m:
            for req, doc in zip(reqs[:-1], docs[:-1]):
                m.get(req.url, content=doc.body)
            resps = list(download.iter_get(reqs))
            success = [doc.url.url.strip('/') for doc in resps if
                       isinstance(doc, download.HtmlDocument)]
            for resp in resps:
                if isinstance(resp, download.DownloadError):
                    if isinstance(resp.err, UnicodeDecodeError):
                        raise resp.err
            self.assertEqual(set(success), set([r.url for r in reqs[:-1]]))

            m.get(reqs[-1].url, content=docs[-1].body)
            resps = list(download.iter_get(reqs[-1]))
            self.assertEqual(
                [r.url.url.strip('/') for r in resps], [reqs[-1].url])
    def test_iter_get(self):
        docs = list(iter_html_docs('cmdextract'))
        reqs = [download.Request(doc.url.url) for doc in docs]
        with requests_mock.mock() as m:
            for req, doc in zip(reqs[:-1], docs[:-1]):
                m.get(req.url, content=doc.body)
            resps = list(download.iter_get(reqs))
            success = [
                doc.url.url.strip('/') for doc in resps
                if isinstance(doc, download.HtmlDocument)
            ]
            for resp in resps:
                if isinstance(resp, download.DownloadError):
                    if isinstance(resp.err, UnicodeDecodeError):
                        raise resp.err
            self.assertEqual(set(success), set([r.url for r in reqs[:-1]]))

            m.get(reqs[-1].url, content=docs[-1].body)
            resps = list(download.iter_get(reqs[-1]))
            self.assertEqual([r.url.url.strip('/') for r in resps],
                             [reqs[-1].url])
Exemplo n.º 8
0
    def test_html_document(self):
        for doc in iter_html_docs('cmdextract'):
            self.assertTrue(doc.tree.tag)

        doc_cp = download.HtmlDocument.from_dict(doc.to_dict())
        self.assertEqual(doc_cp.url.url, doc.url.url)
 def test_iter_commands(self):
     extractor = CommandExtractor()
     for doc in iter_html_docs(TEST_DATA_DIR):
         for (line, cmd), correct in zip(extractor.iter_commands(doc),
                                         COMMANDS[doc.url.url]):
             self.assertEqual((line, cmd), correct)
    def test_html_document(self):
        for doc in iter_html_docs('cmdextract'):
            self.assertTrue(doc.tree.tag)

        doc_cp = download.HtmlDocument.from_dict(doc.to_dict())
        self.assertEqual(doc_cp.url.url, doc.url.url)