def test_iter_commands(self): extractor = CommandExtractor() for doc in iter_html_docs(TEST_DATA_DIR): print(doc.url.url) for (line, cmd), correct in zip(extractor.iter_commands(doc), COMMANDS[doc.url.url]): self.assertEqual((line, cmd), correct)
def test_iter_texts(self): extractor = CommandExtractor() for doc in iter_html_docs(TEST_DATA_DIR): nr_txts = 0 for _ in extractor.iter_text_lines(doc): nr_txts += 1 self.assertEqual(NR_TEXTS[doc.url.url], nr_txts)
def test_extract_commands(self): cmds = extract_commands(iter_html_docs(TEST_DATA_DIR)) self.assertEqual(set(cmds.commands.keys()), MERGED_COMMANDS) cmds = extract_commands(iter_html_docs(TEST_DATA_DIR), 'xargs') self.assertEqual(set(cmds.commands.keys()), set([ 'find /tmp -name "*.tmp" | xargs rm', u'find ./music -name "*.mp3" -print0 | xargs -0 ls', 'find . -name "*.sh" | xargs grep "ksh"', 'find /tmp -name "*.tmp" -print0 | xargs -0 rm', 'find . -name "*.sh" -print0 | xargs -0 -I {} mv {} ~/back.scripts', u'find ./work -print | xargs grep "profit"'])) cmds = extract_commands( get_html_doc(TEST_DATA_DIR, 'stackoverflow.com'), 'xargs') self.assertEqual(cmds.commands, {}) doc = HtmlDocument('http://stackoverflow.com', b'') doc.body = None cmds = extract_commands(doc) self.assertEqual(cmds.nr_docs, 0)
def test_cache(self): doc = get_html_doc('search_engines', 'google.com') docs = list(iter_html_docs('search_engines')) obj = {'single': doc, 'multi': docs} cache.store(obj, testing=True) from_cache = cache.get(testing=True) self.assertEqual(from_cache.keys(), obj.keys()) self.assertEqual(obj['single'].url.url, from_cache['single'].url.url) self.assertIsNone(cache.get(does_not_exist=True))
def test_iter_get(self): docs = list(iter_html_docs('cmdextract')) reqs = [download.Request(doc.url.url) for doc in docs] with requests_mock.mock() as m: for req, doc in zip(reqs[:-1], docs[:-1]): m.get(req.url, content=doc.body) resps = list(download.iter_get(reqs)) success = [doc.url.url.strip('/') for doc in resps if isinstance(doc, download.HtmlDocument)] for resp in resps: if isinstance(resp, download.DownloadError): if isinstance(resp.err, UnicodeDecodeError): raise resp.err self.assertEqual(set(success), set([r.url for r in reqs[:-1]])) m.get(reqs[-1].url, content=docs[-1].body) resps = list(download.iter_get(reqs[-1])) self.assertEqual( [r.url.url.strip('/') for r in resps], [reqs[-1].url])
def test_iter_get(self): docs = list(iter_html_docs('cmdextract')) reqs = [download.Request(doc.url.url) for doc in docs] with requests_mock.mock() as m: for req, doc in zip(reqs[:-1], docs[:-1]): m.get(req.url, content=doc.body) resps = list(download.iter_get(reqs)) success = [ doc.url.url.strip('/') for doc in resps if isinstance(doc, download.HtmlDocument) ] for resp in resps: if isinstance(resp, download.DownloadError): if isinstance(resp.err, UnicodeDecodeError): raise resp.err self.assertEqual(set(success), set([r.url for r in reqs[:-1]])) m.get(reqs[-1].url, content=docs[-1].body) resps = list(download.iter_get(reqs[-1])) self.assertEqual([r.url.url.strip('/') for r in resps], [reqs[-1].url])
def test_html_document(self): for doc in iter_html_docs('cmdextract'): self.assertTrue(doc.tree.tag) doc_cp = download.HtmlDocument.from_dict(doc.to_dict()) self.assertEqual(doc_cp.url.url, doc.url.url)
def test_iter_commands(self): extractor = CommandExtractor() for doc in iter_html_docs(TEST_DATA_DIR): for (line, cmd), correct in zip(extractor.iter_commands(doc), COMMANDS[doc.url.url]): self.assertEqual((line, cmd), correct)