def test_parses_clubweb09(self): warc = WARCParser(open(fixtures.dumppath('ClueWeb09_English_Sample'))) self.assertEquals(0, warc.tell()) d1 = warc.next() self.assertEquals(21894, warc.tell()) # TODO: check 21894 self.assertEquals('http://www.smartwebby.com/DreamweaverTemplates/templates/business_general_template59.asp', d1.uri) self.assertTrue(d1.raw.startswith('<!DOCTYPE HTML PUBLIC')) self.assertTrue(d1.raw.endswith('<!-- InstanceEnd --></html>')) d2 = warc.next() self.assertEquals(43359, warc.tell()) # TODO: check 43359 self.assertEquals('http://www.smartwebby.com/DreamweaverTemplates/templates/business_telecom_template71.asp', d2.uri) self.assertTrue(d2.raw.startswith('<!DOCTYPE HTML PUBLIC')) self.assertTrue(d2.raw.endswith('<!-- InstanceEnd --></html>')) # Total of 100 docs, but we already iterated over 2. self.assertEquals(100, len(list(warc)) + 2)
def setUpClass(klass): if klass.__name__ == 'IntegrationTestCase': return klass.spec = Spec(klass.__name__) klass.fqclient = FreequeryClient(klass.spec) # docset klass.docset = Docset(klass.spec.docset_name) klass.clean_up() for dumpname in klass.dumps: klass.docset.add_dump(dumpname, dumppath(dumpname)) klass.docset.save() # index if klass.index: klass.fqclient.index() # rank if klass.rank: klass.fqclient.linkparse() niter = klass.niter if hasattr(klass, 'niter') else 2 klass.fqclient.rank(niter=niter)
import os, unittest, StringIO from freequery.graph.links import LinkFile, LinkFileOutputStream from freequery.formats.warc import WARCParser from freequery.test import fixtures with open(fixtures.dumppath("small1-links"), "rb") as lf: small1_links = lf.read() class TestLinkFile(unittest.TestCase): def test_parses_file1(self): linkfile = LinkFile(small1_links.splitlines(True)) doclinks = dict((doc.uri, list(doc.link_uris)) for doc in linkfile) exp_doclinks = dict((uri, list(doc.link_uris)) for (uri, doc) in fixtures.dumpdocs("small1").items()) self.assertEquals(exp_doclinks, doclinks) class TestLinkFileOutputStream(unittest.TestCase): def test_writes_file1(self): out = StringIO.StringIO() writer = LinkFileOutputStream(out) docs = fixtures.dumpdocs("small1") writer.add("http://example.com/", docs["http://example.com/"].link_uris) writer.add("http://example.com/about", docs["http://example.com/about"].link_uris) writer.add("http://example.com/contact", docs["http://example.com/contact"].link_uris) self.assertEquals(small1_links, out.getvalue())