示例#1
0
 def test_parser_sample(self):
     fo = ' CDX N b a m s k r M S V g'
     o = 'warcinfo:/wikipedia.warc.gz/archive-commons.0.0.1-SNAPSHOT-20120' \
         '2102659-python 20131109194250 warcinfo:/wikipedia.warc.gz/archiv' \
         'e-commons.0.0.1-SNAPSHOT-20120112102659-python warc-info - 2IGTQ' \
         'CWS2K2D3QYFZZZUCMIHHVSXMYGU - - 338 0 wikipedia.warc.gz'
     c = cdx_reader()
     c.lineReceived(fo)
     c.lineReceived(o)
     e = c.cdx_entries[0]
     self.assertIsInstance(e, cdx_entry)
     self.assertEqual(e.tostring(c.field_order), o)
     self.assertEqual(
         e.massaged_url, 'warcinfo:/wikipedia.warc.gz/archive-c'
         'ommons.0.0.1-SNAPSHOT-201202102659-python')
     self.assertEqual(e.date, '20131109194250')
     self.assertEqual(
         e.original_url, 'warcinfo:/wikipedia.warc.gz/archive-c'
         'ommons.0.0.1-SNAPSHOT-20120112102659-python')
     self.assertEqual(e.mime_type, 'warc-info')
     self.assertEqual(e.response_code, '-')
     self.assertEqual(e.new_style_checksum,
                      '2IGTQCWS2K2D3QYFZZZUCMIHHVSXMYGU')
     self.assertEqual(e.redirect, '-')
     self.assertEqual(e.meta_tags, '-')
     self.assertEqual(e.compressed_record_size, '338')
     self.assertEqual(e.compressed_arc_file_offset, '0')
     self.assertEqual(e.file_name, 'wikipedia.warc.gz')
示例#2
0
 def test_parser_long_field_order(self):
     fo = ' CDX N b a m s k r M S V g'
     c = cdx_reader()
     c.lineReceived(fo)
     self.assertEqual(c.field_order, 'N b a m s k r M S V g'.split())
     c.lineReceived('1 2 3 4 5 6 7 8 9')
     self.assertEqual(c.cdx_entries[0].tostring(c.field_order),
                      '1 2 3 4 5 6 7 8 9 - -')
示例#3
0
 def test_parser_short_field_order(self):
     fo = ' CDX N b a m s k'
     c = cdx_reader()
     c.lineReceived(fo)
     self.assertEqual(c.field_order, 'N b a m s k'.split())
     c.lineReceived('1 2 3 4 5 6 7 8 9 10 11')
     self.assertEqual(c.cdx_entries[0].tostring(c.field_order),
                      '1 2 3 4 5 6')
示例#4
0
 def test_parser_duplicate(self):
     fo = ' CDX N b a m s M r M V V N'
     c = cdx_reader()
     c.lineReceived(fo)
     self.assertEqual(c.field_order, 'N b a m s M r M V V N'.split())
     c.lineReceived('1 2 3 4 5 6 7 8 9 10 11')
     self.assertEqual(c.cdx_entries[0].tostring(c.field_order),
                      '11 2 3 4 5 8 7 8 10 10 11')
示例#5
0
 def test_parser_good(self):
     fo = ' CDX N b a m s k r M S V g'
     o = '1 2 3 4 5 6 7 8 9 10 11'
     c = cdx_reader()
     c.lineReceived(fo)
     self.assertEqual(c.field_order, 'N b a m s k r M S V g'.split())
     c.lineReceived(o)
     self.assertEqual(c.cdx_entries[0].tostring(c.field_order), o)
示例#6
0
 def test_parser_fake_fields(self):
     fo = ' CDX N b z m s q r X S V x'
     o = '1 2 3 4 5 6 7 8 9 10 11'
     c = cdx_reader()
     c.lineReceived(fo)
     self.assertEqual(c.field_order, 'N b z m s q r X S V x'.split())
     c.lineReceived(o)
     self.assertEqual(c.cdx_entries[0].tostring(c.field_order),
                      '1 2 - 4 5 - 7 - 9 10 -')