def test_csviter_encoding(self): body1 = get_testdata('feeds', 'feed-sample4.csv') body2 = get_testdata('feeds', 'feed-sample5.csv') response = TextResponse(url="http://example.com/", body=body1, encoding='latin1') csv = csviter(response) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'latin1', u'value': u'test'}, {u'id': u'2', u'name': u'something', u'value': u'\xf1\xe1\xe9\xf3'}]) response = TextResponse(url="http://example.com/", body=body2, encoding='cp852') csv = csviter(response) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'cp852', u'value': u'test'}, {u'id': u'2', u'name': u'something', u'value': u'\u255a\u2569\u2569\u2569\u2550\u2550\u2557'}])
def test_csviter_wrong_quotechar(self): body = get_testdata('feeds', 'feed-sample6.csv') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) self.assertEqual([row for row in csv], [{u"'id'": u"1", u"'name'": u"'alpha'", u"'value'": u"'foobar'"}, {u"'id'": u"2", u"'name'": u"'unicode'", u"'value'": u"'\xfan\xedc\xf3d\xe9\u203d'"}, {u"'id'": u"'3'", u"'name'": u"'multi'", u"'value'": u"'foo"}, {u"'id'": u"4", u"'name'": u"'empty'", u"'value'": u""}])
def test_csviter_delimiter_binary_response_assume_utf8_encoding(self): body = get_testdata('feeds', 'feed-sample3.csv').replace(b',', b'\t') response = Response(url="http://example.com/", body=body) csv = csviter(response, delimiter='\t') self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_csviter_exception(self): body = get_testdata('feeds', 'feed-sample3.csv') response = TextResponse(url="http://example.com/", body=body) iter = csviter(response) next(iter) next(iter) next(iter) next(iter) self.assertRaises(StopIteration, next, iter)
def parse_rows(self, response): """Receives a response and a dict (representing each row) with a key for each provided (or detected) header of the CSV file. This spider also gives the opportunity to override adapt_response and process_results methods for pre and post-processing purposes. """ for row in csviter(response, self.delimiter, self.headers, self.quotechar): ret = iterate_spider_output(self.parse_row(response, row)) for result_item in self.process_results(response, ret): yield result_item
def test_csviter_exception(self): body = get_testdata("feeds", "feed-sample3.csv") response = TextResponse(url="http://example.com/", body=body) iter = csviter(response) iter.next() iter.next() iter.next() iter.next() self.assertRaises(StopIteration, iter.next)
def test_csviter_quotechar(self): body1 = get_testdata('feeds', 'feed-sample6.csv') body2 = get_testdata('feeds', 'feed-sample6.csv').replace(b',', b'|') response1 = TextResponse(url="http://example.com/", body=body1) csv1 = csviter(response1, quotechar="'") self.assertEqual([row for row in csv1], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}]) response2 = TextResponse(url="http://example.com/", body=body2) csv2 = csviter(response2, delimiter="|", quotechar="'") self.assertEqual([row for row in csv2], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_csviter_headers(self): sample = get_testdata('feeds', 'feed-sample3.csv').splitlines() headers, body = sample[0].split(b','), b'\n'.join(sample[1:]) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, headers=[h.decode('utf-8') for h in headers]) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': u'foo\nbar'}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_csviter_falserow(self): body = get_testdata('feeds', 'feed-sample3.csv') body = b'\n'.join((body, b'a,b', b'a,b,c,d')) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) self.assertEqual([row for row in csv], [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}])
def test_csviter_encoding(self): body1 = get_testdata("feeds", "feed-sample4.csv") body2 = get_testdata("feeds", "feed-sample5.csv") response = TextResponse(url="http://example.com/", body=body1, encoding="latin1") csv = csviter(response) self.assertEqual( [row for row in csv], [ {u"id": u"1", u"name": u"latin1", u"value": u"test"}, {u"id": u"2", u"name": u"something", u"value": u"\xf1\xe1\xe9\xf3"}, ], ) response = TextResponse(url="http://example.com/", body=body2, encoding="cp852") csv = csviter(response) self.assertEqual( [row for row in csv], [ {u"id": u"1", u"name": u"cp852", u"value": u"test"}, {u"id": u"2", u"name": u"something", u"value": u"\u255a\u2569\u2569\u2569\u2550\u2550\u2557"}, ], )
def test_csviter_encoding(self): body1 = get_testdata('feeds', 'feed-sample4.csv') body2 = get_testdata('feeds', 'feed-sample5.csv') response = TextResponse(url="http://example.com/", body=body1, encoding='latin1') csv = csviter(response) self.assertEqual( list(csv), [ {'id': '1', 'name': 'latin1', 'value': 'test'}, {'id': '2', 'name': 'something', 'value': '\xf1\xe1\xe9\xf3'}, ] ) response = TextResponse(url="http://example.com/", body=body2, encoding='cp852') csv = csviter(response) self.assertEqual( list(csv), [ {'id': '1', 'name': 'cp852', 'value': 'test'}, {'id': '2', 'name': 'something', 'value': '\u255a\u2569\u2569\u2569\u2550\u2550\u2557'}, ] )
def test_csviter_delimiter_binary_response_assume_utf8_encoding(self): body = get_testdata("feeds", "feed-sample3.csv").replace(",", "\t") response = Response(url="http://example.com/", body=body) csv = csviter(response, delimiter="\t") self.assertEqual( [row for row in csv], [ {u"id": u"1", u"name": u"alpha", u"value": u"foobar"}, {u"id": u"2", u"name": u"unicode", u"value": u"\xfan\xedc\xf3d\xe9\u203d"}, {u"id": u"3", u"name": u"multi", u"value": u"foo\nbar"}, {u"id": u"4", u"name": u"empty", u"value": u""}, ], )
def parse_rows(self, response): """Receives a response and a dict (representing each row) with a key for each provided (or detected) header of the CSV file. This spider also gives the opportunity to override adapt_response and process_results methods for pre and post-processing purposes. """ for row in csviter(response, self.delimiter, self.headers): ret = self.parse_row(response, row) if isinstance(ret, (BaseItem, Request)): ret = [ret] if not isinstance(ret, (list, tuple)): raise TypeError('You cannot return an "%s" object from a spider' % type(ret).__name__) for result_item in self.process_results(response, ret): yield result_item
def test_csviter_defaults(self): body = get_testdata('feeds', 'feed-sample3.csv') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) result = [row for row in csv] self.assertEqual(result, [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}]) # explicit type check cuz' we no like stinkin' autocasting! yarrr for result_row in result: self.assertTrue(all((isinstance(k, six.text_type) for k in result_row.keys()))) self.assertTrue(all((isinstance(v, six.text_type) for v in result_row.values())))
def test_csviter_falserow(self): body = get_testdata("feeds", "feed-sample3.csv") body = "\n".join((body, "a,b", "a,b,c,d")) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) self.assertEqual( [row for row in csv], [ {u"id": u"1", u"name": u"alpha", u"value": u"foobar"}, {u"id": u"2", u"name": u"unicode", u"value": u"\xfan\xedc\xf3d\xe9\u203d"}, {u"id": u"3", u"name": u"multi", u"value": u"foo\nbar"}, {u"id": u"4", u"name": u"empty", u"value": u""}, ], )
def test_csviter_headers(self): sample = get_testdata("feeds", "feed-sample3.csv").splitlines() headers, body = sample[0].split(","), "\n".join(sample[1:]) response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, headers=headers) self.assertEqual( [row for row in csv], [ {u"id": u"1", u"name": u"alpha", u"value": u"foobar"}, {u"id": u"2", u"name": u"unicode", u"value": u"\xfan\xedc\xf3d\xe9\u203d"}, {u"id": u"3", u"name": u"multi", u"value": u"foo\nbar"}, {u"id": u"4", u"name": u"empty", u"value": u""}, ], )
def parse_rows(self, response): """Receives a response and a dict (representing each row) with a key for each provided (or detected) header of the CSV file. This spider also gives the opportunity to override adapt_response and process_results methods for pre and post-processing purposes. """ for row in csviter(response, self.delimiter, self.headers): ret = self.parse_row(response, row) if isinstance(ret, (BaseItem, Request)): ret = [ret] if not isinstance(ret, (list, tuple)): raise TypeError( 'You cannot return an "%s" object from a spider' % type(ret).__name__) for result_item in self.process_results(response, ret): yield result_item
def test_csviter_defaults(self): body = get_testdata("feeds", "feed-sample3.csv") response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) result = [row for row in csv] self.assertEqual( result, [ {u"id": u"1", u"name": u"alpha", u"value": u"foobar"}, {u"id": u"2", u"name": u"unicode", u"value": u"\xfan\xedc\xf3d\xe9\u203d"}, {u"id": u"3", u"name": u"multi", u"value": u"foo\nbar"}, {u"id": u"4", u"name": u"empty", u"value": u""}, ], ) # explicit type check cuz' we no like stinkin' autocasting! yarrr for result_row in result: self.assert_(all((isinstance(k, unicode) for k in result_row.keys()))) self.assert_(all((isinstance(v, unicode) for v in result_row.values())))
def test_csviter_defaults(self): body = get_testdata("feeds", "feed-sample3.csv") response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) result = [row for row in csv] self.assertEqual( result, [ { "id": "1", "name": "alpha", "value": "foobar" }, { "id": "2", "name": "unicode", "value": "\xfan\xedc\xf3d\xe9\u203d" }, { "id": "3", "name": "multi", "value": FOOBAR_NL }, { "id": "4", "name": "empty", "value": "" }, ], ) # explicit type check cuz' we no like stinkin' autocasting! yarrr for result_row in result: self.assertTrue( all((isinstance(k, str) for k in result_row.keys()))) self.assertTrue( all((isinstance(v, str) for v in result_row.values())))
def test_csviter_delimiter_binary_response_assume_utf8_encoding(self): body = get_testdata('feeds', 'feed-sample3.csv').replace(b',', b'\t') response = Response(url="http://example.com/", body=body) csv = csviter(response, delimiter='\t') self.assertEqual([row for row in csv], [{ u'id': u'1', u'name': u'alpha', u'value': u'foobar' }, { u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d' }, { u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL }, { u'id': u'4', u'name': u'empty', u'value': u'' }])
def test_csviter_wrong_quotechar(self): body = get_testdata('feeds', 'feed-sample6.csv') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response) self.assertEqual([row for row in csv], [{ u"'id'": u"1", u"'name'": u"'alpha'", u"'value'": u"'foobar'" }, { u"'id'": u"2", u"'name'": u"'unicode'", u"'value'": u"'\xfan\xedc\xf3d\xe9\u203d'" }, { u"'id'": u"'3'", u"'name'": u"'multi'", u"'value'": u"'foo" }, { u"'id'": u"4", u"'name'": u"'empty'", u"'value'": u"" }])
def test_csviter_delimiter(self): body = get_testdata('feeds', 'feed-sample3.csv').replace(b',', b'\t') response = TextResponse(url="http://example.com/", body=body) csv = csviter(response, delimiter='\t') self.assertEqual([row for row in csv], [{ 'id': '1', 'name': 'alpha', 'value': 'foobar' }, { 'id': '2', 'name': 'unicode', 'value': '\xfan\xedc\xf3d\xe9\u203d' }, { 'id': '3', 'name': 'multi', 'value': "foo\nbar" }, { 'id': '4', 'name': 'empty', 'value': '' }])
def parse_rows(self, response): for row in csviter(response, self.delimiter, self.headers, self.quotechar): ret = iterate_spider_output(self.parse_row(response, row)) for result_item in self.process_results(response, ret): yield result_item
def test_csviter_quotechar(self): body1 = get_testdata("feeds", "feed-sample6.csv") body2 = get_testdata("feeds", "feed-sample6.csv").replace(b",", b"|") response1 = TextResponse(url="http://example.com/", body=body1) csv1 = csviter(response1, quotechar="'") self.assertEqual( [row for row in csv1], [ { "id": "1", "name": "alpha", "value": "foobar" }, { "id": "2", "name": "unicode", "value": "\xfan\xedc\xf3d\xe9\u203d" }, { "id": "3", "name": "multi", "value": FOOBAR_NL }, { "id": "4", "name": "empty", "value": "" }, ], ) response2 = TextResponse(url="http://example.com/", body=body2) csv2 = csviter(response2, delimiter="|", quotechar="'") self.assertEqual( [row for row in csv2], [ { "id": "1", "name": "alpha", "value": "foobar" }, { "id": "2", "name": "unicode", "value": "\xfan\xedc\xf3d\xe9\u203d" }, { "id": "3", "name": "multi", "value": FOOBAR_NL }, { "id": "4", "name": "empty", "value": "" }, ], )