def crawlFeedOnePage(feed, queue, crawlDateTime, latestWarc, warcDateTime, latestDateTime, tempdir): url = queue.get() print "<- %s fetching %s for domain %s" % (time.asctime(), url, feed['domain']) data = httpc.get(url, headers = {"User-Agent": "Internet Archive OPDS Crawler +http://bookserver.archive.org",}) print "-> %s fetched %s for domain %s" % (time.asctime(), url, feed['domain']) f = feedparser.parse(data) t = f.feed.updated_parsed dt = datetime.datetime(t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) delta = crawlDateTime - dt #if delta.days < 1: # print 'feed update date less than one day since previous crawl' #TODO: make new warc if our warc file is too big #TODO: only add to warc if not already there. ###turn off addToWarc while debugging #if (warcDateTime < dt): # print "Feed updated date is newer than warc date. Adding to warc" if True: addToWarc(latestWarc, url, data, f, 'application/atom+xml') latestDateTime = dt #just archive, no longer feed solr from this script #addToSolr(feed, f, tempdir) parseLinks(f, feed['url'], queue) time.sleep(config['default_sleep_seconds']) return latestDateTime
def test_delete_(self): httpc.put(self.base_url() + 'killme', data='killme') status, msg, body = httpc.delete_(self.base_url() + 'killme') self.assertEquals(status, 204) self.assertRaises( httpc.NotFound, lambda: httpc.get(self.base_url() + 'killme'))
def test_put_01_create(self): data = 'goodbye world' status, msg, body = httpc.put_(self.base_url() + 'goodbye', data=data) self.assertEquals(status, 201) self.assertEquals(msg.dict['x-put'], 'hello') self.assertEquals(body, '') self.assertEquals(httpc.get(self.base_url() + 'goodbye'), data)
def test_get(self): data = 'screw you world' try: response = httpc.get(self.base_url()) self.fail() except httpc.InternalServerError, e: self.assertEquals(e.params.response_body, data) self.assert_(str(e).count(data)) self.assert_(repr(e).count(data))
def test_ssl_proxy_redirects(self): # make sure that if the proxy returns a redirect, that httpc # successfully follows it (this was broken at one point) def ssl_proxy(sock): conn, addr = sock.accept() fd = conn.makefile() try: line = request = fd.readline() self.assertEqual(request, 'GET https://localhost:1234 HTTP/1.1\r\n') while line.strip(): # eat request headers line = fd.readline() # we're not going to actually proxy to localhost:1234, # we're just going to return a response on its behalf fd.write( "HTTP/1.0 302 Found\r\nLocation: https://localhost:1234/2\r\n\r\n" ) finally: fd.close() conn.close() # second request, for /2 target conn, addr = sock.accept() fd = conn.makefile() try: line = request = fd.readline() self.assertEqual(request, 'GET https://localhost:1234/2 HTTP/1.1\r\n') while line.strip(): # eat request headers line = fd.readline() fd.write("HTTP/1.0 200 OK\r\n\r\n") finally: fd.close() conn.close() sock.close() server = api.tcp_listener(('0.0.0.0', 5505)) api.spawn(ssl_proxy, server) import os os.environ['ALL_PROXY'] = 'localhost:5505' httpc.get('https://localhost:1234', use_proxy=True, max_retries=1)
def test_013_empty_return(self): from eventlet import httpc def wsgi_app(environ, start_response): start_response("200 OK", []) return [""] certificate_file = os.path.join(os.path.dirname(__file__), 'test_server.crt') private_key_file = os.path.join(os.path.dirname(__file__), 'test_server.key') sock = api.ssl_listener(('', 4202), certificate_file, private_key_file) api.spawn(wsgi.server, sock, wsgi_app) res = httpc.get("https://localhost:4202/foo") self.assertEquals(res, '')
def test_ssl_proxy(self): def ssl_proxy(sock): conn, addr = sock.accept() fd = conn.makefile() try: line = request = fd.readline() self.assertEqual(request, 'GET https://localhost:1234 HTTP/1.1\r\n') while line.strip(): # eat request headers line = fd.readline() # we're not going to actually proxy to localhost:1234, # we're just going to return a response on its behalf fd.write("HTTP/1.0 200 OK\r\n\r\n") finally: fd.close() conn.close() server = api.tcp_listener(('0.0.0.0', 5505)) api.spawn(ssl_proxy, server) import os os.environ['ALL_PROXY'] = 'localhost:5505' httpc.get('https://localhost:1234', ok=[200], use_proxy=True)
class TestHttpc302(TestBase, TestCase): site_class = Site302 def test_get_expired(self): try: httpc.get(self.base_url() + 'expired/hello', max_retries=0) self.assert_(False) except httpc.Found, err: response = err.retry() self.assertEquals(response, 'hello world') self.assertEquals( httpc.get(self.base_url() + 'expired/hello', max_retries=1), 'hello world')
def test_ssl_proxy_redirects(self): # make sure that if the proxy returns a redirect, that httpc # successfully follows it (this was broken at one point) def ssl_proxy(sock): conn, addr = sock.accept() fd = conn.makefile() try: line = request = fd.readline() self.assertEqual(request, 'GET https://localhost:1234 HTTP/1.1\r\n') while line.strip(): # eat request headers line = fd.readline() # we're not going to actually proxy to localhost:1234, # we're just going to return a response on its behalf fd.write("HTTP/1.0 302 Found\r\nLocation: https://localhost:1234/2\r\n\r\n") finally: fd.close() conn.close() # second request, for /2 target conn, addr = sock.accept() fd = conn.makefile() try: line = request = fd.readline() self.assertEqual(request, 'GET https://localhost:1234/2 HTTP/1.1\r\n') while line.strip(): # eat request headers line = fd.readline() fd.write("HTTP/1.0 200 OK\r\n\r\n") finally: fd.close() conn.close() sock.close() server = api.tcp_listener(('0.0.0.0', 5505)) api.spawn(ssl_proxy, server) import os os.environ['ALL_PROXY'] = 'localhost:5505' httpc.get('https://localhost:1234', use_proxy=True, max_retries=1)
class TestHttpc301(TestBase, TestCase): site_class = Site301 def base_url(self): return 'http://localhost:31337/redirect/' def test_get(self): try: httpc.get(self.base_url() + 'hello', max_retries=0) self.assert_(False) except httpc.MovedPermanently, err: response = err.retry() self.assertEquals(response, 'hello world') self.assertEquals(httpc.get(self.base_url() + 'hello', max_retries=1), 'hello world')
def test_delete(self): httpc.put(self.base_url() + 'killme', data='killme') self.assertEquals(httpc.delete(self.base_url() + 'killme'), '') self.assertRaises(httpc.NotFound, lambda: httpc.get(self.base_url() + 'killme'))
def test_delete(self): httpc.put(self.base_url() + 'killme', data='killme') self.assertEquals(httpc.delete(self.base_url() + 'killme'), '') self.assertRaises( httpc.NotFound, lambda: httpc.get(self.base_url() + 'killme'))
def test_put_02_modify(self): self.test_put_01_create() data = 'i really mean goodbye' status = httpc.put_(self.base_url() + 'goodbye', data=data)[0] self.assertEquals(status, 204) self.assertEquals(httpc.get(self.base_url() + 'goodbye'), data)
def test_delete_(self): httpc.put(self.base_url() + 'killme', data='killme') status, msg, body = httpc.delete_(self.base_url() + 'killme') self.assertEquals(status, 204) self.assertRaises(httpc.NotFound, lambda: httpc.get(self.base_url() + 'killme'))
def test_put_nonempty(self): data = 'nonempty' httpc.put(self.base_url() + 'nonempty', data=data) self.assertEquals(httpc.get(self.base_url() + 'nonempty'), data)
def test_get(self): response = httpc.get(self.base_url() + 'hello') self.assertEquals(response, 'hello world')
def test_get_expires(self): try: httpc.get(self.base_url() + 'expires/hello', max_retries=0) self.assert_(False) except httpc.Found, err: response = err.retry()
def test_get(self): try: httpc.get(self.base_url() + 'hello', max_retries=0) self.assert_(False) except httpc.MovedPermanently, err: response = err.retry()
except httpc.Found, err: response = err.retry() self.assertEquals(response, 'hello world') self.assertEquals( httpc.get(self.base_url() + 'expired/hello', max_retries=1), 'hello world') def test_get_expires(self): try: httpc.get(self.base_url() + 'expires/hello', max_retries=0) self.assert_(False) except httpc.Found, err: response = err.retry() self.assertEquals(response, 'hello world') self.assertEquals( httpc.get(self.base_url() + 'expires/hello', max_retries=1), 'hello world') class TestHttpc303(TestBase, TestCase): site_class = Site303 def base_url(self): return 'http://localhost:31337/redirect/' def test_post(self): data = 'hello world' try: response = httpc.post(self.base_url() + 'hello', data=data) self.assert_(False) except httpc.SeeOther, err:
try: httpc.get(self.base_url() + 'expired/hello', max_retries=0) self.assert_(False) except httpc.Found, err: response = err.retry() self.assertEquals(response, 'hello world') self.assertEquals(httpc.get(self.base_url() + 'expired/hello', max_retries=1), 'hello world') def test_get_expires(self): try: httpc.get(self.base_url() + 'expires/hello', max_retries=0) self.assert_(False) except httpc.Found, err: response = err.retry() self.assertEquals(response, 'hello world') self.assertEquals(httpc.get(self.base_url() + 'expires/hello', max_retries=1), 'hello world') class TestHttpc303(TestBase, TestCase): site_class = Site303 def base_url(self): return 'http://localhost:31337/redirect/' def test_post(self): data = 'hello world' try: response = httpc.post(self.base_url() + 'hello', data=data) self.assert_(False) except httpc.SeeOther, err: response = err.retry()
def test_get_query(self): response = httpc.get(self.base_url() + 'hello?foo=bar&foo=quux') self.assertEquals(response, 'hello worldfoo=bar\nfoo=quux\n')
def test_put_empty(self): httpc.put(self.base_url() + 'empty', data='') self.assertEquals(httpc.get(self.base_url() + 'empty'), '')
def test_get_bad_uri(self): self.assertRaises(httpc.NotFound, lambda: httpc.get(self.base_url() + 'b0gu5'))