示例#1
0
def crawlFeedOnePage(feed, queue, crawlDateTime, latestWarc, warcDateTime, latestDateTime, tempdir):

    url = queue.get()
    print "<- %s fetching %s for domain %s" % (time.asctime(), url, feed['domain'])
    data = httpc.get(url, headers = {"User-Agent": "Internet Archive OPDS Crawler +http://bookserver.archive.org",})
    print "-> %s fetched %s for domain %s" % (time.asctime(), url, feed['domain'])

    f     = feedparser.parse(data)
    t     = f.feed.updated_parsed
    dt    = datetime.datetime(t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec)
    delta = crawlDateTime - dt

    #if delta.days < 1:
    #     print 'feed update date less than one day since previous crawl'

    #TODO: make new warc if our warc file is too big
    #TODO: only add to warc if not already there.

    ###turn off addToWarc while debugging
    #if (warcDateTime < dt):
    #    print "Feed updated date is newer than warc date. Adding to warc"
    if True:
        addToWarc(latestWarc, url, data, f, 'application/atom+xml')
        latestDateTime = dt

    #just archive, no longer feed solr from this script
    #addToSolr(feed, f, tempdir)

    parseLinks(f, feed['url'], queue)
    


    time.sleep(config['default_sleep_seconds'])

    return latestDateTime
示例#2
0
 def test_delete_(self):
     httpc.put(self.base_url() + 'killme', data='killme')
     status, msg, body = httpc.delete_(self.base_url() + 'killme')
     self.assertEquals(status, 204)
     self.assertRaises(
         httpc.NotFound,
         lambda: httpc.get(self.base_url() + 'killme'))
示例#3
0
 def test_put_01_create(self):
     data = 'goodbye world'
     status, msg, body = httpc.put_(self.base_url() + 'goodbye', data=data)
     self.assertEquals(status, 201)
     self.assertEquals(msg.dict['x-put'], 'hello')
     self.assertEquals(body, '')
     self.assertEquals(httpc.get(self.base_url() + 'goodbye'), data)
示例#4
0
 def test_put_01_create(self):
     data = 'goodbye world'
     status, msg, body = httpc.put_(self.base_url() + 'goodbye',
                                    data=data)
     self.assertEquals(status, 201)
     self.assertEquals(msg.dict['x-put'], 'hello')
     self.assertEquals(body, '')
     self.assertEquals(httpc.get(self.base_url() + 'goodbye'), data)
示例#5
0
 def test_get(self):
     data = 'screw you world'
     try:
         response = httpc.get(self.base_url())
         self.fail()
     except httpc.InternalServerError, e:
         self.assertEquals(e.params.response_body, data)
         self.assert_(str(e).count(data))
         self.assert_(repr(e).count(data))
示例#6
0
 def test_get(self):
     data = 'screw you world'
     try:
         response = httpc.get(self.base_url())
         self.fail()
     except httpc.InternalServerError, e:
         self.assertEquals(e.params.response_body, data)
         self.assert_(str(e).count(data))
         self.assert_(repr(e).count(data))
示例#7
0
    def test_ssl_proxy_redirects(self):
        # make sure that if the proxy returns a redirect, that httpc
        # successfully follows it (this was broken at one point)
        def ssl_proxy(sock):
            conn, addr = sock.accept()
            fd = conn.makefile()
            try:
                line = request = fd.readline()
                self.assertEqual(request,
                                 'GET https://localhost:1234 HTTP/1.1\r\n')
                while line.strip():  # eat request headers
                    line = fd.readline()

                # we're not going to actually proxy to localhost:1234,
                # we're just going to return a response on its behalf
                fd.write(
                    "HTTP/1.0 302 Found\r\nLocation: https://localhost:1234/2\r\n\r\n"
                )
            finally:
                fd.close()
                conn.close()

            # second request, for /2 target
            conn, addr = sock.accept()
            fd = conn.makefile()
            try:
                line = request = fd.readline()
                self.assertEqual(request,
                                 'GET https://localhost:1234/2 HTTP/1.1\r\n')
                while line.strip():  # eat request headers
                    line = fd.readline()
                fd.write("HTTP/1.0 200 OK\r\n\r\n")
            finally:
                fd.close()
                conn.close()
            sock.close()

        server = api.tcp_listener(('0.0.0.0', 5505))
        api.spawn(ssl_proxy, server)
        import os
        os.environ['ALL_PROXY'] = 'localhost:5505'
        httpc.get('https://localhost:1234', use_proxy=True, max_retries=1)
示例#8
0
    def test_013_empty_return(self):
        from eventlet import httpc
        def wsgi_app(environ, start_response):
            start_response("200 OK", [])
            return [""]

        certificate_file = os.path.join(os.path.dirname(__file__), 'test_server.crt')
        private_key_file = os.path.join(os.path.dirname(__file__), 'test_server.key')
        sock = api.ssl_listener(('', 4202), certificate_file, private_key_file)
        api.spawn(wsgi.server, sock, wsgi_app)

        res = httpc.get("https://localhost:4202/foo")
        self.assertEquals(res, '')
示例#9
0
    def test_ssl_proxy(self):
        def ssl_proxy(sock):
            conn, addr = sock.accept()
            fd = conn.makefile()
            try:
                line = request = fd.readline()
                self.assertEqual(request, 'GET https://localhost:1234 HTTP/1.1\r\n')
                while line.strip():  # eat request headers
                    line = fd.readline()

                # we're not going to actually proxy to localhost:1234,
                # we're just going to return a response on its behalf
                fd.write("HTTP/1.0 200 OK\r\n\r\n")
            finally:
                fd.close()
                conn.close()

        server = api.tcp_listener(('0.0.0.0', 5505))
        api.spawn(ssl_proxy, server)
        import os
        os.environ['ALL_PROXY'] = 'localhost:5505'
        httpc.get('https://localhost:1234', ok=[200], use_proxy=True)
示例#10
0
class TestHttpc302(TestBase, TestCase):
    site_class = Site302

    def test_get_expired(self):
        try:
            httpc.get(self.base_url() + 'expired/hello', max_retries=0)
            self.assert_(False)
        except httpc.Found, err:
            response = err.retry()
        self.assertEquals(response, 'hello world')
        self.assertEquals(
            httpc.get(self.base_url() + 'expired/hello', max_retries=1),
            'hello world')
示例#11
0
    def test_ssl_proxy_redirects(self):
        # make sure that if the proxy returns a redirect, that httpc
        # successfully follows it (this was broken at one point)
        def ssl_proxy(sock):
            conn, addr = sock.accept()
            fd = conn.makefile()
            try:
                line = request = fd.readline()
                self.assertEqual(request, 'GET https://localhost:1234 HTTP/1.1\r\n')
                while line.strip():  # eat request headers
                    line = fd.readline()

                # we're not going to actually proxy to localhost:1234,
                # we're just going to return a response on its behalf
                fd.write("HTTP/1.0 302 Found\r\nLocation: https://localhost:1234/2\r\n\r\n")
            finally:
                fd.close()
                conn.close()

            # second request, for /2 target
            conn, addr = sock.accept()
            fd = conn.makefile()
            try:
                line = request = fd.readline()
                self.assertEqual(request, 'GET https://localhost:1234/2 HTTP/1.1\r\n')
                while line.strip():  # eat request headers
                    line = fd.readline()
                fd.write("HTTP/1.0 200 OK\r\n\r\n")
            finally:
                fd.close()
                conn.close()
            sock.close()

        server = api.tcp_listener(('0.0.0.0', 5505))
        api.spawn(ssl_proxy, server)
        import os
        os.environ['ALL_PROXY'] = 'localhost:5505'
        httpc.get('https://localhost:1234', use_proxy=True, max_retries=1)
示例#12
0
    def test_ssl_proxy(self):
        def ssl_proxy(sock):
            conn, addr = sock.accept()
            fd = conn.makefile()
            try:
                line = request = fd.readline()
                self.assertEqual(request,
                                 'GET https://localhost:1234 HTTP/1.1\r\n')
                while line.strip():  # eat request headers
                    line = fd.readline()

                # we're not going to actually proxy to localhost:1234,
                # we're just going to return a response on its behalf
                fd.write("HTTP/1.0 200 OK\r\n\r\n")
            finally:
                fd.close()
                conn.close()

        server = api.tcp_listener(('0.0.0.0', 5505))
        api.spawn(ssl_proxy, server)
        import os
        os.environ['ALL_PROXY'] = 'localhost:5505'
        httpc.get('https://localhost:1234', ok=[200], use_proxy=True)
示例#13
0
class TestHttpc301(TestBase, TestCase):
    site_class = Site301

    def base_url(self):
        return 'http://localhost:31337/redirect/'

    def test_get(self):
        try:
            httpc.get(self.base_url() + 'hello', max_retries=0)
            self.assert_(False)
        except httpc.MovedPermanently, err:
            response = err.retry()
        self.assertEquals(response, 'hello world')
        self.assertEquals(httpc.get(self.base_url() + 'hello', max_retries=1),
                          'hello world')
示例#14
0
    def test_013_empty_return(self):
        from eventlet import httpc

        def wsgi_app(environ, start_response):
            start_response("200 OK", [])
            return [""]

        certificate_file = os.path.join(os.path.dirname(__file__),
                                        'test_server.crt')
        private_key_file = os.path.join(os.path.dirname(__file__),
                                        'test_server.key')
        sock = api.ssl_listener(('', 4202), certificate_file, private_key_file)
        api.spawn(wsgi.server, sock, wsgi_app)

        res = httpc.get("https://localhost:4202/foo")
        self.assertEquals(res, '')
示例#15
0
 def test_delete(self):
     httpc.put(self.base_url() + 'killme', data='killme')
     self.assertEquals(httpc.delete(self.base_url() + 'killme'), '')
     self.assertRaises(httpc.NotFound,
                       lambda: httpc.get(self.base_url() + 'killme'))
示例#16
0
 def test_delete(self):
     httpc.put(self.base_url() + 'killme', data='killme')
     self.assertEquals(httpc.delete(self.base_url() + 'killme'), '')
     self.assertRaises(
         httpc.NotFound,
         lambda: httpc.get(self.base_url() + 'killme'))
示例#17
0
 def test_put_02_modify(self):
     self.test_put_01_create()
     data = 'i really mean goodbye'
     status = httpc.put_(self.base_url() + 'goodbye', data=data)[0]
     self.assertEquals(status, 204)
     self.assertEquals(httpc.get(self.base_url() + 'goodbye'), data)
示例#18
0
 def test_delete_(self):
     httpc.put(self.base_url() + 'killme', data='killme')
     status, msg, body = httpc.delete_(self.base_url() + 'killme')
     self.assertEquals(status, 204)
     self.assertRaises(httpc.NotFound,
                       lambda: httpc.get(self.base_url() + 'killme'))
示例#19
0
 def test_put_nonempty(self):
     data = 'nonempty'
     httpc.put(self.base_url() + 'nonempty', data=data)
     self.assertEquals(httpc.get(self.base_url() + 'nonempty'), data)
示例#20
0
 def test_get(self):
     response = httpc.get(self.base_url() + 'hello')
     self.assertEquals(response, 'hello world')
示例#21
0
 def test_put_nonempty(self):
     data = 'nonempty'
     httpc.put(self.base_url() + 'nonempty', data=data)
     self.assertEquals(httpc.get(self.base_url() + 'nonempty'), data)
示例#22
0
 def test_get_expires(self):
     try:
         httpc.get(self.base_url() + 'expires/hello', max_retries=0)
         self.assert_(False)
     except httpc.Found, err:
         response = err.retry()
示例#23
0
 def test_get_expires(self):
     try:
         httpc.get(self.base_url() + 'expires/hello', max_retries=0)
         self.assert_(False)
     except httpc.Found, err:
         response = err.retry()
示例#24
0
 def test_get(self):
     try:
         httpc.get(self.base_url() + 'hello', max_retries=0)
         self.assert_(False)
     except httpc.MovedPermanently, err:
         response = err.retry()
示例#25
0
        except httpc.Found, err:
            response = err.retry()
        self.assertEquals(response, 'hello world')
        self.assertEquals(
            httpc.get(self.base_url() + 'expired/hello', max_retries=1),
            'hello world')

    def test_get_expires(self):
        try:
            httpc.get(self.base_url() + 'expires/hello', max_retries=0)
            self.assert_(False)
        except httpc.Found, err:
            response = err.retry()
        self.assertEquals(response, 'hello world')
        self.assertEquals(
            httpc.get(self.base_url() + 'expires/hello', max_retries=1),
            'hello world')


class TestHttpc303(TestBase, TestCase):
    site_class = Site303

    def base_url(self):
        return 'http://localhost:31337/redirect/'

    def test_post(self):
        data = 'hello world'
        try:
            response = httpc.post(self.base_url() + 'hello', data=data)
            self.assert_(False)
        except httpc.SeeOther, err:
示例#26
0
        try:
            httpc.get(self.base_url() + 'expired/hello', max_retries=0)
            self.assert_(False)
        except httpc.Found, err:
            response = err.retry()
        self.assertEquals(response, 'hello world')
        self.assertEquals(httpc.get(self.base_url() + 'expired/hello', max_retries=1), 'hello world')

    def test_get_expires(self):
        try:
            httpc.get(self.base_url() + 'expires/hello', max_retries=0)
            self.assert_(False)
        except httpc.Found, err:
            response = err.retry()
        self.assertEquals(response, 'hello world')
        self.assertEquals(httpc.get(self.base_url() + 'expires/hello', max_retries=1), 'hello world')


class TestHttpc303(TestBase, TestCase):
    site_class = Site303

    def base_url(self):
        return 'http://localhost:31337/redirect/'

    def test_post(self):
        data = 'hello world'
        try:
            response = httpc.post(self.base_url() + 'hello', data=data)
            self.assert_(False)
        except httpc.SeeOther, err:
            response = err.retry()
示例#27
0
 def test_get_query(self):
     response = httpc.get(self.base_url() + 'hello?foo=bar&foo=quux')
     self.assertEquals(response, 'hello worldfoo=bar\nfoo=quux\n')
示例#28
0
 def test_get(self):
     try:
         httpc.get(self.base_url() + 'hello', max_retries=0)
         self.assert_(False)
     except httpc.MovedPermanently, err:
         response = err.retry()
示例#29
0
 def test_put_empty(self):
     httpc.put(self.base_url() + 'empty', data='')
     self.assertEquals(httpc.get(self.base_url() + 'empty'), '')
示例#30
0
 def test_get_bad_uri(self):
     self.assertRaises(httpc.NotFound,
                       lambda: httpc.get(self.base_url() + 'b0gu5'))
示例#31
0
 def test_put_empty(self):
     httpc.put(self.base_url() + 'empty', data='')
     self.assertEquals(httpc.get(self.base_url() + 'empty'), '')
示例#32
0
 def test_get_bad_uri(self):
     self.assertRaises(httpc.NotFound,
                       lambda: httpc.get(self.base_url() + 'b0gu5'))
示例#33
0
 def test_get(self):
     response = httpc.get(self.base_url() + 'hello')
     self.assertEquals(response, 'hello world')
示例#34
0
 def test_get_query(self):
     response = httpc.get(self.base_url() + 'hello?foo=bar&foo=quux')
     self.assertEquals(response, 'hello worldfoo=bar\nfoo=quux\n')
示例#35
0
 def test_put_02_modify(self):
     self.test_put_01_create()
     data = 'i really mean goodbye'
     status = httpc.put_(self.base_url() + 'goodbye', data=data)[0]
     self.assertEquals(status, 204)
     self.assertEquals(httpc.get(self.base_url() + 'goodbye'), data)