Exemplo n.º 1
0
def crawlFeedOnePage(feed, queue, crawlDateTime, latestWarc, warcDateTime, latestDateTime, tempdir):

    url = queue.get()
    print "<- %s fetching %s for domain %s" % (time.asctime(), url, feed['domain'])
    data = httpc.get(url, headers = {"User-Agent": "Internet Archive OPDS Crawler +http://bookserver.archive.org",})
    print "-> %s fetched %s for domain %s" % (time.asctime(), url, feed['domain'])

    f     = feedparser.parse(data)
    t     = f.feed.updated_parsed
    dt    = datetime.datetime(t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec)
    delta = crawlDateTime - dt

    #if delta.days < 1:
    #     print 'feed update date less than one day since previous crawl'

    #TODO: make new warc if our warc file is too big
    #TODO: only add to warc if not already there.

    ###turn off addToWarc while debugging
    #if (warcDateTime < dt):
    #    print "Feed updated date is newer than warc date. Adding to warc"
    if True:
        addToWarc(latestWarc, url, data, f, 'application/atom+xml')
        latestDateTime = dt

    #just archive, no longer feed solr from this script
    #addToSolr(feed, f, tempdir)

    parseLinks(f, feed['url'], queue)
    


    time.sleep(config['default_sleep_seconds'])

    return latestDateTime
Exemplo n.º 2
0
 def test_delete_(self):
     httpc.put(self.base_url() + 'killme', data='killme')
     status, msg, body = httpc.delete_(self.base_url() + 'killme')
     self.assertEquals(status, 204)
     self.assertRaises(
         httpc.NotFound,
         lambda: httpc.get(self.base_url() + 'killme'))
Exemplo n.º 3
0
 def test_put_01_create(self):
     data = 'goodbye world'
     status, msg, body = httpc.put_(self.base_url() + 'goodbye', data=data)
     self.assertEquals(status, 201)
     self.assertEquals(msg.dict['x-put'], 'hello')
     self.assertEquals(body, '')
     self.assertEquals(httpc.get(self.base_url() + 'goodbye'), data)
Exemplo n.º 4
0
 def test_put_01_create(self):
     data = 'goodbye world'
     status, msg, body = httpc.put_(self.base_url() + 'goodbye',
                                    data=data)
     self.assertEquals(status, 201)
     self.assertEquals(msg.dict['x-put'], 'hello')
     self.assertEquals(body, '')
     self.assertEquals(httpc.get(self.base_url() + 'goodbye'), data)
Exemplo n.º 5
0
 def test_get(self):
     data = 'screw you world'
     try:
         response = httpc.get(self.base_url())
         self.fail()
     except httpc.InternalServerError, e:
         self.assertEquals(e.params.response_body, data)
         self.assert_(str(e).count(data))
         self.assert_(repr(e).count(data))
Exemplo n.º 6
0
 def test_get(self):
     data = 'screw you world'
     try:
         response = httpc.get(self.base_url())
         self.fail()
     except httpc.InternalServerError, e:
         self.assertEquals(e.params.response_body, data)
         self.assert_(str(e).count(data))
         self.assert_(repr(e).count(data))
Exemplo n.º 7
0
    def test_ssl_proxy_redirects(self):
        # make sure that if the proxy returns a redirect, that httpc
        # successfully follows it (this was broken at one point)
        def ssl_proxy(sock):
            conn, addr = sock.accept()
            fd = conn.makefile()
            try:
                line = request = fd.readline()
                self.assertEqual(request,
                                 'GET https://localhost:1234 HTTP/1.1\r\n')
                while line.strip():  # eat request headers
                    line = fd.readline()

                # we're not going to actually proxy to localhost:1234,
                # we're just going to return a response on its behalf
                fd.write(
                    "HTTP/1.0 302 Found\r\nLocation: https://localhost:1234/2\r\n\r\n"
                )
            finally:
                fd.close()
                conn.close()

            # second request, for /2 target
            conn, addr = sock.accept()
            fd = conn.makefile()
            try:
                line = request = fd.readline()
                self.assertEqual(request,
                                 'GET https://localhost:1234/2 HTTP/1.1\r\n')
                while line.strip():  # eat request headers
                    line = fd.readline()
                fd.write("HTTP/1.0 200 OK\r\n\r\n")
            finally:
                fd.close()
                conn.close()
            sock.close()

        server = api.tcp_listener(('0.0.0.0', 5505))
        api.spawn(ssl_proxy, server)
        import os
        os.environ['ALL_PROXY'] = 'localhost:5505'
        httpc.get('https://localhost:1234', use_proxy=True, max_retries=1)
Exemplo n.º 8
0
    def test_013_empty_return(self):
        from eventlet import httpc
        def wsgi_app(environ, start_response):
            start_response("200 OK", [])
            return [""]

        certificate_file = os.path.join(os.path.dirname(__file__), 'test_server.crt')
        private_key_file = os.path.join(os.path.dirname(__file__), 'test_server.key')
        sock = api.ssl_listener(('', 4202), certificate_file, private_key_file)
        api.spawn(wsgi.server, sock, wsgi_app)

        res = httpc.get("https://localhost:4202/foo")
        self.assertEquals(res, '')
Exemplo n.º 9
0
    def test_ssl_proxy(self):
        def ssl_proxy(sock):
            conn, addr = sock.accept()
            fd = conn.makefile()
            try:
                line = request = fd.readline()
                self.assertEqual(request, 'GET https://localhost:1234 HTTP/1.1\r\n')
                while line.strip():  # eat request headers
                    line = fd.readline()

                # we're not going to actually proxy to localhost:1234,
                # we're just going to return a response on its behalf
                fd.write("HTTP/1.0 200 OK\r\n\r\n")
            finally:
                fd.close()
                conn.close()

        server = api.tcp_listener(('0.0.0.0', 5505))
        api.spawn(ssl_proxy, server)
        import os
        os.environ['ALL_PROXY'] = 'localhost:5505'
        httpc.get('https://localhost:1234', ok=[200], use_proxy=True)
Exemplo n.º 10
0
class TestHttpc302(TestBase, TestCase):
    site_class = Site302

    def test_get_expired(self):
        try:
            httpc.get(self.base_url() + 'expired/hello', max_retries=0)
            self.assert_(False)
        except httpc.Found, err:
            response = err.retry()
        self.assertEquals(response, 'hello world')
        self.assertEquals(
            httpc.get(self.base_url() + 'expired/hello', max_retries=1),
            'hello world')
Exemplo n.º 11
0
    def test_ssl_proxy_redirects(self):
        # make sure that if the proxy returns a redirect, that httpc
        # successfully follows it (this was broken at one point)
        def ssl_proxy(sock):
            conn, addr = sock.accept()
            fd = conn.makefile()
            try:
                line = request = fd.readline()
                self.assertEqual(request, 'GET https://localhost:1234 HTTP/1.1\r\n')
                while line.strip():  # eat request headers
                    line = fd.readline()

                # we're not going to actually proxy to localhost:1234,
                # we're just going to return a response on its behalf
                fd.write("HTTP/1.0 302 Found\r\nLocation: https://localhost:1234/2\r\n\r\n")
            finally:
                fd.close()
                conn.close()

            # second request, for /2 target
            conn, addr = sock.accept()
            fd = conn.makefile()
            try:
                line = request = fd.readline()
                self.assertEqual(request, 'GET https://localhost:1234/2 HTTP/1.1\r\n')
                while line.strip():  # eat request headers
                    line = fd.readline()
                fd.write("HTTP/1.0 200 OK\r\n\r\n")
            finally:
                fd.close()
                conn.close()
            sock.close()

        server = api.tcp_listener(('0.0.0.0', 5505))
        api.spawn(ssl_proxy, server)
        import os
        os.environ['ALL_PROXY'] = 'localhost:5505'
        httpc.get('https://localhost:1234', use_proxy=True, max_retries=1)
Exemplo n.º 12
0
    def test_ssl_proxy(self):
        def ssl_proxy(sock):
            conn, addr = sock.accept()
            fd = conn.makefile()
            try:
                line = request = fd.readline()
                self.assertEqual(request,
                                 'GET https://localhost:1234 HTTP/1.1\r\n')
                while line.strip():  # eat request headers
                    line = fd.readline()

                # we're not going to actually proxy to localhost:1234,
                # we're just going to return a response on its behalf
                fd.write("HTTP/1.0 200 OK\r\n\r\n")
            finally:
                fd.close()
                conn.close()

        server = api.tcp_listener(('0.0.0.0', 5505))
        api.spawn(ssl_proxy, server)
        import os
        os.environ['ALL_PROXY'] = 'localhost:5505'
        httpc.get('https://localhost:1234', ok=[200], use_proxy=True)
Exemplo n.º 13
0
class TestHttpc301(TestBase, TestCase):
    site_class = Site301

    def base_url(self):
        return 'http://localhost:31337/redirect/'

    def test_get(self):
        try:
            httpc.get(self.base_url() + 'hello', max_retries=0)
            self.assert_(False)
        except httpc.MovedPermanently, err:
            response = err.retry()
        self.assertEquals(response, 'hello world')
        self.assertEquals(httpc.get(self.base_url() + 'hello', max_retries=1),
                          'hello world')
Exemplo n.º 14
0
    def test_013_empty_return(self):
        from eventlet import httpc

        def wsgi_app(environ, start_response):
            start_response("200 OK", [])
            return [""]

        certificate_file = os.path.join(os.path.dirname(__file__),
                                        'test_server.crt')
        private_key_file = os.path.join(os.path.dirname(__file__),
                                        'test_server.key')
        sock = api.ssl_listener(('', 4202), certificate_file, private_key_file)
        api.spawn(wsgi.server, sock, wsgi_app)

        res = httpc.get("https://localhost:4202/foo")
        self.assertEquals(res, '')
Exemplo n.º 15
0
 def test_delete(self):
     httpc.put(self.base_url() + 'killme', data='killme')
     self.assertEquals(httpc.delete(self.base_url() + 'killme'), '')
     self.assertRaises(httpc.NotFound,
                       lambda: httpc.get(self.base_url() + 'killme'))
Exemplo n.º 16
0
 def test_delete(self):
     httpc.put(self.base_url() + 'killme', data='killme')
     self.assertEquals(httpc.delete(self.base_url() + 'killme'), '')
     self.assertRaises(
         httpc.NotFound,
         lambda: httpc.get(self.base_url() + 'killme'))
Exemplo n.º 17
0
 def test_put_02_modify(self):
     self.test_put_01_create()
     data = 'i really mean goodbye'
     status = httpc.put_(self.base_url() + 'goodbye', data=data)[0]
     self.assertEquals(status, 204)
     self.assertEquals(httpc.get(self.base_url() + 'goodbye'), data)
Exemplo n.º 18
0
 def test_delete_(self):
     httpc.put(self.base_url() + 'killme', data='killme')
     status, msg, body = httpc.delete_(self.base_url() + 'killme')
     self.assertEquals(status, 204)
     self.assertRaises(httpc.NotFound,
                       lambda: httpc.get(self.base_url() + 'killme'))
Exemplo n.º 19
0
 def test_put_nonempty(self):
     data = 'nonempty'
     httpc.put(self.base_url() + 'nonempty', data=data)
     self.assertEquals(httpc.get(self.base_url() + 'nonempty'), data)
Exemplo n.º 20
0
 def test_get(self):
     response = httpc.get(self.base_url() + 'hello')
     self.assertEquals(response, 'hello world')
Exemplo n.º 21
0
 def test_put_nonempty(self):
     data = 'nonempty'
     httpc.put(self.base_url() + 'nonempty', data=data)
     self.assertEquals(httpc.get(self.base_url() + 'nonempty'), data)
Exemplo n.º 22
0
 def test_get_expires(self):
     try:
         httpc.get(self.base_url() + 'expires/hello', max_retries=0)
         self.assert_(False)
     except httpc.Found, err:
         response = err.retry()
Exemplo n.º 23
0
 def test_get_expires(self):
     try:
         httpc.get(self.base_url() + 'expires/hello', max_retries=0)
         self.assert_(False)
     except httpc.Found, err:
         response = err.retry()
Exemplo n.º 24
0
 def test_get(self):
     try:
         httpc.get(self.base_url() + 'hello', max_retries=0)
         self.assert_(False)
     except httpc.MovedPermanently, err:
         response = err.retry()
Exemplo n.º 25
0
        except httpc.Found, err:
            response = err.retry()
        self.assertEquals(response, 'hello world')
        self.assertEquals(
            httpc.get(self.base_url() + 'expired/hello', max_retries=1),
            'hello world')

    def test_get_expires(self):
        try:
            httpc.get(self.base_url() + 'expires/hello', max_retries=0)
            self.assert_(False)
        except httpc.Found, err:
            response = err.retry()
        self.assertEquals(response, 'hello world')
        self.assertEquals(
            httpc.get(self.base_url() + 'expires/hello', max_retries=1),
            'hello world')


class TestHttpc303(TestBase, TestCase):
    site_class = Site303

    def base_url(self):
        return 'http://localhost:31337/redirect/'

    def test_post(self):
        data = 'hello world'
        try:
            response = httpc.post(self.base_url() + 'hello', data=data)
            self.assert_(False)
        except httpc.SeeOther, err:
Exemplo n.º 26
0
        try:
            httpc.get(self.base_url() + 'expired/hello', max_retries=0)
            self.assert_(False)
        except httpc.Found, err:
            response = err.retry()
        self.assertEquals(response, 'hello world')
        self.assertEquals(httpc.get(self.base_url() + 'expired/hello', max_retries=1), 'hello world')

    def test_get_expires(self):
        try:
            httpc.get(self.base_url() + 'expires/hello', max_retries=0)
            self.assert_(False)
        except httpc.Found, err:
            response = err.retry()
        self.assertEquals(response, 'hello world')
        self.assertEquals(httpc.get(self.base_url() + 'expires/hello', max_retries=1), 'hello world')


class TestHttpc303(TestBase, TestCase):
    site_class = Site303

    def base_url(self):
        return 'http://localhost:31337/redirect/'

    def test_post(self):
        data = 'hello world'
        try:
            response = httpc.post(self.base_url() + 'hello', data=data)
            self.assert_(False)
        except httpc.SeeOther, err:
            response = err.retry()
Exemplo n.º 27
0
 def test_get_query(self):
     response = httpc.get(self.base_url() + 'hello?foo=bar&foo=quux')
     self.assertEquals(response, 'hello worldfoo=bar\nfoo=quux\n')
Exemplo n.º 28
0
 def test_get(self):
     try:
         httpc.get(self.base_url() + 'hello', max_retries=0)
         self.assert_(False)
     except httpc.MovedPermanently, err:
         response = err.retry()
Exemplo n.º 29
0
 def test_put_empty(self):
     httpc.put(self.base_url() + 'empty', data='')
     self.assertEquals(httpc.get(self.base_url() + 'empty'), '')
Exemplo n.º 30
0
 def test_get_bad_uri(self):
     self.assertRaises(httpc.NotFound,
                       lambda: httpc.get(self.base_url() + 'b0gu5'))
Exemplo n.º 31
0
 def test_put_empty(self):
     httpc.put(self.base_url() + 'empty', data='')
     self.assertEquals(httpc.get(self.base_url() + 'empty'), '')
Exemplo n.º 32
0
 def test_get_bad_uri(self):
     self.assertRaises(httpc.NotFound,
                       lambda: httpc.get(self.base_url() + 'b0gu5'))
Exemplo n.º 33
0
 def test_get(self):
     response = httpc.get(self.base_url() + 'hello')
     self.assertEquals(response, 'hello world')
Exemplo n.º 34
0
 def test_get_query(self):
     response = httpc.get(self.base_url() + 'hello?foo=bar&foo=quux')
     self.assertEquals(response, 'hello worldfoo=bar\nfoo=quux\n')
Exemplo n.º 35
0
 def test_put_02_modify(self):
     self.test_put_01_create()
     data = 'i really mean goodbye'
     status = httpc.put_(self.base_url() + 'goodbye', data=data)[0]
     self.assertEquals(status, 204)
     self.assertEquals(httpc.get(self.base_url() + 'goodbye'), data)