Пример #1
0
    def test_image_url(self):
        """Verify we don't store, but just tag an image url"""
        img_url = 'http://www.ndftz.com/nickelanddime.png'
        read = ReadUrl.parse(img_url)

        ok_(read.status == 200, "The status is 200: " + str(read.status))
        ok_(read.content is None, "Content should be none: ")
Пример #2
0
    def test_image_url(self):
        """Verify we don't store, but just tag an image url"""
        img_url = 'http://www.ndftz.com/nickelanddime.png'
        read = ReadUrl.parse(img_url)

        ok_(read.status == 200, "The status is 200: " + str(read.status))
        ok_(read.content is None, "Content should be none: ")
Пример #3
0
    def test_404_url(self):
        """Test that we get the proper errors in a missing url"""
        url = 'http://lococast.net/archives/001'
        read = ReadUrl.parse(url)

        ok_(read.status == 404, "The status is 404: " + str(read.status))
        ok_(not read.is_image(), "The content is not an image")
        ok_(read.content is None, "Content should be none")
Пример #4
0
    def test_non_net_url(self):
        """I might be bookmarking something internal bookie can't access"""
        test_url = "http://r2"
        read = ReadUrl.parse(test_url)

        ok_(read.status == 901, "The status is 901: " + str(read.status))
        ok_(not read.is_image(), "The content is not an image")
        ok_(read.content is None, "Content should be none: " + str(read.content))
Пример #5
0
    def test_404_url(self):
        """Test that we get the proper errors in a missing url"""
        url = 'http://lococast.net/archives/001'
        read = ReadUrl.parse(url)

        ok_(read.status == 404, "The status is 404: " + str(read.status))
        ok_(not read.is_image(), "The content is not an image")
        ok_(read.content is None, "Content should be none")
Пример #6
0
def fetch_content(i, q):
    """Our threaded worker to fetch the url contents"""
    while True:
        hash_id, url = q.get()
        print 'Q' + str(i) + ' getting content for ' + hash_id + ' ' + url
        read = ReadUrl.parse(url)
        parsed[hash_id] = read
        q.task_done()
Пример #7
0
def fetch_content(i, q):
    """Our threaded worker to fetch the url contents"""
    while True:
        hash_id, url = q.get()
        print 'Q' + str(i) + ' getting content for ' + hash_id + ' ' + url
        read = ReadUrl.parse(url)
        parsed[hash_id] = read
        q.task_done()
Пример #8
0
    def test_non_net_url(self):
        """I might be bookmarking something internal bookie can't access"""
        test_url = "http://r2"
        read = ReadUrl.parse(test_url)

        ok_(read.status == 901, "The status is 901: " + str(read.status))
        ok_(not read.is_image(), "The content is not an image")
        ok_(read.content is None,
            "Content should be none: " + str(read.content))
Пример #9
0
    def test_url_content(self):
        """Test that we set the correct status"""

        url = 'http://lococast.net/archives/475'
        read = ReadUrl.parse(url)

        ok_(read.status == 200, "The status is 200" + str(read.status))
        ok_(not read.is_image(), "The content is not an image")
        ok_(read.content is not None, "Content should not be none")
        ok_('Lococast' in read.content,
                "The word Lococast is in the content: " + str(read.content))
Пример #10
0
    def test_url_content(self):
        """Test that we set the correct status"""

        url = 'http://lococast.net/archives/475'
        read = ReadUrl.parse(url)

        ok_(read.status == 200, "The status is 200" + str(read.status))
        ok_(not read.is_image(), "The content is not an image")
        ok_(read.content is not None, "Content should not be none")
        ok_('Lococast' in read.content,
            "The word Lococast is in the content: " + str(read.content))
Пример #11
0
    def test_nonworking_url(self):
        """Testing some urls we know we had issues with initially"""
        urls = { 'CouchSurfing': 'http://allthatiswrong.wordpress.com/2010/01/24/a-criticism-of-couchsurfing-and-review-of-alternatives/#problems',
                 'Electronic': 'https://www.fbo.gov/index?s=opportunity&mode=form&tab=core&id=dd11f27254c796f80f2aadcbe4158407',
                 'Will Fuqua': 'http://twitter.com/#!/wafuqua',
        }

        for key, url in urls.iteritems():
            LOG.debug(url)
            read = ReadUrl.parse(url)
            LOG.debug(read)

            ok_(read.status == 200, "The status is 200: " + str(read.status))
            ok_(read.content is not None, "Content should not be none: ")
Пример #12
0
    def test_nonworking_url(self):
        """Testing some urls we know we had issues with initially"""
        urls = {
            'CouchSurfing': ('http://allthatiswrong.wordpress.com/2010/01'
                             '/24/a-criticism-of-couchsurfing-and-review-o'
                             'f-alternatives/#problems'),
            # 'Electronic': ('https://www.fbo.gov/index?s=opportunity&mode='
            #                'form&tab=core&id=dd11f27254c796f80f2aadcbe415'
            #                '8407'),
        }

        for key, url in urls.iteritems():
            read = ReadUrl.parse(url)

            self.assertTrue(
                read.status == 200, "The status is 200: " + str(read.status))
            self.assertTrue(
                read.content is not None, "Content should not be none: ")
Пример #13
0
def fetch_bmark_content(bid):
    """Given a bookmark, fetch its content and index it."""
    trans = transaction.begin()
    logger = get_task_logger('fetch_bmark_content')

    if not bid:
        raise Exception('missing bookmark id')
    bmark = Bmark.query.get(bid)
    if not bmark:
        raise Exception('Bookmark not found: ' + str(bid))
    hashed = bmark.hashed

    try:
        read = ReadUrl.parse(hashed.url)
    except ValueError, exc:
        # We hit this where urllib2 choked trying to get the protocol type of
        # this url to fetch it.
        logger.error('Could not parse url: ' + hashed.url)
        logger.error('exc')
        read = None
Пример #14
0
def fetch_bmark_content(bid):
    """Given a bookmark, fetch its content and index it."""
    trans = transaction.begin()
    logger = get_task_logger('fetch_bmark_content')

    if not bid:
        raise Exception('missing bookmark id')
    bmark = Bmark.query.get(bid)
    if not bmark:
        raise Exception('Bookmark not found: ' + str(bid))
    hashed = bmark.hashed

    try:
        read = ReadUrl.parse(hashed.url)
    except ValueError, exc:
        # We hit this where urllib2 choked trying to get the protocol type of
        # this url to fetch it.
        logger.error('Could not parse url: ' + hashed.url)
        logger.error('exc')
        read = None
Пример #15
0
def fetch_content(i, q):
    """Our threaded worker to fetch the url contents"""
    while True:
        hash_id, url = q.get()
        print 'Q' + str(i) + ' getting content for ' + hash_id + ' ' + url
        read = ReadUrl.parse(url)
        parsed[hash_id] = read
        q.task_done()


if __name__ == "__main__":
    args = parse_args()

    if args.test_url:
        # then we only want to test this one url and not process full lists
        read = ReadUrl.parse(args.test_url)

        print "META"
        print "*" * 30

        print read.content_type
        print read.status
        print read.status_message

        print "\n\n"

        if not read.is_image():
            print read.content
        else:
            print "Url is an image"
Пример #16
0
            # but if we don't we'll just keep getting errors and never end
            url_list = Hashed.query.outerjoin(Readable).\
                        filter(Readable.status_code != 200).all()

        else:
            url_list = Hashed.query.limit(PER_TRANS).offset(ct).all()

        if len(url_list) < PER_TRANS:
            all = True

        ct = ct + len(url_list)

        for hashed in url_list:
            print hashed.url

            read = ReadUrl.parse(hashed.url)
            if not read.is_image():
                if not hashed.readable:
                    hashed.readable = Readable()

                hashed.readable.content = read.content
            else:
                if not hashed.readable:
                    hashed.readable = Readable()

                hashed.readable.content = None

            # set some of the extra metadata
            hashed.readable.content_type = read.content_type
            hashed.readable.status_code = read.status
            hashed.readable.status_message = read.status_message
Пример #17
0
def fetch_content(i, q):
    """Our threaded worker to fetch the url contents"""
    while True:
        hash_id, url = q.get()
        print 'Q' + str(i) + ' getting content for ' + hash_id + ' ' + url
        read = ReadUrl.parse(url)
        parsed[hash_id] = read
        q.task_done()


if __name__ == "__main__":
    args = parse_args()

    if args.test_url:
        # then we only want to test this one url and not process full lists
            read = ReadUrl.parse(args.test_url)

            print "META"
            print "*" * 30

            print read.content_type
            print read.status
            print read.status_message

            print "\n\n"

            if not read.is_image():
                print read.content
            else:
                print "Url is an image"
Пример #18
0
def fetch_bmark_content(bid):
    """Given a bookmark, fetch its content and index it."""
    trans = transaction.begin()

    if not bid:
        raise Exception('missing bookmark id')
    bmark = Bmark.query.get(bid)
    if not bmark:
        raise Exception('Bookmark not found: ' + str(bid))
    hashed = bmark.hashed

    try:
        read = ReadUrl.parse(hashed.url)
    except ValueError:
        # We hit this where urllib2 choked trying to get the protocol type of
        # this url to fetch it.
        logger.error('Could not parse url: ' + hashed.url)
        logger.error('exc')
        read = None

    if read:
        logger.debug(read)
        logger.debug(read.content)

        logger.debug("%s: %s %d %s %s" % (
            hashed.hash_id,
            read.url,
            len(read.content) if read.content else -1,
            read.is_error(),
            read.status_message))

        if not read.is_image():
            if not bmark.readable:
                bmark.readable = Readable()

            bmark.readable.content = read.content
        else:
            if not bmark.readable:
                bmark.readable = Readable()
            bmark.readable.content = None

        # set some of the extra metadata
        bmark.readable.content_type = read.content_type
        bmark.readable.status_code = read.status
        bmark.readable.status_message = read.status_message
        trans.commit()
        fulltext_index_bookmark.delay(
            bid,
            read.content if read else None)
    else:
        logger.error(
            'No readable record for bookmark: ',
            str(bid, bmark.hashed.url))

        # There was a failure reading the thing.
        bmark.readable = Readable()
        bmark.readable.status = '900'
        bmark.readable.status_message = (
            'No readable record '
            'during existing processing')
        trans.commit()
Пример #19
0
    parser.add_argument('--test-url', dest="test_url",
                        action="store",
                        default=False,
                        help="Run the parser on the url provided and test things out")

    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()

    if args.test_url:
        # then we only want to test this one url and not process full lists
            read = ReadUrl.parse(args.test_url)

            print "META"
            print "*" * 30

            print read.content_type
            print read.status
            print read.status_message

            print "\n\n"

            if not read.is_image():
                print read.content
            else:
                print "Url is an image"