def test_image_url(self): """Verify we don't store, but just tag an image url""" img_url = 'http://www.ndftz.com/nickelanddime.png' read = ReadUrl.parse(img_url) ok_(read.status == 200, "The status is 200: " + str(read.status)) ok_(read.content is None, "Content should be none: ")
def test_404_url(self): """Test that we get the proper errors in a missing url""" url = 'http://lococast.net/archives/001' read = ReadUrl.parse(url) ok_(read.status == 404, "The status is 404: " + str(read.status)) ok_(not read.is_image(), "The content is not an image") ok_(read.content is None, "Content should be none")
def test_non_net_url(self): """I might be bookmarking something internal bookie can't access""" test_url = "http://r2" read = ReadUrl.parse(test_url) ok_(read.status == 901, "The status is 901: " + str(read.status)) ok_(not read.is_image(), "The content is not an image") ok_(read.content is None, "Content should be none: " + str(read.content))
def fetch_content(i, q): """Our threaded worker to fetch the url contents""" while True: hash_id, url = q.get() print 'Q' + str(i) + ' getting content for ' + hash_id + ' ' + url read = ReadUrl.parse(url) parsed[hash_id] = read q.task_done()
def test_url_content(self): """Test that we set the correct status""" url = 'http://lococast.net/archives/475' read = ReadUrl.parse(url) ok_(read.status == 200, "The status is 200" + str(read.status)) ok_(not read.is_image(), "The content is not an image") ok_(read.content is not None, "Content should not be none") ok_('Lococast' in read.content, "The word Lococast is in the content: " + str(read.content))
def test_nonworking_url(self): """Testing some urls we know we had issues with initially""" urls = { 'CouchSurfing': 'http://allthatiswrong.wordpress.com/2010/01/24/a-criticism-of-couchsurfing-and-review-of-alternatives/#problems', 'Electronic': 'https://www.fbo.gov/index?s=opportunity&mode=form&tab=core&id=dd11f27254c796f80f2aadcbe4158407', 'Will Fuqua': 'http://twitter.com/#!/wafuqua', } for key, url in urls.iteritems(): LOG.debug(url) read = ReadUrl.parse(url) LOG.debug(read) ok_(read.status == 200, "The status is 200: " + str(read.status)) ok_(read.content is not None, "Content should not be none: ")
def test_nonworking_url(self): """Testing some urls we know we had issues with initially""" urls = { 'CouchSurfing': ('http://allthatiswrong.wordpress.com/2010/01' '/24/a-criticism-of-couchsurfing-and-review-o' 'f-alternatives/#problems'), # 'Electronic': ('https://www.fbo.gov/index?s=opportunity&mode=' # 'form&tab=core&id=dd11f27254c796f80f2aadcbe415' # '8407'), } for key, url in urls.iteritems(): read = ReadUrl.parse(url) self.assertTrue( read.status == 200, "The status is 200: " + str(read.status)) self.assertTrue( read.content is not None, "Content should not be none: ")
def fetch_bmark_content(bid): """Given a bookmark, fetch its content and index it.""" trans = transaction.begin() logger = get_task_logger('fetch_bmark_content') if not bid: raise Exception('missing bookmark id') bmark = Bmark.query.get(bid) if not bmark: raise Exception('Bookmark not found: ' + str(bid)) hashed = bmark.hashed try: read = ReadUrl.parse(hashed.url) except ValueError, exc: # We hit this where urllib2 choked trying to get the protocol type of # this url to fetch it. logger.error('Could not parse url: ' + hashed.url) logger.error('exc') read = None
def fetch_content(i, q): """Our threaded worker to fetch the url contents""" while True: hash_id, url = q.get() print 'Q' + str(i) + ' getting content for ' + hash_id + ' ' + url read = ReadUrl.parse(url) parsed[hash_id] = read q.task_done() if __name__ == "__main__": args = parse_args() if args.test_url: # then we only want to test this one url and not process full lists read = ReadUrl.parse(args.test_url) print "META" print "*" * 30 print read.content_type print read.status print read.status_message print "\n\n" if not read.is_image(): print read.content else: print "Url is an image"
# but if we don't we'll just keep getting errors and never end url_list = Hashed.query.outerjoin(Readable).\ filter(Readable.status_code != 200).all() else: url_list = Hashed.query.limit(PER_TRANS).offset(ct).all() if len(url_list) < PER_TRANS: all = True ct = ct + len(url_list) for hashed in url_list: print hashed.url read = ReadUrl.parse(hashed.url) if not read.is_image(): if not hashed.readable: hashed.readable = Readable() hashed.readable.content = read.content else: if not hashed.readable: hashed.readable = Readable() hashed.readable.content = None # set some of the extra metadata hashed.readable.content_type = read.content_type hashed.readable.status_code = read.status hashed.readable.status_message = read.status_message
def fetch_bmark_content(bid): """Given a bookmark, fetch its content and index it.""" trans = transaction.begin() if not bid: raise Exception('missing bookmark id') bmark = Bmark.query.get(bid) if not bmark: raise Exception('Bookmark not found: ' + str(bid)) hashed = bmark.hashed try: read = ReadUrl.parse(hashed.url) except ValueError: # We hit this where urllib2 choked trying to get the protocol type of # this url to fetch it. logger.error('Could not parse url: ' + hashed.url) logger.error('exc') read = None if read: logger.debug(read) logger.debug(read.content) logger.debug("%s: %s %d %s %s" % ( hashed.hash_id, read.url, len(read.content) if read.content else -1, read.is_error(), read.status_message)) if not read.is_image(): if not bmark.readable: bmark.readable = Readable() bmark.readable.content = read.content else: if not bmark.readable: bmark.readable = Readable() bmark.readable.content = None # set some of the extra metadata bmark.readable.content_type = read.content_type bmark.readable.status_code = read.status bmark.readable.status_message = read.status_message trans.commit() fulltext_index_bookmark.delay( bid, read.content if read else None) else: logger.error( 'No readable record for bookmark: ', str(bid, bmark.hashed.url)) # There was a failure reading the thing. bmark.readable = Readable() bmark.readable.status = '900' bmark.readable.status_message = ( 'No readable record ' 'during existing processing') trans.commit()
parser.add_argument('--test-url', dest="test_url", action="store", default=False, help="Run the parser on the url provided and test things out") args = parser.parse_args() return args if __name__ == "__main__": args = parse_args() if args.test_url: # then we only want to test this one url and not process full lists read = ReadUrl.parse(args.test_url) print "META" print "*" * 30 print read.content_type print read.status print read.status_message print "\n\n" if not read.is_image(): print read.content else: print "Url is an image"