def post(self): id = self.request.get('id') if id: url = Url.get_by_id(int(id)) if url: result = None try: result = urlfetch.fetch(url.url, allow_truncated=True) except urlfetch.DownloadError: url.status = 'DE' logging.info('DownloadError, url: %s' % (url.url)) except urlfetch.ResponseTooLargeError: url.status = 'RTL' logging.info('ResponseTooLargeError, url: %s' % (url.url)) except urlfetch.InvalidURLError: url.status = 'IUE' logging.info('InvalidURLError, url: %s' % (url.url)) except: url.status = 'UE' logging.error('"Unexpected error: %s, url: %s' % (sys.exc_info()[0], url.url)) if result: if result.content_was_truncated: logging.debug('truncated') if result.status_code: url.status = str(result.status_code) if result and result.status_code and result.status_code == 200: url.valid = 2 else: if url.valid > -5: url.valid = url.valid - 1 else: logging.info('Broken url: %s' % (url.url)) url.last_check = datetime.datetime.now() url.put()
def view_master(request, urlid): data = [] logging.debug('View (Master)Url %s' % (urlid)) try: urlid = int(urlid) except: pass url = Url.get_by_id(urlid) if url: channelurls = ChannelUrl.query(ChannelUrl.url == url.key) for channelurl in channelurls: channel = channelurl.channel.get() if channel.private == False: extras = Extra.query(Extra.channelurl == channelurl.key) rates = Rate.query(Rate.channelurl == channelurl.key) rating = channelurl.rating() #data.append({'channel':channel,'post':post,'url':url,'extras': extras}) data.append({ 'channel': channel, 'channelurl': channelurl, 'post': post, 'url': url, 'extras': extras, 'rates': rates, 'rating': rating }) template_values = { 'data': data, 'user': users.get_current_user(), } return render_to_response('masterurl.html', template_values)
def get(self): id = self.request.get('id') if id: url = Url.get_by_id(int(id)) if url: try: prefix = self.request.get('prefix', '') name = str(prefix) + str(id) taskqueue.add(name=name, queue_name='urlfetch', url='/tasks/valid', params={'id': id}) except taskqueue.TombstonedTaskError: logging.warning('TombstonedTaskError %s' % name) except taskqueue.TaskAlreadyExistsError: logging.warning('TaskAlreadyExistsError %s' % name) else: # Routine check url_keys = Url.query().order(Url.last_check, Url.status).fetch(50, keys_only=True) for key in url_keys: id = key.id() try: prefix = self.request.get('prefix', '') name = str(prefix) + str(id) + '_rc' taskqueue.add(name=name, queue_name='urlfetch', url='/tasks/valid', params={'id': id}) except taskqueue.TombstonedTaskError: logging.warning('TombstonedTaskError %s' % name) except taskqueue.TaskAlreadyExistsError: logging.warning('TaskAlreadyExistsError %s' % name) # Fix <missing> fields: document_date, last_check (valid) url_keys = Url.query(Url.status == None).order(Url.idate).fetch( 50, keys_only=True) for key in url_keys: id = key.id() try: prefix = self.request.get('prefix', '') name = str(prefix) + str(id) + '_fix' taskqueue.add(name=name + '_dd', queue_name='document', url='/tasks/update_document', params={'doc_id': id}) taskqueue.add(name=name + '_lc', queue_name='urlfetch', url='/tasks/valid', params={'id': id}) except taskqueue.TombstonedTaskError: logging.warning('TombstonedTaskError %s' % name) except taskqueue.TaskAlreadyExistsError: logging.warning('TaskAlreadyExistsError %s' % name) redirect = self.request.get('redirect') if redirect: return self.redirect(redirect)
def post(self): id = self.request.get('id') if id: url = Url.get_by_id(int(id)) if url: # TODO: fetch title #try: req = urllib2.Request(url.url) #logging.debug('req %s' % (req)) #req.add_header('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11') req.add_header( 'User-agent', 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' ) req.add_header( 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ) req.add_header('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3') req.add_header('Accept-Encoding', 'none') req.add_header('Accept-Language', 'en-US,en;q=0.8') req.add_header('Connection', 'keep-alive') res = urllib2.urlopen(req) #logging.debug('res %s' % (res)) doc = res.read() #logging.debug('doc %s' % (doc)) encoding = res.headers.getparam('charset') logging.debug('encoding %s' % (encoding)) try: tree = etree.fromstring( doc, etree.HTMLParser(encoding=encoding)) except LookupError: tree = etree.fromstring(doc, etree.HTMLParser(encoding='utf-8')) title = tree.find(".//title").text logging.debug('title %s' % (title)) url.title = smart_unicode(re.sub(r'\s+', ' ', title).strip()) #except: # logging.debug('TitleTask: title not fetched %s' % (post_url)) #url.title = post_url url.put()
def get(self): id = self.request.get('id') if id: url = Url.get_by_id(int(id)) if url: try: prefix = self.request.get('prefix', '') name = str(prefix) + str(id) taskqueue.add(name=name, queue_name='urlfetch', url='/tasks/title', params={'id': id}) except taskqueue.TombstonedTaskError: logging.warning('TombstonedTaskError %s' % id) except taskqueue.TaskAlreadyExistsError: logging.warning('TaskAlreadyExistsError %s' % id) else: logging.info('No URL') else: logging.info('No id') redirect = self.request.get('redirect') if redirect: return self.redirect(redirect)
doc_date=None for field in scored_document.fields: if field.name=='url': doc_url=field.value if field.name=='user': doc_user=field.value if field.name=='channel': doc_channel=field.value if field.name=='date': doc_date=field.value #logging.debug('Search result: %s' % (scored_document)) urlinstance=Url.get_by_id(int(doc_id)) if channel == '*': channelurlquery=ChannelUrl.query(ChannelUrl.url==urlinstance.key) else: channelinstance=Channel.query(Channel.name==channel).get() channelurlquery=ChannelUrl.query(ChannelUrl.url==urlinstance.key, ChannelUrl.channel==channelinstance.key) channelurls=channelurlquery.fetch(3) for channelurl in channelurls: retval.append({'id':channelurl.key.id(),'url': urlinstance.url,'posts':channelurl.posts()}) except search.Error: logging.exception('Search failed') #logging.debug('retval %s' % (retval))
def post(self): doc_id = self.request.get('doc_id', '') if doc_id: #logging.debug('doc_id: %s' % (doc_id)) urlinstance = Url.get_by_id(int(doc_id)) if urlinstance: # If not valid url, delete from index if urlinstance.valid < 0: doc_index = search.Index(name='url') logging.info( 'Delete invalid (%s) url (ID %s) from document index \'url\' (%s)' % (str(urlinstance.valid), doc_id, doc_index)) doc_index.delete(doc_id) else: url = urlinstance.url title = urlinstance.title #logging.debug('url: %s, title: %s' % (url, title)) channels = [] channel = None users = [] user = None date = datetime.datetime.fromtimestamp(0) comments = [] comment = None tags = [] tag = None rate = 0 channelurlquery = ChannelUrl.query( ChannelUrl.url == urlinstance.key) for channelurlinstance in channelurlquery: channelinstance = channelurlinstance.channel.get() if channelinstance.name not in channels: channels.append(channelinstance.name) #logging.info('Adding channel %s' % (channelinstance.name)) postquery = Post.query( Post.channelurl == channelurlinstance.key) for postinstance in postquery: if postinstance.user not in users: users.append(postinstance.user) if date: if date < postinstance.date: date = postinstance.date else: date = postinstance.date extraquery = Extra.query( Extra.channelurl == channelurlinstance.key) for extrainstance in extraquery: if extrainstance.tag: if extrainstance.tag not in tags: tags.append(extrainstance.tag) #logging.info('Adding tag %s' % (extrainstance.tag)) if extrainstance.comment: if extrainstance.comment not in comments: comments.append(extrainstance.comment) #logging.info('Adding comment %s' % (extrainstance.comment)) ratequery = Rate.query( Rate.channelurl == channelurlinstance.key) for rateinstance in ratequery: rate += rateinstance.value #logging.debug('rate %s' % (rate)) if not date: date = datetime.datetime.fromtimestamp(0) # lists to strings channel = ' '.join(channels) user = '******'.join(users) tag = ' '.join(tags) if not tag: tag = None comment = ' '.join(comments) if not comment: comment = None logging.debug( 'doc; channel=%s, user=%s, url=%s, date=%s, title=%s, comment=%s, tag=%s, rate=%s' % (channel, user, url, date, title, comment, tag, rate)) try: doc = search.Document( doc_id=str(doc_id), fields=[ search.TextField(name='channel', value=channel), search.TextField(name='user', value=user), search.TextField(name='url', value=url), search.DateField(name='date', value=date), search.TextField(name='title', value=title), search.TextField(name='comment', value=comment, language='fi'), search.TextField(name='tag', value=tag, language='fi'), search.NumberField(name='rate', value=rate) ], language='en') except Exception, e: logging.error('doc_id: %s, error %s' % (str(doc_id), e)) doc = None try: if doc: search.Index(name='url').put(doc) urlinstance.document_date = datetime.datetime.now() urlinstance.put() else: logging.error('Doc missing.') except search.Error: logging.error('Create Document failed.') else: logging.debug('No urlinstance for doc_id: %s' % (doc_id))