def get(self): id = self.request.get('id') if id: url = Url.get_by_id(int(id)) if url: try: prefix = self.request.get('prefix', '') name = str(prefix) + str(id) taskqueue.add(name=name, queue_name='urlfetch', url='/tasks/valid', params={'id': id}) except taskqueue.TombstonedTaskError: logging.warning('TombstonedTaskError %s' % name) except taskqueue.TaskAlreadyExistsError: logging.warning('TaskAlreadyExistsError %s' % name) else: # Routine check url_keys = Url.query().order(Url.last_check, Url.status).fetch(50, keys_only=True) for key in url_keys: id = key.id() try: prefix = self.request.get('prefix', '') name = str(prefix) + str(id) + '_rc' taskqueue.add(name=name, queue_name='urlfetch', url='/tasks/valid', params={'id': id}) except taskqueue.TombstonedTaskError: logging.warning('TombstonedTaskError %s' % name) except taskqueue.TaskAlreadyExistsError: logging.warning('TaskAlreadyExistsError %s' % name) # Fix <missing> fields: document_date, last_check (valid) url_keys = Url.query(Url.status == None).order(Url.idate).fetch( 50, keys_only=True) for key in url_keys: id = key.id() try: prefix = self.request.get('prefix', '') name = str(prefix) + str(id) + '_fix' taskqueue.add(name=name + '_dd', queue_name='document', url='/tasks/update_document', params={'doc_id': id}) taskqueue.add(name=name + '_lc', queue_name='urlfetch', url='/tasks/valid', params={'id': id}) except taskqueue.TombstonedTaskError: logging.warning('TombstonedTaskError %s' % name) except taskqueue.TaskAlreadyExistsError: logging.warning('TaskAlreadyExistsError %s' % name) redirect = self.request.get('redirect') if redirect: return self.redirect(redirect)
def view_master(request, urlid): data = [] logging.debug('View (Master)Url %s' % (urlid)) try: urlid = int(urlid) except: pass url = Url.get_by_id(urlid) if url: channelurls = ChannelUrl.query(ChannelUrl.url == url.key) for channelurl in channelurls: channel = channelurl.channel.get() if channel.private == False: extras = Extra.query(Extra.channelurl == channelurl.key) rates = Rate.query(Rate.channelurl == channelurl.key) rating = channelurl.rating() #data.append({'channel':channel,'post':post,'url':url,'extras': extras}) data.append({ 'channel': channel, 'channelurl': channelurl, 'post': post, 'url': url, 'extras': extras, 'rates': rates, 'rating': rating }) template_values = { 'data': data, 'user': users.get_current_user(), } return render_to_response('masterurl.html', template_values)
def get(self): doc_id = self.request.get('doc_id', '') if doc_id: try: prefix = self.request.get('prefix', '') name = str(prefix) + str(doc_id) taskqueue.add(name=name, queue_name='document', url='/tasks/update_document', params={'doc_id': doc_id}) except taskqueue.TombstonedTaskError: logging.warning('TombstonedTaskError %s' % (name)) except taskqueue.TaskAlreadyExistsError: logging.warning('TaskAlreadyExistsError %s' % (name)) else: doc_ids = [] url_keys = Url.query().order(Url.document_date).fetch( 100, keys_only=True) for key in url_keys: doc_id = key.id() try: prefix = self.request.get('prefix', '') name = str(prefix) + str(doc_id) taskqueue.add(name=str(doc_id), queue_name='document', url='/tasks/update_document', params={'doc_id': str(doc_id)}) except taskqueue.TombstonedTaskError: logging.warning('TombstonedTaskError %s' % (str(name))) except taskqueue.TaskAlreadyExistsError: logging.warning('TaskAlreadyExistsError %s' % (str(name)))
def post(self): id = self.request.get('id') if id: url = Url.get_by_id(int(id)) if url: result = None try: result = urlfetch.fetch(url.url, allow_truncated=True) except urlfetch.DownloadError: url.status = 'DE' logging.info('DownloadError, url: %s' % (url.url)) except urlfetch.ResponseTooLargeError: url.status = 'RTL' logging.info('ResponseTooLargeError, url: %s' % (url.url)) except urlfetch.InvalidURLError: url.status = 'IUE' logging.info('InvalidURLError, url: %s' % (url.url)) except: url.status = 'UE' logging.error('"Unexpected error: %s, url: %s' % (sys.exc_info()[0], url.url)) if result: if result.content_was_truncated: logging.debug('truncated') if result.status_code: url.status = str(result.status_code) if result and result.status_code and result.status_code == 200: url.valid = 2 else: if url.valid > -5: url.valid = url.valid - 1 else: logging.info('Broken url: %s' % (url.url)) url.last_check = datetime.datetime.now() url.put()
def post(self): post_channel = self.request.get('post_channel', '') post_user = self.request.get('post_user', '') post_url = self.request.get('post_url', '') post_extra = self.request.get('extra', '') # Related/tag if post_extra.startwith('#'): # TODO: check if post_extra[1:].isdigit(): type = 'related' else: type = 'tag' else: type = 'comment' url = Url.all().filter('url =', post_url).get() channel = Channel.all().filter('name =', post_channel).get() channelurl = ChannelUrl.all().filter('channel =', channel).filter('url =', url).get() if channelurl: extra = Extra() extra.channelurl = channelurl extra.user = post_user setattr(extra, type, post_extra) extra.put() else: logging.warning('ChannelUrl not found: %s %s' % (post_channel, post_url))
def post(self): id = self.request.get('id') if id: url = Url.get_by_id(int(id)) if url: # TODO: fetch title #try: req = urllib2.Request(url.url) #logging.debug('req %s' % (req)) #req.add_header('User-agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11') req.add_header( 'User-agent', 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' ) req.add_header( 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ) req.add_header('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3') req.add_header('Accept-Encoding', 'none') req.add_header('Accept-Language', 'en-US,en;q=0.8') req.add_header('Connection', 'keep-alive') res = urllib2.urlopen(req) #logging.debug('res %s' % (res)) doc = res.read() #logging.debug('doc %s' % (doc)) encoding = res.headers.getparam('charset') logging.debug('encoding %s' % (encoding)) try: tree = etree.fromstring( doc, etree.HTMLParser(encoding=encoding)) except LookupError: tree = etree.fromstring(doc, etree.HTMLParser(encoding='utf-8')) title = tree.find(".//title").text logging.debug('title %s' % (title)) url.title = smart_unicode(re.sub(r'\s+', ' ', title).strip()) #except: # logging.debug('TitleTask: title not fetched %s' % (post_url)) #url.title = post_url url.put()
def post(self): post_channel = self.request.get('post_channel', '') post_user = self.request.get('post_user', '') post_url = self.request.get('post_url', '') type = self.request.get('type', '') url = Url.all().filter('url =', post_url).get() channel = Channel.all().filter('name =', post_channel).get() channelurl = ChannelUrl.all().filter('channel =', channel).filter('url =', url).get() if channelurl: rate = Rate() rate.channelurl = channelurl rate.user = post_user rate.type = type rate.put() else: logging.warning('ChannelUrl not found: %s %s' % (post_channel, post_url))
def __unicode__(self): if self.link: if self.link_text: return '%s: <a target="_blank" href="%s">%s</a>' % ( self.content, self.link, self.link_text) else: text = '' comments = [] url = Url.query(Url.url == self.link).get(keys_only=True) channel_url = ChannelUrl.query(ChannelUrl.url == url).get( keys_only=True) extra = Extra.query(Extra.channelurl == channel_url).get( keys_only=False) if extra and extra.comment: text = extra.comment if not text: text = ''.join(self.link.split('/')[-1:]) return '%s: <a target="_blank" href="%s">%s</a>' % ( self.content, self.link, text) return self.content
def get(self): id = self.request.get('id') if id: url = Url.get_by_id(int(id)) if url: try: prefix = self.request.get('prefix', '') name = str(prefix) + str(id) taskqueue.add(name=name, queue_name='urlfetch', url='/tasks/title', params={'id': id}) except taskqueue.TombstonedTaskError: logging.warning('TombstonedTaskError %s' % id) except taskqueue.TaskAlreadyExistsError: logging.warning('TaskAlreadyExistsError %s' % id) else: logging.info('No URL') else: logging.info('No id') redirect = self.request.get('redirect') if redirect: return self.redirect(redirect)
# Add http:// when needed if not url.lower().startswith('http'): url='http://'+url #logging.info('Url/API/Post: Channel=%s User=%s Url=%s' % (channel,user,url)) # Fetch url (async): # a) check statuscode LATER # b) get title LATER rpc = urlfetch.create_rpc() urlfetch.make_fetch_call(rpc, url,allow_truncated=True) # Get url from DB: # a) already exists # b) ChannelCheck # 1. tarkista onko olemassa jo ko. Url, lisää jos ei, muuten päivitä (udate, valid?): valid-juttu joo ehkä jos tarpeen, ei muuten urlquery=Url.query(Url.url==url) urlinstance=urlquery.get() if not urlinstance: urlinstance=Url(url=url) urlinstance.put() #logging.debug('New url %s' % (url)) else: logging.info('Old url %s' % (url)) # 2. tarkista onko olemassa jo ko. Channel, lisää jos ei channelquery=Channel.query(Channel.name==channel) channelinstance=channelquery.get() if not channelinstance: if channel.startswith('#'): private=False else:
def post(self): doc_id = self.request.get('doc_id', '') if doc_id: #logging.debug('doc_id: %s' % (doc_id)) urlinstance = Url.get_by_id(int(doc_id)) if urlinstance: # If not valid url, delete from index if urlinstance.valid < 0: doc_index = search.Index(name='url') logging.info( 'Delete invalid (%s) url (ID %s) from document index \'url\' (%s)' % (str(urlinstance.valid), doc_id, doc_index)) doc_index.delete(doc_id) else: url = urlinstance.url title = urlinstance.title #logging.debug('url: %s, title: %s' % (url, title)) channels = [] channel = None users = [] user = None date = datetime.datetime.fromtimestamp(0) comments = [] comment = None tags = [] tag = None rate = 0 channelurlquery = ChannelUrl.query( ChannelUrl.url == urlinstance.key) for channelurlinstance in channelurlquery: channelinstance = channelurlinstance.channel.get() if channelinstance.name not in channels: channels.append(channelinstance.name) #logging.info('Adding channel %s' % (channelinstance.name)) postquery = Post.query( Post.channelurl == channelurlinstance.key) for postinstance in postquery: if postinstance.user not in users: users.append(postinstance.user) if date: if date < postinstance.date: date = postinstance.date else: date = postinstance.date extraquery = Extra.query( Extra.channelurl == channelurlinstance.key) for extrainstance in extraquery: if extrainstance.tag: if extrainstance.tag not in tags: tags.append(extrainstance.tag) #logging.info('Adding tag %s' % (extrainstance.tag)) if extrainstance.comment: if extrainstance.comment not in comments: comments.append(extrainstance.comment) #logging.info('Adding comment %s' % (extrainstance.comment)) ratequery = Rate.query( Rate.channelurl == channelurlinstance.key) for rateinstance in ratequery: rate += rateinstance.value #logging.debug('rate %s' % (rate)) if not date: date = datetime.datetime.fromtimestamp(0) # lists to strings channel = ' '.join(channels) user = '******'.join(users) tag = ' '.join(tags) if not tag: tag = None comment = ' '.join(comments) if not comment: comment = None logging.debug( 'doc; channel=%s, user=%s, url=%s, date=%s, title=%s, comment=%s, tag=%s, rate=%s' % (channel, user, url, date, title, comment, tag, rate)) try: doc = search.Document( doc_id=str(doc_id), fields=[ search.TextField(name='channel', value=channel), search.TextField(name='user', value=user), search.TextField(name='url', value=url), search.DateField(name='date', value=date), search.TextField(name='title', value=title), search.TextField(name='comment', value=comment, language='fi'), search.TextField(name='tag', value=tag, language='fi'), search.NumberField(name='rate', value=rate) ], language='en') except Exception, e: logging.error('doc_id: %s, error %s' % (str(doc_id), e)) doc = None try: if doc: search.Index(name='url').put(doc) urlinstance.document_date = datetime.datetime.now() urlinstance.put() else: logging.error('Doc missing.') except search.Error: logging.error('Create Document failed.') else: logging.debug('No urlinstance for doc_id: %s' % (doc_id))
def post(self): if users.get_current_user(): node.author = users.get_current_user() post_channel = self.request.get('post_channel', '') post_user = self.request.get('post_user', '') post_url = self.request.get('post_url', '') # Add http:// when needed if not post_url.startswith('http'): post_url = 'http://' + post_url logging.debug('Post: C=%s U=%s P=%s' % (post_channel, post_user, post_url)) # 1. tarkista onko olemassa jo ko. Url, lisää jos ei, muuten päivitä (udate, valid?): valid-juttu joo ehkä jos tarpeen, ei muuten url = Url.all().filter('url =', post_url).get() if not url: url = Url() url.url = post_url url.put() # Title name = ''.join( re.findall('[a-zA-Z0-9_-]', post_channel + '_' + post_url))[:500] try: taskqueue.add(name=name, queue_name='urlfetch', url='/tasks/title', params={'post_url': post_url}) except taskqueue.TombstonedTaskError: logging.warning('TombstonedError %s' % post_url) except taskqueue.TaskAlreadyExistsError: logging.warning('TaskAlredyExists: %s' % post_url) # 2. tarkista onko olemassa jo ko. Channel, lisää jos ei channel = Channel.all().filter('name =', post_channel).get() if not channel: channel = Channel() channel.name = post_channel if post_channel.startswith('!'): channel.private = True channel.put() # 3. tarkista onko url jo olemassa channel-tasolla channelurl = ChannelUrl.all().filter('url =', url).filter('channel =', channel).get() if not channelurl: channelurl = ChannelUrl() channelurl.channel = channel channelurl.url = url #channelurl.user=post_user channelurl.put() else: logging.info('OLDIE! %s %s' % (channelurl.channel.name, channelurl.url.url)) # 4. Lisätään postaus post = Post() post.channelurl = channelurl post.user = post_user post.put()