def post(self): url=self.request.get('url',None) if url is None: logging.info('no url no recommendations') return url = url.encode('utf-8') logging.info('getting url hash %s' %url) url_hash = LinkUtils.getUrlHash(url) if url_hash is None: logging.error("can't determing url hash %s" % url) return try: l = Links.gql('WHERE url_hash = :1' , url_hash).get() if l is None: l = Links.gql('WHERE url = :1' , url).get() except: l = None if l is None: logging.info('no link saved with url %s' % url) l = Links() l.url = url l.url_hash = url_hash l.put() api_call= 'http://api.zemanta.com/services/rest/0.0/' args ={'method': 'zemanta.suggest', 'api_key': self.z_key, 'text': url, 'return_categories': 'dmoz', 'format': 'json'} args_enc = urllib.urlencode(args) json= None result=None try: result = urlfetch.fetch(url=api_call, payload=args_enc,method = urlfetch.POST, headers={'Content-Type': 'application/x-www-form-urlencoded'}) json = simplejson.loads(result.content) except: logging.info('bad json data from zemanta: %s' % result) if json is None or json['status'] != 'ok': logging.info('error while fetching recommendations') return articles = json['articles'] #TODO apply DMOZ categories categories = json['categories'] #relevant_articles = [ (c["title"], c["url"]) for c in articles if c["confidence"] > 0.01 ] relevant_articles = [ (c["title"], c["url"]) for c in articles ] l.recommendation=str(simplejson.dumps(relevant_articles[0:4])) if l.url_hash is None: l.url_hash = url_hash l.put()
def post(self): user=self.request.get('user',None) user=urllib.unquote(user) url=self.request.get('url',None) domain=self.request.get('domain',None) title=self.request.get('title',None) share_mode=self.request.get('share_mode',None) if not RequestUtils.checkUrl(url): logging.info('skipping since url is not good!') return lu = LinkUtils() link_info = lu.getLinkInfo(url) description = link_info["d"] embeded = link_info["e"] logging.info('got post title %s' % title) title_new = link_info["t"] if title is None and title_new is not None and len(title_new) > 0: title = title_new if title is None or title == 'None' or title == 'null': title=LinkUtils.getLinkTitle(url) if title is not None: title = title[:199] logging.info('final link title %s' %title) logging.info("link info desc: %s embede: %s" %( description, embeded)) version=self.request.get('version',None) client=self.request.get('client',None) selection = self.request.get('selection', None) user_agent = self.request.get('user_agent',None) UserScoreUtility.updateLinkScore(user,url) UserScoreUtility.updateDomainScore(user, domain) taskqueue.add(url='/user/badge/task', queue_name='badge-queue', params={'url':url, 'domain':domain, 'user':user, 'version': version, 'client': client}) taskqueue.add(url='/link/traction/task', queue_name='link-queue', params={'url':url, 'user': user, 'title': title}) taskqueue.add(url='/link/recommendation/task', queue_name='default', params={'url':url }) name = "url" generic_counter.increment(name) url_cnt = generic_counter.get_count(name) logging.info("total url count %s " % url_cnt) e = EncodeUtils() enbased=e.encode(url_cnt) url_encode26 = e.enbase(enbased) logging.info("url encode: %s and enbase : %s" % (enbased, url_encode26)) url_hash = LinkUtils.getUrlHash(url) today = datetime.datetime.now().date() model = SessionModel.gql('WHERE instaright_account = :1 and url_hash = :2 and date > :3', user, url_hash, today).get() new_entity=False if model is None: logging.info('did not find save dafined by: %s %s for date %s', user, url, str(today)) model = SessionModel() new_entity=True else: logging.info('existing url(key %s) updating certain params' %str(model.key())) logging.info('link: %s title: %s' %(url, title)) try: #remove for local testing model.ip = self.request.remote_addr model.instaright_account = user model.date = datetime.datetime.now() if new_entity == True: model.url = url model.url_hash = url_hash model.url_counter_id = url_cnt model.url_encode26 = url_encode26 model.title = title model.user_agent=user_agent model.domain = domain model.short_link = None model.feed_link = None model.version = version model.client = client model.selection = selection model.embeded = embeded while True: timeout_ms= 100 try: model.put() break except datastore_errors.Timeout: logging.info('model save timeout retrying in %s' % timeout_ms) time.sleep(timeout_ms) timeout_ms *= 2 logging.info('send link : url_hash %s title %s user_id %s updated %s client: %s' %(model.url_hash, model.title, str(model.key()), str(model.date), model.client)) except BadValueError, apiproxy_errors.DeadlineExceededError: e0, e1 = sys.exc_info()[0], sys.exc_info()[1] logging.error('error while saving url %s ( %s, %s)' % (url, e0, e1))
def update_link(self, url, link): existingLink = None url_hash = LinkUtils.getUrlHash(url) link.url_hash = url_hash # qfix for title TODO: find proper solution if link.title is not None: link.title = link.title.strip()[:199] try: existingLink = Links.gql("WHERE url_hash = :1", url_hash).get() if existingLink is None: existingLink = Links.gql("WHERE url = :1", url).get() except: logging.info("bad value for url %s" % url) if existingLink is not None: existingLink.date_updated = link.date_updated existingLink.influence_score = link.influence_score existingLink.instapaper_count = link.instapaper_count existingLink.instaright_count = link.instaright_count existingLink.redditups = link.redditups existingLink.redditdowns = link.redditdowns existingLink.tweets = link.tweets existingLink.diggs = link.diggs existingLink.excerpt = link.excerpt existingLink.categories = link.categories existingLink.delicious_count = link.delicious_count existingLink.facebook_like = link.facebook_like existingLink.domain = link.domain if existingLink.url_hash is None: existingLink.url_hash = url_hash if link.title is not None: existingLink.title = link.title.strip()[:199] # if increase in score is more then 20% if ( existingLink.overall_score is None or existingLink.overall_score == 0 or link.overall_score / existingLink.overall_score >= 1.2 ): existingLink.shared = False existingLink.overall_score = link.overall_score existingLink.put() else: # greater probability for db timeout of new links try: while True: timeout_ms = 100 try: link.put() break except datastore_errors.Timeout: time.sleep(timeout_ms) timeout_ms *= 2 except apiproxy_errors.DeadlineExceededError: logging.info("run out of retries for writing to db") logging.info( "url %s : influence_score %s, instapaper_count %s, redditups %s, redditdowns %s, tweets %s, diggs %s, delicious count %s facebook like %s" % ( url, link.influence_score, link.instapaper_count, link.redditups, link.redditdowns, link.tweets, link.diggs, link.delicious_count, link.facebook_like, ) )
def post(self): url=self.request.get('url',None) url_hash = LinkUtils.getUrlHash(url) if url is None: logging.info('no link in request. skipping') return category_api='http://access.alchemyapi.com/calls/url/URLGetCategory?apikey=%s&url=%s&outputMode=json' %(self.alchemy_key, urllib2.quote(url.encode('utf-8'))) logging.info('trying to fetch shared count info %s' %category_api) link=None language=None category=None try: link = Links.gql('WHERE url_hash = :1', url_hash).get() if link is None: link = Links.gql('WHERE url = :1', url).get() except BadValueError: logging.info('url property too long') if link is None: link = Links() else: link.date_updated = datetime.datetime.now().date() json = LinkUtils.getJsonFromApi(category_api) if json is None: logging.info('alchemy api returned no category.skipping') return try: language=json['language'] category=json['category'] score=Cast.toFloat(json['score'],0) if score is not None and score > 0.5 and category is not None: logging.info('category %s score %s' %(category, score)) cats=category.split("_") if cats is None: logging.info('no categories. exit') return memcache_key=url_hash+'_category' current_categories=memcache.get(memcache_key) merge_cat=[] if current_categories is not None: logging.info('merging with existing cats %s' %current_categories) merge_cat.extend(current_categories) merge_cat.extend(cats) else: merge_cat=cats model=None try: model=SessionModel.gql('WHERE url_hash = :1 order by date desc', url).get() if model is None: model=SessionModel.gql('WHERE url = :1 order by date desc', url).get() except BadValueError: logging.info('url too long ... %s' %url) if model is None: logging.info('model not defined ... skipping') return linkDetail=Links.gql('WHERE url_hash = :1' , url_hash).get() if linkDetail is None: linkDetail=Links.gql('WHERE url = :1' , url).get() if linkDetail is not None and linkDetail.categories is not None: logging.info('category found from link details %s' % linkDetail.categories) delic_cats=eval(linkDetail.categories) d_cats=[ c for c in delic_cats ] merge_cat.extend(d_cats) merge_cat=set(merge_cat) logging.info('caching cats %s for url %s' %(merge_cat, url)) memcache.set(memcache_key, list(set(merge_cat))[:4]) for c in merge_cat: taskqueue.add(queue_name='message-broadcast-queue', url='/category/stream', params={'category':c, 'url': url_hash}) existingLinkCat = LinkCategory.gql('WHERE url_hash = :1 and category = :2', url_hash, c).get() if existingLinkCat is None: existingLinkCat = LinkCategory.gql('WHERE url = :1 and category = :2', url, c).get() if existingLinkCat is not None: existingLinkCat.updated=datetime.datetime.now() if existingLinkCat.url_hash is None: existingLinkCat.url_hash = url_hash existingLinkCat.put() logging.info('updated exisitng url(%s) category(%s) update time %s' % (url, c, existingLinkCat.updated)) else: logging.info('new pair: url%s) category(%s) ' % (url, c)) linkCategory=LinkCategory() linkCategory.url=url linkCategory.url_hash = url_hash linkCategory.category=c if model is not None: linkCategory.model_details=model.key() linkCategory.put() if language is not None: link.language = language link.url=url link.url_hash=url_hash link.put() except KeyError: e0, e1 = sys.exc_info()[0],sys.exc_info()[1] logging.info('key error [[%s, %s]] in %s' %(e0, e1, json))
def post(self): url = self.request.get("url", None) url_hash = LinkUtils.getUrlHash(url) user = self.request.get("user", None) title = self.request.get("title", None) if url is None: logging.info("no url detected. skipping...") return count = 1 url = urllib2.unquote(url) domain = RequestUtils.getDomain(url) if not domain or len(domain) == 0: self.response.out.write("not url: %s skipping!\n" % url) return if domain in self.skip_domains: logging.info("filering out %s" % url) return lu = LinkUtils() link = lu.getAllData(url, count) logging.info("link overall score: %s" % link.overall_score) existingLink = None try: existingLink = Links.gql("WHERE url_hash = :1", url_hash).get() if existingLink is None: existingLink = Links.gql("WHERE url = :1", url).get() except BadValueError: logging.info("bad value url %s" % url) klout_score = UserUtils.getKloutScore(user, self.klout_api_key) share_margin = self.tw_margin if klout_score is not None: link.overall_score = link.overall_score * int(klout_score) logging.info("adjusted overall score %s" % link.overall_score) share_margin = share_margin * self.klout_correction logging.info("adjusting twit margin: %s" % share_margin) logging.info("link score %s tweet margin %s ( existing %s )" % (link.overall_score, share_margin, existingLink)) if link.overall_score > share_margin and (existingLink is None or not existingLink.shared): t = Twit() t.generate_content(link, title, "") # skip tweets is text emtpy and for root domains if t.text is None or LinkUtils.isRootDomain(link.url): logging.info("twit with no body. aborting") return execute_time = TaskUtil.execution_time() logging.info("scheduling tweet for %s" % str(execute_time)) mail.send_mail( sender="*****@*****.**", to="*****@*****.**", subject="Twit to queue!", html="Twitt: %s <br> score: %s" % (t.text, link.overall_score), body="Twitt: %s <br> score: %s" % (t.text[:500], link.overall_score), ) # taskqueue.add(url='/util/twitter/twit/task', eta=execute_time, queue_name='twit-queue', params={'twit':t.text}) taskqueue.add(url="/util/twitter/twit/task", queue_name="twit-queue", params={"twit": t.text}) # update article shared status if existingLink is not None: existingLink.shared = True existingLink.put() logging.info("updated link share status") else: logging.info("not scheduled for tweeting") lh = LinkHandler() lh.update_link(url, link)