Python LinkUtils.getUrlHash示例

        def post(self):
                url=self.request.get('url',None)
                if url is None:
                        logging.info('no url no recommendations')
                        return
		url = url.encode('utf-8')
                logging.info('getting url hash %s' %url)
                url_hash = LinkUtils.getUrlHash(url)
                if url_hash is None:
                        logging.error("can't determing url hash %s" % url)
                        return
                try:
                        l = Links.gql('WHERE url_hash = :1' , url_hash).get()
                        if l is None:
                                l = Links.gql('WHERE url = :1' , url).get()
                except:
                        l = None
                if l is None:
                        logging.info('no link saved with url %s' % url)
                        l = Links()
                        l.url  = url
                        l.url_hash = url_hash
                        l.put()
                api_call= 'http://api.zemanta.com/services/rest/0.0/'
                args ={'method': 'zemanta.suggest',
                               'api_key': self.z_key,
                               'text': url,
                               'return_categories': 'dmoz',
                               'format': 'json'}
                args_enc = urllib.urlencode(args)
                json= None
                result=None
                try:
                        result = urlfetch.fetch(url=api_call, payload=args_enc,method = urlfetch.POST, headers={'Content-Type': 'application/x-www-form-urlencoded'})
                        json = simplejson.loads(result.content)
                except:
                        logging.info('bad json data from zemanta: %s' % result)

                if json is None or json['status'] != 'ok':
                        logging.info('error while fetching recommendations')
                        return
                articles = json['articles']
                #TODO apply DMOZ categories
                categories = json['categories']
                #relevant_articles = [ (c["title"], c["url"]) for c in articles if c["confidence"] > 0.01 ]
                relevant_articles = [ (c["title"], c["url"]) for c in articles ]
                l.recommendation=str(simplejson.dumps(relevant_articles[0:4]))
                if l.url_hash is None:
                        l.url_hash = url_hash
                l.put()

示例#2

显示文件

文件： main.py 项目： bojanbabic/Instaright

        def post(self):
		user=self.request.get('user',None)
		user=urllib.unquote(user)
		url=self.request.get('url',None)
		domain=self.request.get('domain',None)
                title=self.request.get('title',None)
                share_mode=self.request.get('share_mode',None)
                if not RequestUtils.checkUrl(url):
                    logging.info('skipping since url is not good!')
                    return
                lu = LinkUtils()
                link_info = lu.getLinkInfo(url)
                description = link_info["d"]
                embeded = link_info["e"]
                logging.info('got post title %s' % title)
                title_new = link_info["t"]
		if title is None and title_new is not None and len(title_new) > 0:
			title = title_new
                if title is None or title == 'None' or title == 'null':
                        title=LinkUtils.getLinkTitle(url)
                if title is not None:
                        title = title[:199]
                logging.info('final link title %s' %title)
                logging.info("link info desc: %s embede: %s" %( description, embeded))
                version=self.request.get('version',None)
                client=self.request.get('client',None)
                selection = self.request.get('selection', None)
                user_agent = self.request.get('user_agent',None)

                UserScoreUtility.updateLinkScore(user,url)
                UserScoreUtility.updateDomainScore(user, domain)

                taskqueue.add(url='/user/badge/task', queue_name='badge-queue', params={'url':url, 'domain':domain, 'user':user, 'version': version, 'client': client})
                taskqueue.add(url='/link/traction/task', queue_name='link-queue', params={'url':url, 'user': user, 'title': title})
                taskqueue.add(url='/link/recommendation/task', queue_name='default', params={'url':url })

                name = "url"
                generic_counter.increment(name)
                url_cnt = generic_counter.get_count(name)
                logging.info("total url count %s " % url_cnt)
                e = EncodeUtils()
                enbased=e.encode(url_cnt)
                url_encode26 = e.enbase(enbased)
                logging.info("url encode: %s and enbase : %s" % (enbased, url_encode26))
		url_hash = LinkUtils.getUrlHash(url)
		today = datetime.datetime.now().date()
		model = SessionModel.gql('WHERE instaright_account = :1 and url_hash = :2 and date > :3', user, url_hash, today).get()
                new_entity=False
		if model is None:
			logging.info('did not find save dafined by: %s %s for date %s', user, url, str(today))
			model = SessionModel()
                        new_entity=True
		else:
			logging.info('existing url(key %s) updating certain params' %str(model.key()))
                logging.info('link: %s title: %s' %(url, title))
		try:
                        #remove for local testing
                	model.ip = self.request.remote_addr
                	model.instaright_account = user
                	model.date = datetime.datetime.now()
                        if new_entity == True:
                	        model.url = url
                                model.url_hash = url_hash
                                model.url_counter_id = url_cnt
                                model.url_encode26 = url_encode26
                	model.title = title
                        model.user_agent=user_agent
                	model.domain = domain
                	model.short_link = None
                	model.feed_link = None
                	model.version = version
                        model.client = client
                        model.selection = selection 
                        model.embeded = embeded
			while True:
				timeout_ms= 100
				try:
					model.put()
					break
				except datastore_errors.Timeout:
					logging.info('model save timeout retrying in %s' % timeout_ms)
					time.sleep(timeout_ms)
					timeout_ms *= 2
                        logging.info('send link : url_hash %s title %s user_id %s updated %s client: %s' %(model.url_hash, model.title, str(model.key()), str(model.date), model.client))
		except BadValueError, apiproxy_errors.DeadlineExceededError:
		        e0, e1 = sys.exc_info()[0], sys.exc_info()[1]
			logging.error('error while saving url %s ( %s, %s)' % (url, e0, e1))

示例#3

显示文件

文件： link_info.py 项目： bojanbabic/Instaright

 def update_link(self, url, link):
     existingLink = None
     url_hash = LinkUtils.getUrlHash(url)
     link.url_hash = url_hash
     # qfix for title TODO: find proper solution
     if link.title is not None:
         link.title = link.title.strip()[:199]
     try:
         existingLink = Links.gql("WHERE url_hash  = :1", url_hash).get()
         if existingLink is None:
             existingLink = Links.gql("WHERE url = :1", url).get()
     except:
         logging.info("bad value for url %s" % url)
     if existingLink is not None:
         existingLink.date_updated = link.date_updated
         existingLink.influence_score = link.influence_score
         existingLink.instapaper_count = link.instapaper_count
         existingLink.instaright_count = link.instaright_count
         existingLink.redditups = link.redditups
         existingLink.redditdowns = link.redditdowns
         existingLink.tweets = link.tweets
         existingLink.diggs = link.diggs
         existingLink.excerpt = link.excerpt
         existingLink.categories = link.categories
         existingLink.delicious_count = link.delicious_count
         existingLink.facebook_like = link.facebook_like
         existingLink.domain = link.domain
         if existingLink.url_hash is None:
             existingLink.url_hash = url_hash
         if link.title is not None:
             existingLink.title = link.title.strip()[:199]
         # if increase in score is more then 20%
         if (
             existingLink.overall_score is None
             or existingLink.overall_score == 0
             or link.overall_score / existingLink.overall_score >= 1.2
         ):
             existingLink.shared = False
         existingLink.overall_score = link.overall_score
         existingLink.put()
     else:
         # greater probability for db timeout of new links
         try:
             while True:
                 timeout_ms = 100
                 try:
                     link.put()
                     break
                 except datastore_errors.Timeout:
                     time.sleep(timeout_ms)
                     timeout_ms *= 2
         except apiproxy_errors.DeadlineExceededError:
             logging.info("run out of retries for writing to db")
     logging.info(
         "url %s : influence_score %s, instapaper_count %s, redditups %s, redditdowns %s, tweets %s, diggs %s, delicious count %s facebook like %s"
         % (
             url,
             link.influence_score,
             link.instapaper_count,
             link.redditups,
             link.redditdowns,
             link.tweets,
             link.diggs,
             link.delicious_count,
             link.facebook_like,
         )
     )

示例#4

显示文件

文件： categories.py 项目： bojanbabic/Instaright

        def post(self):
		url=self.request.get('url',None)
                url_hash = LinkUtils.getUrlHash(url)
                if url is None:
                        logging.info('no link in request. skipping')
                        return
                category_api='http://access.alchemyapi.com/calls/url/URLGetCategory?apikey=%s&url=%s&outputMode=json' %(self.alchemy_key, urllib2.quote(url.encode('utf-8')))
                logging.info('trying to fetch shared count info %s' %category_api)
                link=None
                language=None
                category=None

		try:
                	link = Links.gql('WHERE url_hash = :1', url_hash).get()
                        if link is None:
                	        link = Links.gql('WHERE url = :1', url).get()
		except BadValueError:
			logging.info('url property too long')
                if link is None:
                        link = Links()
                else:
                        link.date_updated = datetime.datetime.now().date()
                json = LinkUtils.getJsonFromApi(category_api)
                if json is None:
                        logging.info('alchemy api returned no category.skipping')
                        return
                try:
                    language=json['language']
                    category=json['category']
                    score=Cast.toFloat(json['score'],0)
                    if score is not None and score > 0.5 and category is not None:
                            logging.info('category %s score %s' %(category, score))
                            cats=category.split("_")
                            if cats is None:
                                    logging.info('no categories. exit')
                                    return
                            memcache_key=url_hash+'_category'
                            current_categories=memcache.get(memcache_key)
                            merge_cat=[]
                            if current_categories is not None:
                                    logging.info('merging with existing cats %s' %current_categories)
                                    merge_cat.extend(current_categories)
                                    merge_cat.extend(cats)
                            else: 
                                    merge_cat=cats 
                            model=None 
                            try: 
                                    model=SessionModel.gql('WHERE url_hash = :1 order by date desc', url).get() 
                                    if model is None:
                                        model=SessionModel.gql('WHERE url = :1 order by date desc', url).get()
                            except BadValueError:
                                logging.info('url too long ... %s' %url)
                            if model is None:
                                logging.info('model not defined ... skipping')
                                return

                            linkDetail=Links.gql('WHERE url_hash = :1' , url_hash).get()
                            if linkDetail is None:
                                linkDetail=Links.gql('WHERE url = :1' , url).get()
                            if linkDetail is not None and linkDetail.categories is not None:
                                    logging.info('category found from link details %s' % linkDetail.categories)
                                    delic_cats=eval(linkDetail.categories)
                                    d_cats=[ c for c in  delic_cats ]
                                    merge_cat.extend(d_cats)
                            merge_cat=set(merge_cat)
                            logging.info('caching cats %s for url %s' %(merge_cat, url))
                            memcache.set(memcache_key, list(set(merge_cat))[:4])

                            for c in merge_cat:
                                taskqueue.add(queue_name='message-broadcast-queue', url='/category/stream', params={'category':c, 'url': url_hash})
                                existingLinkCat = LinkCategory.gql('WHERE url_hash = :1 and category = :2', url_hash, c).get()
                                if existingLinkCat is None:
                                        existingLinkCat = LinkCategory.gql('WHERE url = :1 and category = :2', url, c).get()
                                if existingLinkCat is not None:
                                        existingLinkCat.updated=datetime.datetime.now()
                                        if existingLinkCat.url_hash is None:
                                                existingLinkCat.url_hash = url_hash
                                        existingLinkCat.put()
                                        logging.info('updated exisitng url(%s) category(%s) update time %s' % (url, c, existingLinkCat.updated))
                                else:
                                        logging.info('new pair: url%s) category(%s) ' % (url, c))
                                        linkCategory=LinkCategory()
                                        linkCategory.url=url
                                        linkCategory.url_hash = url_hash
                                        linkCategory.category=c
                                        if model is not None:
                                                linkCategory.model_details=model.key()
                                        linkCategory.put()

                    if language is not None:
                            link.language = language
                    link.url=url
                    link.url_hash=url_hash
                    link.put()
                except KeyError:
                    e0, e1 = sys.exc_info()[0],sys.exc_info()[1]
                    logging.info('key error [[%s, %s]] in %s' %(e0, e1, json))

示例#5

显示文件

文件： link_info.py 项目： bojanbabic/Instaright

    def post(self):

        url = self.request.get("url", None)
        url_hash = LinkUtils.getUrlHash(url)
        user = self.request.get("user", None)
        title = self.request.get("title", None)

        if url is None:
            logging.info("no url detected. skipping...")
            return
        count = 1
        url = urllib2.unquote(url)
        domain = RequestUtils.getDomain(url)
        if not domain or len(domain) == 0:
            self.response.out.write("not url: %s skipping!\n" % url)
            return
        if domain in self.skip_domains:
            logging.info("filering out %s" % url)
            return
        lu = LinkUtils()
        link = lu.getAllData(url, count)
        logging.info("link overall score: %s" % link.overall_score)
        existingLink = None
        try:
            existingLink = Links.gql("WHERE url_hash = :1", url_hash).get()
            if existingLink is None:
                existingLink = Links.gql("WHERE url = :1", url).get()
        except BadValueError:
            logging.info("bad value url %s" % url)
        klout_score = UserUtils.getKloutScore(user, self.klout_api_key)
        share_margin = self.tw_margin
        if klout_score is not None:
            link.overall_score = link.overall_score * int(klout_score)
            logging.info("adjusted overall score %s" % link.overall_score)
            share_margin = share_margin * self.klout_correction
            logging.info("adjusting twit margin: %s" % share_margin)

        logging.info("link score %s tweet margin %s ( existing %s )" % (link.overall_score, share_margin, existingLink))
        if link.overall_score > share_margin and (existingLink is None or not existingLink.shared):
            t = Twit()
            t.generate_content(link, title, "")
            # skip tweets is text emtpy and for root domains
            if t.text is None or LinkUtils.isRootDomain(link.url):
                logging.info("twit with no body. aborting")
                return
            execute_time = TaskUtil.execution_time()
            logging.info("scheduling tweet for %s" % str(execute_time))
            mail.send_mail(
                sender="*****@*****.**",
                to="*****@*****.**",
                subject="Twit to queue!",
                html="Twitt: %s <br> score: %s" % (t.text, link.overall_score),
                body="Twitt: %s <br> score: %s" % (t.text[:500], link.overall_score),
            )

            # taskqueue.add(url='/util/twitter/twit/task', eta=execute_time, queue_name='twit-queue', params={'twit':t.text})
            taskqueue.add(url="/util/twitter/twit/task", queue_name="twit-queue", params={"twit": t.text})
            # update article shared status
            if existingLink is not None:
                existingLink.shared = True
                existingLink.put()
            logging.info("updated link share status")
        else:
            logging.info("not scheduled for tweeting")
        lh = LinkHandler()
        lh.update_link(url, link)