def doCron(videos): raw_data = http.getHttp("https://gdata.youtube.com/feeds/api/standardfeeds/on_the_web") soup = BeautifulSoup(raw_data, selfClosingTags=['category']) entries=soup.findAll('entry') for entry in entries: if len(entry('title'))>0: mykey=entry('title')[0].text if len(entry('title'))>0 else None if mykey and not getVideo(videos, mykey): video=Video() video.title=entry('title')[0].text video.mykey=mykey video.text=entry('content')[0].text if len(entry('content'))>0 else '' links=entry(lambda tag: tag.name=='link' and tag.attrs[2][0]=='href' and '/watch?' in tag.attrs[2][1]) if len(links)==0: continue video.link=links[0].attrs[2][1] imgs=entry('media:thumbnail', height="90", width="120") if len(imgs)==0: continue video.img=imgs[0].attrs[0][1] imgsBig=entry('media:thumbnail', height='360', width='480') if len(imgsBig)==0: continue video.imgBig=imgsBig[0].attrs[0][1] video.tags=getTags(entry) video.categories=getCategories(entry) video.save();
def doCron(stories): raw_data = http.getHttp("http://www.google.com/trends/hottrends?sa=X") scraper=HotTrendsScraper() scraper.feed(raw_data) data = scraper.trends for d in data: buildStoryFromString(d, stories)
def buildStoryFromString(data, stories): story=findStory(data, stories) if not story: url="http://www.google.com/search?q="+data.replace(' ', '+') logging.info(url) try: raw_data = http.getHttp(url) soup = BeautifulSoup(raw_data) story=None a=soup.find(lambda tag: tag.name=='a' and tag.attrs[0][0]=='href' and not tag.attrs[0][1].startswith('/') and not 'google' in tag.attrs[0][1]) if a and a.text: story=Story() story.deleteFlag=False story.mykey=data story.title='' for c in a.contents: if type(c) == Tag: story.title+=c.text else: story.title+=c story.link=a.attrs[0][1] story.text='' for c in a.parent.contents[4].contents: if type(c) == Tag: story.text+=c.text else: story.text+=c story.put() except DownloadError: #@UndefinedVariable logging.error(url + ' failed to load') ''' scraper=SearchScraper() scraper.feed(raw_data) return scraper.story '''