def test__find_urls(self): s = Silk(self.io_loop, allowed_domains=['www.dmoz.org'], fail_silent=False) s.get(LOCAL_URL%(LOCAL_PORT,'index.html'), self.stop) response = self.wait() spider = Spider() spider._find_urls(response, self.stop) links = self.wait() self.assertIn(['http://www.google.com', 'page1.html'], links)
def get(self): today = date.today() #Get the spider, or create it if we lost it spider = get_spider() if not spider: spider = Spider() spider.put() #Find the list of pictures the spider has already found twitpic_spider_list = spider.twitpics #Api to call twitter twitter_search = twapi.Twitter(domain="search.twitter.com") twitter_search.encoded_args = 'q=&ands=&phrase=twitpic&ors=%23movember+%23mowars&since=' + str(spider.last_since) + '&rpp=100&until=' + str(spider.last_until) tw_search_results = twitter_search.search() #Find twitpic links reg = re.compile(r'http://(www)?twitpic.com/([^\s]*)\s*', re.I) results = [] for twt in tw_search_results['results']: #Crudely try to find original tweeter message = twt['text'] if 'RT' not in message[0:8]: #Find all twitpics res = reg.findall(twt['text']) for url_groups in res: #This is just the twitpic link slug twitpic_url = url_groups[1] #Make a tache tache = Moustache(name=twt['from_user'], tweet=twt['text'], twitpic = twitpic_url) #Don't regrab if an older one has if twitpic_url not in twitpic_spider_list: try: tache_image = images.resize(get_twitpic_image(twitpic_url), 340, 340) tache.image = db.Blob(tache_image) tache.put() twitpic_spider_list.append(twitpic_url) results.append(dict) except: pass #Increase limits by a day a time, or just keep it as today and yesterday one_day = timedelta(days=1) spider.last_since = spider.last_until new_until = spider.last_until + one_day if new_until<=today: spider.last_until = new_until else: spider.last_until = today spider.last_since = today - one_day spider.twitpics = twitpic_spider_list spider.put() self.response.out.write(spider.last_until) self.response.out.write(spider.last_since) self.response.out.write('\n\n<br><br>') self.response.out.write(results) self.response.out.write('\n\n<br><br>') self.response.out.write(tw_search_results)
def post(self): project_name = self.request.arguments['project'][0] version = self.request.arguments['version'][0] eggfile = self.request.files['egg'][0] eggf = StringIO(eggfile['body']) try: workspace = ProjectWorkspace(project_name) yield workspace.init() spiders = yield workspace.test_egg(eggf) workspace.put_egg(eggf, version) except InvalidProjectEgg as e: logger.error('Error when uploading project, %s %s' % (e.message, e.detail)) self.set_status(400, reason=e.message) self.finish("<html><title>%(code)d: %(message)s</title>" "<body><pre>%(output)s</pre></body></html>" % { "code": 400, "message": e.message, "output": e.detail, }) return except ProcessFailed as e: logger.error('Error when uploading project, %s, %s' % (e.message, e.std_output)) self.set_status(400, reason=e.message) self.finish("<html><title>%(code)d: %(message)s</title>" "<body><pre>%(output)s</pre></body></html>" % { "code": 400, "message": e.message, "output": e.std_output, }) return finally: workspace.clearup() logger.debug('spiders: %s' % spiders) with session_scope() as session: project = session.query(Project).filter_by( name=project_name).first() if project is None: project = Project() project.name = project_name project.version = version session.add(project) session.commit() session.refresh(project) for spider_name in spiders: spider = session.query(Spider).filter_by( project_id=project.id, name=spider_name).first() if spider is None: spider = Spider() spider.name = spider_name spider.project_id = project.id session.add(spider) session.commit() if self.request.path.endswith('.json'): self.write( json.dumps({ 'status': 'ok', 'spiders': len(spiders) })) else: loader = get_template_loader() self.write(loader.load("uploadproject.html").generate())