def __init__(self): rpc = urlfetch.create_rpc(deadline=60) urlfetch.make_fetch_call(rpc, self.url) rpcs = [] try: result = rpc.get_result() if result.status_code == 200: content = EncodingHelper.getEncodedContent(result) soup = BeautifulSoup(content) events = soup.find(id='hp-articles').findChildren('a') for event in events: structuredEvent = {} structuredEvent['source'] = self.url structuredEvent['url'] = self.url+event.get('href') structuredEvent['title'] = event.findChild('h2').string structuredEvent['img'] = event.findChild('img').get('src') structuredEvent['place'] = event.findChild('div', attrs={"class": "hp-article-title"}).string self.structuredEvents.append(structuredEvent) innerRpc = urlfetch.create_rpc(deadline=60) innerRpc.callback = self.create_callback(innerRpc) urlfetch.make_fetch_call(innerRpc, structuredEvent['url'], follow_redirects=False) rpcs.append(innerRpc) except urlfetch.DownloadError: self.response.write("chyba stahovani") for irpc in rpcs: irpc.wait()
def handleDetails(self, rpc): try: result = rpc.get_result() url = str(rpc.request).splitlines()[1].split(' ')[1][1:-1] for event in self.structuredEvents: if event['url'] == url: soup = BeautifulSoup(EncodingHelper.getEncodedContent(result)) baseElement = soup.find('div', attrs={"class": "node-inner odd"}) event['text'] = baseElement.findChild('div', attrs={"class": "detail clearfix"}) date = str(baseElement.findChild('div', attrs={"class": "submitted"}).contents[3]).strip() event['date'] = datetime.strptime(date, "%d.%m.%Y").date() return 0 except urlfetch.DownloadError: self.response.write("chyba stahovani")
def CreateChickenPlace(self, id, place_info): ctx = ndb.get_context() menu_page = yield ctx.urlfetch(HOST + place_info["identifier"]) if not menu_page.status_code == 200: raise ndb.Return(None) parser = BeautifulSoup.BeautifulSoup(menu_page.content) address_1 = parser.find( id="ctl00_ContentPlaceHolder1_RestInfo_lblRestAddress").text address_2 = parser.find( id="ctl00_ContentPlaceHolder1_RestInfo_lblRestZip").text address = "%s, %s" % (address_1, " ".join(address_2.split())) place = ChickenPlace() place.key = ndb.Key(ChickenPlace, id, namespace=self.NAME) place.has_chicken = False # Check if they actually serve chicken: for tag in parser.findAll("h2", attrs={"class": "H2MC"}): if "chicken" in tag.text.lower(): place.has_chicken = True if place.has_chicken: # If they don't serve chicken then don't save any of their info. F**k them. place.identifier = place_info["identifier"] place.title = place_info["title"] place.address = address raise ndb.Return(place)
def make_entry(rec): """docstring for make_entry""" body = rec.get('body') body_html = markdown2.markdown(body) rec.update({'body_html': body_html}) slug = rec.get('slug') title = rec.get('title') excerpt = rec.get('excerpt') markdown = rec.get('markdown') or 'markdown' tags = rec.get('tags') or [] if len(tags) == 0: tags = ['general'] tags = [db.Category(utils.slugify(tag)) for tag in tags if tag] static = rec.get('static') if not slug: utils.slugify(title) if not excerpt: soup = BeautifulSoup.BeautifulSoup(body_html) paras = soup.findAll('p') if paras: excerpt = paras[0].string return Entry(author=users.get_current_user(), title=title, slug=slug, body=body, body_html=body_html, markdown=markdown, excerpt=excerpt, tags= tags, static=static, )
def __init__(self, url, options, site, report): if url[:4] != "http": self.url = "http://" + url else: self.url = url self.options = options self.site = site self.soup = BeautifulSoup.BeautifulSoup( self.getHTML(self.url, None, None)) self.report = report
def __init__(self): rpc = urlfetch.create_rpc(deadline=60) urlfetch.make_fetch_call(rpc, self.url) rpcs = [] try: result = rpc.get_result() if result.status_code == 200: content = EncodingHelper.getEncodedContent(result) soup = BeautifulSoup(content) events = soup.find(id='nextPages').findChildren('div', attrs={"class": "media"}) for event in events: mediaBody = event.findChild('div', attrs={"class": "media-body"}) structuredEvent = {} structuredEvent['source'] = self.url structuredEvent['url'] = self.url+mediaBody.findChild('h4').findChild('a').get('href') structuredEvent['title'] = mediaBody.findChild('h4').findChild('a').string structuredEvent['img'] = self.url+event.findChild('img').get('src') datestr = str(mediaBody.findChild('span', attrs={"class": "stream-event-meta"}).string).strip().translate(None, ' ') if datestr != 'None': structuredEvent['date'] = datetime.strptime(datestr, "%d.%m.%Y").date() else: structuredEvent['date'] = None structuredEvent['text'] = mediaBody.findChild('p').string.strip() structuredEvent['place'] = '' self.structuredEvents.append(structuredEvent) ''' innerRpc = urlfetch.create_rpc(deadline=60) innerRpc.callback = self.create_callback(innerRpc) urlfetch.make_fetch_call(innerRpc, structuredEvent['url'], follow_redirects=False) rpcs.append(innerRpc) ''' except urlfetch.DownloadError: self.response.write("chyba stahovani") for irpc in rpcs: irpc.wait()
def generatePayloads(self, data, flag): soup = BeautifulSoup.BeautifulSoup(data) e = [] self.study(soup, keyword=flag, entries=e) payloads = [] for element in e: payload = "" if element['type'] == "attrval": i0 = data.find(flag) try: i1 = data[:i0].rfind(element['name']) except UnicodeDecodeError: continue start = data[i1:i0].replace(" ", "")[len(element['name']):] if start.startswith("='"): payload = "'" if start.startswith('="'): payload = '"' if element['tag'].lower() == "img": payload += "/>" else: payload += "></" + element['tag'] + ">" for xss in self.payloads: payloads.append(payload + xss.replace("__XSS__", flag)) elif element['type'] == "attrname": if flag == element['name']: for xss in self.payloads: payloads.append('>' + xss.replace("__XSS__", flag)) elif element['type'] == "tag": if elem['value'].startswith(flag): for xss in self.payloads: payloads.append(xss.replace("__XSS__", flag)[1:]) else: for xss in self.payloads: payloads.append("/>" + xss.replace("__XSS__", flag)) elif element['type'] == "text": payload = "" if element['parent'] == "title": payload = "</title>" for xss in self.payloads: payloads.append(payload + xss.replace("__XSS__", flag)) return payloads data = data.replace(flag, "none", 1) return payloads
def post(self, slug=None): title = self.request.get("title") body = self.request.get("body") markdown = self.request.get("markup") st = self.request.get("static") cm = self.request.get("comments") if st == '1': static = True else: static = False if cm == '1': comments = True else: comments = False tags = self.request.get("tags") tags = tags.split(' ') if len(tags) == 0: tags = ['general'] tags = [db.Category(utils.slugify(tag)) for tag in tags if tag] body_html = to_html(body, markdown) soup = BeautifulSoup.BeautifulSoup(body_html) paras = soup.findAll('p') if paras: excerpt = paras[0].string else: excerpt = '' entry = db.Query(Entry).filter("slug =", slug).get() if not entry: entry = Entry( author=users.get_current_user(), title=title, slug=utils.slugify(title), body=body, body_html=body_html, markdown=markdown, excerpt=excerpt, tags=tags, static=static, comments=comments, ) else: entry.title = title entry.body = body entry.body_html = body_html entry.excerpt = excerpt entry.static = static entry.tags = tags entry.comments = comments entry.put() self.redirect(entry.url())
def validXSS(self, response, flag): if response == "" or response == None: return False soup = BeautifulSoup.BeautifulSoup(response) for element in soup.findAll("script"): if element.string != None and element.string in [ t.replace("__XSS__", flag) for t in self.scriptOk ]: return True elif element.has_key("src"): if element["src"] == "http://__XSS__/x.js".replace( "__XSS__", flag): return True return False
def GetMenu(self, place): if len(place.menu) and place.MenuIsFresh(): return place.menu # Fetch the menu result = urlfetch.fetch(HOST + place.identifier, headers={"User-Agent": IOS_USER_AGENT}) parser = BeautifulSoup.BeautifulSoup(result.content) for category in parser.findAll("li", attrs={"class": "cat"}): header = category.find("h2").text if "fried chicken" in header.lower(): for product in category.findAll("li"): title = product.find("h3").text price = product.find("span").text place.menu.append("%s|%s" % (title, price)) logging.info(title) logging.info(price) break place.menu_freshness = datetime.date.today() place.put() return place.menu
def crawl(self, depth=2, timeout=3): """Crawl the web!""" seen = set() while len(self._url_queue): url, depth_ = self._url_queue.pop() # skip this url; it's too deep if depth_ > depth: continue self.db.addPage(url) # we've already seen this document if url in seen: continue # !!! change this to page url seen.add(url) # mark this document as haven't been visited socket = None try: socket = urllib2.urlopen(url, timeout=timeout) soup = BeautifulSoup(socket.read()) self._curr_depth = depth_ + 1 self._curr_url = url self._font_size = 0 self._index_document(soup) #index page fxn print " url="+repr(self._curr_url) except Exception as e: print e import traceback traceback.print_exc() pass finally: if socket: socket.close()
def GetAvailablePlaces(self, postcode=None, location=None): places = memcache.get(postcode, namespace=self.NAME) if places is None: result = urlfetch.fetch(BASE_URL.format(postcode), headers={"User-Agent": IOS_USER_AGENT}) parser = BeautifulSoup.BeautifulSoup(result.content) open_places_tag = parser.find(id="OpenRestaurants") places = {} for place_root_tag in open_places_tag.findAll("li"): place = {"title": place_root_tag.find("h2").text} place["identifier"] = place_root_tag.find("a")["href"] places[place_root_tag["data-restaurantid"]] = place # Cache for 20 minutes if we have some places, else # cache an empty result for 5 minutes. memcache.set(postcode, places, namespace=self.NAME, time=(60 * 5, 60 * 20)[len(places) != 0]) database_places = self.getPlacesFromDataStore(places.keys()) database_place_ids = set(database_places.keys()) places_that_dont_exist = set( places.keys()).difference(database_place_ids) if places_that_dont_exist: # To fetch the location we don't use the iOS user-agent as the address isn't # included in the response. created_places = {} futures = { id: self.CreateChickenPlace(id, places[id]) for id in places_that_dont_exist } for id in futures: try: created_places[id] = futures[id].get_result() # Run the geo lookup as soon as the chicken place is created created_places[id]._loc_future = geocode.run_lookup( created_places[id].address) except Exception: logging.exception("could not get ID") for res in created_places: # Loop through all geo responses and set the location geo_lookup_response = created_places[ res]._loc_future.get_result() created_places[res].location = geocode.parse_response( geo_lookup_response) """for id,geopt in geocode.address_to_geopoint({id:item.address for id,item in created_places.items() }).items(): created_places[id].location = geopt""" ndb.put_multi(created_places.values()) database_places.update({ id: created_places[id] for id in created_places if created_places[id].has_chicken }) return database_places.values()
def find_word_count(content): soup = BeautifulSoup(content) content = soup.getText() return len(split_words(content))