Python BeautifulSoup 예제들, lib.BeautifulSoup Python 예제들

예제 #1

0

파일 보기

파일: bizit.py 프로젝트: tivvit/EventCrawlerCZ

    def __init__(self):
        rpc = urlfetch.create_rpc(deadline=60)
        urlfetch.make_fetch_call(rpc, self.url)

        rpcs = []

        try:
            result = rpc.get_result()
            if result.status_code == 200:
                content = EncodingHelper.getEncodedContent(result)
                soup = BeautifulSoup(content)

                events = soup.find(id='hp-articles').findChildren('a')

                for event in events:
                    structuredEvent = {}
                    structuredEvent['source'] = self.url
                    structuredEvent['url'] = self.url+event.get('href')
                    structuredEvent['title'] = event.findChild('h2').string
                    structuredEvent['img'] = event.findChild('img').get('src')
                    structuredEvent['place'] = event.findChild('div', attrs={"class": "hp-article-title"}).string
                    self.structuredEvents.append(structuredEvent)

                    innerRpc = urlfetch.create_rpc(deadline=60)
                    innerRpc.callback = self.create_callback(innerRpc)
                    urlfetch.make_fetch_call(innerRpc, structuredEvent['url'], follow_redirects=False)
                    rpcs.append(innerRpc)

        except urlfetch.DownloadError:
            self.response.write("chyba stahovani")

        for irpc in rpcs:
            irpc.wait()

예제 #2

0

파일 보기

파일: srazyinfo.py 프로젝트: tivvit/EventCrawlerCZ

 def handleDetails(self, rpc):
     try:
         result = rpc.get_result()
         url = str(rpc.request).splitlines()[1].split(' ')[1][1:-1]
         for event in self.structuredEvents:
             if event['url'] == url:
                 soup = BeautifulSoup(EncodingHelper.getEncodedContent(result))
                 baseElement = soup.find('div',  attrs={"class": "node-inner odd"})
                 event['text'] = baseElement.findChild('div', attrs={"class": "detail clearfix"})
                 date = str(baseElement.findChild('div', attrs={"class": "submitted"}).contents[3]).strip()
                 event['date'] = datetime.strptime(date, "%d.%m.%Y").date()
         return 0
     except urlfetch.DownloadError:
             self.response.write("chyba stahovani")

예제 #3

0

파일 보기

    def CreateChickenPlace(self, id, place_info):
        ctx = ndb.get_context()
        menu_page = yield ctx.urlfetch(HOST + place_info["identifier"])
        if not menu_page.status_code == 200:
            raise ndb.Return(None)

        parser = BeautifulSoup.BeautifulSoup(menu_page.content)
        address_1 = parser.find(
            id="ctl00_ContentPlaceHolder1_RestInfo_lblRestAddress").text
        address_2 = parser.find(
            id="ctl00_ContentPlaceHolder1_RestInfo_lblRestZip").text

        address = "%s, %s" % (address_1, " ".join(address_2.split()))

        place = ChickenPlace()
        place.key = ndb.Key(ChickenPlace, id, namespace=self.NAME)
        place.has_chicken = False
        # Check if they actually serve chicken:
        for tag in parser.findAll("h2", attrs={"class": "H2MC"}):
            if "chicken" in tag.text.lower():
                place.has_chicken = True

        if place.has_chicken:
            # If they don't serve chicken then don't save any of their info. F**k them.
            place.identifier = place_info["identifier"]
            place.title = place_info["title"]
            place.address = address

        raise ndb.Return(place)

예제 #4

0

파일 보기

def make_entry(rec):
    """docstring for make_entry"""
    body = rec.get('body')
    body_html = markdown2.markdown(body)
    rec.update({'body_html': body_html})
    slug = rec.get('slug')
    title = rec.get('title')
    excerpt = rec.get('excerpt')
    markdown = rec.get('markdown') or 'markdown'
    tags = rec.get('tags') or []
    if len(tags) == 0:
        tags = ['general']
    tags = [db.Category(utils.slugify(tag)) for tag in tags if tag]
    
    static = rec.get('static')
    
    if not slug:
        utils.slugify(title)
        
    if not excerpt:
        soup = BeautifulSoup.BeautifulSoup(body_html)
        paras = soup.findAll('p')
        if paras:
            excerpt = paras[0].string
    return Entry(author=users.get_current_user(),
    title=title,
    slug=slug,
    body=body,
    body_html=body_html,
    markdown=markdown,
    excerpt=excerpt,
    tags= tags,
    static=static,
    )

예제 #5

0

파일 보기

파일: Blackbox.py 프로젝트: insightglacier/The-Web-Application-Tester

 def __init__(self, url, options, site, report):
     if url[:4] != "http":
         self.url = "http://" + url
     else:
         self.url = url
     self.options = options
     self.site = site
     self.soup = BeautifulSoup.BeautifulSoup(
         self.getHTML(self.url, None, None))
     self.report = report

예제 #6

0

파일 보기

파일: srazyinfo.py 프로젝트: tivvit/EventCrawlerCZ

    def __init__(self):
        rpc = urlfetch.create_rpc(deadline=60)
        urlfetch.make_fetch_call(rpc, self.url)

        rpcs = []

        try:
            result = rpc.get_result()
            if result.status_code == 200:
                content = EncodingHelper.getEncodedContent(result)
                soup = BeautifulSoup(content)

                events = soup.find(id='nextPages').findChildren('div', attrs={"class": "media"})

                for event in events:
                    mediaBody = event.findChild('div', attrs={"class": "media-body"})
                    structuredEvent = {}
                    structuredEvent['source'] = self.url
                    structuredEvent['url'] = self.url+mediaBody.findChild('h4').findChild('a').get('href')
                    structuredEvent['title'] = mediaBody.findChild('h4').findChild('a').string
                    structuredEvent['img'] = self.url+event.findChild('img').get('src')
                    datestr = str(mediaBody.findChild('span', attrs={"class": "stream-event-meta"}).string).strip().translate(None, ' ')
                    if datestr != 'None':
                        structuredEvent['date'] = datetime.strptime(datestr, "%d.%m.%Y").date()
                    else:
                        structuredEvent['date'] = None
                    structuredEvent['text'] = mediaBody.findChild('p').string.strip()
                    structuredEvent['place'] = ''
                    self.structuredEvents.append(structuredEvent)

                    '''
                    innerRpc = urlfetch.create_rpc(deadline=60)
                    innerRpc.callback = self.create_callback(innerRpc)
                    urlfetch.make_fetch_call(innerRpc, structuredEvent['url'], follow_redirects=False)
                    rpcs.append(innerRpc)
                    '''

        except urlfetch.DownloadError:
            self.response.write("chyba stahovani")

        for irpc in rpcs:
            irpc.wait()

예제 #7

0

파일 보기

파일: Blackbox.py 프로젝트: insightglacier/The-Web-Application-Tester

    def generatePayloads(self, data, flag):
        soup = BeautifulSoup.BeautifulSoup(data)
        e = []
        self.study(soup, keyword=flag, entries=e)

        payloads = []

        for element in e:
            payload = ""

            if element['type'] == "attrval":
                i0 = data.find(flag)
                try:
                    i1 = data[:i0].rfind(element['name'])
                except UnicodeDecodeError:
                    continue

                start = data[i1:i0].replace(" ", "")[len(element['name']):]
                if start.startswith("='"): payload = "'"
                if start.startswith('="'): payload = '"'
                if element['tag'].lower() == "img":
                    payload += "/>"
                else:
                    payload += "></" + element['tag'] + ">"

                for xss in self.payloads:
                    payloads.append(payload + xss.replace("__XSS__", flag))

            elif element['type'] == "attrname":
                if flag == element['name']:
                    for xss in self.payloads:
                        payloads.append('>' + xss.replace("__XSS__", flag))

            elif element['type'] == "tag":
                if elem['value'].startswith(flag):
                    for xss in self.payloads:
                        payloads.append(xss.replace("__XSS__", flag)[1:])
                else:
                    for xss in self.payloads:
                        payloads.append("/>" + xss.replace("__XSS__", flag))

            elif element['type'] == "text":
                payload = ""
                if element['parent'] == "title":
                    payload = "</title>"

                for xss in self.payloads:
                    payloads.append(payload + xss.replace("__XSS__", flag))
                return payloads

            data = data.replace(flag, "none", 1)

        return payloads

예제 #8

0

파일 보기

    def post(self, slug=None):
        title = self.request.get("title")
        body = self.request.get("body")
        markdown = self.request.get("markup")
        st = self.request.get("static")
        cm = self.request.get("comments")
        if st == '1': static = True
        else: static = False
        if cm == '1': comments = True
        else: comments = False

        tags = self.request.get("tags")
        tags = tags.split(' ')
        if len(tags) == 0:
            tags = ['general']
        tags = [db.Category(utils.slugify(tag)) for tag in tags if tag]

        body_html = to_html(body, markdown)

        soup = BeautifulSoup.BeautifulSoup(body_html)
        paras = soup.findAll('p')

        if paras:
            excerpt = paras[0].string
        else:
            excerpt = ''

        entry = db.Query(Entry).filter("slug =", slug).get()
        if not entry:
            entry = Entry(
                author=users.get_current_user(),
                title=title,
                slug=utils.slugify(title),
                body=body,
                body_html=body_html,
                markdown=markdown,
                excerpt=excerpt,
                tags=tags,
                static=static,
                comments=comments,
            )
        else:
            entry.title = title
            entry.body = body
            entry.body_html = body_html
            entry.excerpt = excerpt
            entry.static = static
            entry.tags = tags
            entry.comments = comments
        entry.put()
        self.redirect(entry.url())

예제 #9

0

파일 보기

파일: Blackbox.py 프로젝트: insightglacier/The-Web-Application-Tester

 def validXSS(self, response, flag):
     if response == "" or response == None:
         return False
     soup = BeautifulSoup.BeautifulSoup(response)
     for element in soup.findAll("script"):
         if element.string != None and element.string in [
                 t.replace("__XSS__", flag) for t in self.scriptOk
         ]:
             return True
         elif element.has_key("src"):
             if element["src"] == "http://__XSS__/x.js".replace(
                     "__XSS__", flag):
                 return True
     return False

예제 #10

0

파일 보기

    def GetMenu(self, place):
        if len(place.menu) and place.MenuIsFresh():
            return place.menu

        # Fetch the menu
        result = urlfetch.fetch(HOST + place.identifier,
                                headers={"User-Agent": IOS_USER_AGENT})
        parser = BeautifulSoup.BeautifulSoup(result.content)
        for category in parser.findAll("li", attrs={"class": "cat"}):
            header = category.find("h2").text
            if "fried chicken" in header.lower():
                for product in category.findAll("li"):
                    title = product.find("h3").text
                    price = product.find("span").text
                    place.menu.append("%s|%s" % (title, price))
                    logging.info(title)
                    logging.info(price)
                break

        place.menu_freshness = datetime.date.today()
        place.put()
        return place.menu

예제 #11

0

파일 보기

파일: crawler2.py 프로젝트: laucliff/CSC326-Project

    def crawl(self, depth=2, timeout=3):
        """Crawl the web!"""
        seen = set()

        while len(self._url_queue):

            url, depth_ = self._url_queue.pop()

            # skip this url; it's too deep
            if depth_ > depth:
                continue

            self.db.addPage(url)

            # we've already seen this document
            if url in seen:
                continue

            # !!! change this to page url
            seen.add(url) # mark this document as haven't been visited
            
            socket = None
            try:
                socket = urllib2.urlopen(url, timeout=timeout)
                soup = BeautifulSoup(socket.read())

                self._curr_depth = depth_ + 1
                self._curr_url = url
                self._font_size = 0
                self._index_document(soup) #index page fxn
                print "    url="+repr(self._curr_url)
            except Exception as e:
                print e
                import traceback
                traceback.print_exc()
                pass
            finally:
                if socket:
                    socket.close()

예제 #12

0

파일 보기

    def GetAvailablePlaces(self, postcode=None, location=None):
        places = memcache.get(postcode, namespace=self.NAME)

        if places is None:
            result = urlfetch.fetch(BASE_URL.format(postcode),
                                    headers={"User-Agent": IOS_USER_AGENT})
            parser = BeautifulSoup.BeautifulSoup(result.content)
            open_places_tag = parser.find(id="OpenRestaurants")
            places = {}

            for place_root_tag in open_places_tag.findAll("li"):
                place = {"title": place_root_tag.find("h2").text}
                place["identifier"] = place_root_tag.find("a")["href"]
                places[place_root_tag["data-restaurantid"]] = place

            # Cache for 20 minutes if we have some places, else
            # cache an empty result for 5 minutes.
            memcache.set(postcode,
                         places,
                         namespace=self.NAME,
                         time=(60 * 5, 60 * 20)[len(places) != 0])

        database_places = self.getPlacesFromDataStore(places.keys())
        database_place_ids = set(database_places.keys())

        places_that_dont_exist = set(
            places.keys()).difference(database_place_ids)

        if places_that_dont_exist:
            # To fetch the location we don't use the iOS user-agent as the address isn't
            # included in the response.

            created_places = {}

            futures = {
                id: self.CreateChickenPlace(id, places[id])
                for id in places_that_dont_exist
            }

            for id in futures:
                try:
                    created_places[id] = futures[id].get_result()
                    # Run the geo lookup as soon as the chicken place is created
                    created_places[id]._loc_future = geocode.run_lookup(
                        created_places[id].address)
                except Exception:
                    logging.exception("could not get ID")

            for res in created_places:
                # Loop through all geo responses and set the location
                geo_lookup_response = created_places[
                    res]._loc_future.get_result()
                created_places[res].location = geocode.parse_response(
                    geo_lookup_response)
            """for id,geopt in geocode.address_to_geopoint({id:item.address
                                                         for id,item in created_places.items()
                                                        }).items():
                created_places[id].location = geopt"""

            ndb.put_multi(created_places.values())
            database_places.update({
                id: created_places[id]
                for id in created_places if created_places[id].has_chicken
            })

        return database_places.values()

예제 #13

0

파일 보기

파일: utils.py 프로젝트: eremzeit/beatwrit

def find_word_count(content):
    soup = BeautifulSoup(content)
    content = soup.getText()
    return len(split_words(content))