Python BeautifulSoup.find примеры, BeautifulSoup.BeautifulSoup.find Python примеры использования

Пример #1

0

Показать файл

Файл: addon.py Проект: curtisgibby/nitrocsia-xbmc-addons

 def get_featured(self):
     url = 'http://www.lds.org/media-library/video?lang=eng'
     soup = BeautifulSoup(make_request(url), convertEntities=BeautifulSoup.HTML_ENTITIES)
     for i in soup.find('div',{'class':'feature-box'}).find('ul',{'class':"feature-preview"})('li'):
         fc = i.find('div',{'class':'feature-control'})
         name = fc.findNext('h3').getText().encode('utf8')
         desc = fc.p.getText().encode('utf8')
         u = fc.findNext('a')['href']
         thumb = "https://www.lds.org" + urllib.quote(i.findNext('img')['src'])
         if 'media-library/video/categories' in u: mode = 2
         else: mode = 4
         self.add_dir(thumb,{'Title':name,'Plot':desc},{'name':name,'url':u,'mode':mode},thumb)
     for i in soup.find('ul',{'class':'media-list'})('li'):
         name = i.findNext('h4').a.getText().encode('utf8')
         desc = i.findNext('p').getText().encode('utf8')
         u = i.find('a',{'class':'video-thumb-play'})['href']
         thumb = i.findNext('img')['src']
         try:
             soup2 = BeautifulSoup(make_request(u), convertEntities=BeautifulSoup.HTML_ENTITIES)
             for j in soup2.find('div',{'class':'galleryMeta'})('p'):
                 try:
                     if "for downloads" in j.a.getText():
                         u = j.a['href']
                         break
                 except:
                     continue
             else:
                 continue
         except:
             print "Couldn't get video link for %s. %s" % (name,traceback.format_exc().splitlines()[-1])
             continue
         if 'media-library/video/categories' in u: mode = 2
         else: mode = 4
         self.add_dir(thumb,{'Title':name,'Plot':desc},{'name':name,'url':u,'mode':mode},thumb)

Пример #2

0

Показать файл

Файл: models.py Проект: fitzgeraldsteele/django_slideshare

def getpresentationdetails(sender, **kwargs):
    print "Pre Save!"
    #print sender
    model =  kwargs['instance']
    
    
    # fetch the presentation url
    
    try:
        import urllib
        from BeautifulSoup import BeautifulSoup as BS
        html = urllib.urlopen(kwargs['instance'].url).read()
        bs = BS(html)
        # find the let's get the media url

        presurl = bs.find('link', rel='media:presentation')
        print "* Presentation: " + presurl['href']
        # and the thumbnail
        thumburl = bs.find('link', rel='image_src')
        print "* Thumbnail: " + thumburl['href']
        # and the author ame
        creator = bs.find('meta', property='dc:creator')
        print "* Creator: " + creator['content']
        
        title = bs.find('meta', property="media:title")
        print "* Content: " + title['content']

    except Exception, e:
        raise e

Пример #3

0

Показать файл

Файл: tasks.py Проект: devrow/bookmark_service

def fetch_page(link_id):
    link = Link.objects.get(pk=link_id)
    url = link.url

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'}
    req = urllib2.Request(url, None, headers)

    try:
        html = urllib2.urlopen(req).read()
        soup = BeautifulSoup(html)
        link.title = soup.find('title').text

        favicon = soup.find('link', rel='shortcut icon')
        if favicon and favicon['href']:
            link.favicon = urljoin(url, favicon['href'])

        for item in soup.findAll('meta'):
            if item.get('name', '').lower() in ('description', 'og:description') and item.get('content', ''):
                link.description = item.get('content', '')

    except Exception as e:
        link.is_error = 1
        link.error_text = e.reason.__str__()

    link.save()

Пример #4

0

Показать файл

Файл: alto_las_condes.py Проект: vkhemlan/loogares_scrappers

    def _retrieve_product(cls, url):
        browser = mechanize.Browser()
        soup = BeautifulSoup(browser.open(url))

        result = {}

        container = soup.find('div', 'detalle_tienda')

        result['name'] = container.find('h1').string

        category = soup.find('div', 'selectcion_cat').find('div', 'txt').string
        result['category'] = category

        store_data = container.findAll('li')

        result['level'] = store_data[2].string
        result['local'] = store_data[0].string

        phone = store_data[1].string.split(u'Teléfono ')[1]
        if not phone:
            phone = None
        result['phone'] = phone
        result['picture'] = cls.base_url + container.find('img')['src']

        store_url = None
        link_tag = store_data[3].find('a')
        if link_tag.string:
            store_url = link_tag['href']

        result['store_url'] = store_url

        return result, {}

Пример #5

0

Показать файл

Файл: lefigaro.py Проект: morganko/newsdiffs

    def _parse(self, html):
        soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, fromEncoding='utf-8')

        try:
            p_tags = soup.find('div', attrs={'itemprop':'articleBody'}).findAll('p')
        except AttributeError:
            self.real_article = False
            return

        main_body = '\n'.join([p.getText() for p in p_tags])

        self.body = main_body

        self.meta = soup.findAll('meta')

        self.title = soup.find('meta', attrs={'property':'og:title'}).get('content')
        
        author = soup.find('p', attrs={'itemprop':'name'})
        
        if author:
            self.byline = author.getText()
        else:
            self.byline = ''

        datestr = soup.find('time', attrs={'itemprop':'datePublished'}).get('datetime')
        new_dt = datestr[:19]
        datet = datetime.strptime(new_dt, '%Y-%m-%dT%H:%M:%S')
        self.date = datet.strftime(DATE_FORMAT)

Пример #6

0

Показать файл

Файл: __init__.py Проект: wwitzel3/incubator-allura

class GoogleCodeProjectExtractor(object):
    RE_REPO_TYPE = re.compile(r'(svn|hg|git)')

    PAGE_MAP = {
            'project_info': 'http://code.google.com/p/%s/',
            'source_browse': 'http://code.google.com/p/%s/source/browse/',
        }

    LICENSE_MAP = defaultdict(lambda:'Other/Proprietary License', {
            'Apache License 2.0': 'Apache Software License',
            'Artistic License/GPL': 'Artistic License',
            'Eclipse Public License 1.0': 'Eclipse Public License',
            'GNU GPL v2': 'GNU General Public License (GPL)',
            'GNU GPL v3': 'GNU General Public License (GPL)',
            'GNU Lesser GPL': 'GNU Library or Lesser General Public License (LGPL)',
            'MIT License': 'MIT License',
            'Mozilla Public License 1.1': 'Mozilla Public License 1.1 (MPL 1.1)',
            'New BSD License': 'BSD License',
            'Other Open Source': 'Other/Proprietary License',
        })

    DEFAULT_ICON = 'http://www.gstatic.com/codesite/ph/images/defaultlogo.png'

    def __init__(self, project, page='project_info'):
        gc_project_name = project.get_tool_data('google-code', 'project_name')
        self.url = self.PAGE_MAP[page] % urllib.quote(gc_project_name)
        self.project = project
        self.page = BeautifulSoup(urllib2.urlopen(self.url))

    def get_short_description(self):
        self.project.short_description = self.page.find(itemprop='description').string.strip()

    def get_icon(self):
        icon_url = urljoin(self.url, self.page.find(itemprop='image').attrMap['src'])
        if icon_url == self.DEFAULT_ICON:
            return
        icon_name = urllib.unquote(urlparse(icon_url).path).split('/')[-1]
        fp_ish = urllib2.urlopen(icon_url)
        fp = StringIO(fp_ish.read())
        M.ProjectFile.save_image(
            icon_name, fp,
            fp_ish.info()['content-type'].split(';')[0],  # strip off charset=x extra param,
            square=True, thumbnail_size=(48,48),
            thumbnail_meta={'project_id': self.project._id, 'category': 'icon'})

    def get_license(self):
        license = self.page.find(text='Code license').findNext().find('a').string.strip()
        trove = M.TroveCategory.query.get(fullname=self.LICENSE_MAP[license])
        self.project.trove_license.append(trove._id)

    def get_repo_type(self):
        repo_type = self.page.find(id="crumb_root")
        if not repo_type:
            raise Exception("Couldn't detect repo type: no #crumb_root in "
                    "{0}".format(self.url))
        re_match = self.RE_REPO_TYPE.match(repo_type.text.lower())
        if re_match:
            return re_match.group(0)
        else:
            raise Exception("Unknown repo type: {0}".format(repo_type.text))

Пример #7

0

Показать файл

Файл: account.py Проект: r-darwish/neobot

    def _on_page(self, page):
        if not page:
            import ipdb
            ipdb.set_trace()

        soup = BeautifulSoup(page)
        if not soup.find('a', text='Log in'):
            event = soup.find('b', text='Something has happened!')
            if event:
                cell = event.findParent('table').findAll('td')[2]
                text = ''.join([x.text if hasattr(x, 'text') else x
                        for x in cell.childGenerator()])
                self._logger.info("Something has happned: %s", text)

            try:
                self._neopoints = get_np(soup)
            except NoNpInPage:
                pass

            return soup

        self._logger.info('Need to login. Using account %s', self._username)
        data = dict(username=self._username, password=self._password,
                    destination=soup.find(
                        'input', attrs=dict(name='destination'))['value'])
        d = self._browser.post('http://www.neopets.com/login.phtml', data)
        d.addCallback(self._on_login)
        return d

Пример #8

0

Показать файл

Файл: loc_dup_cluster.py Проект: CosteaPaul/bcbb

 def transcripts(self, organism, gene_id):
     """Retrieve a list of (transcript, protein) ids for the given gene_id.
     """
     txs = []
     ps = []
     valid_gene_starts = ["EN", "FB", "AA", "AG"]
     with self._get_open_handle("Gene", "Summary", organism,
             gene_id) as in_handle:
         soup = BeautifulSoup(in_handle)
         tx_info = soup.find("table", {"id" : "transcripts"})
         if tx_info is None:
             tx_info = soup.find(True, {"id" : "transcripts_text"})
         #print tx_info
         tx_links = tx_info.findAll("a", 
                 href = re.compile("Transcript/Summary"))
         for tx_link in tx_links:
             if tx_link.string and tx_link.string[:2] in valid_gene_starts:
                 txs.append(tx_link.string)
         p_links = tx_info.findAll("a", 
                 href = re.compile("Transcript/ProteinSummary"))
         for p_link in p_links:
             if p_link.string:
                 ps.append(p_link.string)
     assert len(txs) == len(ps), (organism, gene_id, txs, ps)
     return zip(txs, ps)

Пример #9

0

Показать файл

Файл: default.py Проект: 19pvv52/seppius-xbmc-repo

def check_login():
    login = __settings__.getSetting("Login")
    password = __settings__.getSetting("Password")

    if len(login) > 0:
        http = GET(httpSiteUrl, httpSiteUrl)
        if http == None: return None

        beautifulSoup = BeautifulSoup(http)
        userPanel = beautifulSoup.find('a', {"id": "loginlink"})

        if userPanel == None:
            os.remove(cookiepath)

            loginResponse = GET(httpSiteUrl, httpSiteUrl, {
            'login': '******',
            'login_name': login,
            'login_password': password,
            'submit': 'Вход'
            })

            loginSoup = BeautifulSoup(loginResponse)
            userPanel = loginSoup.find('a', {"id": "loginlink"})
            if userPanel == None:
                showMessage('Login', 'Check login and password', 3000)
            else:
                return userPanel.text.encode('utf-8', 'cp1251')
        else:
            return userPanel.text.encode('utf-8', 'cp1251')
    return None

Пример #10

0

Показать файл

Файл: IPinfo.py Проект: 7h3rAm/IPinfo

def urlVoid(s):
    """
    API info: http://blog.urlvoid.com/urlvoid-api-v2-0/
    Restrictions: < 1,0000 per day
    * if "-1" is returned it means the domain has not been yet scanned
    """
    print (header("URLvoid"))
    api_key = ""
    if not api_key:
        print "[!] You must configure your URLvoid API key"
    else:
        url = "http://api.urlvoid.com/index/exec/"
        parameters = {"domains": s, 
                      "api": api_key, 
                      "go": 'Check'}
        data = urllib.urlencode(parameters)
        try:
            page = urllib2.urlopen(url, data)
            soup = BeautifulSoup(page)
            new_date = datetime.fromtimestamp(int(soup.find("details")['last_scan'])).strftime("%b %d %Y")
            print "Last Scan  :",new_date
            detect_cnt = soup.find("details")['detected']
            if detect_cnt == "-1":
                print "Not scanned yet"
            else:
                print "Detected   :",detect_cnt
            if detect_cnt > "0":
                print "Detections :",soup.find("details")['lists_detected']
        except Exception, msg:
            print msg

Пример #11

0

Показать файл

Файл: geocode_labels.py Проект: fgregg/neighborhoods

def placeFromMap(listing) :
    soup = BeautifulSoup(listing)

    geo_location_div = soup.find('div', {'id' : 'map'})

    if geo_location_div :
        latitude = geo_location_div['data-latitude']
        longitude = geo_location_div['data-longitude']
        geolocation = (float(latitude), float(longitude))
    else :
        geolocation = None

    address_div = soup.find('div', {'class' : 'mapaddress'})

    if address_div :
        address = address_div.text
    else :
        address = None

    posting_title_div = soup.find('h2', {'class' : 'postingtitle'})
    
    if posting_title_div :
        match = re.search(r'.*\((.+?)\)$', posting_title_div.text.strip())
        if match :
            end_label = match.group(1)
        else :
            end_label = None
    else :
        end_label = None

    return address, geolocation, end_label

Пример #12

0

Показать файл

Файл: naver.py Проект: cjrules/xbmc-korean

    def ParsePage(self,id):
        resp = urllib.urlopen( self.main_url % id );
	soup = BeautifulSoup(resp.read(),fromEncoding="euc-kr")

        self.meta.m_id = id
	self.meta.m_title = soup.find('h2').string
	self.meta.m_artist = [ soup.find("dt",id="artistName").span.a.string ]

	strain = SoupStrainer("div",{"class" : "album_info"})
	sect = soup.find(strain)
	self.meta.m_thumb = sect.find("img",{"id":"albumBigThumb"})['src']
	self.meta.m_genres = sect.find("img",alt=u"장르").parent.nextSibling.nextSibling.next.string.strip().split('/')
	self.meta.m_release = sect.find("img",alt=u"발매일").parent.nextSibling.nextSibling.next.string.strip()

	self.meta.m_rating = float( sect.find("span",{"class":"text_point"}).string )

	self.meta.m_review = ''.join(soup.find("div", id="albumDesc").findAll(text=True)).strip()
	self.meta.m_review = self.meta.m_review.replace("&amp;","&")
	self.meta.m_review = self.meta.m_review.replace("&#039;","'").replace("&#8211;","-")
	self.meta.m_review = unicode(self.meta.m_review, 'utf-8')

	self.meta.m_tracks = []
	for item in soup.findAll("td",{"class" : "num"}):
	    pos = int( item.string )
	    track = item.findNextSiblings('td')[1].a.string
	    self.meta.m_tracks.append( (pos,track) )

	return self.meta

Пример #13

0

Показать файл

Файл: daum.py Проект: cjrules/xbmc-korean

    def ParseSeriesCastPage(self,id):
        resp = urllib.urlopen( self.cast_url % id );
        soup = BeautifulSoup(resp.read(),fromEncoding="utf-8")

        pt = soup.find("h5",text=re.compile(u"^\s*출연\s*$"))
        if pt:
            for item in pt.parent.parent.findAll("dl"):
                name = item.find('img')['alt'].strip()
                role = item.find('span',{"class" : "etcs"}).string.strip()
                if role.rfind(u" 역") >= 0:
                    role = role[:role.rfind(u" 역")]
                else:
                    role = ''
                self.meta.s_actors.append( (name,role) )

        pt = soup.find("h5",text=re.compile(u"^\s*제작진\s*$"))
        if pt:
            for item in pt.parent.parent.findAll('li'):
                if item.contents[0].string.startswith(u"극본"):
                    for person in item.contents[1:]:
                        name = person.string.strip()
                        if name:
                            self.meta.s_writers.append(name)
                elif item.contents[0].string.startswith(u"연출"):
                    for person in item.contents[1:]:
                        name = person.string.strip()
                        if name:
                            self.meta.s_directors.append(name)

Пример #14

0

Показать файл

Файл: turkpipe.py Проект: reubano/turkpipe

def makeHTMLQuestion(fn, htmldata):
  soup = BeautifulSoup(htmldata)
  #add JS
  soup.find('body')['onload'] = "populateAssignmentID('myAssignmentId')"
  soup.find('head').insert(0, SUBMIT_JS)
  #replace forms
  forms = soup.findAll('form')
  if forms:
    for form in forms:
      if not form.has_key('method'):
        form['method'] = 'POST'
      if not form.has_key('action'):
        if testmode:
          form['action'] = 'http://workersandbox.mturk.com/mturk/externalSubmit'
        else:
          form['action'] = 'http://www.mturk.com/mturk/externalSubmit'
      if not form.has_key('onSubmit'):
        form['onSubmit'] = "return verifyTurkSubmit('myAssignmentId');"
      inputtag = Tag(soup,'input')
      inputtag['type'] = 'hidden'
      inputtag['name'] = 'assignmentId'
      inputtag['id'] = 'myAssignmentId'
      inputtag['value'] = ''
      form.insert(0, inputtag)
  mainurl = uploadfile(fn, str(soup))
  for sub in soup.findAll('img'):
    # TODO
    fn = dirname(fn) + '/' + sub['src']
    uploadfile(fn)
  return ExternalQuestion(escape(mainurl), frame_height)

Пример #15

0

Показать файл

Файл: media_helper.py Проект: BowlesCR/the-blue-alliance

    def _parse_cdphotothread_image_partial(cls, html):
        """
        Input: the HTML from the thread page
        ex: http://www.chiefdelphi.com/media/photos/38464,

        returns the url of the image in the thread
        ex: http://www.chiefdelphi.com/media/img/3f5/3f5db241521ae5f2636ff8460f277997_l.jpg
        """
        html = html.decode("utf-8", "replace")

        # parse html for the image url
        soup = BeautifulSoup(html,
                             convertEntities=BeautifulSoup.HTML_ENTITIES)

        # 2014-07-15: CD doesn't properly escape the photo title, which breaks the find() for cdmLargePic element below
        # Fix by removing all instances of the photo title from the HTML
        photo_title = soup.find('div', {'id': 'cdm_single_photo_title'}).text
        cleaned_soup = BeautifulSoup(html.replace(photo_title, ''),
                             convertEntities=BeautifulSoup.HTML_ENTITIES)

        element = cleaned_soup.find('a', {'target': 'cdmLargePic'})
        if element is not None:
            partial_url = element['href']
        else:
            return None

        # partial_url looks something like: "/media/img/774/774d98c80dcf656f2431b2e9186f161a_l.jpg"
        # we want "774/774d98c80dcf656f2431b2e9186f161a_l.jpg"
        image_partial = re.match(r'\/media\/img\/(.*)', partial_url)
        if image_partial is not None:
            return image_partial.group(1)
        else:
            return None

Пример #16

0

Показать файл

Файл: CrunchyManga.py Проект: Manabi/CrunchyManga

 def getUrl(self, url, downloadtype = None):
     if not downloadtype: downloadtype = self.downloadtype 
     self.errors = []
     html = self.downloadHtml(url)
     if not html:
         return False
     soup = BeautifulSoup(html)
     manga = soup.find("object",{u"id":u"showmedia_videoplayer_object"})
     if manga:
         manga = manga.find("embed",{u"type":u"application/x-shockwave-flash"}).get("flashvars").split("=")
         manga_titulo = soup.find(u"span", {u"itemprop":u"title"}).text
         #nopermitido = ["\\","/","?",":","*","\"","<",">","|"]
         #for i in nopermitido:manga_titulo = manga_titulo.replace(i,' ')
         manga_titulo = self.checkStr(manga_titulo)
         self.manga_titulo = manga_titulo
         n = len(manga)-1
         serie_id = manga[1][:manga[1].find('&chapterNumber')]
         chapterNumber = self.numCap(manga[2][:manga[2].find('&server')])
         if chapterNumber: self.chapterNumber = chapterNumber[0]
         sesion_id = manga[n]
         url_serie = "http://api-manga.crunchyroll.com/chapters?series_id="+serie_id
     if downloadtype == "Chapter":
         return self.Chapter(sesion_id,url_serie,manga_titulo,chapterNumber)
     elif downloadtype == "Volume":
         return self.Volume(soup)
     elif downloadtype == "Complete":
         pass
     else:
         self.addError("Error: Invalid download type.")

Пример #17

0

Показать файл

Файл: politico.py Проект: ChrisMissal/newsdiffs

    def _parse(self, html):
        soup = bs4.BeautifulSoup(html)
        print_link = soup.findAll('a', text='Print')[0].get('href')
        html2 = grab_url(print_link)
        logger.debug('got html 2')
        # Now we have to switch back to bs3.  Hilarious.
        # and the labeled encoding is wrong, so force utf-8.
        soup = BeautifulSoup(html2, convertEntities=BeautifulSoup.HTML_ENTITIES,
                             fromEncoding='utf-8')

        self.meta = soup.findAll('meta')
        p_tags = soup.findAll('p')[1:]
        real_p_tags = [p for p in p_tags if
                       not p.findAll(attrs={'class':"twitter-follow-button"})]

        self.title = soup.find('strong').getText()
        entity = soup.find('span', attrs={'class':'author'})
        children = list(entity.childGenerator())
        try:
            self.byline = 'By ' + children[1].getText()
        except IndexError:
            self.byline = ''
        self.date = children[-1].strip()

        self.body = '\n'+'\n\n'.join([p.getText() for p in real_p_tags])

Пример #18

0

Показать файл

Файл: getBaconSql.py Проект: asoa/webScraping

def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org" + articleUrl)
    soup = BeautifulSoup(html)
    title = soup.find("h1").text
    content = soup.find('div', {'id':'mw-content-text'}).find('p').text
    store(title, content)
    return soup.find('div', {'id':'bodyContent'}).findAll('a', href=re.compile('^(\/wiki\/)((?!:).)*$'))

Пример #19

0

Показать файл

Файл: user.py Проект: BGEray/pyTOP

 def login(self, username='', passwd='', app_user_nick=None, target=None, use_taobaoid=False):
     if use_taobaoid:
         systime = SysTime()
         params = {
             'app_key' : self.API_KEY,
             'timestamp'  : systime.get(),
             'sign_method' : self.SIGN_METHOD,
         }
         if app_user_nick!=None: params['app_user_nick'] = app_user_nick
         if target!=None: params['target'] = target
         src = self.APP_SECRET + ''.join(["%s%s" % (k, v) for k, v in sorted(params.iteritems())]) + self.APP_SECRET
         params['sign'] = md5(src).hexdigest().upper()
         form_data = urllib.urlencode(params)
         rsp = requests.get('%s?%s'%(self.TaobaoID_URL, form_data))
         print rsp.content
     else:
         rsp = requests.get('%s%s'%(self.LOGIN_URL, self.API_KEY))
         soup = BeautifulSoup(rsp.content)
         iframe_src = soup.find('iframe')['src']
         rsp = requests.get(iframe_src)
         print rsp.url
         #s = requests.session()
         login_url = 'https://login.taobao.com/member/login.jhtml'
         soup = BeautifulSoup(rsp.content)
         login_url = soup.find('form')['action']
         #inputs = soup.findAll('input')
         forms = self.extract_form_fields(soup)
         forms['TPL_username'] = username
         forms['TPL_password'] = passwd
         rsp = requests.post(login_url, data=forms)
         print rsp.url
         print rsp.content

Пример #20

0

Показать файл

Файл: default.py Проект: tamminh/xbmc-plugins

def fslink_get_video_list(url, count):
    soup = BeautifulSoup(make_request(url), convertEntities=BeautifulSoup.HTML_ENTITIES)
    # items = soup.findAll('a', {'class' : 'title t'})

    # for item in items:
    # print item
    # print item.a
    # print item.nextSibling().img('src')
    # add_dir(item.text.encode("utf-8").replace('	',' '), FSLINK+item['href'], 8, icon)

    items = soup.find("div", {"class": "featured-view"})

    for item in items.findAll("a"):
        try:
            add_dir(item.img["alt"], FSLINK + item["href"], 8, FSLINK + item.img["src"])
        except:
            pass

    items = soup.find("div", {"class": "vm-pagination"})
    for item in items.findAll("a"):
        try:
            if item.string == ">":
                if count < 0:
                    count = count + 1
                    fslink_get_video_list(FSLINK + item["href"], count)
                else:
                    add_dir(item.string, FSLINK + item["href"], 7, icon)
        except:
            pass

Пример #21

0

Показать файл

Файл: marvelkids.py Проект: AbsMate/bluecop-xbmc-repo

def play(url=common.args.url):
    swfUrl = 'http://admin.brightcove.com/viewer/us20110809.1526/federatedVideoUI/BrightcovePlayer.swf'
    exp_id=common.args.exp_id
    data = common.getURL(url)

    tree=BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES)
    key = tree.find('param',attrs={'name':'playerKey'})['value']
    content_id = tree.find('param',attrs={'name':'@videoPlayer'})['value']
    
    #key = re.compile('<param name="playerKey" value="(.+?)" />').findall(data)[0]
    #content_id = re.compile('<param name="@videoPlayer" value="(.+?)" />').findall(data)[0]
    #exp_id = re.compile('<param name="playerID" value="(.+?)" />').findall(data)[0]
    
    renditions = get_episode_info(key, content_id, url, exp_id)['programmedContent']['videoPlayer']['mediaDTO']['renditions']
    rtmp = ''
    hi_res = 0
    selected_video = None
    for video in renditions:
        if(int(video['size'])>hi_res):
            selected_video = video
            hi_res = int(video['size'])
    
    link = selected_video['defaultURL']
    item = xbmcgui.ListItem(path=link)
    return xbmcplugin.setResolvedUrl(pluginhandle, True, item)

Пример #22

0

Показать файл

Файл: durex.py Проект: Kaibin/Condom_Data_Fetcher

    def get(self):
        content = self.request.content
        soup = BeautifulSoup(''.join(content))

        #所有text已经被自动转为unicode，如果需要，可以自行转码encode(xxx)
        title = soup.html.body.h1
        if not title:
            return
        title = title.text
        subtitle = soup.findAll(attrs={'class':'f_cy f_s16b'})[0].string
        description = soup.find(attrs={'class':'f_cy f_s14 pt20'})
        description = description.text if description else ''
        smooth_index = soup.findAll(attrs={'class':'pt20'})[0]
        smooth_index = smooth_index.text if smooth_index else ''
        information = soup.findAll(attrs={'class':'pt20'})[1]
        information = information.text if information else ''
        tips = soup.find(attrs={'class':'f_s14 pt20'})
        tips = tips.text + tips.nextSibling.nextSibling.text if tips else ''

#        pics = soup.findAll('a', href = re.compile(r'pic\d'))
        pics = soup.findAll(attrs={'class':'pic1'})
        if pics:
            imageList = []
            for pic in pics:
                img = pic.find('img')['src']
                imageList.append(img)
                spider.put(HTTP%img)

        self.page.append((self.request.url, title, subtitle, description, smooth_index, information, tips, imageList))

Пример #23

0

Показать файл

Файл: blogrollSQLGenerator.py Проект: ershad/Snippets

def convertToSQL(line):
    try:
        soup = BeautifulSoup(line)
        name = soup.contents[0].a.string
        link =  soup.find("a")["href"]

        try:
            target = soup.find("a")["target"]
        except:
            target = ''
 
        try:
            rel = soup.find("a")["rel"]
        except:
            rel = ''
        
        try:
            title = soup.find("a")["title"]
        except:
            title = ''

        sqlLine = 'INSERT into %s.wp_links (link_url, link_name, link_target, link_description, link_rel)' % database
        sqlLine += """ values ('%s','%s','%s','%s','%s');""" % (link, name, target, title, rel)

        print sqlLine
    except:
        pass

Пример #24

0

Показать файл

Файл: default.py Проект: juicybutts/divingmule-xbmc-repo-clone

def index(url,name):
        if not name=='All Videos':
            if not re.search('page=', url):
                addPlaylist('Play Featured Videos',url,4,'')
        req = urllib2.Request(url)
        req.addheaders = [('Referer', 'http://www.nfl.com/'),
                    ('Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3 ( .NET CLR 3.5.30729)')]
        response = urllib2.urlopen(req)
        link=response.read()
        response.close()
        soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
        videos = soup.find('ul', attrs={'id' : "video-list-items"})('li')
        for video in videos:
            name = video('h3')[0]('a')[0].string
            link = video('h3')[0]('a')[0]['href'].split('/')[3]
            thumb = video('img')[0]['src'].replace('_video_thumbnail_80_60.jpg','_video_rhr_280_210.jpg')
            try:
                desc = video('p')[1].string+' \n  '+video('p')[0].string
            except:
                desc = video('p')[0].string
            duration = video('div')[-1].string.replace('\n','').replace('\t','')
            addLink(name,link,thumb,duration,desc,3)
        try:
            page = soup.find('div', attrs={'id' : "video-list-pagination"})('a')[-1]['href']
            if not page == '?page=3':
                addDir('Next Page',url.split('?')[0]+page,1,next)
            else:
                addDir('Next Page','http://www.nfl.com/ajax/videos/v2?batchNum=1&channelId='+url.split('/')[-1].split('?')[0],5,next)
        except:
            pass

Пример #25

0

Показать файл

Файл: fsmodel.py Проект: sergeymaysak/ex-ua-boxee-tv

	def folderDescription(self, folderUrl):
		#return 'description here'
		http = self.GET(folderUrl, httpSiteUrl)
		fullSoup = BeautifulSoup(http)
		itemInfo = fullSoup.find('div', 'item-info')
		if None == itemInfo: return ''
		
		plot = fullSoup.find('meta', attrs = {'name' : 'description'})
		try:
			if plot != None: plot = plot['content']
			if plot == None: plot = ''
		except:
			plot = ''
		
		detailsString = ''
		try:
			for pair in itemInfo.findAll('tr'):
				right = ''
				for r in pair.findAll('a'): right += r.string + ','
				right = right.rstrip(',')
				detailsString += pair.find('td').string.strip() + " " + right + "\n"
		except:
			detailsString = ''
		description = detailsString + '\n\n' + plot
		return description.encode('utf-8')

Пример #26

0

Показать файл

Файл: datautil.py Проект: orenmazor/openparliament

def load_pol_pics():
    for pol in Politician.objects.exclude(parlpage='').filter(models.Q(headshot__isnull=True) | models.Q(headshot='')):
        print "#%d: %s" % (pol.id, pol)
        print pol.parlpage
        soup = BeautifulSoup(urllib2.urlopen(pol.parlpage))
        img = soup.find('img', id='MasterPage_MasterPage_BodyContent_PageContent_Content_TombstoneContent_TombstoneContent_ucHeaderMP_imgPhoto')
        if not img:
            img = soup.find('img', id="ctl00_cphContent_imgParliamentarianPicture")
            if not img:
                raise Exception("Didn't work for %s" % pol.parlpage)
        imgurl = img['src']
        if '?' not in imgurl: # no query string
            imgurl = urllib.quote(imgurl.encode('utf8')) # but there might be accents!
        imgurl = urlparse.urljoin(pol.parlpage, imgurl)
        try:
            test = urllib2.urlopen(imgurl)
            content = urllib.urlretrieve(imgurl)
        except Exception, e:
            print "ERROR ON %s" % pol
            print e
            print imgurl
            continue
        #filename = urlparse.urlparse(imgurl).path.split('/')[-1]
        pol.headshot.save(str(pol.id) + ".jpg", File(open(content[0])), save=True)
        pol.save()

Пример #27

0

Показать файл

Файл: wretmans.py Проект: cotillion/lunchsvall

def scrape_wretmans(url, day=None):
	page = urlopen(url)
	soup = BeautifulSoup(page)
	page.close()

	if day == None:
		day = date.today().weekday()

	# No lunch on Saturday or Sunday
	if day == 5 or day == 6:
		return daily_specials

	# Modify all the strange <span class="SpellE"> and insert a space before the text
	for s in soup.findAll("span", {"class": "SpellE"}):
		soup.find(text=s.text).replaceWith("&nbsp;" + s.text)

	day = [u"Måndag", u"Tisdag", u"Onsdag", u"Torsdag", u"Fredag"][day]
	anchor = soup.find(lambda tag: tag.name == "p" and re.match(tag.text, day))
	siblings = anchor.findNextSiblings("p", limit=2)
	specials = []
	for i, s in enumerate([s.text for s in siblings]):
		s = re.sub("\n", " ", s)
		if s[0:6] == "&nbsp;":
			specials.append(s[6:])
		else:
			specials.append(s)

	return filter(len, specials)

Пример #28

0

Показать файл

Файл: guba_stock_detail_realtime_redis_spider.py Проект: lijiahong/scrapy_guba_redis

    def parse(self, response):
        resp = response.body
        soup = BeautifulSoup(resp)

        try:
            post_id = int(re.search(r'topicid="(.*?)";', str(soup)).group(1))
        except:
            raise UnknownResponseError

        headcode_span = soup.find("span", {"id": "stockheadercode"})
        stock_id = headcode_span.find("a").string

        content = soup.find('div', {'class':'stockcodec'}).text
        title = soup.find('div', {'id': 'zwconttbt'}).text
        releaseTimePara = re.search(r'发表于 (.*?) (.*?) ', str(soup.find('div', {'class': 'zwfbtime'})))
        part1 = releaseTimePara.group(1).decode('utf-8')
        part2 = releaseTimePara.group(2).decode('utf-8')
        releaseTime = part1 + ' ' + part2

        lastReplyTime = None
        zwlitxb_divs = soup.findAll('div', {'class': 'zwlitime'})
        if len(zwlitxb_divs):
            lastReplyTime = re.search(r'发表于 (.*?)<', str(zwlitxb_divs[0])).group(1).decode('utf-8').replace('  ', ' ')

        item_dict = {'post_id': post_id, 'content': content, 'releaseTime': releaseTime, 'lastReplyTime': lastReplyTime, \
                'stock_id': stock_id, 'title': title}
        item = GubaPostDetailItem()
        for key in GubaPostDetailItem.RESP_ITER_KEYS:
            item[key] = item_dict[key]

        return item

Пример #29

0

Показать файл

Файл: alto_las_condes.py Проект: vkhemlan/loogares_scrappers

    def _product_urls_and_types(cls, product_types):
        product_links = []

        if 'Store' not in product_types:
            return []

        browser = mechanize.Browser()

        main_urls = [
            cls.base_url + '/tiendas/',
            cls.base_url + '/mirador-del-alto/',
        ]

        for main_url in main_urls:
            soup = BeautifulSoup(browser.open(main_url))

            categories = soup.find('ul', 'optciones_cat').findAll('li')

            for category_element in categories:
                category_url = cls.base_url + category_element.find('a')['href']

                category_soup = BeautifulSoup(browser.open(category_url))

                store_options = category_soup.find('ul', 'optciones_sto').findAll('li')

                for option in store_options:
                    url = cls.base_url + option.find('a')['href']
                    product_links.append([url, 'Store'])

        return product_links

Пример #30

0

Показать файл

Файл: winnipeg-city-councillors.py Проект: flyeven/scraperwiki-scraper-vault

def scrape_councillor(url, record):
    record["URL"] = "http://www.winnipeg.ca/council/" + url
    soup = BeautifulSoup(scraperwiki.scrape(record["URL"]))

    # strip all HTML comments from the page.
    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    [comment.extract() for comment in comments]

    tables = soup.find("div", {"id" : "content"}).findAll("table")
    img = soup.find("img", {"class" : "bio_pic"})
    if img:
        record["Image"] = "http://www.winnipeg.ca" + img["src"]

    name = soup.find("span", {"class" : "bg90B"})
    record["Name"] = name.text.replace('Councillor','').strip()


    # table = soup.find(text="Ward Information").findParent('table').find("table", { "width" : "100%" })
    table = tables[0]
    key = ''
    value = ''
    # Could be improved to add spaces within the addresses.
    for row in table.findAll("tr"):
        cols = row.findAll("td")
        k = cols[0].text.strip().replace(':','')
        if len(k) > 0:
            add_record_value(record, key, value)
            key = k
            value = ''
        if len(cols) > 1:
            value += cols[1].text.strip() + '\n'
    add_record_value(record, key, value)

Пример #31

0

Показать файл

def mobileUA(content):
    soup = BeautifulSoup(content, convertEntities=BeautifulSoup.HTML_ENTITIES)
    res = soup.find('html')
    res = res.get('class', '') if res else ''
    return True if 'a-mobile' in res or 'a-tablet' in res else False

Пример #32

0

Показать файл

Файл: northern_ireland_-_individual_voluntary_arrangemen.py Проект: yuandra/scraperwiki-scraper-vault

import mechanize
import re
import scraperwiki
from BeautifulSoup import BeautifulSoup

br = mechanize.Browser()
br.open("http://www.business.detini.gov.uk/iva_register/IVASearch.aspx")
br.select_form(name="aspnetForm")
# br['ctl00$ContentPlaceHolder1$Surname']= "McMullan" # comment to search all
response = br.submit()

#scraperwiki.sqlite.save('data_columns', ['Surname', 'Forename', 'DOB', 'Address', 'Postcode', 'latlng'])

soup = BeautifulSoup(response)

tds = soup.find("table", {"id": "ctl00_ContentPlaceHolder1_GridView1"})
rows = tds.findAll("tr")

for row in rows:
    record = {}
    table_cells = row.findAll("td")
    if table_cells:
        surname = re.sub(
            "(\(.*)|( nee.*)", '',
            (re.sub("(\s[a|A]ka.*)|(\sformerly.*)", '', table_cells[0].text)))
        surname = re.sub("\s", '', surname)
        record['Surname'] = surname
        record['Forename'] = table_cells[1].text
        dob = re.sub('&nbsp;', "Unknown", table_cells[2].text)
        record['DOB'] = dob
        address = table_cells[3].text

Пример #33

0

Показать файл

Файл: work4mp_historic_data.py Проект: yuandra/scraperwiki-scraper-vault

import time
import scraperwiki
import re
from BeautifulSoup import BeautifulSoup

page = 4300
while page <= 9500:

    url = 'http://www.w4mp.org/html/personnel/jobs/disp_job_text.asp?ref=%s' % page
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    table = soup.find('table').find('table')

    def soup_strip_html(st):
        return ''.join([e.strip() for e in st.recursiveChildGenerator() if isinstance(e,unicode)])

    def strip_html(st):
        tags = re.compile(r'<.*?>')
        return tags.sub('',st)
    
    if table:
        data = {}
        data['url'] = url
        row_count = 0
        for row in table.findAll('tr'):
            cells = row.findAll('td')
            if row_count == 1:
                key = 'title'
                data[key] = soup_strip_html(cells[0])
            else:
                if cells[0].string != '&nbsp;' and cells[1].string != '&nbsp;':

Пример #34

0

Показать файл

from mechanize import Browser
from BeautifulSoup import BeautifulSoup

import scraperwiki
from scraperwiki import sqlite
mech = Browser()

url = 'http://www.gpupdate.net/en/standings/192/2013-moto3-standings/'

page = mech.open(url)
html = page.read()
soup = BeautifulSoup(html)

resContainer = soup.find("div", {"id": "middle_container"})
rownumber = 0

table = soup.find("table")
for row in table.findAll('tr')[1:40]:
    col = row.findAll('td')

    pos = int(col[0].string.replace(".", ""))
    driver = col[1].a.string

    tempTD = col[1]

    team = tempTD.findAll('span')
    team = team[1].string
    points = col[2].string

    country = tempTD.findAll('img')
    country = country[0]['alt'].upper()

Пример #35

0

Показать файл

Файл: cosmetics_reviews_2.py Проект: yuandra/scraperwiki-scraper-vault

re_rating = re.compile("reviewer-rating")
"""
Url settings for scraping
"""
product_id = 10002735
review_max = 10
"""
Scraping
"""

# generating first review's url

base_url = "http://www.cosme.net/product/product_id/%d/reviews" % product_id
soup = BeautifulSoup(scraperwiki.scrape(base_url).decode('sjis', "ignore"))

div = soup.find(attrs={"class": "review-sec"})
a = soup.find('a', attrs={"class": 'cmn-viewmore'})
review_url = str(a.attrMap['href'])

# getting reviews iteratively

review_count = 1

while 1:
    # scraping review's data
    print "getting review: %d, url: %s" % (review_count, review_url)
    review_page_soup = BeautifulSoup(
        scraperwiki.scrape(review_url).decode('sjis', "ignore"))
    div = review_page_soup.find(attrs={"class": "review-sec"})
    data = {}
    data['product_id'] = product_id

Пример #36

0

Показать файл

                    "     value=%s, labels=%s" %
                    (str(item), [label.text for label in item.get_labels()]))
        print "\n".join(r)

print br.title()
br.select_form(name='provider_search_1')
br.form['searchType'] = ['43']  #43=Ysgolion cynradd
#     value=56, labels=['Ysgolion uwchradd']
#     value=57, labels=['Ysgolion arbennig']
#     value=25, labels=['Meithrinfeydd a gynhelir']

br.response
base_url = 'http://www.estyn.gov.uk/cymraeg/gwybodaeth-am-arolygiadau/adroddiadau-arolygu/'
print br.submit()
soup = BeautifulSoup(br.response().read())
tud_nesa = soup.find('a', {"class": "next"})
next_link = tud_nesa['href']
next_url = base_url + next_link
#atags = soup.findAll('a')
#print atags
#for atag_inst in atags:
#    atag = atag_inst.find(text=re.compile("Next"))
#    if atag:
#        next_link = atag_inst['href']
print next_link
#            if next_link:
#                next_url = base_url + next_link['href']
print next_url
#                scrape_and_look_for_next_link(next_url)

#soup = BeautifulSoup(br.response().read())

Пример #37

0

Показать файл

import requests
from BeautifulSoup import BeautifulSoup
import shutil
import os, sys, errno
import urlparse
from simplejson import loads, dumps

print "starting..."

url = 'http://archillect.com'
response = requests.get(url)
html = response.content

soup = BeautifulSoup(html)

container = soup.find('div', attrs={'id': 'container'})

try:
    state = loads(
        open(os.path.join(os.path.dirname(__file__), 'scrape.state'),
             'r').read())
except IOError:
    state = {}

if 'last_image' not in state:
    state['last_image'] = ''


# create the output images folder if it doesn't exist
def ensure_dir(directory):
    if not os.path.exists(directory):

Пример #38

0

Показать файл

import scraperwiki
from BeautifulSoup import BeautifulSoup
import re

# retrieve a page
starting_url = 'http://finance.yahoo.com/q/ks?s=GE+Key+Statistics'
html = scraperwiki.scrape(starting_url)
soup = BeautifulSoup(html)

#mytable = soup.findAll(id="yfncsumtab")
#mysubtable = mytable.findAll('table')
#print mysubtable

ForwardPEValue = soup.find(text=re.compile("Forward P/E")).findNext(
    'td'
).text  # Nahodit frazu "Forward P/E" i vydaet znachenie v sleduyshei yacheike

#ili po drugomu:
#MarketCap = soup.find(text=re.compile("Market Cap"))
#MarketCapTag = MarketCap.findNext('td').text

record = {soup.find(text=re.compile("Forward P/E")): ForwardPEValue}
scraperwiki.datastore.save([soup.find(text=re.compile("Forward P/E"))], record)

#mytable = soup('table',limit =10)[9] #Otkryvaet 9-u po scetu tablicu na stranice
#tds = mytable.findAll('td')
#for td in tds:
#    print td
#print mytable.prettify()
#print mytable('tr',limit = 3)[2].prettify()

Пример #39

0

Показать файл

Файл: layout_make.py Проект: 04n0/realpython-book2-exercises

def process(folder):
    indexfile = open(os.path.join(folder, 'index.html'), 'rb')
    try:
        soup = BS(indexfile.read())
    finally:
        indexfile.close()
    styles = [x['href'] for x in soup.findAll('link')]
    soup.find('head').contents = BS(head(styles))
    try:
        soup.find(
            'h1').contents = BS('{{=response.title or request.application}}')
        soup.find('h2').contents = BS(
            "{{=response.subtitle or '=response.subtitle'}}")
    except:
        pass
    for match in (soup.find('div', id='menu'),
                  soup.find('div', {'class': 'menu'}),
                  soup.find('div', id='nav'),
                  soup.find('div', {'class': 'nav'})):
        if match:
            match.contents = BS('{{=MENU(response.menu)}}')
            break
    done = False
    for match in (soup.find('div', id='content'),
                  soup.find('div', {'class': 'content'}),
                  soup.find('div', id='main'),
                  soup.find('div', {'class': 'main'})):
        if match:
            match.contents = BS(content())
            done = True
            break
    if done:
        page = soup.prettify()
        page = re.compile("\s*\{\{=response\.flash or ''\}\}\s*", re.MULTILINE)\
            .sub("{{=response.flash or ''}}", page)
        print page
    else:
        raise Exception("Unable to convert")

Пример #40

0

Показать файл

Файл: batchXenocanto.py Проект: reviforks/batchuploads

 filename = asciichars(oddchars(title)) + ".ogg"  # upload filename
 # Quick check of filename in use - these should be unique
 if nameused(filename):
     print Fore.RED + 'Filename found', Fore.YELLOW + "http://commons.wikimedia.org/wiki/File:" + re.sub(
         "%20", "_", urllib.quote(filename)), Fore.WHITE
     continue
 localfile = workingdir + ref + ".mp3"  # source mp3 file
 localenc = workingdir + ref
 print Fore.GREEN + filename + Fore.WHITE
 source = r['file']
 artist = r['rec']
 gallery = r['url']
 url = urltry(gallery)
 html = htmltry(url, gallery)
 soup = BeautifulSoup(html)
 rd = str(soup.find('section', {'id': 'recording-data'}).find('tbody'))
 date = rd.split(">Date<")[1].split('<td>')[1].split('<')[0]
 dtime = ""
 if re.search(">Time<", rd):
     dtime = rd.split(">Time<")[1].split('<td>')[1].split('<')[0]
     if len(dtime) > 2:
         date += " " + dtime
 elevation = ''
 if re.search('>Elevation<', rd):
     elevation = rd.split(">Elevation<")[1].split('<td>')[1].split('<')[0]
 background = ''
 if re.search('>Background<', rd):
     background = rd.split(">Background")[1].split('<td')[1].split(
         '>')[1].split('<')[0]
     if background == "none":
         background = ''

Пример #41

0

Показать файл

Файл: parser.py Проект: chenxiaohui/MusicCategory

def download_url(filename):
    """"""
    words = {}
    request = httplib2.Http()
    if not config.url_pattern:
        raise config.ConfigException

    url = config.url_pattern % (filename.replace(' ', ''))
    try:
        response, content = request.request(url)
    except exception.WebException, e:
        raise
    if response.status == 200:
        try:
            soup = BeautifulSoup(content)
            song_list = soup.find(monkey="song-list")
            if song_list:
                html_tags = song_list.findAll("span", {"class": "song-title"})
                html_tags.extend(song_list.findAll("span",
                                                   {"class": "singer"}))
                words = [tag.text for tag in html_tags]
        except Exception, e:
            raise

    return words


def parse(filename):
    """"""
    wordcount = {}
    try:

Пример #42

0

Показать файл

def main():

    urls = []
    urls.insert(
        1,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Buick&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        2,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Acura&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        3,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chrysler&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        4,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chrysler&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        5,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Dodge&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        6,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Dodge&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        7,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Dodge&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        8,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=GMC&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        9,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=GMC&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        10,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=GMC&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        11,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Jeep&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        12,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Jeep&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        13,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Jaguar&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        14,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Jaguar&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        15,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Land+Rover&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        16,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Lexus&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        17,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Lincoln&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        18,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mazda&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        19,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mazda&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        20,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mercury&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        21,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Saab&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        22,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Subaru&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        23,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Suzuki&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        24,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=BMW&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        25,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=BMW&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        26,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=BMW&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        27,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=BMW&srchtyp=newMake&pageno=4&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        28,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=BMW&srchtyp=newMake&pageno=5&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        29,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=BMW&srchtyp=newMake&pageno=6&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        30,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Cadillac&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        31,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Cadillac&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        32,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chevrolet&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&tabView=0&rowLimit=200"
    )
    urls.insert(
        33,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chevrolet&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&tabView=0&rowLimit=200"
    )
    urls.insert(
        34,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chevrolet&srchtyp=newMake&pageno=4&sortBy=Comb&tabView=0&tabView=0&rowLimit=200"
    )
    urls.insert(
        35,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chevrolet&srchtyp=newMake&pageno=5&sortBy=Comb&tabView=0&tabView=0&rowLimit=200"
    )
    urls.insert(
        36,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chevrolet&srchtyp=newMake&pageno=6&sortBy=Comb&tabView=0&tabView=0&rowLimit=200"
    )
    urls.insert(
        37,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Chevrolet&srchtyp=newMake&pageno=7&sortBy=Comb&tabView=0&tabView=0&rowLimit=200"
    )
    urls.insert(
        38,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Ford&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        39,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Ford&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        40,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Ford&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        41,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Ford&srchtyp=newMake&pageno=4&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        42,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Ford&srchtyp=newMake&pageno=5&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        43,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Honda&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        44,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Honda&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        45,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Hyundai&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        46,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Hyundai&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        47,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Hyundai&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        48,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Infiniti&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        49,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Infiniti&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        50,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Kia&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        51,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Kia&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        52,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mercedes-Benz&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        53,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mercedes-Benz&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        54,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mercedes-Benz&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        55,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mercedes-Benz&srchtyp=newMake&pageno=4&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        56,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mercedes-Benz&srchtyp=newMake&pageno=5&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        57,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=MINI&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        58,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=MINI&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        59,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mitsubishi&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=100"
    )
    urls.insert(
        60,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Mitsubishi&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=100"
    )
    urls.insert(
        61,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Nissan&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        62,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Nissan&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        63,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Nissan&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        64,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Nissan&srchtyp=newMake&pageno=4&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        65,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Scion&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        66,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Smart&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&tabView=0&rowLimit=200"
    )
    urls.insert(
        67,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Toyota&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        68,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Toyota&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        69,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Toyota&srchtyp=newMake&pageno=3&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        70,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Toyota&srchtyp=newMake&pageno=4&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        71,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Volvo&srchtyp=newMake&pageno=1&sortBy=Comb&tabView=0&rowLimit=200"
    )
    urls.insert(
        72,
        "http://www.fueleconomy.gov/feg/PowerSearch.do?action=noform&path=4&year1=2005&year2=2017&make=Volvo&srchtyp=newMake&pageno=2&sortBy=Comb&tabView=0&rowLimit=200"
    )

    tablehead = 'CarModel,ImgUrl,MPG,UserEco\n'
    # EngineDisplacement,Transmission,FuelType,
    while len(urls) > 0:
        try:
            htmltext = ''
            htmltext = urllib.urlopen(urls[0]).read()
        except:
            print urls[0]

        soup = BeautifulSoup(htmltext)
        urls.pop(0)

        table = soup.find(
            'table', attrs={'class': 'cars display responsive stickyHeader'})

        for table_body in table.findAll('tbody'):
            rowNum = 0
            if table_body:
                rows = table_body.findAll('tr')
                for row in rows:
                    if rowNum == 0:
                        model = row.find('td').find('a', href=True)
                        #detail = row.find('span', attrs={'class': 'config'})
                        if model:
                            tablehead = tablehead + ' ' + model.text  # + '  ' + detail.text

                    if row.find('td', attrs={'class': 'mpg-epa'}):
                        imgurl = row.find('td', attrs={
                            'class': 'vphoto'
                        }).find('img',
                                attrs={
                                    'class':
                                    'img-thumbnail img-responsive veh-photo'
                                })
                        fuelecoTable = ''
                        try:
                            fuelecoTable = row.find(
                                'td', attrs={
                                    'class': 'mpg-epa'
                                }).find('div',
                                        attrs={
                                            'class':
                                            'panel panel-default mpg-border'
                                        }).find('div',
                                                attrs={
                                                    'class': 'panel-body'
                                                }).find('table',
                                                        attrs={
                                                            'class': 'results'
                                                        }).findAll('tbody')
                            fueleco = ''
                            for fuelecorow in fuelecoTable:
                                fueleco = fuelecorow.find(
                                    'td', attrs={'class': 'mpg-comb'})
                                break
                        except:
                            pass
                        finally:
                            if fuelecoTable is not '':
                                usereco = row.find('td',
                                                   attrs={'class': 'mpg-user'})
                                tablehead = tablehead + ',' + str(
                                    imgurl['src'])
                                tablehead = tablehead + ',' + str(fueleco.text)
                                tablehead = tablehead + ',' + str(usereco.text)
                                tablehead = tablehead + '\n'
                    rowNum += 1
        #tablehead = tablehead + '\n'
    tablehead = tablehead + '\n'
    f = open('Output.csv', 'w')
    f.write(str(tablehead))
    print tablehead

Пример #43

0

Показать файл

    def get_answers(self, question):
        r = self.session.get('http://%s/questions/%s' % (self.site, question))
        q = BeautifulSoup(r.text).find(attrs={'class': 'question'})
        a = BeautifulSoup(r.text).find(attrs={'id': 'answers'})
        qd = {}
        ad = []

        qd['title'] = BeautifulSoup(r.text).find(attrs={
            'id': 'question-header'
        }).find('h1').text
        qd['votes'] = q.find(attrs={'class': 'vote-count-post '}).text
        qd['favos'] = q.find(attrs={'class': 'favoritecount'}).text
        qd['text'] = html2md(
            str(
                BeautifulSoup(r.text).find(attrs={
                    'class': 'post-text'
                }).extract()))
        qd['tags'] = [t.text for t in q.findAll(attrs={'class': 'post-tag'})],
        qd['users'] = []
        u = q.find(attrs={'class': 'post-signature owner'})
        for u in u.findAll(attrs={'class': 'user-info '}):
            avatar = u.find(attrs={'class': 'user-gravatar32'}).find('img')
            username = u.find(attrs={'class': 'user-details'}).find('a')
            reputation = u.find(attrs={'class': 'reputation-score'})
            qd['users'].append({
                'owner':
                True,
                'last_edit':
                u.find(attrs={
                    'class': 'user-action-time'
                }).find('span')['title'],
                'avatar':
                avatar['src'] if avatar else "",
                'username':
                username.text if username else "",
                'reputation':
                reputation.text if reputation else ""
            })

        for u in q.findAll(attrs={'class': 'post-signature'}):
            for u in u.findAll(attrs={'class': 'user-info '}):
                avatar = u.find(attrs={'class': 'user-gravatar32'}).find('img')
                username = u.find(attrs={'class': 'user-details'}).find('a')
                reputation = u.find(attrs={'class': 'reputation-score'})
                qd['users'].append({
                    'last_edit':
                    u.find(attrs={
                        'class': 'user-action-time'
                    }).find('span')['title'],
                    'avatar':
                    avatar['src'] if avatar else "",
                    'username':
                    username.text if username else "",
                    'reputation':
                    reputation.text if reputation else ""
                })
        qd['comments'] = []
        for c in q.findAll(attrs={'class': 'comment'}):
            qd['comments'].append({
                'id':
                c['id'].split('-')[-1],
                'score':
                c.find(attrs={
                    'class': 'comment-score'
                }).text,
                'text':
                c.find(attrs={
                    'class': 'comment-copy'
                }).text,
                'user':
                c.find(attrs={
                    'class': 'comment-user'
                }).text,
                'last_edit':
                c.find(attrs={
                    'class': 'comment-date'
                }).find('span')['title'],
            })

        for aa in a.findAll(
                attrs={'class': re.compile(r'^answer( accepted-answer)?$')}):
            aad = {
                'id': aa['id'].split('-')[-1],
                'accepted': 'accepted-answer' in aa['class'],
                'votes': aa.find(attrs={
                    'class': 'vote-count-post '
                }).text,
                'text': html2md(str(aa.find(attrs={'class': 'post-text'}))),
                'users': [],
                'comments': []
            }
            for u in aa.findAll(attrs={'class': 'post-signature'}):
                avatar = u.find(attrs={'class': 'user-gravatar32'}).find('img')
                username = u.find(attrs={'class': 'user-details'}).find('a')
                reputation = u.find(attrs={'class': 'reputation-score'})
                aad['users'].append({
                    'last_edit':
                    u.find(attrs={
                        'class': 'user-action-time'
                    }).find('span')['title'],
                    'avatar':
                    avatar['src'] if avatar else "",
                    'username':
                    username.text if username else "",
                    'reputation':
                    reputation.text if reputation else ""
                })

            for c in aa.findAll(attrs={'class': 'comment'}):
                user = c.find(attrs={'class': 'comment-user'})
                aad['comments'].append({
                    'id':
                    c['id'].split('-')[-1],
                    'score':
                    c.find(attrs={
                        'class': 'comment-score'
                    }).text,
                    'text':
                    c.find(attrs={
                        'class': 'comment-copy'
                    }).text,
                    'user':
                    user.text if user else "",
                    'last_edit':
                    c.find(attrs={
                        'class': 'comment-date'
                    }).find('span')['title'],
                })
            ad.append(aad)

        return dict(question=qd, answers=ad)

Пример #44

0

Показать файл

Файл: dlr-harbour-current-weather.py Проект: yuandra/scraperwiki-scraper-vault

###############################################################################
# Dun Laoghaire Harbour - CURRENT HARBOUR WEATHER
###############################################################################

import scraperwiki
import mechanize
from BeautifulSoup import BeautifulSoup

# retrieve a page
starting_url = 'http://www.dlharbour.ie/weather/index.php'
html = scraperwiki.scrape(starting_url)
#print html
soup = BeautifulSoup(html)

wdata = soup.find('div', { 'class' : 'wdata' })
print wdata

record = {}
lis = wdata.findAll('li')
for li in lis:
    val = li.text.split(':', 1)
    if len(val) > 1:
        record[val[0]] = val[1]

print record
# save records to the datastore
scraperwiki.sqlite.save(['Date', 'Time'], record) ###############################################################################
# Dun Laoghaire Harbour - CURRENT HARBOUR WEATHER
###############################################################################

import scraperwiki

Пример #45

0

Показать файл

 def _get_content(self, div_id):
     soup = BeautifulSoup("".join(self.htmlsourse))
     self.data = str(soup.find("div", {"id": div_id}))

Пример #46

0

Показать файл

def poll_for_polyphen2_results(sid):
    """ Polls PolyPhen2's GGI web interface for updates on the progress of the job.
        Once the job has completed the full result file is returned. """
    curr_step = -1
    max_tries = 10
    tries = 0
    wait_msg = "Waiting for PolyPhen2 results => %s"
    done_msg = " => Done.\n"
    while True:
        params = urllib.urlencode({
            '_ggi_project': 'PPHWeb2',
            '_ggi_origin': 'manage',
            '_ggi_target_manage': 'Refresh',
            'sid': sid
        })
        doc = None
        while doc is None:
            try:
                response = urllib2.urlopen(pph2_url, params)
                doc = response.read()
            except (socket.timeout, IOError):
                pass
        soup = BeautifulSoup(doc)
        status_td = soup.find('td', text=re.compile(r'^Batch \d+:'))
        if status_td is None:
            # We might be done, make sure this page is not an error page
            if soup.find('b', text=re.compile(r'^Service Name:')): break
            else:
                tries += 1
                if tries >= max_tries:
                    raise RemoteException(
                        'PolyPhen won\'t let us check the status right now.')
                spin(15)
                continue
        pos_td = status_td.parent.parent.findAll('td')[1]
        try:
            pos = int(pos_td.string)
        except ValueError:
            pos = 0
        shortened = re.sub(r'^Batch \d+:\s+', '', str(status_td))
        this_step = steps.index(shortened)
        if curr_step != this_step:
            if curr_step is not -1:
                write_status((wait_msg + done_msg) % steps[curr_step], True)
            curr_step += 1
            while curr_step < this_step:  # Write out steps that were completed between refreshes.
                write_status((wait_msg + done_msg) % steps[curr_step])
                curr_step += 1
            maxpos = pos
        write_status(wait_msg % shortened, maxpos - pos, maxpos)
        spin(15)
    if curr_step != -1:
        write_status((wait_msg + done_msg) % steps[curr_step], True)
    curr_step += 1
    while curr_step < len(
            steps):  # Write out steps that were completed before last refresh.
        write_status((wait_msg + done_msg) % steps[curr_step])
        curr_step += 1
    result_url = pph2_result_url % sid
    while True:
        error = False
        try:
            write_status(
                "Waiting for PolyPhen2 results => Waiting for download", True)
            response = urllib2.urlopen(result_url)
            result = response.read()
            if result: break
        except (socket.timeout, IOError):
            spin(15)
    if error: raise RemoteException(error.split("\n")[0])
    write_status(True)
    return result

Пример #47

0

Показать файл

Файл: discursosdesarmamento-camaragovbr.py Проект: yuandra/scraperwiki-scraper-vault

 def getDiscurso(url):
     html = unicode(scraperwiki.scrape(url), 'utf-8', 'ignore')
     soup = BeautifulSoup(html, fromEncoding='utf-8')
     soup = soup.find('div', { 'id' : 'content' })
     soup = soup.find('p', { 'align' : 'justify' })
     return soup.renderContents()

Пример #48

0

Показать файл

Файл: ans.py Проект: mvasilkov/Talho

def main(bot, args):
    '''Ответить слушателю. Параметры: <user_id> <message>
Если в качестве user_id указать восклицательный знак, сообщение будет выглядеть как объявление.
? user_id — заблеклистить юзера user_id, его сообщения перестанут поступать в диджейку.
?? — показать блеклист.
?! — очистить блеклист.'''
    syl = {
        '0': 'be',
        '1': 'sa',
        '2': 'ko',
        '3': 'pa',
        '4': 're',
        '5': 'du',
        '6': 'ma',
        '7': 'ne',
        '8': 'wa',
        '9': 'si',
        'a': 'to',
        'b': 'za',
        'c': 'mi',
        'd': 'ka',
        'e': 'ga',
        'f': 'no'
    }
    salt = bot.settings["ans_salt"]
    message_limit = 250
    userpost = ""
    if len(args) == 1 and args[0] != "??" and args[0] != "?!" or not len(args):
        return
    blacklisting = False
    if args[0] != "!":
        if args[0] == "??":
            return _("blacklist:\n%s") % "\n".join(bot.blacklist)
        if args[0] == "?!":
            bot.blacklist = []
            return _("blacklist cleared.")
        if args[0] == "?":
            blacklisting = True
            del args[0]
        if len(args[0]) != 12:
            return _("incorrect name entered, should be 12 symbols.")
        check = md5()
        check.update(args[0][:8].encode('utf-8') + salt)
        if check.hexdigest()[:4] != args[0][8:12]:
            return _("incorrect name entered (checksum invalid).")

        if blacklisting:
            bot.blacklist.append(args[0])
            return _("%s was added to blacklist.") % args[0]

        to = ">>" + args[0]
        if args[0] in bot.usersposts:
            userpost = "<span class=\"userpost\">&gt; " + escape(
                bot.usersposts[args[0]]) + "</span><br/>"
    else:
        to = "!"
    message = " ".join(args[1:])
    if len(message) > message_limit:
        return _(
            "too long answer, should be less than %d symbols, you entered %d symbols."
        ) % (message_limit, len(message))
    soup = BeautifulSoup(open(bot.settings["ans_file"], "r"))
    posts = soup.findAll('p')
    new_post = Tag(soup, 'p')
    user_id = Tag(soup, 'span', [('id', 'user_id')])
    if to != "!":
        user_id.insert(0, escape(to))
    else:
        user_id.insert(0, "<b>&gt;&gt;ОБЪЯВЛЕНИЕ&lt;&lt;</b>")
    new_post.insert(
        0, '[' +
        datetime.datetime.strftime(datetime.datetime.now(), "%H:%M:%S") + ']')
    new_post.insert(1, user_id)
    message = re.sub(
        r'\[([^]]*)\]', lambda x: '<a href="' + x.group(1).replace(
            "&amp;", "&") + '" target="_blank">' + x.group(1) + '</a>',
        escape(message))
    message = re.sub(
        r'\{([^}]*)\}',
        lambda x: '<a href="' + x.group(1).replace("&amp;", "&") +
        '" target="_blank"><img style="max-width: 200px; max-height: 200px;display: inline;" src="'
        + x.group(1).replace("&amp;", "&") + '"/></a>', message)
    new_post.insert(2, userpost + message)
    if len(posts) > 0:
        posts[0].parent.insert(2, new_post)
    else:
        soup.find('h1').parent.insert(1, new_post)
    if len(posts) > 9:

        posts[len(posts) - 1].extract()

    f = open(bot.settings["ans_file"], "w")
    f.write(soup.prettify())
    f.close()

    return _("sent.")

Пример #49

0

Показать файл

                        '[Errors when replying to ads?]')[2].partition(
                            'START CLTAGS')[0].partition(
                                '<!-- imgList = new Array')[0]

                    # TODO get address

        # Finally, throw the listing into data[]
        data.append(listing)

        # Deal with CL's occasional timeouts
        if i % 10 == 0:
            print "sleeping for 10 seconds..."
            sleep(10)

    # Retrieve the URL for the next page
    nexturl = soup.find(text=re.compile("next 100 postings"))

    # Set next crawl URL
    if nexturl is not None:
        crawlurl = rooturl + nexturl.parent['href']
    else:
        crawlurl = ""

print data

# Write data to database
scraperwiki.sqlite.execute("""
DROP TABLE IF EXISTS `cl_sc_commercial_space`
""")

scraperwiki.sqlite.execute("""

Пример #50

0

Показать файл

            print "not found"
            continue
        a_url = u_base + href
        time.sleep(random.random())
        result = requests.get(a_url)
        if result.status_code != 200:
            print author, response.reason
            break

        soup2 = BS(result.content)

        data = []
        if not soup2:
            print "no content found"
            continue
        table = soup2.find('table', attrs={'id': 'gsc_rsb_st'})
        if not table:
            print "content not found"
            continue
        table_body = table.find('tbody')

        rows = table_body.findAll('tr')
        for row in rows:
            cols = row.findAll('td')
            cols = [ele.text.strip() for ele in cols]
            data.append([ele for ele in cols if ele])
        write_row += "{},{},{},{},".format(data[1][1], data[1][2], data[2][1],
                                           data[2][2])
        domains = []
        for text in soup2.findAll(attrs={
                'class': 'gsc_prf_il',

Пример #51

0

Показать файл

Файл: SherdogScraper.py Проект: stphnma/sherdogscraper

	def getEventDetails(self, eventID):
		
		"""
		Return event details for a given event ID from sherdog.com's fightfinder.
		
		Arguments:
		eventID -- A String containing the event's numeric event ID from sherdog.com
		
		Returns:
		eventDetails -- A dictionary containing the events details as scraped from sherdog.com.
		
		eventDetails keys:
		ID -- Event's ID
		title -- Event's full title
		promotion -- Promotion which ran the event
		date -- Date of event (YYYY-MM-DD)
		venue -- Event's venue
		city -- City in which event took place
		fights -- A list containing dictionaries (fightDetails[]) with the details of each fight on the event
	
		fightDetails keys:
		ID -- Fight's ID
		fighter1 -- Sherdog ID for the first fighter
		fighter2 -- Sherdog ID for the second fighter
		winner -- Sherdog ID for the winning fighter
		result -- Method of victory/Type of decision
		referee -- Referee that presided over the fight
		round -- Round in which fight ended
		time -- Time at which final round ended
		"""
		
		# initialise empty dict to store event details
		eventDetails = {}
		
		# store event ID in dict
		eventDetails['ID'] = eventID
		
		# generate event url
		url = self.__eventURL__ % eventID
		
		# retrieve html and initialise beautifulsoup object for parsing
		soup = BeautifulSoup(self.getHtml(url))
		
		pageTitle = soup.html.head.title.string
		pageTitleArr = pageTitle.split(' - ', 1)	
		# find and store event title in dict
		eventDetails['title'] = pageTitle
		
		# find and store promotion name in dict
		eventDetails['promotion'] = pageTitleArr[0]
		
		# find events date
		tempDate = soup.find("div", {"class" : "authors_info"}).find("span", {"class" : "date"}).string
		
		# store event date in dict
		eventDetails['date'] = datetime.datetime.strptime(tempDate, '%b %d, %Y') 
		eventTemp = ''
		try:
			# find and store venue in dict
			eventTemp = soup.find("span", {"class" : "author"}).findAll(text=True)[0].split("\r\n")
			eventDetails['venue'] = eventTemp[0].lstrip().rstrip(",")
		except:
			# store blank string if no venue listed
			eventDetails['venue'] = ''
		
		try:
			# find and store city in dict
			eventDetails['city'] = eventTemp[1].lstrip().rstrip() 
		except:
			# store blank string if no city listed
			eventDetails['city'] = ''
		
		# find list of fights for event
		table = soup.find("div", {"class" : "module_fight_card"})
		
		# initialise empty list to store fightDetails dicts
		eventDetails['fights'] = []
	
		
		fightDetails = {}
		fights = []
		fightDetails['fighter1'] = soup.find("div", {"class" : "fighter left_side"}).a['href'].rsplit("-", 1)[1]
		fightDetails['fighter2'] = soup.find("div", {"class" : "fighter right_side"}).a['href'].rsplit("-", 1)[1]

		leftResult = ''
		rightResult = ''
		winner = ''
		leftResult = soup.find("div", {"class" : "fighter left_side"}).find("span", {"class" : "final_result win"})
		rightResult = soup.find("div", {"class" : "fighter right_side"}).find("span", {"class" : "final_result win"})
		
		if leftResult != None and leftResult.string == 'win':
			fightDetails['winner'] = fightDetails["fighter1"]
		if rightResult != None and leftResult.string == 'win':
			fightDetails['winner'] = fightDetails["fighter2"]
		
		tempCells =  soup.find("table", {"class" : "resume"}).findAll("td")
		fightDetails['ID'] = int(tempCells[0].findAll(text=True)[1].strip())
		fightDetails['result'] = tempCells[1].findAll(text=True)[1].strip()
		fightDetails['referee'] = tempCells[2].findAll(text=True)[1].strip()
		fightDetails['round'] = tempCells[3].findAll(text=True)[1].strip()
		fightDetails['time'] = tempCells[4].findAll(text=True)[1].strip()
		fights.append(fightDetails)

		# find all rows in the fights table
		rows = soup.find("div", {"class" : "content table"}).findAll("tr")
		
		# set rowcount to 0
		rowcount = 0
			
		# loop through all rows in fights table
		for row in rows:
			
			# ignore first row in table
			if not rowcount == 0:
				
				# find all columns in table
				cols = row.findAll('td')
				
				# initialise empty dict to store fight details
				fightDetails = {}
				
				# find and store fight ID
				fightDetails['ID'] = int(cols[0].string)
				
				# find and store ID for fighter1
				fightDetails['fighter1'] = cols[1].a['href'].rsplit('-', 1)[1]
				# find and store ID for fighter2
				fightDetails['fighter2'] = cols[5].a['href'].rsplit('-', 1)[1]
				
				# check that fight was not a draw
				win = cols[1].find("span").find(text=True)
				if win == 'win':
					# find and store winner ID
					fightDetails['winner'] = fightDetails['fighter1']
				else:
					# store blank string if no winner
					fightDetails['winner'] = ''
				
				# find and store result
				fightDetails['result'] = cols[6].find(text=True).string
				
				# find and store round in which fight ended
				fightDetails['referee'] = cols[6].find("span").string
				
				# find and store round in which fight ended
				fightDetails['round'] = cols[7].string
				
				# find and store end time of fight
				fightDetails['time'] = cols[8].string
				
				# add fightDetails dict to fights list
				fights.append(fightDetails)
			
			# increase rowcount by 1
			rowcount = rowcount + 1
	
		sort_on = "ID"
		sortFights = [(dict_[sort_on], dict_) for dict_ in fights]
		sortFights.sort()
		eventDetails['fights'] = [dict_ for (key, dict_) in sortFights]
		# return the scraped details
		return eventDetails

Пример #52

0

Показать файл

for s in symbols:

    #First form URL

    urltext = 'http://www.bloomberg.com/quote/' + s
    # print urltext

    # Open URL
    ##########

    url = urllib2.urlopen(urltext)
    soup = BeautifulSoup(url)

    # 52w high and low
    ##################
    table = soup.find('table', 'snapshot_table')
    row = table.find('tr', 'bottom')
    cells = row.findChildren(['th', 'td'])
    numbers = cells[3].text
    wlow, trail = numbers.split(" ", 1)
    whigh = re.search(r'[\d,.]+$', trail).group(0)

    # Symbol lookup
    ###############
    tag = soup.find('h3')

    # Price lookup
    ###############
    tagprice_currency = soup.find('span', {'class': ' price'}).text

    # print tagprice_currency

Пример #53

0

Показать файл

Файл: crawler2.py Проект: pandrewhk/perspectives

def zamandaily(url):
    with open(url) as f:
        content = f.readlines()
    a = ""
    for line in content:
        a += line + " "

    soup = BeautifulSoup(a)
    output = "{\"source\": \"" + "Zaman Daily" + "\",\n"
    output += "\"url\": \"" + "http://www.todayszaman.com/diplomacy_ihh-icc-finds-israel-guilty-of-war-crimes-in-mavi-marmara-raid_363650.html" + "\",\n"
    output += "\"title\": "
    a = soup.find("title")

    output += "\"" + a.text + "\",\n"

    b = soup.find("div", {"class": "topDate"})
    #November 30, 2014, Sunday
    date = b.text
    arr = date.split(" ")

    month = arr[0]
    day = arr[1].strip(",")
    year = arr[2].strip(",")[2:]

    if month == "January":
        m = 1
    elif month == "February":
        m = 2
    elif month == "March":
        m = 3
    elif month == "April":
        m = 4
    elif month == "May":
        m = 5
    elif month == "June":
        m = 6
    elif month == "July":
        m = 7
    elif month == "August":
        m = 8
    elif month == "September":
        m = 9
    elif month == "October":
        m = 10
    elif month == "November":
        m = 11
    elif month == "December":
        m = 12

    d = str(m) + "/" + day + "/" + year
    output += "\"date\": \"" + d + "\",\n"

    output += "\"article_text\": \""

    c = soup.findAll("p")
    for paragraph in c:
        try:
            output += paragraph.text.decode("utf-8").replace(
                "\"", "'").replace("^M", "").replace("\n", "").replace(
                    "\\", "").replace(r'\r', '').replace("\r", "").replace(
                        "\n", "") + " "
        except UnicodeEncodeError:
            pass

    output += "\"}\n"
    #    output = output.replace("\"","")
    return ''.join([i if ord(i) < 128 else ' ' for i in output])

Пример #54

0

Показать файл

Файл: DatasetCreationFinal.py Проект: ShwetaSinghEngineer/Restaurants-Data-Visualisation-Based-on-Food-Popularity

    # get total number of rows
    print("Total no. of rows: %d" % (csvreader.line_num))

for row in rows[:]:
    #link
    print(len(row))
    links.append(row[0])
    address.append(row[1])
    name.append(row[2])

    driver.get(row[0])
    content = driver.page_source

    soup = BeautifulSoup(content)
    a = soup.find('div', attrs={'class': 'rv_highlights__section pr10'})

    if a is not None:
        f = a.find('div', attrs={'class': 'fontsize13 ln18'})
        #print(f)
        food.append(f.text)
    elif a is None:
        food.append("")

df = pd.DataFrame({
    'Link': links,
    'Address': address,
    'Name': name,
    'Food Name': food
})
df.to_csv('Fooddataset.csv', index=False, encoding='utf-8')

Пример #55

0

Показать файл

def parse_html(db, cursor):
    # create table for film data
    print "creating nyt_data"
    cursor.execute(''' drop table if exists nyt_data ''')
    cursor.execute(
        ''' create table nyt_data (path varchar(255), movie_title varchar(255),
        text_body text, people_names text, years text)''')

    # initialize final dataframe
    nyt_data = pd.DataFrame()

    # i want to print progress, so we need to know how many files we're iterating over
    # (inefficient/redundant but low cost)
    ind = 1
    total_files = 0
    for root, dirs, files in os.walk("./nyt"):
        for file in files:
            if file.endswith('.html'):
                total_files += 1

    print "processing nyt data"
    # loop through the files in the directory
    for root, dirs, files in os.walk("./nyt"):
        for name in files:
            if name.endswith((".html")):
                stdout.write("\r%d/%d" % (ind, total_files))
                stdout.flush()
                df = {}
                # initialize beautifulsoup stuff
                path = root + "/" + name
                article = open(path, 'r')
                soup = BeautifulSoup(article)

                df['path'] = path

                # get the movie title first. in almost all cases it's either in a div with
                # id = movieTitle or in an itemprop tag
                div_title = soup.find('div', {'id': 'movieTitle'})
                itemprop_title = soup.find(itemprop="name")

                # best source seems to be itemprop_title. however it's only available after 2008.
                # if 2007/2008, use the div_title
                # this method seems to correctly get 496/500 titles, the missing ones seem to be one-off
                # changes in formatting. is there a better way to get these remaining 4? hmmm
                if re.match(".*/200[78]/.*", path):
                    if div_title:
                        df['movie_title'] = re.sub("\(.*", "",
                                                   div_title.text).strip()
                    else:
                        df['movie_title'] = None
                else:
                    if itemprop_title:
                        df['movie_title'] = itemprop_title.text
                    else:
                        df['movie_title'] = None

                # now look for names of people
                # grab the body of the story, which can be in two places depending when the article was written
                if re.match(".*/200[78]/.*", path):
                    text_body = soup.findAll('div',
                                             attrs={'class': 'articleBody'})
                    if len(text_body) > 1:
                        text_body = [x.text for x in text_body]
                        text_body = ' '.join(text_body)
                        df['text_body'] = text_body
                    elif len(text_body) == 1:
                        text_body = text_body[0].text
                        df['text_body'] = text_body
                    else:
                        df['text_body'] = None
                else:
                    text_body = soup.findAll('div', {'id': 'story-body'})
                    if len(text_body) > 1:
                        text_body = [x.text for x in text_body]
                        text_body = ' '.join(text_body)
                        df['text_body'] = text_body
                    elif len(text_body) == 1:
                        text_body = text_body[0].text
                        df['text_body'] = text_body
                    else:
                        df['text_body'] = None

                # use nltk to pull words that look like names
                # method borrowed from:
                # http://timmcnamara.co.nz/post/2650550090/extracting-names-with-6-lines-of-python-code
                # i chose this method over others (there are lots of ways to do this!) because it is
                # relatively short/simple and seemed to produce good results (not too much non-name stuff,
                # caught most of the names)
                director_names = ""
                if text_body:
                    for sent in nltk.sent_tokenize(text_body):
                        for chunk in nltk.ne_chunk(
                                nltk.pos_tag(nltk.word_tokenize(sent))):
                            if hasattr(chunk, 'node'):
                                if chunk.node == "PERSON":
                                    director_names = director_names + " " + ' '.join(
                                        c[0] for c in chunk.leaves())
                    df['people_names'] = director_names
                else:
                    df['people_names'] = None

                # pull out things that look like years
                # also include year that article was published
                if text_body:
                    years = re.findall("\d{4}", text_body)
                    years_string = " ".join(years) + " " + re.search(
                        "\d{4}", path).group(0)
                    df['years'] = years_string
                else:
                    df['years'] = None

                # put data for this page in our main dataframe
                nyt_data = nyt_data.append(df, ignore_index=True)
                ind += 1

    # put the main dataframe in the db
    stdout.write("\n")
    print "inserting data to db"
    nyt_data.to_sql(con=db,
                    name='nyt_data',
                    flavor='mysql',
                    index=False,
                    if_exists='append')
    db.commit()

Пример #56

0

Показать файл

Файл: SherdogScraper.py Проект: stphnma/sherdogscraper

	def getFighterDetails(self, fighterID):
		
		"""
		Return fighter details for a given fighter ID from sherdog.com's fightfinder.
		
		Arguments:
		fighterID -- A String containing the fighter's numeric ID from sherdog.com
		
		Returns:
		fighterDetails -- A dictionary containing the fighters details as scraped from sherdog.com
		
		fighterDetails keys:
		ID -- Fighter's ID
		name -- Fighter's full name
		nickName -- Fighter's current nickname
		association -- Fighter's current camp/association
		height -- Fighter's height
		weight -- Fighter's weight (in lbs)
		birthDate -- Fighter's date of birth
		city -- Fighter's city of birth
		country -- Fighter's country of birth
		thumbUrl -- URL of fighter image
		"""
		
		# initialise empty dict to store fighter details
		fighterDetails = {}
		# set all keys to empty values
		fighterDetails['ID'] = ''
		fighterDetails['name'] = ''
		fighterDetails['nickName'] = ''
		fighterDetails['association'] = ''
		fighterDetails['height'] = ''
		fighterDetails['weight'] = ''
		fighterDetails['birthDate'] = ''
		fighterDetails['city'] = ''
		fighterDetails['country'] = ''
		
		# store fighter ID in dict
		fighterDetails['ID'] = fighterID
		
		# generate fighter url
		url = self.__fighterURL__ % fighterID
		
		# retrieve html and initialise beautifulsoup object for parsing
		soup = BeautifulSoup(self.getHtml(url))

		bio = soup.find("div", {"class" : "module bio_fighter"})	
		fighterDetails['name'] = bio.h1.find(text=True)
		try:
			fighterDetails['nickName'] = bio.find("span", {"class" : "nickname"}).em.string
		except Exception:
			fighterDetails['nickName'] = ''
		try:
			fighterDetails['association'] = bio.find("span", {"class" : "item association"}).strong.string
			heightTemp = bio.find("span", {"class" : "item height"})
		except Exception:
			fighterDetails['association'] = ''
		fighterDetails['height'] = ("%s %s" % (heightTemp.strong.string, heightTemp.findAll(text=True)[3].string)).rstrip()
		weightTemp = bio.find("span", {"class" : "item weight"})
		fighterDetails['weight'] = ("%s %s" % (weightTemp.strong.string, weightTemp.findAll(text=True)[3].string)).rstrip() 
		fighterDetails['birthDate'] = bio.find("span", {"class" : "item birthday"}).findAll(text=True)[0].rsplit(":")[1].strip()
		try:
			birthpTemp =  bio.find("span", {"class" : "item birthplace"})
			fighterDetails['city'] = birthpTemp.findAll(text=True)[1].strip()
			fighterDetails['country'] = birthpTemp.strong.string
		except Exception:
			fighterDetails['city'] = ''
			fighterDetails['country'] = ''
		""" Commented
			# check if row contains 'city' and store to fighterDetails dict
			elif infoItem[0].string.rstrip(' ').rstrip('\n') == 'City':
				fighterDetails['city'] = infoItem[1].string.rstrip(' ').rstrip('\n')

			# check if row contains 'country' and store to fighterDetails dict
			elif infoItem[0].string.rstrip(' ').rstrip('\n') == 'Country':
				fighterDetails['country'] = infoItem[1].string.rstrip(' ').rstrip('\n')
			
			# find and store url for fighter image
			fighterDetails['thumbUrl'] = soup.find("span", {"id" : "fighter_picture"}).img['src']
		"""	
		# return scraped details
		return fighterDetails

Пример #57

0

Показать файл

Файл: ukrlp.py Проект: yuandra/scraperwiki-scraper-vault

RANGE = 1000000

def strip_tags(s):
    start = s.find("<")
    if start == -1:
        return s # No tags
    end = s.find(">", start)+1
    return s[:start]+strip_tags(s[end:])

for i in xrange(RANGE):
    url = "http://www.ukrlp.co.uk/ukrlp/ukrlp_provider.page_pls_provDetails?x=&pn_p_id=1%07d&pv_status=VERIFIED" % (i)
    html = scraperwiki.scrape(url)
    bs = BeautifulSoup(html)
    data = {}
    content = bs.find("div", {"class": "pod_main_body"})
    title = content.findAll("div", {"class":"provhead"})
    data["ukprn"] = int(title[0].string.split()[1])
    data["name"] = strip_tags(str(title[1])).strip()
    raw = str(content)
    start = raw.find('<div class="assoc">Legal address</div>')
    end = raw.find('<div class="assoc">Primary contact address</div>')
    address = ""
    for line in  raw[start:end].split("<br />")[1:]:
        l = line.strip()
        if l.startswith("<strong>"):
            s = l.find(">") + 1
            e = l.find("</strong>")
            data["legal_"+l[s:e].strip()[:-1]] = l[e+9:].strip()
        else:
            if l == "<": continue

Пример #58

0

Показать файл

def enhance(request):
    check_login(request)
    session = request.session
    google_resource_id = ""
    slideshare_id = ""
    embed_google = False
    embed_slideshare = False
    not_converted = True
    show_iframe = False
    form = Form(request, schema=QuestionAnswerSchema)
    validate_form = form.validate()
    print form.all_errors()
    if session.has_key('google-resource-id'):
        google_resource_id = session['google-resource-id']
    if session.has_key('slideshare_id'):
        slideshare_id = session['slideshare_id']
        if fetch_slideshow_status(slideshare_id) == "2":
            not_converted = False
            show_iframe = True

    if google_resource_id != "":
        embed_google = True
    if slideshare_id != "":
        embed_slideshare = True
    templatePath = "templates/google_ss_preview.pt"
    if validate_form:
        introductory_paragraphs = request.POST.get('introductory_paragraphs')
        question_count = 0
        cnxml = session[
            "cnxml"] + """<content><section id="intro-section-title"><title id="introtitle">Introduction</title><para id="introduction-1">""" + introductory_paragraphs + """</para></section><section id="slides-embed"><title id="slide-embed-title">View the slides</title><figure id="ss-embed-figure"><media id="slideshare-embed" alt="slideshare-embed"><iframe src="http://www.slideshare.net/slideshow/embed_code/""" + slideshare_id + """" width="425" height="355" /></media></figure></section>"""
        for i in range(1, 6):
            form_question = request.POST.get('question-' + str(i))
            if form_question:
                form_radio_answer = request.POST.get(
                    'radio-' + str(i)
                )  #this give us something like 'answer-1-1'. so our solution is this
                question_count += 1
                if question_count == 1:
                    cnxml += """<section id="test-section"><title>Test your knowledge</title>"""
                itemlist = ""
                for j in range(1, 10):
                    try:

                        form_all_answers = request.POST.get('answer-' +
                                                            str(i) + '-' +
                                                            str(j))
                        if form_all_answers:
                            itemlist += "<item>" + form_all_answers + "</item>"

                    except:
                        print "No element found"

                if form_radio_answer:
                    solution = request.POST.get(form_radio_answer)
                    cnxml += """<exercise id="exercise-""" + str(
                        i
                    ) + """"><problem id="problem-""" + str(
                        i
                    ) + """"><para id="para-""" + str(i) + """">""" + str(
                        form_question
                    ) + """<list id="option-list-""" + str(
                        i
                    ) + """" list-type="enumerated" number-style="lower-alpha">""" + str(
                        itemlist) + """</list></para></problem>"""
                else:
                    print "ELESE CONDUITION OF radio"
                    solution = request.POST.get('answer-' + str(i) + '-1')
                    cnxml += """<exercise id="exercise-""" + str(
                        i) + """"><problem id="problem-""" + str(
                            i) + """"><para id="para-""" + str(
                                i) + """">""" + str(
                                    form_question) + """</para></problem>"""
                print "FORM RADIO ANSWER", form_radio_answer
                print "SOLUTION", solution
                cnxml += """ <solution id="solution-""" + str(
                    i
                ) + """"> <para id="solution-para-""" + str(
                    i
                ) + """">""" + solution + """</para></solution></exercise>"""
                """form_solution = request.POST.get('solution-'+str(i))
                all_post_data = {"data":{"options":form_options,"solution":form_solution,"question":form_question}}
                for question in all_post_data:
                    options = all_post_data[question]['options']
                    solution = all_post_data[question]['solution']
                    asked_question = all_post_data[question]['question']
                    optionlist=""
                    for option in options:
                        optionlist+="<item>"+option+"</item>"""
                #cnxml+="""<exercise id="exercise-"""+str(j)+""""><problem id="problem-"""+str(j)+""""><para id="para-"""+str(j)+"""">"""+str(asked_question)+"""<list id="option-list-"""+str(j)+"""" list-type="enumerated" number-style="lower-alpha">"""+str(optionlist)+"""</list></para></problem>"""
                #cnxml+=""" <solution id="solution-"""+str(j)+""""> <para id="solution-para-"""+str(j)+"""">"""+solution+"""</para></solution></exercise>"""
                #j+=1
        metadata = session['metadata']
        if question_count >= 1:
            cnxml += "</section></content></document>"
        else:
            cnxml += "</content></document>"
        workspaces = [(i['href'], i['title'])
                      for i in session['login'].collections]
        metadata_entry = sword2cnx.MetaData(metadata)
        zipped_filepath = session['userfilepath']
        zip_archive = zipfile.ZipFile(zipped_filepath, 'w')
        zip_archive.writestr("index.cnxml", cnxml)
        zip_archive.close()
        conn = sword2cnx.Connection("http://cnx.org/sword/servicedocument",
                                    user_name=session['login'].username,
                                    user_pass=session['login'].password,
                                    always_authenticate=True,
                                    download_service_document=True)
        collections = [{
            'title': i.title,
            'href': i.href
        } for i in sword2cnx.get_workspaces(conn)]
        session['login'].collections = collections
        workspaces = [(i['href'], i['title'])
                      for i in session['login'].collections]
        session['workspaces'] = workspaces
        with open(zipped_filepath, 'rb') as zip_file:
            deposit_receipt = conn.create(
                col_iri=workspaces[0][0],
                metadata_entry=metadata_entry,
                payload=zip_file,
                filename='upload.zip',
                mimetype='application/zip',
                packaging='http://purl.org/net/sword/package/SimpleZip',
                in_progress=True)
        session['dr'] = deposit_receipt
        session['deposit_receipt'] = deposit_receipt.to_xml()
        soup = BeautifulSoup(deposit_receipt.to_xml())
        data = soup.find("link", rel="edit")
        edit_iri = data['href']
        session['edit_iri'] = edit_iri
        creator = soup.find('dcterms:creator')
        username = session['login'].username
        email = creator["oerdc:email"]
        url = "http://connexions-oerpub.appspot.com/"
        post_values = {
            "username": username,
            "email": email,
            "slideshow_id": slideshare_id
        }
        data = urllib.urlencode(post_values)
        google_req = urllib2.Request(url, data)
        google_response = urllib2.urlopen(google_req)
        now_string = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
        temp_dir_name = '%s-%s' % (request.session['login'].username,
                                   now_string)
        save_dir = os.path.join(request.registry.settings['transform_dir'],
                                temp_dir_name)
        os.mkdir(save_dir)
        request.session['upload_dir'] = temp_dir_name
        cnxml = clean_cnxml(cnxml)
        save_cnxml(save_dir, cnxml, [])
        return HTTPFound(location=request.route_url('metadata'))

        #return HTTPFound(location=request.route_url('updatecnx'))

    response = {
        'form': FormRenderer(form),
        "slideshare_id": slideshare_id,
        "google_resource_id": google_resource_id,
        "embed_google": embed_google,
        "embed_slideshare": embed_slideshare,
        "not_converted": not_converted,
        "show_iframe": show_iframe
    }
    return render_to_response(templatePath, response, request=request)

Пример #59

0

Показать файл

Файл: premier-league-scraper-2.py Проект: pombredanne/scraperwiki-scraper-vault

##################
#Note from creator(Yomal Mudalige)- I have been tried to change column orders,  however still it is not finalized. Hence I added prefix 'A',  'B'.etc. Thanks
#Reference - Scraperwiki Tutorial 3
##################
import scraperwiki
from BeautifulSoup import BeautifulSoup

print "Premier League Football 2011/2011 Points Tables"

html = scraperwiki.scrape('http://www.guardian.co.uk/football/premierleague')
soup = BeautifulSoup(html)
scraperwiki.metadata.save('data_columns', ['Team', 'Pld', 'GD', 'Pts'])
data_table = soup.find("table", {"class": "full"})
rows = data_table.findAll("tr")
m = 0
for row in rows:
    print m, row
    if m < 0:
        m = m + 1
        continue
    else:
        record = {}
        table_cells = row.findAll("td")
        if table_cells:
            record['A- Team'] = table_cells[0].text
            record['B-Matches Played'] = table_cells[1].text
            record['C-Goal Difference'] = table_cells[2].text
            record['D-Points'] = table_cells[3].text
            print record, '------------'
            scraperwiki.datastore.save(["A- Team"], record)

Пример #60

0

Показать файл

        'http://www.motogp.com/en/Results+Statistics/2011/AUS/MotoGP',
        'Phillip Island', '2011'
    ],
    #['http://www.motogp.com/en/Results+Statistics/2011/MAL/MotoGP','Sepang','2011']]
    [
        'http://www.motogp.com/en/Results+Statistics/2011/VAL/MotoGP',
        'Valencia', '2011'
    ]
]

for entry in url:
    #print (entry[1])
    page = mech.open(entry[0])
    html = page.read()
    soup = BeautifulSoup(html)
    table = soup.find("table", {"class": "width100 marginbot10"})
    col = table.findAll("tr")
    tds = col[2]("td")
    track = entry[1]
    season = entry[2]
    date = tds[0].text

    rider = tds[1].text
    time = tds[2].text
    speed = re.search('(.*)[^ Km/h]', tds[3].text).group(0)
    #print(lap)
    #print(track,season,rider,date,time,speed)
    scraperwiki.sqlite.save(unique_keys=["circuit", "season"],
                            data={
                                "circuit": track,
                                "season": season,

Python BeautifulSoup.find примеры использования