Python BeautifulSoup.insert примеры, BeautifulSoup.BeautifulSoup.insert Python примеры использования

Пример #1

0

Показать файл

Файл: soup.py Проект: marcosquixada/cardapiodigitalweb

class TreeBuilder(_base.TreeBuilder):
    def documentClass(self):
        self.soup = BeautifulSoup("")
        return Element(self.soup, self.soup)

    def insertDoctype(self, name, publicId, systemId):
        self.soup.insert(0, Declaration(name))

    def elementClass(self, name):
        return Element(Tag(self.soup, name), self.soup)

    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)

    def fragmentClass(self):
        self.soup = BeautifulSoup("")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup)

    def appendChild(self, node):
        self.soup.insert(len(self.soup.contents), node.element)

    def testSerializer(self, element):
        return testSerializer(element)

    def getDocument(self):
        return self.soup

    def getFragment(self):
        return _base.TreeBuilder.getFragment(self).element

Пример #2

0

Показать файл

def extractLinks(postSoup):
    linkSoup = BeautifulSoup()
    for tag in postSoup.findAll("a"):
        if "href" in tag:
            linkSoup.insert(len(linkSoup), tag["href"])

    return linkSoup.renderContents()

Пример #3

0

Показать файл

Файл: scraper.py Проект: kristjanjansen/environmental_notices_fusion

 def geo_term_extract(self, desc):
     data = values ={
              'maxRows':'1',
              'fuzzy':'1',
              'country':'EE',
              'featureClass':'P',
              'operator':'OR',
              'username':self.geonames_user,
              'q':desc.encode('utf-8')}
     data=urllib.urlencode(values)
 
     link = u"http://api.geonames.org/search"
     xmldata = urllib.urlopen(link, data)
     soup = BeautifulSoup(xmldata)
 #   print soup.prettify()
     lng = '0'
     lat = '0'
     if len(soup.findAll("lat")) > 0:
         lng = soup.findAll("lng")[0].text
         lat = soup.findAll("lat")[0].text
         lat_f = float(lat)
         lng_f = float(lng)
         lat = '%.5f' % ((lat_f * 10000 + random.uniform(1,80))/10000)
         lng = '%.5f' % ((lng_f * 10000 + random.uniform(1,80))/10000)
     
     soup2 = BeautifulSoup()
     tag1 = Tag(soup2, "Point")
     tag2 = Tag(soup2, "coordinates")
     soup2.insert(0, tag1)
     tag1.insert(0, tag2)
     text = NavigableString(lng + "," + lat)
     tag2.insert(0, text)
 #   print soup2
     result = (soup2.__str__()).encode("utf-8")
     return [result, lat, lng]

Пример #4

0

Показать файл

	def setup_source(self):
		source_path = vfs.join('special://profile/', 'sources.xml')
		try:
			soup = vfs.read_file(source_path, soup=True)
		except:
			soup = BeautifulSoup()
			sources_tag = Tag(soup, "sources")
			soup.insert(0, sources_tag)
			
		if soup.find("video") == None:
			sources = soup.find("sources")
			if not sources: return
			video_tag = Tag(soup, "video")
			sources.insert(0, video_tag)
		
		video = soup.find("video")
		if len(soup.findAll(text="PVR Recordings")) < 1:
			pvr_source_tag = Tag(soup, "source")
			pvr_name_tag = Tag(soup, "name")
			pvr_name_tag.insert(0, "PVR Recordings")
			PVR_PATH_tag = Tag(soup, "path")
			PVR_PATH_tag['pathversion'] = 1
			PVR_PATH_tag.insert(0, "pvr://recordings/active/Default/")
			pvr_source_tag.insert(0, pvr_name_tag)
			pvr_source_tag.insert(1, PVR_PATH_tag)
			video.insert(2, pvr_source_tag)
			string = ""
			for i in soup:
				string = string + str(i)

			vfs.write_file(source_path, string)

Пример #5

0

Показать файл

Файл: extractor.py Проект: flynnwang/collection

    def merge_related_elems(self):
        """ search through sibling for related contents """
        article = Soup('<div></div>')
        index = 0
        threshold = max(10, self.top_candidate[READABILITY] * 0.2)
        siblings = [elem for elem in self.top_candidate.parent.contents]
        for elem in siblings:
            append = False
            if elem is self.top_candidate:
                append = True
            elif _has_attr(elem, READABILITY) and elem[READABILITY] >= threshold:
                append = True
            elif is_navigable_string(elem) or elem.name == 'p':
                text = _inner_text(elem)
                text_length = len(text)
                link_density = get_link_density(elem)
                if text_length >= 80 and link_density < 0.25:
                    append = True
                elif text_length < 80 and link_density < 1e-5 and re.search(r'\.( |$)', text):
                    append = True

            if append:
                _debug("sibling found: ", _attr(elem, 'id'), ' ', _attr(elem, 'class'))
                article.insert(index, elem)
                index += 1
        self.article = article

Пример #6

0

Показать файл

Файл: sgm_wl.py Проект: jagzviruz/webcol

    def __init__(self, hl=None):
        soup = BeautifulSoup()
        doc = Tag(soup, 'DOC')
        docid = Tag(soup, 'DOCID')
        doctype = Tag(soup, 'DOCTYPE')
        datetime = Tag(soup, 'DATETIME')
        body = Tag(soup, 'BODY')
        headline = Tag(soup, 'HEADLINE')
        text = Tag(soup, 'TEXT')
        soup.insert(0, doc)
        doc.insert(0, docid)
        doc.insert(1, doctype)
        doc.insert(2, datetime)
        doc.insert(3, body)
        body.insert(0, headline)
        body.insert(1, text)

        doctype.insert(0, NavigableString(" BLOG TEXT "))
        doctype['SOURCE'] = "blog"
        
        self.soup = soup
        self.docid = docid
        self.datetime = datetime
        self.headline = headline
        self.text = text
        self.initialPost = True

        if hl:
            self.setHeadline(hl)

Пример #7

0

Показать файл

Файл: soup.py Проект: RJHsiao/planet.moztw.org

class TreeBuilder(_base.TreeBuilder):
    def documentClass(self):
        self.soup = BeautifulSoup("")
        return Element(self.soup, self.soup)
    
    def insertDoctype(self, name, publicId, systemId):
        self.soup.insert(0, Declaration(name))
    
    def elementClass(self, name):
        return Element(Tag(self.soup, name), self.soup)
        
    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)
    
    def fragmentClass(self):
        self.soup = BeautifulSoup("")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup) 

    def appendChild(self, node):
        self.soup.insert(len(self.soup.contents), node.element)

    def testSerializer(self, element):
        return testSerializer(element)

    def getDocument(self):
        return self.soup
    
    def getFragment(self):
        return _base.TreeBuilder.getFragment(self).element

Пример #8

0

Показать файл

Файл: filters.py Проект: caseypatrickdriscoll/reddit

def wikimarkdown(text, include_toc=True, target=None):
    from r2.lib.cssfilter import legacy_s3_url
    
    def img_swap(tag):
        name = tag.get('src')
        name = custom_img_url.search(name)
        name = name and name.group(1)
        if name and c.site.images.has_key(name):
            url = c.site.images[name]
            url = legacy_s3_url(url, c.site)
            tag['src'] = url
        else:
            tag.extract()
    
    nofollow = True
    
    text = snudown.markdown(_force_utf8(text), nofollow, target, g.domain,
                            renderer=snudown.RENDERER_WIKI)
    
    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))
    images = soup.findAll('img')
    
    if images:
        [img_swap(image) for image in images]
    
    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)
    
    text = str(soup)
    
    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON

Пример #9

0

Показать файл

def content_absolute_links(content, image=None):
    from django.contrib.sites.models import Site
    current_site = Site.objects.get(pk=settings.SITE_ID)

    def abs_url(url):

        parsed = urlparse.urlparse(url)
        if parsed.netloc == parsed.scheme == '':
            url = urlparse.urljoin('http://{0}'.format(current_site.domain),
                                   url)
        return url

    soup = BeautifulSoup(content)

    if image:
        img = Tag(soup, 'img', [('src', image)])
        soup.insert(0, img)

    for link in soup.findAll('a'):
        link['href'] = abs_url(link['href'])

    for link in soup.findAll('img'):
        link['src'] = abs_url(link['src'])

    return unicode(soup)

Пример #10

0

Показать файл

Файл: util.py Проект: EfraimFeinstein/find-the-expert

def extractLinks(postSoup):
    linkSoup = BeautifulSoup()
    for tag in postSoup.findAll("a"):
        if "href" in tag:
            linkSoup.insert(len(linkSoup), tag["href"])
    
    return linkSoup.renderContents()

Пример #11

0

Показать файл

Файл: sgm_ng.py Проект: jagzviruz/webcol

    def __init__(self, hl=None):
        soup = BeautifulSoup()
        doc = Tag(soup, "DOC")
        docid = Tag(soup, "DOCID")
        doctype = Tag(soup, "DOCTYPE")
        datetime = Tag(soup, "DATETIME")
        body = Tag(soup, "BODY")
        headline = Tag(soup, "HEADLINE")
        text = Tag(soup, "TEXT")
        soup.insert(0, doc)
        doc.insert(0, docid)
        doc.insert(1, doctype)
        doc.insert(2, datetime)
        doc.insert(3, body)
        body.insert(0, headline)
        body.insert(1, text)

        doctype.insert(0, NavigableString(" USENET TEXT "))
        doctype["SOURCE"] = "usenet"

        self.soup = soup
        self.docid = docid
        self.datetime = datetime
        self.headline = headline
        self.text = text
        self.initialPost = True

        if hl:
            self.setHeadline(hl)

Пример #12

0

Показать файл

Файл: views.py Проект: Godrik-xp/myTZM

def AllCategories(request):
	print 'allcat'
	x = BeautifulSoup()
	#root = Tag(x,'ul', [('class', "tree"), ( 'id', "tree")])
	#x.insert(0,root)
	AllCategories = RECategory.objects.filter(parent__isnull=True).order_by('-number')
	
	AllAnswered = {}
    #в logs добавляем только самые поздние по дате RELog
	for log in RELog.objects.filter(user=request.user).order_by('-date'):
		if not log.category_id in AllAnswered:
			AllAnswered[log.category_id] = {}
		if not log.type_log in AllAnswered[log.category_id]:
			AllAnswered[log.category_id][log.type_log] = log
	for category in AllCategories:
		print category.id
		nt = Tag(x,'li', [("id", str(category.id))])
		log = AllAnswered.get(category.id)
		rating = ''
		if log:
			log = log.get(5)
			if log :
				rating = 'Оценка: ' + str(log.rating)
		div = Tag(x,'div')
		div.string = rating
		div["class"] = "rating"
		#div["style"] = "width: 150px; float: right;"
		nt.insert(0, div)
		
		if category.is_3d:
			isDDD = "Есть";
		else:
			isDDD = "Нет";
		div = Tag(x,'div')
		div.string = isDDD 
		div["class"] = "is3d"
		#div["style"] = "margin-right: 0px;width: 110px; float: right;"
		nt.insert(0, div)
		
		div = Tag(x,'div')
		div["class"] = "demo"
		#div["style"] = "margin-right: 0px;width: 110px; float: right;"
		div.string = str(category.type_category)
		nt.insert(0, div)
		
		div = Tag(x,'div')
		div.string = category.name
		nt.insert(0, div)
		
		x.insert(0,nt)
		recurseCategories(category, nt, x, AllAnswered)
	res = x.prettify()
	#print res
	print 'endallcat'
	return res

Пример #13

0

Показать файл

Файл: util.py Проект: EfraimFeinstein/find-the-expert

def extractCode(postSoup):
    """ extract and clean up the code from a soup-ed post string,
    return a set of tokens"""
    codes = BeautifulSoup()
    for tag in postSoup.findAll("code"):
        codes.insert(len(codes), tag)
        tag.hidden = True
        if tag.string:
            tag.string = tag.string + u"\n"

    return codes.renderContents()

Пример #14

0

Показать файл

def extractCode(postSoup):
    """ extract and clean up the code from a soup-ed post string,
    return a set of tokens"""
    codes = BeautifulSoup()
    for tag in postSoup.findAll("code"):
        codes.insert(len(codes), tag)
        tag.hidden = True
        if tag.string:
            tag.string = tag.string + u"\n"

    return codes.renderContents()

Пример #15

0

Показать файл

Файл: models.py Проект: pombredanne/xscheduling

  def save(self):
    soup = BeautifulSoup()
    root_tag = Tag(soup, 'Task')
    soup.insert(0, root_tag)
    i = 0

    try:
      job_tag = Tag(soup, 'Job')
      job_tag.insert(0, NavigableString('%s' % self.owner_id))
      root_tag.insert(i, job_tag)
      i = i+1
    except AttributeError:
      raise ValueError("You must provide job id.")  
    
    try:
      id_tag = Tag(soup, 'TaskID')
      id_tag.insert(0, NavigableString('%d' % self.id))
      root_tag.insert(i, id_tag)
      i = i+1
    except AttributeError:
      raise ValueError("You must provide task id.")
    
    try:
      if self.name:
        label_tag = Tag(soup, 'Label')
        label_tag.insert(0, NavigableString(self.name))
        root_tag.insert(i, label_tag)
        i = i+1
    except AttributeError:
      pass
    
    try:
      if self.description:
        description_tag = Tag(soup, 'Description')
        description_tag.insert(0, NavigableString(self.description))
        root_tag.insert(i, description_tag)
        i = i+1
    except AttributeError:
      pass

    try:
      if self.estimated_minutes:
        estimated_minutes_tag = Tag(soup, 'EstimatedMinutes')
        estimated_minutes_tag.insert(0, NavigableString('%d' % self.estimated_minutes))
        root_tag.insert(i, estimated_minutes_tag)
        i = i+1
    except AttributeError:
      pass
    
    print soup
    response = rest_client.Client("").POST(self.post, str(soup))
    return Task(xml=response.content)

Пример #16

0

Показать файл

def wikimarkdown(text, include_toc=True, target=None):
    from v1.lib.template_helpers import make_url_protocol_relative

    # this hard codes the stylesheet page for now, but should be parameterized
    # in the future to allow per-page images.
    from v1.models.wiki import ImagesByWikiPage
    from v1.lib.utils import UrlParser
    from v1.lib.template_helpers import add_sr
    page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet")

    def img_swap(tag):
        name = tag.get('src')
        name = custom_img_url.search(name)
        name = name and name.group(1)
        if name and name in page_images:
            url = page_images[name]
            url = make_url_protocol_relative(url)
            tag['src'] = url
        else:
            tag.extract()

    nofollow = True

    text = snudown.markdown(_force_utf8(text),
                            nofollow,
                            target,
                            renderer=snudown.RENDERER_WIKI)

    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))
    images = soup.findAll('img')

    if images:
        [img_swap(image) for image in images]

    def add_ext_to_link(link):
        url = UrlParser(link.get('href'))
        if url.is_verbify_url():
            link['href'] = add_sr(link.get('href'), sr_path=False)

    if c.render_style == 'compact':
        links = soup.findAll('a')
        [add_ext_to_link(a) for a in links]

    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)

    text = str(soup)

    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON

Пример #17

0

Показать файл

Файл: models.py Проект: pombredanne/xscheduling

  def save(self):
    soup = BeautifulSoup()
    root_tag = Tag(soup, 'Note')
    soup.insert(0, root_tag)
    i = 0

    try:
      job_tag = Tag(soup, 'Job')
      job_tag.insert(0, NavigableString('%s' % self.owner_id))
      root_tag.insert(i, job_tag)
      i = i+1
    except AttributeError:
      raise ValueError("You must provide job id.")  
    
    try:
      title_tag = Tag(soup, 'Title')
      title_tag.insert(0, NavigableString(self.title))
      root_tag.insert(i, title_tag)
      i = i+1
    except AttributeError:
      raise ValueError("You must provide note's title.")
    
    try:
      text_tag = Tag(soup, 'Text')
      text_tag.insert(0, NavigableString(self.text))
      root_tag.insert(i, text_tag)
      i = i+1
    except AttributeError:
      raise ValueError("You must provide note's text.")
    
    try:
      if self.folder:
        folder_tag = Tag(soup, 'Folder')
        folder_tag.insert(0, NavigableString(self.folder))
        root_tag.insert(i, folder_tag)
        i = i+1
    except AttributeError:
      pass
    
    try:
      if self.public:
        public_tag = Tag(soup, 'Public')
        public_tag.insert(0, NavigableString(str(self.public).lower()))
        root_tag.insert(i, public_tag)
        i = i+1
    except AttributeError:
      pass
    
    response = rest_client.Client("").POST(self.post, str(soup))
    return Note(xml=response.content)

Пример #18

0

Показать файл

Файл: views.py Проект: Godrik-xp/myTZM

def userlist(request):
        x = BeautifulSoup()
        root = Tag(x,'root')
        x.insert(0,root)
        for u in models.Group.objects.get(name='Курсанты').user_set.all():
                root.insert(0,'\n')
                root.insert(0,Tag(x,'user',[
                        ('uid',str(u.id)),
                        ('username',u.username),
                        ('first_name',u.first_name),
                        ('last_name',u.last_name),
                        ]))
        
        return HttpResponse(x)

Пример #19

0

Показать файл

Файл: filters.py Проект: pra85/reddit

def wikimarkdown(text, include_toc=True, target=None):
    from r2.lib.template_helpers import make_url_protocol_relative

    # this hard codes the stylesheet page for now, but should be parameterized
    # in the future to allow per-page images.
    from r2.models.wiki import ImagesByWikiPage
    from r2.lib.utils import UrlParser
    from r2.lib.template_helpers import add_sr
    page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet")
    
    def img_swap(tag):
        name = tag.get('src')
        name = custom_img_url.search(name)
        name = name and name.group(1)
        if name and name in page_images:
            url = page_images[name]
            url = make_url_protocol_relative(url)
            tag['src'] = url
        else:
            tag.extract()
    
    nofollow = True
    
    text = snudown.markdown(_force_utf8(text), nofollow, target,
                            renderer=snudown.RENDERER_WIKI)
    
    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))
    images = soup.findAll('img')
    
    if images:
        [img_swap(image) for image in images]

    def add_ext_to_link(link):
        url = UrlParser(link.get('href'))
        if url.is_reddit_url():
            link['href'] = add_sr(link.get('href'), sr_path=False)

    if c.render_style == 'compact':
        links = soup.findAll('a')
        [add_ext_to_link(a) for a in links]

    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)
    
    text = str(soup)
    
    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON

Пример #20

0

Показать файл

Файл: models.py Проект: pombredanne/xscheduling

  def delete(self):
    soup = BeautifulSoup()
    client_tag = Tag(soup, 'Client')
    soup.insert(0, client_tag)
    try:
      id_tag = Tag(soup, 'ID')
      id_tag.insert(0, NavigableString('%d' % self.id))
      client_tag.insert(0, id_tag)
    except AttributeError:
      raise ValueError("You must have id for delete operation.")  

    response = rest_client.Client("").POST(self.delete_url, str(soup))
    soup = BeautifulStoneSoup(response.content)
    if soup.status and soup.status.contents[0].lower() == 'error':
      raise ResponseStatusError(soup.errordescription.contents[0])

Пример #21

0

Показать файл

Файл: BeautifulNoodle.py Проект: lemr/BeautifulNoodle

 def find_wanted_content(self, soup):
     """
     Finds wanted elements.
     """
     assert isinstance(soup, BeautifulSoup)
     new_soup = BeautifulSoup()
     for selector in self.wanted_tags_selector:
         tag = soup.find(**selector.soup)
         self.log.info('Looking for element %s...' % selector.out())
         if tag:
             self.log.info('found')
             new_soup.insert(0, tag)
         else:
             self.log.info('NOT FOUND')
     return new_soup

Пример #22

0

Показать файл

Файл: utils.py Проект: hollerith/freeder

def reddit(post):
    print post.title
    print post.content
    soup = BeautifulSoup(post.content)
    imgur = soup.find('a', href=re.compile('imgur'))
    if imgur:
        src = imgur['href']
    else:
        print "No imgur"
        qkme = soup.find('a', href=re.compile('qkme'))
        if qkme:
            src = qkme['href']
            if urlparse.urlparse(src).hostname == "qkme.me":
                src = 'http://i.qkme.me/'+src[15:].split('?')[0]
        else:
            print "No meme neither"
            tumblr = soup.find('a', href=re.compile('tumblr'))
            if tumblr:
                src = tumblr['href']
            else:
                print "No tumblr neither"
                src = None
    if src:
        url = urllib.urlopen(src).getcode()
        if url == 404: 
            url = urllib.urlopen(src+'.jpg').getcode()
            if url == 200:
                src += '.jpg'
            else:
                url = urllib.urlopen(src+'.gif').getcode()
                if url == 200: 
                    src += '.gif'
                      
        if url == 200:
            print "Embedding..."
            img = Tag(soup, "img", [("src", src)])
            soup.insert(0, img)    
            thumb = soup.find('img', src=re.compile('thumbs.redditmedia.com/'))
            if thumb:
                thumb.extract()
                print 'remove thumbs'
            
            post.summary = soup.renderContents()
            post.save()
            print post.summary #

Пример #23

0

Показать файл

Файл: newsletter.py Проект: Scopart/emencia-django-newsletter

def body_insertion(content, insertion, end=False):
    """Insert an HTML content into the body HTML node"""
    insertion = BeautifulSoup(insertion)
    soup = BeautifulSoup(content)

    if soup.body and end:
        soup.body.append(insertion)
    elif soup.body:
        soup.body.insert(0, insertion)
    elif not soup.body and end:
        soup.append(insertion)
    elif not soup.body:
        soup.insert(0, insertion)

    if USE_PRETTIFY:
        return soup.prettify()
    else:
        return soup.renderContents()

Пример #24

0

Показать файл

Файл: filters.py Проект: new-day-international/reddit

def wikimarkdown(text, include_toc=True, target=None):
    from r2.lib.cssfilter import legacy_s3_url
    
    nofollow = True
    
    text = snudown.markdown(_force_utf8(text), nofollow, target, g.domain )
    
    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))

    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)
    
    text = str(soup)
    
    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON

Пример #25

0

Показать файл

Файл: soup.py Проект: AlexxNica/planet-web

class TreeBuilder(_base.TreeBuilder):
    def __init__(self, namespaceHTMLElements):
        if namespaceHTMLElements:
            warnings.warn(
                "BeautifulSoup cannot represent elements in any namespace",
                DataLossWarning)
        _base.TreeBuilder.__init__(self, namespaceHTMLElements)

    def documentClass(self):
        self.soup = BeautifulSoup("")
        return Element(self.soup, self.soup, None)

    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]

        if publicId:
            self.soup.insert(
                0,
                Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\"" %
                            (name, publicId, systemId or "")))
        elif systemId:
            self.soup.insert(
                0, Declaration("DOCTYPE %s SYSTEM \"%s\"" % (name, systemId)))
        else:
            self.soup.insert(0, Declaration("DOCTYPE %s" % name))

    def elementClass(self, name, namespace):
        if namespace is not None:
            warnings.warn(
                "BeautifulSoup cannot represent elements in any namespace",
                DataLossWarning)
        return Element(Tag(self.soup, name), self.soup, namespace)

    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)

    def fragmentClass(self):
        self.soup = BeautifulSoup("")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)

    def appendChild(self, node):
        self.soup.insert(len(self.soup.contents), node.element)

    def testSerializer(self, element):
        return testSerializer(element)

    def getDocument(self):
        return self.soup

    def getFragment(self):
        return _base.TreeBuilder.getFragment(self).element

Пример #26

0

Показать файл

Файл: filters.py Проект: wigg234/reddit

def wikimarkdown(text, include_toc=True, target=None):
    from r2.lib.template_helpers import s3_https_if_secure

    # this hard codes the stylesheet page for now, but should be parameterized
    # in the future to allow per-page images.
    from r2.models.wiki import ImagesByWikiPage
    page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet")

    def img_swap(tag):
        name = tag.get('src')
        name = custom_img_url.search(name)
        name = name and name.group(1)
        if name and name in page_images:
            url = page_images[name]
            url = s3_https_if_secure(url)
            tag['src'] = url
        else:
            tag.extract()

    nofollow = True

    text = snudown.markdown(_force_utf8(text),
                            nofollow,
                            target,
                            renderer=snudown.RENDERER_WIKI)

    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))
    images = soup.findAll('img')

    if images:
        [img_swap(image) for image in images]

    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)

    text = str(soup)

    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON

Пример #27

0

Показать файл

Файл: views.py Проект: Godrik-xp/myTZM

def ConvertToTestHtml(quest):
	types = quest.type
	titles = quest.text
	quests_ids = [quest.id]
	answers = RETestAnswer.objects.filter(question__id__in=quests_ids)
	newbs = BeautifulSoup()
	pNode = Tag(newbs, 'p')
	newbs.insert(0,pNode)
	if quest.img:
		print 'Image!!!'
		print quest.img.url
		imageNode = Tag(newbs, 'image', [('src', quest.img.url)])
		newbs.insert(0,imageNode)
	TitleNode = Tag(newbs, 'p')
	TitleNode.string = titles
	newbs.insert(0,TitleNode)
	i = 0
	if types != 1:
		for answer in answers:
			radioname = 'ans' + str(i)
			nt = Tag(newbs,'input', [('type', 'radio'), ('type', radioname), ('name', 'answerradio'), ('value', str(answer.is_correct))])
			nt.string = answer.name
			pNode.insert(len(pNode.contents), nt)
			pNode.insert(len(pNode.contents), Tag(newbs, 'br'))
	else:
		for answer in answers:
			radioname = 'ans' + str(i)
			nt = Tag(newbs,'input', [('type', 'text'), ('name', 'answertext'),('ans', answer.name)])
			pNode.insert(len(pNode.contents), nt)
			pNode.insert(len(pNode.contents), Tag(newbs, 'br'))
	return newbs.prettify()

Пример #28

0

Показать файл

Файл: filters.py Проект: Acceto/reddit

def wikimarkdown(text, include_toc=True, target=None):
    from r2.lib.template_helpers import media_https_if_secure

    # this hard codes the stylesheet page for now, but should be parameterized
    # in the future to allow per-page images.
    from r2.models.wiki import ImagesByWikiPage
    page_images = ImagesByWikiPage.get_images(c.site, "config/stylesheet")
    
    def img_swap(tag):
        name = tag.get('src')
        name = custom_img_url.search(name)
        name = name and name.group(1)
        if name and name in page_images:
            url = page_images[name]
            url = media_https_if_secure(url)
            tag['src'] = url
        else:
            tag.extract()
    
    nofollow = True
    
    text = snudown.markdown(_force_utf8(text), nofollow, target,
                            renderer=snudown.RENDERER_WIKI)
    
    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))
    images = soup.findAll('img')
    
    if images:
        [img_swap(image) for image in images]
    
    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)
    
    text = str(soup)
    
    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON

Пример #29

0

Показать файл

Файл: views.py Проект: HM2MC/Webfront

def get_malott_menu(today):
    url = "http://www.scrippscollege.edu/students/dining-services/index.php"
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content)
    head = BeautifulSoup("<thead><tr><td colspan=3>Malott Commons</td></tr></thead>")

    target = soup.find("div", {"id": "right_column_content"})
    target.extract()
    meals = []
    for meal in target.findAll("ul"):
        meal.extract()
        meals += [meal]

    labels = []
    for title in target.findAll("p"):
        title.extract()
        labels += [title]

    final_table = BeautifulSoup()
    table = Tag(final_table, "table")
    final_table.insert(0, table)
    table.insert(0, head)
    table["class"] = "mealtable"
    for meal in meals:
        tr = Tag(final_table, "tr")
        td = Tag(final_table, "td")
        tr.insert(0, td)
        td["class"] = "mealtime"
        td.contents = labels[1].contents
        table.insert(len(table.contents) - 1, tr)
        labels = labels[1:]
        for food in meal.findAll("li"):
            tr = Tag(final_table, "tr")
            td = Tag(final_table, "td")
            tr.insert(0, td)
            td.contents = food.contents
            table.insert(len(table.contents) - 1, tr)

    return final_table.prettify()

Пример #30

0

Показать файл

Файл: feeds.py Проект: hollerith/freeder

def reddit(value):
    soup = BeautifulSoup(value)

    imgur = soup.find("a", href=re.compile("imgur"))
    if imgur:
        src = imgur["href"]
    else:
        qkme = soup.find("a", href=re.compile("qkme"))
        if qkme:
            src = qkme["href"]
            if urlparse.urlparse(src).hostname == "qkme.me":
                src = "http://i.qkme.me/" + src[15:].split("?")[0]
        else:
            tumblr = soup.find("a", href=re.compile("tumblr"))
            if tumblr:
                src = tumblr["href"]
            else:
                src = None
    if src:
        url = urllib.urlopen(src).getcode()
        if url == 404:
            url = urllib.urlopen(src + ".jpg").getcode()
            if url == 200:
                src += ".jpg"
            else:
                url = urllib.urlopen(src + ".gif").getcode()
                if url == 200:
                    src += ".gif"

        if url == 200:
            img = Tag(soup, "img", [("src", src)])
            soup.insert(0, img)
            thumb = soup.find("img", src=re.compile("thumbs.redditmedia.com/"))
            if thumb:
                thumb.extract()
                print "remove thumbs"

    return soup.renderContents()

Пример #31

0

Показать файл

Файл: soup.py Проект: PedaNet/html5lib

class TreeBuilder(_base.TreeBuilder):
    def __init__(self, namespaceHTMLElements):
        if namespaceHTMLElements:
            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
        _base.TreeBuilder.__init__(self, namespaceHTMLElements)
        
    def documentClass(self):
        self.soup = BeautifulSoup("")
        return Element(self.soup, self.soup, None)
    
    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]

        if publicId:
            self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
        elif systemId:
            self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""%
                                            (name, systemId)))
        else:
            self.soup.insert(0, Declaration("DOCTYPE %s"%name))
    
    def elementClass(self, name, namespace):
        if namespace is not None:
            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
        return Element(Tag(self.soup, name), self.soup, namespace)
        
    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)
    
    def fragmentClass(self):
        self.soup = BeautifulSoup("")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None) 

    def appendChild(self, node):
        self.soup.insert(len(self.soup.contents), node.element)

    def testSerializer(self, element):
        return testSerializer(element)

    def getDocument(self):
        return self.soup
    
    def getFragment(self):
        return _base.TreeBuilder.getFragment(self).element

Пример #32

0

Показать файл

Файл: filters.py Проект: unixcrh/reddit

def wikimarkdown(text, include_toc=True, target=None):
    from r2.lib.cssfilter import legacy_s3_url

    def img_swap(tag):
        name = tag.get('src')
        name = custom_img_url.search(name)
        name = name and name.group(1)
        if name and c.site.images.has_key(name):
            url = c.site.images[name]
            url = legacy_s3_url(url, c.site)
            tag['src'] = url
        else:
            tag.extract()

    nofollow = True

    text = snudown.markdown(_force_utf8(text),
                            nofollow,
                            target,
                            renderer=snudown.RENDERER_WIKI)

    # TODO: We should test how much of a load this adds to the app
    soup = BeautifulSoup(text.decode('utf-8'))
    images = soup.findAll('img')

    if images:
        [img_swap(image) for image in images]

    if include_toc:
        tocdiv = generate_table_of_contents(soup, prefix="wiki")
        if tocdiv:
            soup.insert(0, tocdiv)

    text = str(soup)

    return SC_OFF + WIKI_MD_START + text + WIKI_MD_END + SC_ON

Пример #33

0

Показать файл

Файл: scraper.py Проект: sivartravis/environmental_notices_fusion

    def geo_term_extract(self, desc):
        data = values = {
            'maxRows': '1',
            'fuzzy': '1',
            'country': 'EE',
            'featureClass': 'P',
            'operator': 'OR',
            'username': self.geonames_user,
            'q': desc.encode('utf-8')
        }
        data = urllib.urlencode(values)

        link = u"http://api.geonames.org/search"
        xmldata = urllib.urlopen(link, data)
        soup = BeautifulSoup(xmldata)
        #   print soup.prettify()
        lng = '0'
        lat = '0'
        if len(soup.findAll("lat")) > 0:
            lng = soup.findAll("lng")[0].text
            lat = soup.findAll("lat")[0].text
            lat_f = float(lat)
            lng_f = float(lng)
            lat = '%.5f' % ((lat_f * 10000 + random.uniform(1, 80)) / 10000)
            lng = '%.5f' % ((lng_f * 10000 + random.uniform(1, 80)) / 10000)

        soup2 = BeautifulSoup()
        tag1 = Tag(soup2, "Point")
        tag2 = Tag(soup2, "coordinates")
        soup2.insert(0, tag1)
        tag1.insert(0, tag2)
        text = NavigableString(lng + "," + lat)
        tag2.insert(0, text)
        #   print soup2
        result = (soup2.__str__()).encode("utf-8")
        return [result, lat, lng]

Пример #34

0

Показать файл

Файл: utils.py Проект: MeirKriheli/statirator

def content_absolute_links(content, image=None):
    from django.contrib.sites.models import Site
    current_site = Site.objects.get(pk=settings.SITE_ID)

    def abs_url(url):

        parsed = urlparse.urlparse(url)
        if parsed.netloc == parsed.scheme == '':
            url = urlparse.urljoin('http://{0}'.format(current_site.domain), url)
        return url

    soup = BeautifulSoup(content)

    if image:
        img = Tag(soup, 'img', [('src', image)])
        soup.insert(0, img)

    for link in soup.findAll('a'):
        link['href'] = abs_url(link['href'])

    for link in soup.findAll('img'):
        link['src'] = abs_url(link['src'])

    return unicode(soup)

Пример #35

0

Показать файл

Файл: browser.py Проект: domruf/collective.ebook

    def process(self, items):
        text = "\n".join(self.publish(item, level) for (item, level) in items)

        soup = BeautifulSoup(text)
        normalizer = getUtility(IURLNormalizer).normalize

        stack = [{'children': [], 'level': 0}]

        headings = soup.findAll(('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))

        for index, heading in enumerate(headings):
            level = int(heading.name[1])

            hid = 'section-' + normalizer(heading.string) + '-%d' % (index + 1)

            title = u''
            for string in heading.recursiveChildGenerator():
                if isinstance(string, unicode):
                    title += string.lstrip('123456789. ').strip()

            # Remove trivial headings
            if not title:
                heading.extract()
                continue

            entry = {
                'title': title,
                'id': hid,
                'children': [],
                'level': level,
            }

            i = 0
            while level <= stack[-1]['level']:
                stack.pop()
                i += 1

            stack[-1]['children'].append(entry)
            stack.append(entry)

            heading['id'] = hid

            if level == 1:
                heading.name = 'h2'
                heading['class'] = 'documentFirstHeading'

        # Make sure we start with a heading (default to 'own').
        for child in soup.recursiveChildGenerator():
            if isinstance(child, unicode):
                if child.strip('\n '):
                    hid = 'section-0'
                    title = self.context.Title().decode('utf-8')
                    soup.insert(0, '<h2 id="%s">%s</h2>' % (hid, title))
                    # stack[0]['children'].insert(
                    #    0, {'title': title,
                    #        'id': hid,
                    #        'children': [],
                    #        'level': 2,
                    #        })
                    break
            elif child.name.startswith('h'):
                break

        while len(stack[0]['children']) == 1:
            stack[0] = stack[0]['children'].pop()

        return soup, stack[0]['children']

Пример #36

0

Показать файл

Файл: toc.py Проект: aksrikanth/library-build-system

class Toc:

  def __init__(self, title, options):
    # defaults
    self.infile = sys.stdin
    self.outfile = sys.stdout
    self.tag_names = ['h2', 'h3']
    self.toc_id = 'auto_toc'
    self.name_prefix = 'section'
    self.title = title

    # options
    for option, value in options:
      if option in ('-h', '--help'):
        usage()
        sys.exit()
      elif option in ('-t', '--tags'):
        self.tag_names = value.split(',')
      elif option in ('-i', '--infile'):
        self.infile = open(value, 'r')
      elif option in ('-o', '--outfile'):
        self.outfile = open(value, 'w')

    # process the html, create toc and print
    self.get_tags()
    if self.tag_list:
      self.id_tags()
      self.create_toc()
    self.output()

    # clean up
    self.infile.close()
    self.outfile.close()

  def get_tags(self):
    self.soup = BeautifulSoup(self.infile.read())

    # check if there is an existing toc
    toc = self.soup.findAll(id=self.toc_id)
    for tag in toc:
      tag.extract()

    # check which of the mentioned tags are present
    tag_names = []
    for tag_name in self.tag_names:
      tag_list = self.soup.findAll(tag_name)
      if tag_list:
        tag_names.append(tag_name)
      if len(tag_names) >= 2:
        break
    self.tag_names = tag_names

    # get tags
    self.tag_list = self.soup.findAll(self.tag_names) if self.tag_names else []

  def id_tags(self):
    counts = []
    self.toc_list = []
    for item in self.tag_names:
      counts.append(0)
    for tag in self.tag_list:
      reset = False
      depth = 0
      for index, tag_name in enumerate(self.tag_names):
        if reset == True:
          counts[index] = 0
        if tag.name == tag_name:
          depth = index
          counts[index] += 1
          reset = True

      name = self.name_prefix
      for count in counts:
        if count == 0:
          break
        name = '%(name)s_%(count)i' % { 'name': name, 'count': count }

      tag['id'] = name
      self.toc_list.append({ 'depth': depth, 'id': name, 'title': tag.text })

  def create_toc(self):
    # lists will hold the last ol/ul elements at each depth
    lists = []

    # setup the toc container
    toc = Tag(self.soup, 'div')
    toc['id'] = self.toc_id
    last_li = toc
    header = Tag(self.soup, 'h2')
    header_title = NavigableString('Contents')
    header.append(header_title)
    toc.append(header)

    for toc_item in self.toc_list:
      depth = toc_item['depth']
      if len(lists) ==  depth: # this is the first time we're at this depth
        list_el = Tag(self.soup, 'ol')
        lists.append(list_el)
        last_li.append(list_el)

      elif depth > old_depth: # this is a new sub-tree
        list_el = Tag(self.soup, 'ol')
        lists[depth] = list_el
        last_li.append(list_el)

      old_depth = depth

      # set up the new item
      li = Tag(self.soup, 'li')
      a = Tag(self.soup, 'a')
      title = NavigableString(toc_item['title'])
      a.append(title)
      a['href'] = '#%(id)s' % { 'id': toc_item['id'] }
      li.append(a)
      lists[depth].append(li)
      last_li = li

    # insert the toc at the top of the html
    self.soup.insert(0, toc)

  def output(self):
    prefix = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
	<title>%(title)s</title>
</head>
<body>
    """ % { 'title': self.title }
    suffix = """
    </body>
    </html>
    """
    self.outfile.write(prefix)
    self.outfile.write(self.soup.prettify())
    self.outfile.write(suffix)

Пример #37

0

Показать файл

Файл: settings.py Проект: dszybala/epp

class XMLSettings(object):
    """Saves Settings in XML file"""
    # -------------------------------------------------------------------------------------
    # Attributes

    # Default Settings Header
    HEADER = """<?xml version="1.0" encoding="UTF-8"?>\n"""
    HEADER_SETTINGS = """<{0}>
    </{0}>"""

    PAT_FORMAT = re.compile(r">\s*<")  # White spaces
    PAT_FORMAT_2 = re.compile(
        r"<([a-z0-9]*)>\s*<(/\1)>")  # New lines on elements

    # -------------------------------------------------------------------------------------
    def filepath():
        doc = "The filepath property to the path"

        def fget(self):
            return self._filepath

        def fset(self, value):
            self._filepath = value

        return locals()

    filepath = property(**filepath())

    # -------------------------------------------------------------------------------------
    def cache():
        doc = "Cache property - do not reload file on access if true"

        def fget(self):
            return self._cache

        def fset(self, value):
            self._cache = value

        return locals()

    cache = property(**cache())

    # -------------------------------------------------------------------------------------
    # Private
    # -------------------------------------------------------------------------------------
    def __init__(self, filepath, root=None, cache=False):
        super(XMLSettings, self).__init__()

        self.root = root
        self.filepath = filepath
        self.cache = cache

        if os.path.isfile(self.filepath):
            self._soup = BeautifulSoup(open(self.filepath))
            if self.root is None:
                self.root = self._soup.first().name
        else:
            if self.root is None:
                self.root = "settings"

        self.HEADER_SETTINGS = self.HEADER_SETTINGS.format(self.root)

        if not os.path.isfile(self.filepath):
            self._soup = BeautifulSoup(self.HEADER + self.HEADER_SETTINGS)

    # -------------------------------------------------------------------------------------
    def __len__(self):
        """docstring for __len__"""
        root = self._soup.find(self.root)
        if root is None:
            return 0

        return len(root.findAll(recursive=False))

    # -------------------------------------------------------------------------------------
    def __iter__(self):
        """docstring for __len__"""
        root = self._soup.find(self.root)
        if root is None:
            raise StopIteration

        for element in root.findAll(recursive=False):
            yield (element.name, dict(element.attrs))

    # -------------------------------------------------------------------------------------
    def _set_element(self, root, tagname, text=None, attr=None):
        """Creates if not available an element at the soup root element
        
        :return: tag object or None
        :rtype: Tag
        """

        # Add Topic if not available
        if attr is None:
            if root.find(re.compile(tagname + "$", re.I)) is None:
                new_tag = Tag(self._soup, tagname)
                root.insert(0, new_tag)
        else:
            if root.find(re.compile(tagname + "$", re.I), attr) is None:
                new_tag = Tag(self._soup, tagname, attr.items())
                root.insert(0, new_tag)

        settings = self._soup.find(self.root)
        tag = settings.find(re.compile(tagname + "$", re.I))

        # Something to insert
        if tag is not None and text is not None:
            if tag.text.strip() == "":
                tag.insert(0, NavigableString(text))
            else:
                tag.contents[0].replaceWith(text)

        return tag

    # -------------------------------------------------------------------------------------
    def _set(self, topic, key, value, topic_attr=None):
        """Set key and value at topic
        
        :return: success status
        :rtype: bool"""

        # In case it is an empty document
        if not unicode(self._soup).strip().startswith("<?xml"):
            self._soup.insert(0, NavigableString(self.HEADER))

        # In case settings root is not defined
        settings = self._soup.find(self.root)
        if settings is None:
            self._soup.insert(1, Tag(self._soup, self.root))
            settings = self._soup.find(self.root)

        # Add Topic
        topic_tag = self._set_element(settings, topic.lower(), attr=topic_attr)

        if topic_tag is None:
            return False

        # Add key and value
        key_tag = self._set_element(topic_tag, key.lower(), escape(value))
        # Add "" since XML may introduce whitespaces.
        #key_tag = self._set_element(topic_tag, key, '"{0}"'.format(value))

        return key_tag is not None

    # -------------------------------------------------------------------------------------
    def _get(self, topic, key, topic_attr=None):
        """Get key at topic
        
        :return: success status
        :rtype: bool"""

        # In case settings root is not defined
        settings = self._soup.find(self.root)
        if settings is None:
            return None

        if topic_attr is None:
            topic_tag = settings.find(re.compile(topic + "$", re.I))
        else:
            topic_tag = settings.find(re.compile(topic + "$", re.I),
                                      topic_attr)

        if topic_tag is None:
            return None

        key_tag = topic_tag.find(re.compile(key + "$", re.I))

        if key_tag is None or len(key_tag.contents) < 1:
            return None

        value = unescape(key_tag.contents[0]).strip()
        #if value.startswith('"') and value.endswith('"'):
        #    value = value.strip('"')

        return value

    # -------------------------------------------------------------------------------------
    def _save(self, filepath=None):
        """Save the File"""
        if filepath is None:
            filepath = self.filepath

        with open(filepath, 'w') as f:
            # For the newline make sure that content is escaped
            pretty_content = self._soup.renderContents()

            pretty_content = self.PAT_FORMAT.sub(">\\n<", pretty_content)
            pretty_content = self.PAT_FORMAT_2.sub("<\\1><\\2>",
                                                   pretty_content)
            f.write(pretty_content)

    # -------------------------------------------------------------------------------------
    # Public
    # -------------------------------------------------------------------------------------
    def set(self, topic, key, value, topic_attr=None):
        """Set key and value at topic
        
        :return: success status
        :rtype: bool"""

        # Won't even bother
        if "<" in topic or ">" in topic:
            return False
        if "<" in key or ">" in key:
            return False

        ret = self._set(topic, key, value, topic_attr=topic_attr)
        if ret == True:
            self._save()

        return ret

    # -------------------------------------------------------------------------------------
    def get(self,
            topic,
            key,
            default_value=None,
            create=False,
            topic_attr=None):
        """Get key at topic
        
        :return: success status
        :rtype: bool"""

        if not os.path.isfile(self.filepath):
            return default_value

        # Won't even bother
        if "<" in topic or ">" in topic:
            return default_value
        if "<" in key or ">" in key:
            return default_value

        # Only reload if not cached
        if not self.cache:
            self._soup = BeautifulSoup(open(self.filepath))

        ret = self._get(topic, key, topic_attr=topic_attr)
        if ret is None:
            if create:
                self.set(topic, key, default_value, topic_attr=topic_attr)

            return default_value

        return ret

    # -------------------------------------------------------------------------------------
    def remove(self, topic, key=None):
        """Remove a complete topic or key from topic"""

        if not os.path.isfile(self.filepath):
            return False

        if "<" in topic or ">" in topic:
            return False
        if key is not None and ("<" in key or ">" in key):
            return False

        # Only reload if not cached
        if not self.cache:
            self._soup = BeautifulSoup(open(self.filepath))

        # In case settings root is not defined
        settings = self._soup.find(self.root)
        if settings is None:
            return False

        topic_tag = settings.find(re.compile(topic + "$", re.I))

        if topic_tag is None:
            return False

        # Delete the whole topic
        if key is None:
            topic_tag.extract()
        else:
            # Delete only key
            key_tag = topic_tag.find(re.compile(key + "$", re.I))
            if key_tag is None:
                return False

            key_tag.extract()

        self._save()

        return True

    # -------------------------------------------------------------------------------------
    def findall(self, topic, key, attr="name"):
        """docstring for finall"""
        entries = {}
        for name, attrs in sorted(self):
            if name.lower() == topic and attr in attrs:
                entries[attrs[attr]] = self.get(topic,
                                                key,
                                                topic_attr={attr: attrs[attr]})

        return entries

Пример #38

0

Показать файл

Файл: CommentTGDD.py Проект: buihuuloc/CommentTGDD

    def writeFile(self, cate):
        count = 0
        for cm in self.commentInfo:
            commentData  = ""
            soupComment = BeautifulSoup(commentData)
            isOk = False
            productName = cm['title'].encode('utf-8')
            linkProduct = "http://www.thegioididong.com" + cm['href']
            print linkProduct
            productData  = urllib2.urlopen(linkProduct)
            soup = BeautifulSoup(productData.read())
            dt = soup.find('div',attrs={'id':'tgddComment'})
            s = BeautifulSoup(str(dt).lower())
            info = s.find('div',attrs={'id':'tgddcomment'})
            url = self.getUrlComment(info['cateid'], info['detailid'])
            rattingUrl = self.getUrlRatting(info['cateid'], info['detailid'])
            rattingData = urllib2.urlopen(rattingUrl).read()
            jsonData = urllib2.urlopen(url)
            data = jsonData.read()
            if len(str(data)) > 0:
                rattingData = rattingData[2:]
                rattingData = rattingData[:-2]
                data = data[1:]
                data = data[:-1]
                commentData = json.loads(data)
                count += 1
                isOk = True
            else:
                print productName + " No Comment\n"
            if(isOk):
                path = cate + "\\" + no_accent_vietnamese(productName).replace('/','-') + ".txt"
                print path
                fileData = open(path,"w")
                fileData.write("<name>" + productName + "</name>")
                fileData.write("\n")
                fileData.write("<link>" + linkProduct + "</link>")
                fileData.write("\n")
                fileData.write("<ratting>" + rattingData + "</ratting>")  
                fileData.write("\n")
                fileData.write("<!-- Comments -->")
                fileData.write("\n")

                pos = -1
                for cmData in commentData:
                    pos += 1
                    id = cmData['Id'].encode('utf-8')
                    try:
                        parentId = cmData['ParentId'].encode('utf-8')
                    except Exception:
                        parentId = "null";
                    content = cmData['Content'].encode('utf-8')
                    author = cmData['UserId'].encode('utf-8')
                    date = cmData['CreatedDate'].encode('utf-8').replace("/Date(","")
                    date = date.replace(")/", "")
                    try:
                        timeComment = self.convertTime(date)
                    except Exception:
                        timeComment = 'null'
                    newcomment = Comment("null",content,author, timeComment, str(cmData['LikeCounts']),"null", id, parentId);
                    if  parentId == "0" or parentId == "null":
                        comment = newcomment.makeAComment()
                        soupComment.insert(pos,comment)
                    else:
                      soupComment.insert(pos,newcomment.makeASubComment())
                fucksoup = BeautifulSoup(str(soupComment))
                maintag = fucksoup.findAll('tag')
                subtag= fucksoup.findAll('subtag')
                for t in maintag:
                    t.properties['reply'] = 0
                    for ts in subtag:
                        if t.properties['id'] == ts.properties['parentid']:
                            if t.properties['reply'] == 0:
                                subcm = "\n\t\t" + str(ts) + "\n" + "::"
                            else:
                                 subcm = "\t" + str(ts) + "\n" + "::"
                            t.properties['reply'] += 1 
                            t.comment.insert(t.properties['reply'],subcm)
                            del(ts)
                    ff = str(t).replace('::',(' '*4)) + "\n"
                    fileData.write(ff)
                fileData.close()
                print no_accent_vietnamese(productName) + " Done \n"
        print count

Пример #39

0

Показать файл

def sunset_embed(body, request=False):
    # Moved the import down here to avoid a circular import
    from sunset.models import image
    self_closing = [
        'sunset',
    ]

    if body and "<sunset" in body:
        body_raw = BeautifulSoup(body, selfClosingTags=self_closing)
        imglist = body_raw.findAll('sunset')

        for imgtag in imglist:
            err = 'Unknown error parsing Sunset embed tag'
            new_tag = ''
            img_pk = imgtag.get('id', False)
            cur_type = imgtag.get('type', 'icon')
            if img_pk:
                img_check = image.objects.filter(pk=int(img_pk)).filter(
                    access_query(request)).select_related('cat')

                if img_check:
                    cur_img = img_check.first()
                    asset_check = cur_img.assets.filter(type=cur_type)

                    if asset_check:
                        cur_asset = asset_check.first()
                        new_tag = BeautifulSoup(selfClosingTags=self_closing)
                        new_a = Tag(new_tag, 'a')
                        new_img = Tag(new_tag, 'img')

                        new_a['class'] = 'sunset_embed sunset_%s' % cur_type
                        new_a['href'] = cur_img.get_absolute_url()
                        new_a['title'] = cur_img

                        new_img['alt'] = cur_img
                        new_img['title'] = cur_img

                        new_img['src'] = cur_asset.get_url()

                        new_tag.insert(0, new_a)
                        new_a.insert(0, new_img)
                        err = False

                    else:
                        err = 'Sunset image asset type specified in embed tag was not found'

                else:
                    err = 'Sunset image specified in embed tag was not found'

            else:
                err = 'Invalid or missing image ID in Sunset embed tag'

            if err:
                imgtag.replaceWith(
                    Comment('%s.  Original was:  %s' % (err, imgtag)))
            else:
                imgtag.replaceWith(new_tag)

        return unicode(body_raw)

    else:
        # Nothing to do.
        return body

Пример #40

0

Показать файл

response = br.follow_link(text_regex=r"UK")
data = br.response().read()

# pick out anchors that are tagged with the story class
soup = BeautifulSoup(data)
# tags = soup.findAll("a", "story")
tags = soup.findAll("a")
newSoup = BeautifulSoup()

base = "http://www.bbc.co.uk"
for tag in tags:
    # add base url if it is missing from href
    if tag[u'href'][0] == "/": tag[u'href'] = base + tag[u'href']
    # add tag to new soup followed by a <br>
    newSoup.insert(0, tag)
    newSoup.insert(0, Tag(soup, "br"))

# convert soup into a string
data = str(newSoup)

# save scraped info to a file
try:
    f = open("out.html", "w")
    f.write(data)
    f.close()
except IOError, e:
    print e

# display local file in browser
try:

Пример #41

0

Показать файл

            + str(int(value_killer)) + '%'))

    divtag_t4 = Tag(htmldata, "div")
    divtag_t4.insert(
        0,
        NavigableString('Total percentage of NOT USEFUL data: ' +
                        str(int(value_unc)) + '%'))

    divtag_t5 = Tag(htmldata, "div")
    divtag_t5.insert(
        0,
        NavigableString(
            'NOTE: The chart takes into account also the simple affidability criteria'
        ))

    htmldata.insert(0, htmltag)
    htmltag.insert(0, headtag)
    headtag.insert(0, titletag)

    htmltag.insert(1, bodytag)

    bodytag.insert(0, divtag_wrap)
    divtag_wrap.insert(0, imgtag)
    divtag_wrap.insert(1, divtag_t1)
    divtag_wrap.insert(2, divtag_t2)
    divtag_wrap.insert(3, divtag_t3)
    divtag_wrap.insert(4, divtag_t4)
    divtag_wrap.insert(5, divtag_t5)

    #print(htmldata)

Пример #42

0

Показать файл

Файл: xbmclibrary.py Проект: shaifbari/XBMC-Amazon.com-Prime-Streaming

def LIST_MOVIES():
    if (common.addon.getSetting('enablelibraryfolder') == 'true'):
        SetupAmazonLibrary()
    elif (common.addon.getSetting('customlibraryfolder') <> ''):
        CreateDirectory(MOVIE_PATH)
        CreateDirectory(TV_SHOWS_PATH)
    import movies as moviesDB
    movies = moviesDB.loadMoviedb(favorfilter=True)
    for asin, movietitle, url, poster, plot, director, writer, runtime, year, premiered, studio, mpaa, actors, genres, stars, votes, TMDBbanner, TMDBposter, TMDBfanart, isprime, watched, favor, TMDB_ID in movies:
        CreateStreamFile(movietitle, url, MOVIE_PATH)
        soup = BeautifulSoup()
        movie = Tag(soup, "movie")
        soup.insert(0, movie)
        movie.insert(0, createElement('title', movietitle + ' (Amazon)'))
        if year:
            movie.insert(1, createElement('year', str(year)))
        if premiered:
            movie.insert(1, createElement('premiered', premiered))
        if plot:
            movie.insert(2, createElement('plot', plot))
        if runtime:
            movie.insert(2, createElement('runtime', runtime))
        if votes:
            movie.insert(3, createElement('votes', str(votes)))
        if stars:
            movie.insert(4, createElement('rating', str(stars)))
        if director:
            movie.insert(5, createElement('director', director))
        if studio:
            movie.insert(6, createElement('studio', studio))
        if poster:
            movie.insert(7, createElement('thumb', poster))
        if mpaa:
            movie.insert(8, createElement('mpaa', mpaa))
        u = sys.argv[0]
        u += '?url="' + urllib.quote_plus(url) + '"'
        u += '&mode="play"'
        u += '&name="' + urllib.quote_plus(movietitle) + '"'
        utrailer = u + '&sitemode="PLAYTRAILER"'
        movie.insert(9, createElement('trailer', utrailer))
        fileinfo = createElement('fileinfo', '')
        streamdetails = createElement('streamdetails', '')
        audio = createElement('audio', '')
        audio.insert(0, createElement('channels', '2'))
        audio.insert(1, createElement('codec', 'aac'))
        streamdetails.insert(0, audio)
        video = createElement('video', '')
        video.insert(0, createElement('codec', 'h264'))
        video.insert(1, createElement('height', '400'))
        video.insert(2, createElement('width', '720'))
        video.insert(4, createElement('scantype', 'Progressive'))
        streamdetails.insert(1, video)
        fileinfo.insert(0, streamdetails)
        movie.insert(10, fileinfo)
        index = 10
        if genres:
            for genre in genres.split(','):
                index += 1
                movie.insert(index, createElement('genre', genre))
        if actors:
            for actor in actors.split(','):
                if actor <> None:
                    index += 1
                    actortag = createElement('actor', '')
                    actorname = createElement('name', actor)
                    actortag.insert(0, actorname)
                    movie.insert(index, actortag)
        movieNFO = os.path.join(MOVIE_PATH, movietitle + '.nfo')
        file = open(movieNFO, 'w')
        file.write(str(soup))
        file.close()

Пример #43

0

Показать файл

Файл: 07-creating-doc.py Проект: ithenis/python-course

from BeautifulSoup import BeautifulSoup, Tag, NavigableString


soup =  BeautifulSoup()

tag1 = Tag(soup, "person")
tag2 = Tag(soup, "name", [("first","John"),("last","Smith")])
tag3 = Tag(soup, "location", [("country", "uk")])
soup.insert(0, tag1)
tag1.insert(0, tag2)
tag1.insert(1, tag3)
print soup
text = NavigableString("John Gary Smith")
tag2.insert(0, text)
print soup.prettify()


1

Пример #44

0

Показать файл

Файл: PTB-wikify.py Проект: BackupTheBerlios/osxptb-svn

def mexhelpextract(mexnames):
    #print 'processing mex files: ' + mexnames.__repr__()
    from ConfigParser import RawConfigParser as ConfigParser, Error as error
    for mexname in mexnames:
        # ConfigParser for the three elements per subfunctions written to tmpdir
        # [SubFunction]
        # usage: 'xyz'
        # help: 'xyz'
        # seealso: 'xyz'
        config = ConfigParser({'usage': [], 'help': [], 'seealso': []})
        # assemble command line for matlab
        matlabcmd = 'addpath(\'%s\');%s(\'%s\',\'%s\'); exit' % \
            (_tmpdir, \
             os.path.splitext(os.path.basename(_mexscript))[0], \
             mexname, \
             _tmpdir)
        cmd = 'matlab -nojvm -nodisplay -r "%s" > /dev/null' % matlabcmd
        # and execute matlab w/ the temporary script we wrote earlier
        try:
            print 'running MATLAB for %s in %s' % (mexname, _tmpdir)
            stdin, stderr = os.popen4(cmd)
            print stderr.read()
            stdin.close()
            stderr.close()
        except:
            print 'could not dump help for %s into %s' % (mexname, _tmpdir)

        cfgfile = config.read(os.path.join(_tmpdir, mexname))
        if cfgfile == []:
            print "skipping " + mexname + " (no output)"
            continue
        subfunctions = config.sections()
        print 'processing subfunctions: ' + subfunctions.__repr__()
        for subfunction in subfunctions:
            # read in the strings for this subfunction
            usage = config.get(subfunction, 'usage')
            help = config.get(subfunction, 'help')
            seealso = config.get(subfunction, 'seealso')

            headline = '===[[' + subfunction + ' ' + mexname + '(\'' + subfunction + '\')]]===\n'
            breadcrumb = "==[[Psychtoolbox]] &#8250; [[" \
                                + mexname + "]].{mex*,dll} subfunction==\n\n"

            # scrub the text for main text only
            body = beackern(help)

            docstring = '' \
                    + '%%(matlab;Usage)' \
                    + usage \
                    + '%%\n' \
                    + body \
                    + '\n\n'
            if seealso:
                docstring = docstring + '<<=====See also:=====\n' + seealso + '<<'

            text =  '""' + headline \
                    + breadcrumb \
                    + docstring + '""'

            # retrieve old body text, to update or concatenate with synonymous subfunctions
            #
            # browse the page
            title = re.sub("[^\w]|_", "", subfunction)
            try:
                resp = mech.open(baseurl + title + "/edit")
            except HTTPError, e:
                sys.exit(
                    "retrieving old text during posting of this mex function failed: %d: %s"
                    % (e.code, e.msg))
            # get text from the edit form
            mech.select_form(nr=1)
            try:
                oldbody = mech["body"]
            except:
                print 'No id="body" form. Figure this out first. cf. page text above.'
                for form in mech.forms():
                    print form
                sys.exit(
                    "retrieving old body text failed while processing page: " +
                    baseurl + title + '/edit')

            # parse embedded structuring HTML tags in the wiki text
            soup = BeautifulSoup(oldbody)

            # check if the subfunction is already present, by CSS 'class' and 'id'
            subfct = soup.find('div', {'class': "subfct", 'id': mexname})
            if subfct:
                # replace the text of the container DIV
                subfct.contents[0].replaceWith(text)
            else:
                # contruct new DIV to hold the text
                subfctDIV = Tag(soup, "div")
                subfctDIV['class'] = 'subfct'
                subfctDIV['id'] = mexname
                subfctDIV.insert(0, NavigableString(text))

                # insert the new div
                soup.insert(len(soup), subfctDIV)

            # Now scoop the good well-formed divs out of the soup
            divs = soup('div', {'class': "subfct"})

            # and drop them into fresh yummy cheese soup
            cheesesoup = BeautifulSoup()

            # drop good divs into the soup, one by one
            for div in divs:
                # remove the unneeded style attribute, we finally
                # have this stuff defined in the ptbdocs.css now.
                del (div['style'])
                # escape the HTML tags for wiki parser
                cheesesoup.append(NavigableString('\n""'))
                cheesesoup.append(div)
                cheesesoup.append(NavigableString('""\n'))

            post(subfunction, cheesesoup.renderContents())

Пример #45

0

Показать файл

Файл: models.py Проект: pombredanne/xscheduling

  def save(self):
    soup = BeautifulSoup()
    client_tag = Tag(soup, 'Client')
    soup.insert(0, client_tag)
    i = 0
    method = "POST"
    try:
      id_tag = Tag(soup, 'ID')
      id_tag.insert(0, NavigableString('%d' % self.id))
      client_tag.insert(i, id_tag)
      i = i+1
      method = "PUT"
    except AttributeError:
      pass
    
    try:
      name_tag = Tag(soup, 'Name')
      name_tag.insert(0, NavigableString(self.name))
      client_tag.insert(i, name_tag)
      i = i+1
    except AttributeError:
      raise ValueError("You must provide client's name.")  
    
    try:
      if self.address:
        address_tag = Tag(soup, 'Address')
        address_tag.insert(0, NavigableString(self.address))
        client_tag.insert(i, address_tag)
        i = i+1
    except AttributeError:
      pass
    
    try:
      if self.postal_address:
        postal_address_tag = Tag(soup, 'PostalAddress')
        postal_address_tag.insert(0, NavigableString(self.postal_address))
        client_tag.insert(i, postal_address_tag)
        i = i+1
    except AttributeError:
      pass
    
    try:
      if self.phone:
        phone_tag = Tag(soup, 'Phone')
        phone_tag.insert(0, NavigableString(self.phone))
        client_tag.insert(i, phone_tag)
        i = i+1
    except AttributeError:
      pass

    try:
      if self.fax:
        fax_tag = Tag(soup, 'Fax')
        fax_tag.insert(0, NavigableString(self.fax))
        client_tag.insert(i, fax_tag)
        i = i+1
    except AttributeError:
      pass

    try:
      if self.website:
        website_tag = Tag(soup, 'WebSite')
        website_tag.insert(0, NavigableString(self.website))
        client_tag.insert(i, website_tag)
        i = i+1
    except AttributeError:
      pass
    
    try:
      if self.referral_source:
        referral_source_tag = Tag(soup, 'ReferralSource')
        referral_source_tag.insert(0, NavigableString(self.referral_source))
        client_tag.insert(i, referral_source_tag)
    except AttributeError:
      pass

    if method == "PUT":
      response = rest_client.Client("").PUT(self.put, str(soup))
    else:
      response = rest_client.Client("").POST(self.post, str(soup))
    return Client(xml=response.content)

Пример #46

0

Показать файл

Файл: models.py Проект: pombredanne/xscheduling

  def save(self):
    soup = BeautifulSoup()
    contact_tag = Tag(soup, 'Contact')
    soup.insert(0, contact_tag)
    i = 0
    method = "PUT"

    try:
      id_tag = Tag(soup, 'ID')
      id_tag.insert(0, NavigableString('%d' % self.id))
      contact_tag.insert(i, id_tag)
      i = i+1
    except AttributeError:
      pass

    try:
      client_tag = Tag(soup, 'Client')
      client_id_tag = Tag(soup, 'ID')
      client_id_tag.insert(0, NavigableString('%d' % self.owner_id))
      client_tag.insert(0, client_id_tag)
      contact_tag.insert(i, client_tag)
      i = i+1
      method = "POST"
    except AttributeError:
      pass
    
    try:
      name_tag = Tag(soup, 'Name')
      name_tag.insert(0, NavigableString(self.name))
      contact_tag.insert(i, name_tag)
      i = i+1
    except AttributeError:
      raise ValueError("You must provide client's name.")  
    
    try:
      if self.mobile:
        mobile_tag = Tag(soup, 'Mobile')
        mobile_tag.insert(0, NavigableString(self.mobile))
        contact_tag.insert(i, mobile_tag)
        i = i+1
    except AttributeError:
      pass
    
    try:
      if self.email:
        email_tag = Tag(soup, 'Email')
        email_tag.insert(0, NavigableString(self.email))
        contact_tag.insert(i, email_tag)
        i = i+1
    except AttributeError:
      pass
    
    try:
      if self.phone:
        phone_tag = Tag(soup, 'Phone')
        phone_tag.insert(0, NavigableString(self.phone))
        contact_tag.insert(i, phone_tag)
        i = i+1
    except AttributeError:
      pass

    try:
      if self.position:
        position_tag = Tag(soup, 'Position')
        position_tag.insert(0, NavigableString(self.position))
        contact_tag.insert(i, position_tag)
        i = i+1
    except AttributeError:
      pass

    if method == "PUT":
      response = rest_client.Client("").PUT(self.put % self.id, str(soup))
    else:
      response = rest_client.Client("").POST(self.post, str(soup))
    return Contact(xml=response.content)

Пример #47

0

Показать файл

Файл: PTB-wikify.py Проект: BackupTheBerlios/osxptb-svn

def mexhelpextract(mexnames):
    #print 'processing mex files: ' + mexnames.__repr__()
    from ConfigParser import RawConfigParser as ConfigParser, Error as error
    for mexname in mexnames:
        # ConfigParser for the three elements per subfunctions written to tmpdir
        # [SubFunction]
        # usage: 'xyz'
        # help: 'xyz'
        # seealso: 'xyz'
        config = ConfigParser({'usage':[], 'help':[], 'seealso':[]})
        # assemble command line for matlab
        matlabcmd = 'addpath(\'%s\');%s(\'%s\',\'%s\'); exit' % \
            (_tmpdir, \
             os.path.splitext(os.path.basename(_mexscript))[0], \
             mexname, \
             _tmpdir)
        cmd = 'matlab -nojvm -nodisplay -r "%s" > /dev/null' % matlabcmd
        # and execute matlab w/ the temporary script we wrote earlier
        try:
            print 'running MATLAB for %s in %s' % (mexname,_tmpdir)
            stdin, stderr = os.popen4(cmd)
            print stderr.read()
            stdin.close()
            stderr.close()
        except: print 'could not dump help for %s into %s' % (mexname,_tmpdir)

        cfgfile = config.read(os.path.join(_tmpdir,mexname))
        if cfgfile == []:
            print "skipping " + mexname + " (no output)"
            continue
        subfunctions = config.sections()
        print 'processing subfunctions: ' + subfunctions.__repr__()
        for subfunction in subfunctions:
            # read in the strings for this subfunction
            usage = config.get(subfunction,'usage')
            help = config.get(subfunction,'help')
            seealso = config.get(subfunction,'seealso')

            headline = '===[['+subfunction+' '+mexname+'(\''+subfunction+'\')]]===\n'
            breadcrumb = "==[[Psychtoolbox]] &#8250; [[" \
                                + mexname + "]].{mex*,dll} subfunction==\n\n"

            # scrub the text for main text only
            body = beackern(help)

            docstring = '' \
                    + '%%(matlab;Usage)' \
                    + usage \
                    + '%%\n' \
                    + body \
                    + '\n\n'
            if seealso:
                docstring = docstring + '<<=====See also:=====\n' + seealso + '<<'

            text =  '""' + headline \
                    + breadcrumb \
                    + docstring + '""'

            # retrieve old body text, to update or concatenate with synonymous subfunctions
            #
            # browse the page
            title = re.sub("[^\w]|_","",subfunction)
            try:
                resp = mech.open(baseurl+title+"/edit")
            except HTTPError, e:
                sys.exit("retrieving old text during posting of this mex function failed: %d: %s" % (e.code, e.msg))
            # get text from the edit form
            mech.select_form(nr=1)
            try:
                oldbody = mech["body"]
            except:
                print 'No id="body" form. Figure this out first. cf. page text above.'
                for form in mech.forms():
                        print form
                sys.exit("retrieving old body text failed while processing page: " + baseurl + title +'/edit')

            # parse embedded structuring HTML tags in the wiki text
            soup = BeautifulSoup(oldbody)

            # check if the subfunction is already present, by CSS 'class' and 'id'
            subfct = soup.find('div', {'class' : "subfct", 'id' : mexname})
            if subfct:
                # replace the text of the container DIV
                subfct.contents[0].replaceWith(text)
            else:
                # contruct new DIV to hold the text
                subfctDIV = Tag(soup, "div")
                subfctDIV['class'] = 'subfct'
                subfctDIV['id'] = mexname
                subfctDIV.insert(0,NavigableString(text))

                # insert the new div
                soup.insert(len(soup),subfctDIV)

            # Now scoop the good well-formed divs out of the soup
            divs = soup('div', {'class' : "subfct"})

            # and drop them into fresh yummy cheese soup
            cheesesoup = BeautifulSoup()

            # drop good divs into the soup, one by one
            for div in divs:
                # remove the unneeded style attribute, we finally
                # have this stuff defined in the ptbdocs.css now.
                del(div['style'])
                # escape the HTML tags for wiki parser
                cheesesoup.append(NavigableString('\n""'))
                cheesesoup.append(div)
                cheesesoup.append(NavigableString('""\n'))

            post(subfunction,cheesesoup.renderContents())

Python BeautifulSoup.insert примеры использования