Python clean_html示例，lxml.html.clean.clean_html Python示例

示例#1

0

显示文件

文件： serializers.py 项目： kahihia/project

 def to_representation(self, obj):
     return {
         'id': obj.pk,
         'title': clean_html(obj.title),
         'pubDate': obj.created_at.strftime("%d.%m.%y"),
         'fullText': clean_html(obj.description) if obj.description else '',
     }

示例#2

0

显示文件

文件： googlereaderapi.py 项目： vbuell/newspaper

 def __init__(self, elemento):
     self.is_read = False
     self.is_marked_as_unread = False
     self.is_liked = False
     self.is_shared = False
     self.is_starred = False
     self.is_browsed = False
     self.is_emailed = False
     self.is_twitter = False
     self.is_readitlater = False
     #
     self.crawlTimeMsec = self._get(elemento, 'crawlTimeMsec')
     self.id = self._get(elemento, 'id')
     self.categories = self._get(elemento, 'categories')
     self.title = self._get(elemento, 'title')
     self.alternate = self._get(elemento, 'alternate')
     if len(self.alternate) > 0:
         self.alternate = self.alternate[0]
     self.published = self._get(elemento, 'published')
     self.updated = self._get(elemento, 'updated')
     self.summary = self._get(elemento, 'summary')
     if self.summary is not None:
         self.summary = clean_html('<div>%s</div>' %
                                   (self.summary['content']))
     self.content = self._get(elemento, 'content')
     if self.content is not None:
         self.content = clean_html('<div>%s</div>' %
                                   (self.content['content']))
     self.author = self._get(elemento, 'author')
     self.likingUsers = self._get(elemento, 'likingUsers')
     self.comments = self._get(elemento, 'comments')
     self.annotations = self._get(elemento, 'annotations')
     self.origin = self._get(elemento, 'origin')

示例#3

0

显示文件

文件： XapianArticleLoader.py 项目： mizhal/oldies-veronica-backend

 def _cleanArticleText(self, article):
     if BLANK_RE.match(article.title):
         untitle = ''
     else:
         try:
             title = clean_html(article.title)
             untitle = lxml.html.fromstring(title).text_content()
         except:
             untitle = ''
                         
     if BLANK_RE.match(article.content):
         uncontent = ''
     else:
         try:
             content = clean_html(article.content)
             uncontent = lxml.html.fromstring(content).text_content()
         except:
             uncontent = ''
         
     uncontent = untitle + uncontent
     
     if BLANK_RE.match(uncontent):
         return None, None
     else:
         termbag = []
         
         splitter = re.compile(u"([^\s\)\(\]\[.,\":;\-+!¡¿?\{\}]+)")
         
         for i in splitter.finditer(uncontent):
             term = i.groups()[0]
             if not self.stopwords.has_key(term):
                 if re.match("^[0-9]+(?:[.,][0-9]+)*$", term) is None:
                     termbag.append(term.lower())
                     
         return untitle, " ".join(termbag)

示例#4

0

显示文件

文件： test_clean.py 项目： wuweijun747/lxml

    def test_clean_invalid_root_tag(self):
        # only testing that cleaning with invalid root tags works at all
        s = lxml.html.fromstring('parent <invalid tag>child</another>')
        self.assertEqual('parent child', clean_html(s).text_content())

        s = lxml.html.fromstring('<invalid tag>child</another>')
        self.assertEqual('child', clean_html(s).text_content())

示例#5

0

显示文件

文件： test_clean.py 项目： 18600597055/hue

    def test_clean_invalid_root_tag(self):
        # only testing that cleaning with invalid root tags works at all
        s = lxml.html.fromstring('parent <invalid tag>child</another>')
        self.assertEqual('parent child', clean_html(s).text_content())

        s = lxml.html.fromstring('<invalid tag>child</another>')
        self.assertEqual('child', clean_html(s).text_content())

示例#6

0

显示文件

文件： serializers.py 项目： kahihia/project

 def to_representation(self, obj):
     return {
         'id': obj.pk,
         'title': clean_html(obj.title),
         'pubDate': obj.created_at.strftime("%d.%m.%y"),
         'cover': obj.image.big if obj.image else '',
         'fullText': clean_html(obj.content) if obj.content else '',
     }

示例#7

0

显示文件

文件： gazeta.py 项目： langwieder/corpus_python_dz

def parse_url_and_save(url):
    try:
        html_document = parse(url).getroot()
    except:
        print("не удалось загрузить url или распарсить документ")
        return

    page_contents = html_document.find_class("page_content")
    if len(page_contents) == 0:
        return
    page_content = page_contents[0]

    news_contents = page_content.find_class("content_c")
    if len(news_contents) == 0:
        return
    news_content = news_contents[0]

    news_details = page_content.find_class("news_detail")
    if len(news_details) == 0:
        return
    news_detail = news_details[0]

    # Автор статьи
    authors = news_content.find_class("autor_name")
    if len(authors) == 0:
        return
    author = authors[0].find('a')
    author = clean_html(author).text_content()

    # Заголовок статьи
    header = news_content.find("h1")
    header = clean_html(header).text_content()
    header = header.strip(' \t\n\r')
    if len(header) == 0:
        return

    # Дата статьи
    created = page_content.find_class('date_time')[0].find_class('date')[0]
    created = clean_html(created).text_content()

    # Topic
    topic = news_content.find_class('rubric')[0]
    topic = clean_html(topic).text_content()

    # source
    source = url

    # publ_year
    publ_year = created[6:]

    # Текст статьи
    text = news_detail.find("article")
    text = clean_html(text).text_content()
    text = text.strip(' \t\n\r')

    # Сохраняем извлеченную инфу в файл, добавляем в csv
    path = save_text_to_file(author, header, created, topic, source, text)
    add_to_csv(path, author, header, topic, created, source, publ_year)

示例#8

0

显示文件

文件： html.py 项目： yoshrote/Columns

def striphtml(content):
    """Returns ``content`` stripped of all HTML tags and of the contents of <style> and <script> tags.
	It will also remove any tabs, newline characters and non-breaking spaces.
	"""
    if not isinstance(content, basestring):
        return u""
    content = re_script.sub(u"", content)
    doc = html.fragment_fromstring(content, create_parent=True)
    clean.clean_html(doc)
    return unicode(re_nl.sub(u"", doc.text_content()))

示例#9

0

显示文件

文件： serializers.py 项目： kahihia/project

 def to_representation(self, obj):
     return {
         'id': obj.pk,
         'name': clean_html(obj.name),
         'pubDate': obj.created_at.strftime("%d.%m.%y"),
         'currency': currency_symbol(obj.currency),
         'price': obj.cost,
         'cover': obj.image.big if obj.image else '',
         'details': clean_html(obj.description) if obj.description else ''
     }

示例#10

0

显示文件

文件： serializers.py 项目： kahihia/project

 def to_representation(self, obj):
     return {
         'id': obj.pk,
         'name': clean_html(obj.name),
         'endDate': obj.end_coupon_date.strftime("%d.%m.%y"),
         'currency': currency_symbol(obj.currency),
         'oldPrice': obj.cost,
         'cover': obj.image.big if obj.image else '',
         'details': clean_html(obj.description) if obj.description else '',
         'percent': obj.coupon_discount_percent
     }

示例#11

0

显示文件

    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = 'quotes-%s.html' % page
        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(str(response.body)), parser)
        result = etree.tostring(tree.getroot(),pretty_print = True, method = "html")
        print("_____________________________________________________________________", result, "______________________________________________________________________")
        print clean_html(response.body)

        with open(filename, 'wb') as f:
            f.write(result)
        self.log('Saved file %s' % filename)

示例#12

0

显示文件

文件： serializers.py 项目： kahihia/project

 def to_representation(self, obj):
     return {
         'id':
         obj.pk,
         'title':
         clean_html(obj.title),
         'pubDate':
         obj.created_at.strftime("%d.%m.%y"),
         'shortText':
         clean_html(Truncator(obj.description).words("30", html=True))
         if obj.description else '',
     }

示例#13

0

显示文件

def get_url(url):
    http_pattern = '^http://'
    if re.search(http_pattern, url):
        urlfh = urllib.urlopen(url)
        content = urlfh.read()
        html_tree = lxml.html.fromstring(content)
        clean_html(html_tree)  # removes crud from html
        clean_html_string = lxml.html.tostring(html_tree,
                                               encoding=unicode,
                                               method='text')
        return io.StringIO(clean_html_string)
    else:
        raise Exception("Bad url: {}".format(url))

示例#14

0

显示文件

文件： doc_reader.py 项目： taherh/pysimsearch

def get_url(url):
    http_pattern = '^http://'
    if re.search(http_pattern, url):
        urlfh = urllib.urlopen(url)
        content = urlfh.read()
        html_tree = lxml.html.fromstring(content)
        clean_html(html_tree)  # removes crud from html
        clean_html_string = lxml.html.tostring(html_tree, 
                                               encoding=unicode,
                                               method='text')
        return io.StringIO(clean_html_string)
    else:
        raise Exception("Bad url: {}".format(url))

示例#15

0

显示文件

文件： serializers.py 项目： kahihia/project

 def to_representation(self, obj):
     return {
         'img':
         obj.user.profile.avatar.th
         if obj.user and obj.user.profile.avatar else '',
         'name':
         clean_html(obj.user.profile.full_name)
         if obj.user and obj.user.profile.full_name else '',
         'post':
         clean_html(obj.name) if obj.name else '',
         'phone':
         clean_html(obj.user.profile.mobile_number)
         if obj.user and obj.user.profile.mobile_number else ''
     }

示例#16

0

显示文件

文件： indeed.py 项目： perfettiful/jobfinder

    def get_description(self, url):
        """Fetches job's page and get description"""
        html = self.fetch_url(url)
        doc = lxh.fromstring(html)
        if 'indeed.' not in url:
            return clean_html(html)

        el = doc.find('.//span[@id="job_summary"]')
        if el is None:
            return clean_html(html)

        bytes = etree.tostring(el, encoding='utf8')
        html = bytes.decode()
        return self.highlight_words(html)

示例#17

0

显示文件

    def test_clean_with_comments(self):
        html = """<p><span style="color: #00ffff;">Cy<!-- xx -->an</span><!-- XXX --></p>"""
        s = lxml.html.fragment_fromstring(html)

        self.assertEqual(b'<p><span>Cyan</span></p>',
                         lxml.html.tostring(clean_html(s)))
        self.assertEqual('<p><span>Cyan</span></p>', clean_html(html))

        cleaner = Cleaner(comments=False)
        result = cleaner.clean_html(s)
        self.assertEqual(b'<p><span>Cy<!-- xx -->an</span><!-- XXX --></p>',
                         lxml.html.tostring(result))
        self.assertEqual('<p><span>Cy<!-- xx -->an</span><!-- XXX --></p>',
                         cleaner.clean_html(html))

示例#18

0

显示文件

文件： serializers.py 项目： kahihia/project

    def to_representation(self, obj):

        if obj.short_description:
            short_text = obj.short_description
        else:
            short_text = Truncator(obj.content).words("30", html=True)

        return {
            'id': obj.pk,
            'title': clean_html(obj.title),
            'pubDate': obj.created_at.strftime("%d.%m.%y"),
            'cover': obj.image.big if obj.image else '',
            'shortText': clean_html(short_text) if short_text else '',
        }

示例#19

0

显示文件

文件： serializers.py 项目： kahihia/project

    def to_representation(self, obj):
        if obj.short_description:
            short_text = obj.short_description
        else:
            short_text = Truncator(obj.description).words("30", html=True)

        return {
            'id': obj.pk,
            'name': clean_html(obj.name),
            'pubDate': obj.created_at.strftime("%d.%m.%y"),
            'currency': currency_symbol(obj.currency),
            'price': obj.cost,
            'cover': obj.image.big if obj.image else '',
            'details': clean_html(short_text) if short_text else ''
        }

示例#20

0

显示文件

    def download_and_parse(self):
        if self._parsed:
            raise Exception('This article ({}) has already been parsed.'.format(self.url))

        self.download_date = datetime.utcnow()
        self.source_domain = urlparse(self.url).netloc
        self._parsed = True
        try:
            self.html = requests.get(self.url).content
        except requests.exceptions.RequestException:
            raise IOError("Could not download the article at: %s" % self.url)
        # This alters the html in-place.
        clean_html(self.html)
        doc = document_fromstring(self.html)
        parsers.parse_article(self, doc)

示例#21

0

显示文件

文件： wikimark.py 项目： seamrvaulter/sensimark

def extract_paragraphs(element):
    if element.tag == 'hr':
        return []
    if element.tag == 'p':
        text = clean_html(element).text_content()
        text = ' '.join(text.split())
        return [text]
    if element.tag[0] == 'h':
        text = clean_html(element).text_content()
        text = ' '.join(text.split())
        return [text]
    out = list()
    for child in get_children(element):
        out.extend(extract_paragraphs(child))
    return out

示例#22

0

显示文件

文件： playlist.py 项目： nascarsayan/chiasenhac

def main():
  quan = int(argv[1])
  st = int(argv[2])
  query = ' '.join(argv[3:])
  page = 1
  curr = 0
  while (curr < quan):
    response = requests.get(CSN, params={'s': query, 'page': page})
    # print(response.content)
    treem = html.fromstring((clean_html(response.content)).strip())
    cntpage = len(treem.xpath("//table[@class='tbtable'][1]//tr[@title]"))
    for idx in range(cntpage):
      curr += 1
      if (st > curr):
        continue
      if (curr == quan):
        break
      try:
        page1 = treem.xpath("//table[@class='tbtable'][1]//tr[@title][%d]/td[2]//a[@class='musictitle']/@href" % (idx + 1))[0]
        title = treem.xpath("//table[@class='tbtable'][1]//tr[@title][%d]/td[2]//a//text()" % (idx + 1))[0]
        print('Downloading %3d of %3d : %s' % (curr, quan, title))
        response = requests.get(page1)
        tree = html.fromstring((clean_html(response.content)).strip())
        page2 = tree.xpath("//img[@src='http://data.chiasenhac.com/images/button_download.gif']/../@href")[0]
        response = requests.get(page2)
        qual = 1
        found = False
        tree2 = html.fromstring((clean_html(response.content)).strip())
        while not found:
          try:
            mlink = tree2.xpath("//div[@id='downloadlink2']//a[last() - %d]/@href" %(qual))[0]
            request = urllib2.Request(mlink)
            print(mlink)
            request.get_method = lambda : 'HEAD'
            response = urllib2.urlopen(request)
            if 'http://chiasenhac.vn/' not in response.url:
              found = True
            else:
              print('Reducing quality')
              qual += 1
          except Exception as e:
            print('Reducing quality')
            qual += 1
        os.system('aria2c "%s" -d ./downloads' % (mlink))
        sleep(1)
      except Exception as e:
        print(e)
    page += 1

示例#23

0

显示文件

    def get_video_transcript(self, video_id):
        """
        Retrieves and formats transcripts for the passed video

        TODO: If no captions are available, download audio track and pass into
        Cloud Speech-to-Text? for now we just return None implying that we cant
        perform sentiment analysis on the video content itself.
        """
        video = YouTube('https://www.youtube.com/watch?v={}'.format(video_id))
        captions = video.captions.get_by_language_code('en')
        if not captions:
            logger.info('Unable to return transcript for video %r!', video_id)
            return

        # format captions as plaintext and strip trailing whitespace and html
        try:
            captions = ElementTree.fromstring(captions.xml_captions)
        except UnicodeEncodeError:
            xml = captions.xml_captions.encode("utf-8")
            captions = ElementTree.fromstring(xml)
        captions_list = []
        for subtitle in captions.getchildren():
            text = subtitle.text or u''
            caption = unescape(text.replace('\n', ' ').replace('  ', ' '))
            captions_list.append(u"{text} ".format(text=caption))
        transcript = clean_html(
            html.fromstring(u''.join(captions_list).strip()))
        return transcript.text_content().strip()

示例#24

0

显示文件

文件： helpers.py 项目： boxkite/ckanext-canada

def dataset_comments(pkg_id):

    #import pdb; pdb.set_trace()
    comment_list = []
    try:
      dbd = parse_db_config('ckan.drupal.url')
      if (dbd):
        drupal_conn_string = "host='%s' dbname='%s' user='******' password='******'" % (dbd['db_host'], dbd['db_name'], dbd['db_user'], dbd['db_pass'])    
        
        drupal_conn = pg2.connect(drupal_conn_string)
        drupal_cursor = drupal_conn.cursor()
        
        # add this to the SQL statement to limit comments to those that are published  'and status = 0'
        drupal_cursor.execute(
           """select c.subject, to_char(to_timestamp(c.changed), 'YYYY-MM-DD'), c.name, c.thread, f.comment_body_value from comment c 
inner join field_data_comment_body f on c.cid = f.entity_id
inner join opendata_package o on o.pkg_node_id = c.nid
where o.pkg_id = %s""", (pkg_id,))
      
    
        for comment in drupal_cursor:
           comment_body = clean_html(comment[4])
           comment_list.append({'subject': comment[0], 'date': comment[1], 'thread': comment[3], 'comment_body': comment_body, 'user': comment[2]})
        drupal_cursor.close()
        drupal_conn.close()
        
    except KeyError:
       pass
     
    return comment_list

示例#25

0

显示文件

文件： reposio_core.py 项目： waytai/Repos.io

def readme(repository):
    """
    Return a rendered version of the readme for the given repository
    """
    if not repository.readme or not repository.readme.strip():
        return 'No readme :('

    readme = None

    try:
        if repository.readme_type == 'markdown':
            readme = markup.markdown(repository.readme)
        elif repository.readme_type == 'textile':
            readme = markup.textile(repository.readme)
        elif repository.readme_type == 'rest':
            readme = markup.restructuredtext(repository.readme)
    except:
        pass

    if not readme:
        readme = '<pre>%s</pre>' % urlize(repository.readme)

    try:
        result = mark_safe(clean_html(readme))
    except:
        result = 'Unreadble readme :('

    return result

示例#26

0

显示文件

文件： lyricwiki.py 项目： baryonyx5/GetLyrics

    def get_lyrics(self):
        element = self.element

        # Replace <br> tags with \n (prepend it with \n and then remove all
        # occurrences of <br>)
        for br in element.cssselect('br'):
            br.tail = '\n' + br.tail if br.tail else '\n'
        etree.strip_elements(element, 'br', with_tail=False)

        # Remove unneeded tags
        bad_tags = element.cssselect('.rtMatcher') + \
            element.cssselect('.lyricsbreak')
        for tag in bad_tags:
            tag.drop_tree()

        # Remove HTML comments
        real_string = etree.tostring(element, encoding=unicode)
        cleaned_html = clean_html(real_string)

        # -KMS Modification-
        # Add try/except block to prevent script from crashing when
        # run from applescript
        try:
            print u'{0}'.format(
                html.fragment_fromstring(cleaned_html).text_content()
            ).encode('utf-8').strip()
        except UnicodeError:
            print u'{0}'.format(
                html.fragment_fromstring(cleaned_html).text_content()
            ).encode('utf-8').strip()
        return 0

示例#27

0

显示文件

文件： scraper.py 项目： andreslucena/congreso_datos_diputado

def get_content(url):
    req = Request(url)
    req.add_header('User-Agent', USER_AGENT)
    html = urlopen(req).read()
    html = clean_html(html)
    root = fromstring(html)
    return root.getroottree()

示例#28

0

显示文件

文件： forms.py 项目： franciskung/myewb2

 def clean_content(self):
     """ Do our usual HTML cleanup.
     Do we want to mangle the markup field to always be "html"?
     """
     self.cleaned_data['content'] = clean_html(self.cleaned_data['content'])
     self.cleaned_data['content'] = autolink_html(self.cleaned_data['content'])
     return self.cleaned_data['content']

示例#29

0

显示文件

def rtvslo(page):
    title_regex = r'<h1>(.+)</h1>'
    subtitle_regex = r'<div class="subtitle">(.+)</div>'
    lead_regex = r'<p class="lead">(.+)</p>'
    content_regex = r'<div class="article-body">\s*([\s\S]+?\s*</article>)'
    author_regex = r'<div class="author-name">(.+)</div>'
    date_regex = r'<div class="publish-meta">\s*(.*)\s*<br>'

    data = {
        'title':
        re.search(title_regex, page).group(1),
        'subtitle':
        re.search(subtitle_regex, page).group(1),
        'lead':
        re.search(lead_regex, page).group(1),
        'content':
        clean.clean_html(
            re.search(content_regex, page).group(1).replace('\t', '')),
        'author':
        re.search(author_regex, page).group(1),
        'date':
        re.search(date_regex, page).group(1)
    }

    return data

示例#30

0

显示文件

文件： baseforms.py 项目： boothead/karl

 def _to_python(self, value, state):
     try:
         clean = clean_html(value)
     except:
         msg = 'Unable to parse the provided HTML'
         raise Invalid(msg, value, state)
     return clean

示例#31

0

显示文件

文件： sw_listado_de_diputados.py 项目： PabloCastellano/pablog-opendata

def get_content(url):
    req = urllib2.Request(url, None, headers)
    html = urllib2.urlopen(req).read()
    html = clean_html(html)

    root = fromstring(html)
    return root.getroottree()

示例#32

0

显示文件

    def get_lyrics(self):
        response = requests.get(self.url)
        page_html = html.document_fromstring(response.text)
        element = page_html.cssselect(self.CSS_SELECTOR)[0]

        # Replace <br> tags with \n (prepend it with \n and then remove all
        # occurrences of <br>)
        for br in element.cssselect('br'):
            br.tail = '\n' + br.tail if br.tail else '\n'
        etree.strip_elements(element, 'br', with_tail=False)

        # Remove unneeded tags
        bad_tags = element.cssselect('.rtMatcher') + \
            element.cssselect('.lyricsbreak')
        for tag in bad_tags:
            tag.drop_tree()

        # Remove HTML comments
        real_string = etree.tostring(element, encoding="UTF-8")
        cleaned_html = clean_html(real_string)

        info_output = format_song_info(self.json['artist'], self.json['song'])
        lyric_output = html.fragment_fromstring(cleaned_html).text_content()

        return u'{}{}'.format(info_output, lyric_output)

示例#33

0

显示文件

文件： models.py 项目： byerssanchez/xvs

	def full_body(self):
		signature = render_to_string('layout/signature.txt', {})
		return """
		%s

		<span style="color: #666">%s</span>
		""" % (clean_html(self.body), signature.replace('\n', '<br/>\n'))

示例#34

0

显示文件

文件： pdu_st.py 项目： ccstolley/pdu_controllers

def get_sensor_status():
    """
    Parses PDU status HTML and returns sensor readings.
    """
    url = '/sensors.html'
    res = dispatch_request(url)
    if res[0] != 200:
        raise Exception('Failed to get status')
    data = res[1]
    data = clean_html(data)
    tree = parse_html(data)
    id1 = parse_value(tree, '/html/body/div/div/table[2]/tr[5]/td[2]/font')
    id2 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[2]/font')
    lab1 = parse_value(tree, '/html/body/div/div/table[2]/tr[5]/td[3]/font/b')
    lab2 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[3]/font/b')
    temp1 = parse_value(tree,
                       '/html/body/div/div/table[2]/tr[5]/td[4]/font/b/font/b')
    temp2 = parse_value(tree,
                       '/html/body/div/div/table[2]/tr[6]/td[4]/font/b/font/b')
    hum1 = parse_value(tree,
                       '/html/body/div/div/table[2]/tr[5]/td[5]/font/b/font/b')
    hum2 = parse_value(tree,
                       '/html/body/div/div/table[2]/tr[6]/td[5]/font/b/font/b')
    hum1 = hum1.replace(' %', '')
    hum2 = hum2.replace(' %', '')
    temp1 = temp1.replace(' Deg. F', '')
    temp2 = temp2.replace(' Deg. F', '')
    res = [{'id': id1, 'label': lab1, 'temp': temp1, 'hum': hum1},
           {'id': id2, 'label': lab2, 'temp': temp2, 'hum': hum2}, ]
    return res

示例#35

0

显示文件

文件： page.py 项目： nvasilakis/lang-process

	def tokenize(self, granularity="max"): # use granularity to debug!
		""" It extracts actual text from wep pages.

		Implements three granularity levels (min, med, max):
		 * min represents text in the style of an one-line string
		 * mid represents text in the style closest to the format of a web page
		 * max represents text in word tokens
		"+" versions (e.g., max+) do not double-check for trailing punctuation
		"""
		def maxLambda(l):
			words = l.split()
			# impossible with list comprehensions
			for i, word in enumerate(words):
				if word[-1] in [',','.',';','?','\'','"']:  # TODO something about "word..."
					words[i] = word[:-1] + "\n" + word[-1]
			return "\n".join(words)

		strippedJS = clean.clean_html(self.raw)
		strippedHTML = nltk.util.clean_html(strippedJS)
		ampersands = "&[a-zA-Z]{2,4};"					# remove html ampersand commands
		stripped = re.sub(ampersands,"",strippedHTML) 	# such as &amp; &gt; etc
		tokensFormat = (granularity=="mid") and (lambda l: l) or (maxLambda)
		punctuation = re.compile(r'.+[,.;?\"]{1,3}$')	# split trailing punctuation

		self.tokenized = tokensFormat(stripped)

示例#36

0

显示文件

文件： worker.py 项目： samzhang111/syllabi-scraper

def scraper_worker(worker_id, q, r, timeout=2, wayback=False):
    wayback_base = "http://web.archive.org/web/"
    iteration = 0
    con, cur = setup_db()
    
    for item in q.consume():
        if not item:
            print "%d: RECEIVED SENTINEL" % worker_id
            #received sentinel
            break
        syllabi_id, link = item
        #pdb.set_trace()
        if wayback:
            link = wayback_base + link
        try:
            req = requests.get(link, timeout = timeout)
            if req.status_code != requests.codes.ok:
                r.incr("errors")
            else:
                src = clean_html(req.text)
                if wayback:
                    src = strip_wayback(src)
                try:
                    #pdb.set_trace()
                    cur.execute("INSERT INTO " + db_settings.table_name + " (syllabiID, chnm_cache) VALUES (%s,%s)", 
                                (syllabi_id, src))
                    con.commit()
                    r.incr("success")
                except mdb.Error, e:
                    print "---> DB insert error on worker %d on iteration %d -> %s\n\tReconnecting cursor..." % (worker_id, iteration, e)
                    r.incr("dberrors")
                    con, cur = setup_db()
        except:
            r.incr("timeouts")

示例#37

0

显示文件

文件： lib.py 项目： nikola/updaterlib

def mpcHc_installLatestReleaseVersion(self, version, pathname, silent=False, archive=False, compact=False, compatText=False):
    log('Identifying filename of MPC-HC download ...')
    html = clean_html(requests.get(MPCHC_DOWNLADS, headers=HEADERS_TRACKABLE).text)
    url = MPCHC_LINK_ARCHIVE if archive else MPCHC_LINK_INSTALLER
    initialUrl = re.search(url, html).group(1)
    log(' done.\n')

    retries = 0
    while True:
        log('Selecting filehost for MPC-HC download ...')
        response = requests.get(initialUrl, headers=HEADERS_SF).text
        filehostResolver = re.search('<meta[^>]*?url=(.*?)["\']', response, re.I).group(1)
        filehostName = re.search('use_mirror=([a-z\-]+)', filehostResolver).group(1)
        filehostUrl = filehostResolver[:filehostResolver.index('?')].replace('downloads', filehostName + '.dl')
        log(' done: %s.\n' % filehostName)

        time.sleep(1)

        log('Downloading %s ...' % filehostUrl)
        response = requests.get(filehostUrl, headers=HEADERS_SF).content
        log(' done.\n')

        if response.strip().endswith('</html>') or len(response) < 1e6:
            retries += 1

            if retries < 10:
                log('Selected filehost is not serving MPC-HC %s, trying another filehost.\n' % version, RED)
                time.sleep(2)
            else:
                log('It appears no filehost can be found serving MPC-HC %s, aborting for now.\n' % version, RED)
                return
        else:
            break

    mpcHc_install(response, version, pathname, silent, archive, compact, compatText)

示例#38

0

显示文件

文件： lyricwiki.py 项目： ysim/songtext

    def get_lyrics(self):
        response = requests.get(self.url)
        page_html =  html.document_fromstring(response.text)
        element = page_html.cssselect(self.CSS_SELECTOR)[0]

        # Replace <br> tags with \n (prepend it with \n and then remove all
        # occurrences of <br>)
        for br in element.cssselect('br'):
            br.tail = '\n' + br.tail if br.tail else '\n'
        etree.strip_elements(element, 'br', with_tail=False)

        # Remove unneeded tags
        bad_tags = element.cssselect('.rtMatcher') + \
            element.cssselect('.lyricsbreak')
        for tag in bad_tags:
            tag.drop_tree()

        # Remove HTML comments
        real_string = etree.tostring(element, encoding=unicode)
        cleaned_html = clean_html(real_string)

        info_output = format_song_info(self.json['artist'], self.json['song'])
        lyric_output = html.fragment_fromstring(cleaned_html).text_content()

        return u'{}{}'.format(info_output, lyric_output)

示例#39

0

显示文件

 def clean_comment(self):
     self.cleaned_data['comment'] = clean_html(self.cleaned_data['comment'])
     self.cleaned_data['comment'] = autolink_html(
         self.cleaned_data['comment'])
     self.cleaned_data['comment'] = autolink_email(
         self.cleaned_data['comment'])
     return self.cleaned_data['comment']

示例#40

0

显示文件

def get_content(url):
    req = urllib2.Request(url, None, headers)
    html = urllib2.urlopen(req).read()
    html = clean_html(html)

    root = fromstring(html)
    return root.getroottree()

示例#41

0

显示文件

文件： __init__.py 项目： fogleman/RssToJson

def parse(url, etag=None, modified=None):
    data = feedparser.parse(url,
        etag=etag, modified=modified, agent=USER_AGENT)
    entries = []
    feed = data.get('feed', {})
    for entry in data.get('entries', []):
        description = entry.get('description')
        description = description and clean_html(description)
        timestamp = entry.get('date_parsed')
        timestamp = timestamp and datetime.datetime(*timestamp[:6]).isoformat()
        entry = {
            'id': create_entry_id(entry),
            'author': entry.get('author'),
            'link': entry.get('link'),
            'title': entry.get('title'),
            'description': description,
            'timestamp': timestamp,
        }
        entries.append(entry)
    return {
        'url': url,
        'entries': entries,
        'feed': {
            'title': feed.get('title'),
            'link': feed.get('link'),
        },
        'etag': data.get('etag'),
        'modified': data.get('modified'),
    }

示例#42

0

显示文件

def get_content(url):
    req = Request(url)
    req.add_header('User-Agent', USER_AGENT)
    html = urlopen(req).read()
    html = clean_html(html)
    root = fromstring(html)
    return root.getroottree()

示例#43

0

显示文件

文件： web-crawl.py 项目： DBozhinovski/50apps

def crawler(url, depth, term):
	
	if depth < 0:
		return
	
	try:	
		page = urllib2.urlopen(url) 
	except:
		print "Error at url: %s;\n" %(url)
		return	
	
	data = page.read()
		
	if term in data: 
		print "found %s in %s\n" %(term,url)
		returnString = "<tr><td>%s</td><td>%s</td></tr>" %(url,depth)
		results.append(returnString)
		
	data = clean_html(data)
	
	root = fromstring(data)
	
	links = root.xpath('.//a/@href')
	
	passed.append(url)
	
	for link in links:
		next = urljoin(url,link)
		
		if next not in passed:
			crawler(next, depth-1, term)

示例#44

0

显示文件

文件： utils.py 项目： LIAAD/ToolboxDemo

def strip_html_tags(html_text):
    
    tree = html.fromstring(html_text)
    clean_tree = clean_html(tree)
    result = clean_tree.text_content().strip()
    result = result.replace("\\n","").replace("\\t","").replace("\\r","")
    return result

示例#45

0

显示文件

文件： models.py 项目： adamfk/myewb2

 def save(self, force_insert=False, force_update=False):
     # validate HTML content
     # Additional options at http://codespeak.net/lxml/lxmlhtml.html#cleaning-up-html
     self.content = clean_html(self.content)
     #self.content = autolink_html(self.content)
     
     super(Announcement, self).save(force_insert, force_update)

示例#46

0

显示文件

    def handle_submit(self, converted):
        request = self.request
        context = self.context
        # *will be* modified event
        objectEventNotify(ObjectWillBeModifiedEvent(context))
        if converted.get('middle_portlets'):
            middle_portlets = split_lines(converted['middle_portlets'])
        else:
            middle_portlets = []
        if converted.get('right_portlets'):
            right_portlets = split_lines(converted['right_portlets'])
        else:
            right_portlets = []
        context.title = converted['title']
        context.address = converted['address']
        context.city = converted['city']
        context.state = converted['state']
        context.country = converted['country']
        context.zipcode = converted['zipcode']
        context.telephone = converted['telephone']
        context.navigation = clean_html(converted['navigation'])
        context.middle_portlets = middle_portlets
        context.right_portlets = right_portlets
        # *modified* event
        objectEventNotify(ObjectModifiedEvent(context))

        location = resource_url(context.__parent__['intranets'], request)
        return HTTPFound(location=location)

示例#47

0

显示文件

文件： html_elements_downloader.py 项目： csiluszyk/html-elements-downloader

def get_HTML_element(xpath, url):
    """Returns a string representation of HTML element
       given in `xpath` from `url`.

       :param xpath: xpath to element
       :type xpath: str
       :param url: URL address from which `xpath` will be downloaded
       :type url: str
    """

    response = urlopen(url)
    enc = response.headers.get('content-type', 'utf-8').split('charset=')[-1]
    tree = etree.parse(response, etree.HTMLParser())

    try:
        el = clean_html(etree.tostring(tree.xpath(xpath)[0]))
    except IndexError as e:
        raise HTMLElementNotExists(
            'HTML element for item %s doesn\'t exist!' % n) from e

    try:
        el = el.decode(enc, 'ignore')
    except LookupError:
        el = el.decode('utf-8', 'ignore')

    return _unpack(el)

示例#48

0

显示文件

文件： webcrawler.py 项目： miohtama/transmogrify.webcrawler

    def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None, options=None):
        self.text = text
        self.url = url
        self.verbose = verbose
        self.maxpage = maxpage
        self.checker = checker
        self.options = options

        # The parsing of the page is done in the __init__() routine in
        # order to initialize the list of names the file
        # contains. Stored the parser in an instance variable. Passed
        # the URL to MyHTMLParser().
        size = len(self.text)

        if self.maxpage and size > self.maxpage:
            self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
            self.parser = None
            return

        if options:
            text = self.reformat(text, url)
        self.checker.note(2, "  Parsing %s (%d bytes)", self.url, size)
        text = clean_html(text)
        try:
#            self.parser = lxml.html.fromstring(text)
            self.parser = lxml.html.soupparser.fromstring(text)
            self.parser.resolve_base_href()
            self._html = tostring(self.parser,
                                             encoding=unicode,
                                             method="html",
                                             pretty_print=True)
            return
        except UnicodeDecodeError, HTMLParseError:
            pass

示例#49

0

显示文件

文件： EDGAR_CRAWLER.py 项目： xbrlware/earningsday

def html_cleaner(html_file):
    '''
    this function removes the tags from an HTML leaving only the content
    !!!!instead of removin the tags, use them! 
    
    '''
    tree = etree.parse(html_file, html.HTMLParser())
    tree = clean_html(tree)
    clean_text = tree.getroot().text_content()
    # first build a dirty version of the lists, which has newline characters
    items_list = re.findall("\n [ ]*Item [0-9][0-9. ]*", clean_text)
    items_list += re.findall("\nItem [0-9][0-9. ]*", clean_text)
    items_list += re.findall("\nITEM [0-9][0-9. ]*", clean_text)
    items_list += re.findall("\n [ ]*ITEM [0-9][0-9. ]*", clean_text)
    # exhibits are included under item 9.01
    # use the dirty version to build a list of locations in the file.
    locations = []
    for item in items_list:
        locations += [clean_text.index(item)]

    # add the ending point for Exhibits, where the About Co section start.
    locations += [len(clean_text)]
    # clean the items list

    name_list = re.findall('item [0-9.][0-9. ]*', str(items_list).lower())
    filings = {}
    for i in range(len(name_list)):
        name = name_list[i]
        filings[name] = clean_text[locations[i]:locations[i + 1]]

    return filings

示例#50

0

显示文件

文件： routes.py 项目： tlevine/ejnug

def attachment(querystr, n):
    db = Database()
    query = Query(db, querystr)
    if query.count_messages() != 1:
        redirect('/!/%s/' % querystr)
    else:
        message = next(iter(query.search_messages()))
        parts = message.get_message_parts()
        i = n - 1
        if i >= len(parts):
            redirect('/!/%s/' % querystr)
        else:
            part = parts[i]
            content_type = part.get_content_type()
            response.content_type = content_type
         #  response.charset = part.get_content_charset()

            fn = part.get_filename()
            if fn != None:
                response.headers['content-disposition'] = 'filename="%s";' % unidecode(fn).replace('"', '')

            payload = message.get_part(n)
            if 'html' in content_type.lower():
                return clean_html(payload)
            else:
                return payload

示例#51

0

显示文件

def do_scrape():
    az_html = scraperwiki.scrape('http://www.lambeth.gov.uk/Services/')
    list_root = lxml.html.fromstring(az_html)
    for a in list_root.cssselect("div.AZ li a"):
        try:
            page_title =  a.text
            page_link = 'http://www.lambeth.gov.uk' +  a.get('href')

            print "scraping " + page_link 
            page_full_html = scraperwiki.scrape(page_link)
            page_root = lxml.html.fromstring(page_full_html)

            #pull out the section details
            print page_root.cssselect('div.breadCrumb a')[2].text
            sections_csv = page_root.cssselect('div.breadCrumb a')[2].text

            #check it is a content page, not a nav page
            if page_full_html.find('cScape.Lambeth.GenericTemplates/ServiceCategory.aspx') <0 and page_full_html.find('cScape.Lambeth.GenericTemplates/DocumentSummary.aspx') <0 and page_full_html.find('cScape.Lambeth.GenericTemplates/GroupDocument.aspx') <0:

                content_fragment = page_root.cssselect('div.page')[0]
                for toplink in content_fragment.cssselect('div.topLink'):
                    content_fragment.remove(toplink)
                content_html = lxml.html.tostring(content_fragment)  
                content_html = clean_html(content_html)


                scraperwiki.sqlite.save(unique_keys=["source_url"], data={"source_url":page_link, "title":page_title, "content": content_html, 'sections_csv': sections_csv})
            else:
                print "ignoring nav page"
        except:
            print "something went wrong"
            pass

示例#52

0

显示文件

文件： intranets.py 项目： disko/karl

    def handle_submit(self, converted):
        request = self.request
        context = self.context
        # *will be* modified event
        objectEventNotify(ObjectWillBeModifiedEvent(context))
        if converted.get('middle_portlets'):
            middle_portlets = split_lines(converted['middle_portlets'])
        else:
            middle_portlets = []
        if converted.get('right_portlets'):
            right_portlets = split_lines(converted['right_portlets'])
        else:
            right_portlets = []
        context.title = converted['title']
        context.address = converted['address']
        context.city = converted['city']
        context.state = converted['state']
        context.country = converted['country']
        context.zipcode = converted['zipcode']
        context.telephone = converted['telephone']
        context.navigation = clean_html(converted['navigation'])
        context.middle_portlets = middle_portlets
        context.right_portlets = right_portlets
        context.css = converted['css']
        # *modified* event
        objectEventNotify(ObjectModifiedEvent(context))

        location = resource_url(context.__parent__['intranets'], request)
        return HTTPFound(location=location)

示例#53

0

显示文件

文件： blog.py 项目： Masood-Lapeh/ieproject2

def create():
    """Create a new post for the current user."""
    if request.method == 'POST':
        title = request.form['title']
        body = clean_html(request.form['body'])
        visibility = request.form['visibility']
        error = None

        if not title:
            error = 'عنوان لازم است.'

        if error is not None:
            flash(error)
        else:
            db = get_db()
            db.execute(
                'INSERT INTO post (title, body, visibility, author_id)'
                ' VALUES (?, ?, ?, ?)',
                (title, body, maybeNone(visibility), g.user['id']))
            db.commit()
            return redirect(url_for('blog.index'))

    db = get_db()
    users = db.execute('SELECT id, username'
                       ' FROM user'
                       ' ORDER BY id DESC').fetchall()
    return render_template('blog/create.html', users=users)

示例#54

0

显示文件

文件： feed.py 项目： ykbfb/DFSS_Auto_Test_BDD

def build_entry_content(entry, teaser=False, teaser_size=None):
    from lxml.html.clean import clean_html

    content = clean_html(parse_entry_content(entry))
    if teaser:
        content = truncate_html(content, teaser_size)
    return content

示例#55

0

显示文件

文件： wordcount.py 项目： ck1357/sportshacker-samples

def strip_tags(url):
    from lxml import html
    from lxml.html.clean import clean_html
    tree=html.parse(url)
    tree=clean_html(tree)
    text=tree.getroot().text_content()
    return text.split()

示例#56

0

显示文件

文件： pysolr_tornado.py 项目： nieldomingo/pysolr-tornado

    def _scrape_response(self, headers, response):
        """
        Scrape the html response.
        """
        # identify the responding server
        server_type = None
        server_string = headers.get('server', '')

        if server_string and 'jetty' in server_string.lower():
            server_type = 'jetty'

        if server_string and 'coyote' in server_string.lower():
            import lxml.html
            server_type = 'tomcat'

        reason = None
        full_html = ''
        dom_tree = None

        if server_type == 'tomcat':
            # Tomcat doesn't produce a valid XML response
            soup = lxml.html.fromstring(response)
            body_node = soup.find('body')
            p_nodes = body_node.cssselect('p')

            for p_node in p_nodes:
                children = p_node.getchildren()

                if len(children) >= 2 and 'message' in children[0].text.lower():
                    reason = children[1].text

            if reason is None:
                from lxml.html.clean import clean_html
                full_html = clean_html(response)
        else:
            # Let's assume others do produce a valid XML response
            try:
                dom_tree = ET.fromstring(response)
                reason_node = None

                # html page might be different for every server
                if server_type == 'jetty':
                    reason_node = dom_tree.find('body/pre')
                else:
                    reason_node = dom_tree.find('head/title')

                if reason_node is not None:
                    reason = reason_node.text

                if reason is None:
                    full_html = ET.tostring(dom_tree)
            except SyntaxError as err:
                full_html = "%s" % response

        full_html = full_html.replace('\n', '')
        full_html = full_html.replace('\r', '')
        full_html = full_html.replace('<br/>', '')
        full_html = full_html.replace('<br />', '')
        full_html = full_html.strip()
        return reason, full_html

示例#57

0

显示文件

文件： budou.py 项目： adamyi/budou

  def _html_serialize(self, chunks, attributes):
    """Returns concatenated HTML code with SPAN tag.

    Args:
      chunks: The list of chunks to be processed. (ChunkList)
      attributes: If a dictionary, it should be a map of name-value pairs for
          attributes of output SPAN tags. If a string, it should be a class name
          of output SPAN tags. If an array, it should be a list of class names
          of output SPAN tags. (str or dict or list of str)

    Returns:
      The organized HTML code. (str)
    """
    doc = lxml.etree.Element('span')
    for chunk in chunks:
      if chunk.is_space():
        if doc.getchildren():
          doc.getchildren()[-1].tail = ' '
        else:
          pass
      else:
        ele = lxml.etree.Element('span')
        ele.text = chunk.word
        for k, v in attributes.items():
          ele.attrib[k] = v
        doc.append(ele)
    result = lxml.etree.tostring(
        doc, pretty_print=False, encoding='utf-8').decode('utf-8')
    result = clean_html(result)
    return result