def to_representation(self, obj): return { 'id': obj.pk, 'title': clean_html(obj.title), 'pubDate': obj.created_at.strftime("%d.%m.%y"), 'fullText': clean_html(obj.description) if obj.description else '', }
def __init__(self, elemento): self.is_read = False self.is_marked_as_unread = False self.is_liked = False self.is_shared = False self.is_starred = False self.is_browsed = False self.is_emailed = False self.is_twitter = False self.is_readitlater = False # self.crawlTimeMsec = self._get(elemento, 'crawlTimeMsec') self.id = self._get(elemento, 'id') self.categories = self._get(elemento, 'categories') self.title = self._get(elemento, 'title') self.alternate = self._get(elemento, 'alternate') if len(self.alternate) > 0: self.alternate = self.alternate[0] self.published = self._get(elemento, 'published') self.updated = self._get(elemento, 'updated') self.summary = self._get(elemento, 'summary') if self.summary is not None: self.summary = clean_html('<div>%s</div>' % (self.summary['content'])) self.content = self._get(elemento, 'content') if self.content is not None: self.content = clean_html('<div>%s</div>' % (self.content['content'])) self.author = self._get(elemento, 'author') self.likingUsers = self._get(elemento, 'likingUsers') self.comments = self._get(elemento, 'comments') self.annotations = self._get(elemento, 'annotations') self.origin = self._get(elemento, 'origin')
def _cleanArticleText(self, article): if BLANK_RE.match(article.title): untitle = '' else: try: title = clean_html(article.title) untitle = lxml.html.fromstring(title).text_content() except: untitle = '' if BLANK_RE.match(article.content): uncontent = '' else: try: content = clean_html(article.content) uncontent = lxml.html.fromstring(content).text_content() except: uncontent = '' uncontent = untitle + uncontent if BLANK_RE.match(uncontent): return None, None else: termbag = [] splitter = re.compile(u"([^\s\)\(\]\[.,\":;\-+!¡¿?\{\}]+)") for i in splitter.finditer(uncontent): term = i.groups()[0] if not self.stopwords.has_key(term): if re.match("^[0-9]+(?:[.,][0-9]+)*$", term) is None: termbag.append(term.lower()) return untitle, " ".join(termbag)
def test_clean_invalid_root_tag(self): # only testing that cleaning with invalid root tags works at all s = lxml.html.fromstring('parent <invalid tag>child</another>') self.assertEqual('parent child', clean_html(s).text_content()) s = lxml.html.fromstring('<invalid tag>child</another>') self.assertEqual('child', clean_html(s).text_content())
def test_clean_invalid_root_tag(self): # only testing that cleaning with invalid root tags works at all s = lxml.html.fromstring('parent <invalid tag>child</another>') self.assertEqual('parent child', clean_html(s).text_content()) s = lxml.html.fromstring('<invalid tag>child</another>') self.assertEqual('child', clean_html(s).text_content())
def to_representation(self, obj): return { 'id': obj.pk, 'title': clean_html(obj.title), 'pubDate': obj.created_at.strftime("%d.%m.%y"), 'cover': obj.image.big if obj.image else '', 'fullText': clean_html(obj.content) if obj.content else '', }
def parse_url_and_save(url): try: html_document = parse(url).getroot() except: print("не удалось загрузить url или распарсить документ") return page_contents = html_document.find_class("page_content") if len(page_contents) == 0: return page_content = page_contents[0] news_contents = page_content.find_class("content_c") if len(news_contents) == 0: return news_content = news_contents[0] news_details = page_content.find_class("news_detail") if len(news_details) == 0: return news_detail = news_details[0] # Автор статьи authors = news_content.find_class("autor_name") if len(authors) == 0: return author = authors[0].find('a') author = clean_html(author).text_content() # Заголовок статьи header = news_content.find("h1") header = clean_html(header).text_content() header = header.strip(' \t\n\r') if len(header) == 0: return # Дата статьи created = page_content.find_class('date_time')[0].find_class('date')[0] created = clean_html(created).text_content() # Topic topic = news_content.find_class('rubric')[0] topic = clean_html(topic).text_content() # source source = url # publ_year publ_year = created[6:] # Текст статьи text = news_detail.find("article") text = clean_html(text).text_content() text = text.strip(' \t\n\r') # Сохраняем извлеченную инфу в файл, добавляем в csv path = save_text_to_file(author, header, created, topic, source, text) add_to_csv(path, author, header, topic, created, source, publ_year)
def striphtml(content): """Returns ``content`` stripped of all HTML tags and of the contents of <style> and <script> tags. It will also remove any tabs, newline characters and non-breaking spaces. """ if not isinstance(content, basestring): return u"" content = re_script.sub(u"", content) doc = html.fragment_fromstring(content, create_parent=True) clean.clean_html(doc) return unicode(re_nl.sub(u"", doc.text_content()))
def to_representation(self, obj): return { 'id': obj.pk, 'name': clean_html(obj.name), 'pubDate': obj.created_at.strftime("%d.%m.%y"), 'currency': currency_symbol(obj.currency), 'price': obj.cost, 'cover': obj.image.big if obj.image else '', 'details': clean_html(obj.description) if obj.description else '' }
def to_representation(self, obj): return { 'id': obj.pk, 'name': clean_html(obj.name), 'endDate': obj.end_coupon_date.strftime("%d.%m.%y"), 'currency': currency_symbol(obj.currency), 'oldPrice': obj.cost, 'cover': obj.image.big if obj.image else '', 'details': clean_html(obj.description) if obj.description else '', 'percent': obj.coupon_discount_percent }
def parse(self, response): page = response.url.split("/")[-2] filename = 'quotes-%s.html' % page parser = etree.HTMLParser() tree = etree.parse(StringIO(str(response.body)), parser) result = etree.tostring(tree.getroot(),pretty_print = True, method = "html") print("_____________________________________________________________________", result, "______________________________________________________________________") print clean_html(response.body) with open(filename, 'wb') as f: f.write(result) self.log('Saved file %s' % filename)
def to_representation(self, obj): return { 'id': obj.pk, 'title': clean_html(obj.title), 'pubDate': obj.created_at.strftime("%d.%m.%y"), 'shortText': clean_html(Truncator(obj.description).words("30", html=True)) if obj.description else '', }
def get_url(url): http_pattern = '^http://' if re.search(http_pattern, url): urlfh = urllib.urlopen(url) content = urlfh.read() html_tree = lxml.html.fromstring(content) clean_html(html_tree) # removes crud from html clean_html_string = lxml.html.tostring(html_tree, encoding=unicode, method='text') return io.StringIO(clean_html_string) else: raise Exception("Bad url: {}".format(url))
def get_url(url): http_pattern = '^http://' if re.search(http_pattern, url): urlfh = urllib.urlopen(url) content = urlfh.read() html_tree = lxml.html.fromstring(content) clean_html(html_tree) # removes crud from html clean_html_string = lxml.html.tostring(html_tree, encoding=unicode, method='text') return io.StringIO(clean_html_string) else: raise Exception("Bad url: {}".format(url))
def to_representation(self, obj): return { 'img': obj.user.profile.avatar.th if obj.user and obj.user.profile.avatar else '', 'name': clean_html(obj.user.profile.full_name) if obj.user and obj.user.profile.full_name else '', 'post': clean_html(obj.name) if obj.name else '', 'phone': clean_html(obj.user.profile.mobile_number) if obj.user and obj.user.profile.mobile_number else '' }
def get_description(self, url): """Fetches job's page and get description""" html = self.fetch_url(url) doc = lxh.fromstring(html) if 'indeed.' not in url: return clean_html(html) el = doc.find('.//span[@id="job_summary"]') if el is None: return clean_html(html) bytes = etree.tostring(el, encoding='utf8') html = bytes.decode() return self.highlight_words(html)
def test_clean_with_comments(self): html = """<p><span style="color: #00ffff;">Cy<!-- xx -->an</span><!-- XXX --></p>""" s = lxml.html.fragment_fromstring(html) self.assertEqual(b'<p><span>Cyan</span></p>', lxml.html.tostring(clean_html(s))) self.assertEqual('<p><span>Cyan</span></p>', clean_html(html)) cleaner = Cleaner(comments=False) result = cleaner.clean_html(s) self.assertEqual(b'<p><span>Cy<!-- xx -->an</span><!-- XXX --></p>', lxml.html.tostring(result)) self.assertEqual('<p><span>Cy<!-- xx -->an</span><!-- XXX --></p>', cleaner.clean_html(html))
def to_representation(self, obj): if obj.short_description: short_text = obj.short_description else: short_text = Truncator(obj.content).words("30", html=True) return { 'id': obj.pk, 'title': clean_html(obj.title), 'pubDate': obj.created_at.strftime("%d.%m.%y"), 'cover': obj.image.big if obj.image else '', 'shortText': clean_html(short_text) if short_text else '', }
def to_representation(self, obj): if obj.short_description: short_text = obj.short_description else: short_text = Truncator(obj.description).words("30", html=True) return { 'id': obj.pk, 'name': clean_html(obj.name), 'pubDate': obj.created_at.strftime("%d.%m.%y"), 'currency': currency_symbol(obj.currency), 'price': obj.cost, 'cover': obj.image.big if obj.image else '', 'details': clean_html(short_text) if short_text else '' }
def download_and_parse(self): if self._parsed: raise Exception('This article ({}) has already been parsed.'.format(self.url)) self.download_date = datetime.utcnow() self.source_domain = urlparse(self.url).netloc self._parsed = True try: self.html = requests.get(self.url).content except requests.exceptions.RequestException: raise IOError("Could not download the article at: %s" % self.url) # This alters the html in-place. clean_html(self.html) doc = document_fromstring(self.html) parsers.parse_article(self, doc)
def extract_paragraphs(element): if element.tag == 'hr': return [] if element.tag == 'p': text = clean_html(element).text_content() text = ' '.join(text.split()) return [text] if element.tag[0] == 'h': text = clean_html(element).text_content() text = ' '.join(text.split()) return [text] out = list() for child in get_children(element): out.extend(extract_paragraphs(child)) return out
def main(): quan = int(argv[1]) st = int(argv[2]) query = ' '.join(argv[3:]) page = 1 curr = 0 while (curr < quan): response = requests.get(CSN, params={'s': query, 'page': page}) # print(response.content) treem = html.fromstring((clean_html(response.content)).strip()) cntpage = len(treem.xpath("//table[@class='tbtable'][1]//tr[@title]")) for idx in range(cntpage): curr += 1 if (st > curr): continue if (curr == quan): break try: page1 = treem.xpath("//table[@class='tbtable'][1]//tr[@title][%d]/td[2]//a[@class='musictitle']/@href" % (idx + 1))[0] title = treem.xpath("//table[@class='tbtable'][1]//tr[@title][%d]/td[2]//a//text()" % (idx + 1))[0] print('Downloading %3d of %3d : %s' % (curr, quan, title)) response = requests.get(page1) tree = html.fromstring((clean_html(response.content)).strip()) page2 = tree.xpath("//img[@src='http://data.chiasenhac.com/images/button_download.gif']/../@href")[0] response = requests.get(page2) qual = 1 found = False tree2 = html.fromstring((clean_html(response.content)).strip()) while not found: try: mlink = tree2.xpath("//div[@id='downloadlink2']//a[last() - %d]/@href" %(qual))[0] request = urllib2.Request(mlink) print(mlink) request.get_method = lambda : 'HEAD' response = urllib2.urlopen(request) if 'http://chiasenhac.vn/' not in response.url: found = True else: print('Reducing quality') qual += 1 except Exception as e: print('Reducing quality') qual += 1 os.system('aria2c "%s" -d ./downloads' % (mlink)) sleep(1) except Exception as e: print(e) page += 1
def get_video_transcript(self, video_id): """ Retrieves and formats transcripts for the passed video TODO: If no captions are available, download audio track and pass into Cloud Speech-to-Text? for now we just return None implying that we cant perform sentiment analysis on the video content itself. """ video = YouTube('https://www.youtube.com/watch?v={}'.format(video_id)) captions = video.captions.get_by_language_code('en') if not captions: logger.info('Unable to return transcript for video %r!', video_id) return # format captions as plaintext and strip trailing whitespace and html try: captions = ElementTree.fromstring(captions.xml_captions) except UnicodeEncodeError: xml = captions.xml_captions.encode("utf-8") captions = ElementTree.fromstring(xml) captions_list = [] for subtitle in captions.getchildren(): text = subtitle.text or u'' caption = unescape(text.replace('\n', ' ').replace(' ', ' ')) captions_list.append(u"{text} ".format(text=caption)) transcript = clean_html( html.fromstring(u''.join(captions_list).strip())) return transcript.text_content().strip()
def dataset_comments(pkg_id): #import pdb; pdb.set_trace() comment_list = [] try: dbd = parse_db_config('ckan.drupal.url') if (dbd): drupal_conn_string = "host='%s' dbname='%s' user='******' password='******'" % (dbd['db_host'], dbd['db_name'], dbd['db_user'], dbd['db_pass']) drupal_conn = pg2.connect(drupal_conn_string) drupal_cursor = drupal_conn.cursor() # add this to the SQL statement to limit comments to those that are published 'and status = 0' drupal_cursor.execute( """select c.subject, to_char(to_timestamp(c.changed), 'YYYY-MM-DD'), c.name, c.thread, f.comment_body_value from comment c inner join field_data_comment_body f on c.cid = f.entity_id inner join opendata_package o on o.pkg_node_id = c.nid where o.pkg_id = %s""", (pkg_id,)) for comment in drupal_cursor: comment_body = clean_html(comment[4]) comment_list.append({'subject': comment[0], 'date': comment[1], 'thread': comment[3], 'comment_body': comment_body, 'user': comment[2]}) drupal_cursor.close() drupal_conn.close() except KeyError: pass return comment_list
def readme(repository): """ Return a rendered version of the readme for the given repository """ if not repository.readme or not repository.readme.strip(): return 'No readme :(' readme = None try: if repository.readme_type == 'markdown': readme = markup.markdown(repository.readme) elif repository.readme_type == 'textile': readme = markup.textile(repository.readme) elif repository.readme_type == 'rest': readme = markup.restructuredtext(repository.readme) except: pass if not readme: readme = '<pre>%s</pre>' % urlize(repository.readme) try: result = mark_safe(clean_html(readme)) except: result = 'Unreadble readme :(' return result
def get_lyrics(self): element = self.element # Replace <br> tags with \n (prepend it with \n and then remove all # occurrences of <br>) for br in element.cssselect('br'): br.tail = '\n' + br.tail if br.tail else '\n' etree.strip_elements(element, 'br', with_tail=False) # Remove unneeded tags bad_tags = element.cssselect('.rtMatcher') + \ element.cssselect('.lyricsbreak') for tag in bad_tags: tag.drop_tree() # Remove HTML comments real_string = etree.tostring(element, encoding=unicode) cleaned_html = clean_html(real_string) # -KMS Modification- # Add try/except block to prevent script from crashing when # run from applescript try: print u'{0}'.format( html.fragment_fromstring(cleaned_html).text_content() ).encode('utf-8').strip() except UnicodeError: print u'{0}'.format( html.fragment_fromstring(cleaned_html).text_content() ).encode('utf-8').strip() return 0
def get_content(url): req = Request(url) req.add_header('User-Agent', USER_AGENT) html = urlopen(req).read() html = clean_html(html) root = fromstring(html) return root.getroottree()
def clean_content(self): """ Do our usual HTML cleanup. Do we want to mangle the markup field to always be "html"? """ self.cleaned_data['content'] = clean_html(self.cleaned_data['content']) self.cleaned_data['content'] = autolink_html(self.cleaned_data['content']) return self.cleaned_data['content']
def rtvslo(page): title_regex = r'<h1>(.+)</h1>' subtitle_regex = r'<div class="subtitle">(.+)</div>' lead_regex = r'<p class="lead">(.+)</p>' content_regex = r'<div class="article-body">\s*([\s\S]+?\s*</article>)' author_regex = r'<div class="author-name">(.+)</div>' date_regex = r'<div class="publish-meta">\s*(.*)\s*<br>' data = { 'title': re.search(title_regex, page).group(1), 'subtitle': re.search(subtitle_regex, page).group(1), 'lead': re.search(lead_regex, page).group(1), 'content': clean.clean_html( re.search(content_regex, page).group(1).replace('\t', '')), 'author': re.search(author_regex, page).group(1), 'date': re.search(date_regex, page).group(1) } return data
def _to_python(self, value, state): try: clean = clean_html(value) except: msg = 'Unable to parse the provided HTML' raise Invalid(msg, value, state) return clean
def get_content(url): req = urllib2.Request(url, None, headers) html = urllib2.urlopen(req).read() html = clean_html(html) root = fromstring(html) return root.getroottree()
def get_lyrics(self): response = requests.get(self.url) page_html = html.document_fromstring(response.text) element = page_html.cssselect(self.CSS_SELECTOR)[0] # Replace <br> tags with \n (prepend it with \n and then remove all # occurrences of <br>) for br in element.cssselect('br'): br.tail = '\n' + br.tail if br.tail else '\n' etree.strip_elements(element, 'br', with_tail=False) # Remove unneeded tags bad_tags = element.cssselect('.rtMatcher') + \ element.cssselect('.lyricsbreak') for tag in bad_tags: tag.drop_tree() # Remove HTML comments real_string = etree.tostring(element, encoding="UTF-8") cleaned_html = clean_html(real_string) info_output = format_song_info(self.json['artist'], self.json['song']) lyric_output = html.fragment_fromstring(cleaned_html).text_content() return u'{}{}'.format(info_output, lyric_output)
def full_body(self): signature = render_to_string('layout/signature.txt', {}) return """ %s <span style="color: #666">%s</span> """ % (clean_html(self.body), signature.replace('\n', '<br/>\n'))
def get_sensor_status(): """ Parses PDU status HTML and returns sensor readings. """ url = '/sensors.html' res = dispatch_request(url) if res[0] != 200: raise Exception('Failed to get status') data = res[1] data = clean_html(data) tree = parse_html(data) id1 = parse_value(tree, '/html/body/div/div/table[2]/tr[5]/td[2]/font') id2 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[2]/font') lab1 = parse_value(tree, '/html/body/div/div/table[2]/tr[5]/td[3]/font/b') lab2 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[3]/font/b') temp1 = parse_value(tree, '/html/body/div/div/table[2]/tr[5]/td[4]/font/b/font/b') temp2 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[4]/font/b/font/b') hum1 = parse_value(tree, '/html/body/div/div/table[2]/tr[5]/td[5]/font/b/font/b') hum2 = parse_value(tree, '/html/body/div/div/table[2]/tr[6]/td[5]/font/b/font/b') hum1 = hum1.replace(' %', '') hum2 = hum2.replace(' %', '') temp1 = temp1.replace(' Deg. F', '') temp2 = temp2.replace(' Deg. F', '') res = [{'id': id1, 'label': lab1, 'temp': temp1, 'hum': hum1}, {'id': id2, 'label': lab2, 'temp': temp2, 'hum': hum2}, ] return res
def tokenize(self, granularity="max"): # use granularity to debug! """ It extracts actual text from wep pages. Implements three granularity levels (min, med, max): * min represents text in the style of an one-line string * mid represents text in the style closest to the format of a web page * max represents text in word tokens "+" versions (e.g., max+) do not double-check for trailing punctuation """ def maxLambda(l): words = l.split() # impossible with list comprehensions for i, word in enumerate(words): if word[-1] in [',','.',';','?','\'','"']: # TODO something about "word..." words[i] = word[:-1] + "\n" + word[-1] return "\n".join(words) strippedJS = clean.clean_html(self.raw) strippedHTML = nltk.util.clean_html(strippedJS) ampersands = "&[a-zA-Z]{2,4};" # remove html ampersand commands stripped = re.sub(ampersands,"",strippedHTML) # such as & > etc tokensFormat = (granularity=="mid") and (lambda l: l) or (maxLambda) punctuation = re.compile(r'.+[,.;?\"]{1,3}$') # split trailing punctuation self.tokenized = tokensFormat(stripped)
def scraper_worker(worker_id, q, r, timeout=2, wayback=False): wayback_base = "http://web.archive.org/web/" iteration = 0 con, cur = setup_db() for item in q.consume(): if not item: print "%d: RECEIVED SENTINEL" % worker_id #received sentinel break syllabi_id, link = item #pdb.set_trace() if wayback: link = wayback_base + link try: req = requests.get(link, timeout = timeout) if req.status_code != requests.codes.ok: r.incr("errors") else: src = clean_html(req.text) if wayback: src = strip_wayback(src) try: #pdb.set_trace() cur.execute("INSERT INTO " + db_settings.table_name + " (syllabiID, chnm_cache) VALUES (%s,%s)", (syllabi_id, src)) con.commit() r.incr("success") except mdb.Error, e: print "---> DB insert error on worker %d on iteration %d -> %s\n\tReconnecting cursor..." % (worker_id, iteration, e) r.incr("dberrors") con, cur = setup_db() except: r.incr("timeouts")
def mpcHc_installLatestReleaseVersion(self, version, pathname, silent=False, archive=False, compact=False, compatText=False): log('Identifying filename of MPC-HC download ...') html = clean_html(requests.get(MPCHC_DOWNLADS, headers=HEADERS_TRACKABLE).text) url = MPCHC_LINK_ARCHIVE if archive else MPCHC_LINK_INSTALLER initialUrl = re.search(url, html).group(1) log(' done.\n') retries = 0 while True: log('Selecting filehost for MPC-HC download ...') response = requests.get(initialUrl, headers=HEADERS_SF).text filehostResolver = re.search('<meta[^>]*?url=(.*?)["\']', response, re.I).group(1) filehostName = re.search('use_mirror=([a-z\-]+)', filehostResolver).group(1) filehostUrl = filehostResolver[:filehostResolver.index('?')].replace('downloads', filehostName + '.dl') log(' done: %s.\n' % filehostName) time.sleep(1) log('Downloading %s ...' % filehostUrl) response = requests.get(filehostUrl, headers=HEADERS_SF).content log(' done.\n') if response.strip().endswith('</html>') or len(response) < 1e6: retries += 1 if retries < 10: log('Selected filehost is not serving MPC-HC %s, trying another filehost.\n' % version, RED) time.sleep(2) else: log('It appears no filehost can be found serving MPC-HC %s, aborting for now.\n' % version, RED) return else: break mpcHc_install(response, version, pathname, silent, archive, compact, compatText)
def get_lyrics(self): response = requests.get(self.url) page_html = html.document_fromstring(response.text) element = page_html.cssselect(self.CSS_SELECTOR)[0] # Replace <br> tags with \n (prepend it with \n and then remove all # occurrences of <br>) for br in element.cssselect('br'): br.tail = '\n' + br.tail if br.tail else '\n' etree.strip_elements(element, 'br', with_tail=False) # Remove unneeded tags bad_tags = element.cssselect('.rtMatcher') + \ element.cssselect('.lyricsbreak') for tag in bad_tags: tag.drop_tree() # Remove HTML comments real_string = etree.tostring(element, encoding=unicode) cleaned_html = clean_html(real_string) info_output = format_song_info(self.json['artist'], self.json['song']) lyric_output = html.fragment_fromstring(cleaned_html).text_content() return u'{}{}'.format(info_output, lyric_output)
def clean_comment(self): self.cleaned_data['comment'] = clean_html(self.cleaned_data['comment']) self.cleaned_data['comment'] = autolink_html( self.cleaned_data['comment']) self.cleaned_data['comment'] = autolink_email( self.cleaned_data['comment']) return self.cleaned_data['comment']
def get_content(url): req = urllib2.Request(url, None, headers) html = urllib2.urlopen(req).read() html = clean_html(html) root = fromstring(html) return root.getroottree()
def parse(url, etag=None, modified=None): data = feedparser.parse(url, etag=etag, modified=modified, agent=USER_AGENT) entries = [] feed = data.get('feed', {}) for entry in data.get('entries', []): description = entry.get('description') description = description and clean_html(description) timestamp = entry.get('date_parsed') timestamp = timestamp and datetime.datetime(*timestamp[:6]).isoformat() entry = { 'id': create_entry_id(entry), 'author': entry.get('author'), 'link': entry.get('link'), 'title': entry.get('title'), 'description': description, 'timestamp': timestamp, } entries.append(entry) return { 'url': url, 'entries': entries, 'feed': { 'title': feed.get('title'), 'link': feed.get('link'), }, 'etag': data.get('etag'), 'modified': data.get('modified'), }
def get_content(url): req = Request(url) req.add_header('User-Agent', USER_AGENT) html = urlopen(req).read() html = clean_html(html) root = fromstring(html) return root.getroottree()
def crawler(url, depth, term): if depth < 0: return try: page = urllib2.urlopen(url) except: print "Error at url: %s;\n" %(url) return data = page.read() if term in data: print "found %s in %s\n" %(term,url) returnString = "<tr><td>%s</td><td>%s</td></tr>" %(url,depth) results.append(returnString) data = clean_html(data) root = fromstring(data) links = root.xpath('.//a/@href') passed.append(url) for link in links: next = urljoin(url,link) if next not in passed: crawler(next, depth-1, term)
def strip_html_tags(html_text): tree = html.fromstring(html_text) clean_tree = clean_html(tree) result = clean_tree.text_content().strip() result = result.replace("\\n","").replace("\\t","").replace("\\r","") return result
def save(self, force_insert=False, force_update=False): # validate HTML content # Additional options at http://codespeak.net/lxml/lxmlhtml.html#cleaning-up-html self.content = clean_html(self.content) #self.content = autolink_html(self.content) super(Announcement, self).save(force_insert, force_update)
def handle_submit(self, converted): request = self.request context = self.context # *will be* modified event objectEventNotify(ObjectWillBeModifiedEvent(context)) if converted.get('middle_portlets'): middle_portlets = split_lines(converted['middle_portlets']) else: middle_portlets = [] if converted.get('right_portlets'): right_portlets = split_lines(converted['right_portlets']) else: right_portlets = [] context.title = converted['title'] context.address = converted['address'] context.city = converted['city'] context.state = converted['state'] context.country = converted['country'] context.zipcode = converted['zipcode'] context.telephone = converted['telephone'] context.navigation = clean_html(converted['navigation']) context.middle_portlets = middle_portlets context.right_portlets = right_portlets # *modified* event objectEventNotify(ObjectModifiedEvent(context)) location = resource_url(context.__parent__['intranets'], request) return HTTPFound(location=location)
def get_HTML_element(xpath, url): """Returns a string representation of HTML element given in `xpath` from `url`. :param xpath: xpath to element :type xpath: str :param url: URL address from which `xpath` will be downloaded :type url: str """ response = urlopen(url) enc = response.headers.get('content-type', 'utf-8').split('charset=')[-1] tree = etree.parse(response, etree.HTMLParser()) try: el = clean_html(etree.tostring(tree.xpath(xpath)[0])) except IndexError as e: raise HTMLElementNotExists( 'HTML element for item %s doesn\'t exist!' % n) from e try: el = el.decode(enc, 'ignore') except LookupError: el = el.decode('utf-8', 'ignore') return _unpack(el)
def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None, options=None): self.text = text self.url = url self.verbose = verbose self.maxpage = maxpage self.checker = checker self.options = options # The parsing of the page is done in the __init__() routine in # order to initialize the list of names the file # contains. Stored the parser in an instance variable. Passed # the URL to MyHTMLParser(). size = len(self.text) if self.maxpage and size > self.maxpage: self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001)) self.parser = None return if options: text = self.reformat(text, url) self.checker.note(2, " Parsing %s (%d bytes)", self.url, size) text = clean_html(text) try: # self.parser = lxml.html.fromstring(text) self.parser = lxml.html.soupparser.fromstring(text) self.parser.resolve_base_href() self._html = tostring(self.parser, encoding=unicode, method="html", pretty_print=True) return except UnicodeDecodeError, HTMLParseError: pass
def html_cleaner(html_file): ''' this function removes the tags from an HTML leaving only the content !!!!instead of removin the tags, use them! ''' tree = etree.parse(html_file, html.HTMLParser()) tree = clean_html(tree) clean_text = tree.getroot().text_content() # first build a dirty version of the lists, which has newline characters items_list = re.findall("\n [ ]*Item [0-9][0-9. ]*", clean_text) items_list += re.findall("\nItem [0-9][0-9. ]*", clean_text) items_list += re.findall("\nITEM [0-9][0-9. ]*", clean_text) items_list += re.findall("\n [ ]*ITEM [0-9][0-9. ]*", clean_text) # exhibits are included under item 9.01 # use the dirty version to build a list of locations in the file. locations = [] for item in items_list: locations += [clean_text.index(item)] # add the ending point for Exhibits, where the About Co section start. locations += [len(clean_text)] # clean the items list name_list = re.findall('item [0-9.][0-9. ]*', str(items_list).lower()) filings = {} for i in range(len(name_list)): name = name_list[i] filings[name] = clean_text[locations[i]:locations[i + 1]] return filings
def attachment(querystr, n): db = Database() query = Query(db, querystr) if query.count_messages() != 1: redirect('/!/%s/' % querystr) else: message = next(iter(query.search_messages())) parts = message.get_message_parts() i = n - 1 if i >= len(parts): redirect('/!/%s/' % querystr) else: part = parts[i] content_type = part.get_content_type() response.content_type = content_type # response.charset = part.get_content_charset() fn = part.get_filename() if fn != None: response.headers['content-disposition'] = 'filename="%s";' % unidecode(fn).replace('"', '') payload = message.get_part(n) if 'html' in content_type.lower(): return clean_html(payload) else: return payload
def do_scrape(): az_html = scraperwiki.scrape('http://www.lambeth.gov.uk/Services/') list_root = lxml.html.fromstring(az_html) for a in list_root.cssselect("div.AZ li a"): try: page_title = a.text page_link = 'http://www.lambeth.gov.uk' + a.get('href') print "scraping " + page_link page_full_html = scraperwiki.scrape(page_link) page_root = lxml.html.fromstring(page_full_html) #pull out the section details print page_root.cssselect('div.breadCrumb a')[2].text sections_csv = page_root.cssselect('div.breadCrumb a')[2].text #check it is a content page, not a nav page if page_full_html.find('cScape.Lambeth.GenericTemplates/ServiceCategory.aspx') <0 and page_full_html.find('cScape.Lambeth.GenericTemplates/DocumentSummary.aspx') <0 and page_full_html.find('cScape.Lambeth.GenericTemplates/GroupDocument.aspx') <0: content_fragment = page_root.cssselect('div.page')[0] for toplink in content_fragment.cssselect('div.topLink'): content_fragment.remove(toplink) content_html = lxml.html.tostring(content_fragment) content_html = clean_html(content_html) scraperwiki.sqlite.save(unique_keys=["source_url"], data={"source_url":page_link, "title":page_title, "content": content_html, 'sections_csv': sections_csv}) else: print "ignoring nav page" except: print "something went wrong" pass
def handle_submit(self, converted): request = self.request context = self.context # *will be* modified event objectEventNotify(ObjectWillBeModifiedEvent(context)) if converted.get('middle_portlets'): middle_portlets = split_lines(converted['middle_portlets']) else: middle_portlets = [] if converted.get('right_portlets'): right_portlets = split_lines(converted['right_portlets']) else: right_portlets = [] context.title = converted['title'] context.address = converted['address'] context.city = converted['city'] context.state = converted['state'] context.country = converted['country'] context.zipcode = converted['zipcode'] context.telephone = converted['telephone'] context.navigation = clean_html(converted['navigation']) context.middle_portlets = middle_portlets context.right_portlets = right_portlets context.css = converted['css'] # *modified* event objectEventNotify(ObjectModifiedEvent(context)) location = resource_url(context.__parent__['intranets'], request) return HTTPFound(location=location)
def create(): """Create a new post for the current user.""" if request.method == 'POST': title = request.form['title'] body = clean_html(request.form['body']) visibility = request.form['visibility'] error = None if not title: error = 'عنوان لازم است.' if error is not None: flash(error) else: db = get_db() db.execute( 'INSERT INTO post (title, body, visibility, author_id)' ' VALUES (?, ?, ?, ?)', (title, body, maybeNone(visibility), g.user['id'])) db.commit() return redirect(url_for('blog.index')) db = get_db() users = db.execute('SELECT id, username' ' FROM user' ' ORDER BY id DESC').fetchall() return render_template('blog/create.html', users=users)
def build_entry_content(entry, teaser=False, teaser_size=None): from lxml.html.clean import clean_html content = clean_html(parse_entry_content(entry)) if teaser: content = truncate_html(content, teaser_size) return content
def strip_tags(url): from lxml import html from lxml.html.clean import clean_html tree=html.parse(url) tree=clean_html(tree) text=tree.getroot().text_content() return text.split()
def _scrape_response(self, headers, response): """ Scrape the html response. """ # identify the responding server server_type = None server_string = headers.get('server', '') if server_string and 'jetty' in server_string.lower(): server_type = 'jetty' if server_string and 'coyote' in server_string.lower(): import lxml.html server_type = 'tomcat' reason = None full_html = '' dom_tree = None if server_type == 'tomcat': # Tomcat doesn't produce a valid XML response soup = lxml.html.fromstring(response) body_node = soup.find('body') p_nodes = body_node.cssselect('p') for p_node in p_nodes: children = p_node.getchildren() if len(children) >= 2 and 'message' in children[0].text.lower(): reason = children[1].text if reason is None: from lxml.html.clean import clean_html full_html = clean_html(response) else: # Let's assume others do produce a valid XML response try: dom_tree = ET.fromstring(response) reason_node = None # html page might be different for every server if server_type == 'jetty': reason_node = dom_tree.find('body/pre') else: reason_node = dom_tree.find('head/title') if reason_node is not None: reason = reason_node.text if reason is None: full_html = ET.tostring(dom_tree) except SyntaxError as err: full_html = "%s" % response full_html = full_html.replace('\n', '') full_html = full_html.replace('\r', '') full_html = full_html.replace('<br/>', '') full_html = full_html.replace('<br />', '') full_html = full_html.strip() return reason, full_html
def _html_serialize(self, chunks, attributes): """Returns concatenated HTML code with SPAN tag. Args: chunks: The list of chunks to be processed. (ChunkList) attributes: If a dictionary, it should be a map of name-value pairs for attributes of output SPAN tags. If a string, it should be a class name of output SPAN tags. If an array, it should be a list of class names of output SPAN tags. (str or dict or list of str) Returns: The organized HTML code. (str) """ doc = lxml.etree.Element('span') for chunk in chunks: if chunk.is_space(): if doc.getchildren(): doc.getchildren()[-1].tail = ' ' else: pass else: ele = lxml.etree.Element('span') ele.text = chunk.word for k, v in attributes.items(): ele.attrib[k] = v doc.append(ele) result = lxml.etree.tostring( doc, pretty_print=False, encoding='utf-8').decode('utf-8') result = clean_html(result) return result