def normalize_entities(html): # turn and aliases into normal spaces html = html.replace(u' ', u' ') html = html.replace(u' ', u' ') html = html.replace(u' ', u' ') html = html.replace(u'\xa0', u' ') return html
def _video_urls(self): if self._is_shelf(): return None if self.video_count is not None: return self.video_urls self.video_count = 0 rows = self.tree_html.xpath("//div[@id='tabItemDetails']//a/@href") rows = [r for r in rows if "video." in r or "/mediaroom/" in r or ("//media." in r and (".flv" in r or ".mov" in r))] url = "http://content.webcollage.net/sc/smart-button?ird=true&channel-product-id=%s" % self._product_id() html = urllib.urlopen(url).read() # \"src\":\"\/_cp\/products\/1374451886781\/tab-6174b48c-58f3-4d4b-8d2f-0d9bf0c90a63 # \/552b9366-55ed-443c-b21e-02ede6dd89aa.mp4.mobile.mp4\" video_base_url = self._find_between(html, 'data-resources-base=\\"', '\\">').replace("\\", "") + "%s" m = re.findall(r'"src":"([^"]*?\.mp4)"', html.replace("\\",""), re.DOTALL) for item in m: if ".blkbry" in item or ".mobile" in item: pass else: if video_base_url % item not in rows and item.count(".mp4") < 2: rows.append(video_base_url % item) m = re.findall(r'"src":"([^"]*?\.flv)"', html.replace("\\",""), re.DOTALL) for item in m: if ".blkbry" in item or ".mobile" in item: pass else: if video_base_url % item not in rows and item.count(".flv") < 2: rows.append(video_base_url % item) if len(rows) < 1: return None new_rows = [r for r in rows if ("%s.flash.flv" % r) not in rows] self.video_urls = list(set(new_rows)) self.video_count = len(self.video_urls) return self.video_urls
def get_html(self, options, line_id=None, additional_context=None): templates = self.get_templates() report = {'name': self.get_report_name(), 'company_name': self.env.user.company_id.name} lines = self.with_context( self.set_context(options)).get_lines(options, line_id=line_id) rcontext = { 'report': report, 'lines': {'columns_header': self.get_columns_name(options), 'lines': lines}, 'options': options, 'context': self.env.context, 'model': self, } if additional_context and type(additional_context) == dict: rcontext.update(additional_context) render_template = templates.get( 'main_template', 'stock_kardex.main_template') if line_id is not None: render_template = templates.get( 'line_template', 'stock_kardex.line_template') html = self.env['ir.ui.view'].render_template( render_template, values=dict(rcontext), ) if self.env.context.get('print_mode', False): for k, v in self.replace_class().items(): html = html.replace(k, v) html = html.replace( b'<div class="js_stock_report_footnotes"></div>', self.get_html_footnotes('')) return html
def barebones(url): html = url if checkurl(url): html = gethtml(url) if not html: return None # This chops out the following tags AND all the presumably extraneous content in-between. for nuketagblock in ['title', 'head']: html = deletenode(html, nuketagblock) html = bodycopy(html) html = stripcomments(html) # Same as above, but a second-pass on the usual code-bloating suspects in between body tags. for nuketagblock in ['header', 'footer', 'nav', 'script', 'style', 'noscript', 'form', 'object', 'embed', 'select']: html = deletenode(html, nuketagblock) html = stripparams(html) html = lowercasetags(html) # html = striplists(html) html = stripemptyhtml(html) html = stripbr(html) # This strips out the following tags, but leaves the in-between content in place. for nuketag in ['label', 'section', 'article', 'div', 'span', 'img', 'a', 'b', 'i', 'param', 'table', 'td', 'tr', 'font', 'title', 'head', 'meta', 'strong', 'em', 'iframe']: html = deletetag(html, nuketag) html = stripwhitespace(html) html = stripcrlf(html) html = onetagoneline(html) html = convert_html_entities(html) html = lesslines(html) html = html.replace('\n', ' ') html = html.replace(' ', ' ') html = html.strip() return html
def CreateReport(Date, dfChange, dfAdditions, dfDeletions, report_filepath, template_filepath): log('Creating report html...') pd.set_option( 'display.max_colwidth', -1 ) #stop the dataframe from truncating cell contents. This needs to be set if you want html links to work in cell contents with open(template_filepath, 'r') as template: htmltemplate = template.read() additionsTable = dfAdditions.to_html( na_rep=" ", index=False, classes= "table table-bordered text-left table-striped table-hover table-sm") changeTable = dfChange.to_html( na_rep=" ", index=False, classes= "table table-bordered text-left table-striped table-hover table-sm") deletionsTable = dfDeletions.to_html( na_rep=" ", index=False, classes= "table table-bordered text-left table-striped table-hover table-sm") with open(report_filepath, 'w', encoding='utf-8') as f: html = htmltemplate.replace('__DATE__', Date).replace( '__CHANGELEN__', str(len(dfChange))).replace('__DFCHANGES__', changeTable) if len(dfAdditions) > 0: html = html.replace('__ADDITIONSLEN__', str(len(dfAdditions))).replace( '__DFADDITIONS__', additionsTable) else: html = html.replace('__ADDITIONSLEN__', str(len(dfAdditions))).replace( '__DFADDITIONS__', '') if len(dfDeletions) > 0: html = html.replace('__DELETIONSLEN__', str(len(dfDeletions))).replace( '__DFDELETIONS__', deletionsTable) else: html = html.replace('__DELETIONSLEN__', str(len(dfDeletions))).replace( '__DFDELETIONS__', '') html = html.replace('<', '<').replace('>', '>').replace( '\\', '/').replace('\u2011', '-').replace('\u2015', '―').replace( 'ī', '').replace('─', '—') f.write(html) f.close() pass print("Exported html report to..." + report_filepath) log("Exported html report to..." + report_filepath)
def get_html(self, options, line_id=None, additional_context=None): ''' return the html value of report, or html value of unfolded line * if line_id is set, the template used will be the line_template otherwise it uses the main_template. Reason is for efficiency, when unfolding a line in the report we don't want to reload all lines, just get the one we unfolded. ''' templates = self.get_templates() report_manager = self.get_report_manager(options) report = {'name': self.get_report_name(), 'summary': report_manager.summary, 'company_name': self.env.user.company_id.name,} ctx = self.set_context(options) lines = self.with_context(ctx).get_lines(options, line_id=line_id) if options.get('hierarchy'): lines = self.create_hierarchy(lines) footnotes_to_render = [] if self.env.context.get('print_mode', False): # we are in print mode, so compute footnote number and include them in lines values, otherwise, let the js compute the number correctly as # we don't know all the visible lines. footnotes = dict([(str(f.line), f) for f in report_manager.footnotes_ids]) number = 0 for line in lines: f = footnotes.get(str(line.get('id'))) if f: number += 1 line['footnote'] = str(number) footnotes_to_render.append({'id': f.id, 'number': number, 'text': f.text}) rcontext = {'report': report, 'lines': {'columns_header': self.get_columns_name(options), 'lines': lines}, 'options': options, 'context': self.env.context, 'model': self, } if additional_context and type(additional_context) == dict: rcontext.update(additional_context) if ctx.get('analytic_account_ids'): rcontext['options']['analytic_account_ids'] = [ {'id': acc.id, 'name': acc.name} for acc in ctx['analytic_account_ids'] ] render_template = templates.get('main_template', 'account_reports.main_template') if line_id is not None: render_template = templates.get('line_template', 'account_reports.line_template') html = self.env['ir.ui.view'].render_template( render_template, values=dict(rcontext), ) if self.env.context.get('print_mode', False): for k,v in self.replace_class().items(): html = html.replace(k, v) # append footnote as well html = html.replace(b'<div class="js_account_report_footnotes"></div>', self.get_html_footnotes(footnotes_to_render)) return html
def html2plaintext(html, body_id=None, encoding='utf-8'): """ From an HTML text, convert the HTML to plain text. If @param body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext html = ustr(html) tree = etree.fromstring(html, parser=etree.HTMLParser()) if body_id is not None: source = tree.xpath('//*[@id=%s]' % (body_id,)) else: source = tree.xpath('//body') if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall('.//a'): url = link.get('href') if url: i += 1 link.tag = 'span' link.text = '%s [%s]' % (link.text, i) url_index.append(url) html = ustr(etree.tostring(tree, encoding=encoding)) html = html.replace('<strong>', '*').replace('</strong>', '*') html = html.replace('<b>', '*').replace('</b>', '*') html = html.replace('<h3>', '*').replace('</h3>', '*') html = html.replace('<h2>', '**').replace('</h2>', '**') html = html.replace('<h1>', '**').replace('</h1>', '**') html = html.replace('<em>', '/').replace('</em>', '/') html = html.replace('<tr>', '\n') html = html.replace('</p>', '\n') html = re.sub('<br\s*/?>', '\n', html) html = re.sub('<.*?>', ' ', html) html = html.replace(' ' * 2, ' ') # strip all lines html = '\n'.join([x.strip() for x in html.splitlines()]) html = html.replace('\n' * 2, '\n') for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += ustr('[%s] %s\n') % (i + 1, url) return html
def convertHtml(html): html = html.replace(' ', ' ') html = html.replace('<p>', '') html = html.replace('</p>', '') html = html.replace('\t', '') html = html.replace('/b>', '') html = html.replace('-', '') html = html.replace('<b>', '') html = html.replace('<br>', '') html = html.replace('<', '') return html
def to_xhtml(self, html, base_url): html = html.replace(' ', ' ') html = html.replace('—', '—') try: xhtml = etree.fromstring(html, lxml.html.XHTMLParser(), base_url=base_url) except etree.ParseError as what: error("etree.fromstring says %s" % what) raise xhtml.make_links_absolute(base_url=base_url) return xhtml
def process_canvas_reference(item, html): """ Replace $CANVAS_OBJECT_REFERENCE$ with edx /jump_to_id/<url_name> """ object_id = urllib.parse.unquote(item).replace("$CANVAS_OBJECT_REFERENCE$/quizzes/", "/jump_to_id/") html = html.replace(item, object_id) return html
def __call__(self, url, html): if html == '': print 'empty html from downloader' raise Exception("empty html") # return None # if url == self.seed_url: # if not decode, sometims failed, and arise 'encoding error : input conversion failed due to input error, bytes 0x84 0x31 0x95 0x33.' # so decode manual, and add param 'ignore' html = html.decode('GBK', 'ignore').encode('GBK') urls = [] results = [] queue = self.queue # filter for links matching our regular expression # and self.same_domain(link, seed_url) for oneUrl in (self.normalize(self.seed_url, link) for link in self.get_links(html) if re.search('allvisit_', link)): if self.same_domain(oneUrl, self.seed_url) and (oneUrl not in queue or queue[oneUrl] != 2): results.append(oneUrl) # sometimes needs to revome the following codes html = html.replace('''"/> <meta property=''', '') tree = lxml.html.fromstring(html) fixText = lxml.html.tostring(tree, pretty_print=True) tree = lxml.html.fromstring(fixText) for t in tree.cssselect('ul.xbk'): book = [] name = None for index, tag in enumerate(t.cssselect('li.tjxs > span')): if index == 0: templink = tag.cssselect('a')[0].attrib['href'] book.append(self.normalize(self.seed_url, templink)) name = tag.cssselect('a')[0].text_content() # print name # print tag.cssselect('a')[0].text_content() # print tag.cssselect('a')[0].attrib['href'] if index == 1: book.append(tag.cssselect('a')[0].text_content()) book.append(tag.cssselect('a')[0].attrib['href']) # print tag.cssselect('a')[0].text_content() # print tag.cssselect('a')[0].attrib['href'] if index == 2: book.append(tag.text_content()) # print tag.text_content() if index == 3: book.append(tag.cssselect('i')[0].text_content()) # print tag.cssselect('i')[0].text_content() if name is not None: self.book_data[name] = book return results
def crawl(self): for p in range(self.first, self.last + 1): browser = requests.get(self.url.format(p)) text = browser.text pattern = re.compile(r'<div.*</div>', re.DOTALL | re.MULTILINE) m = pattern.search(text) if m: html = m.group() html = html.replace('\\"', '"').replace('\\n', '') html = lxml.html.fromstring(html) divs = html.xpath('//div[@class="tabDiv"]') trs = divs[0].xpath('.//tr') for n in range(1, len(trs)): tds = trs[n].xpath('.//td') url = tds[0].xpath('./a')[0].attrib["href"] name = tds[0].text_content().strip() degree_provided = tds[3].text_content().strip() data = { "name": name, "url": self.BASE_URL + url, "degree_provided": degree_provided, } print(json.dumps(data), file=self.ipin_url) print("Page{:>6}: [done]".format(p)) print("Page{:>6}: [done]".format(p), file=self.ipin_url_log) else: print("Page{:>6}: [fail]".format(p)) print("Page{:>6}: [fail]".format(p), file=self.ipin_url_log) self.ipin_url.close() self.ipin_url_log.close()
def read_wods_json(file): with open(file) as json_file: wods = json.load(json_file) for k, v in wods.items(): html = '<p> </p>'.join(v) img_urls = re.findall('src="([^"]+)"', html) new = [ f'<a href="{url.replace("avatar_thumbnail", "feed_photo")}" target="_blank"><img style="width: 100px; height: 100px;" src="{url}" alt="Media for Result"></a>' for url in img_urls ] old = [f'<img src="{url}" alt="Media for Result">' for url in img_urls] for i, j in zip(new, old): html = html.replace(j, i) wods[k] = html df_wod = pd.DataFrame(wods, index=['html']).T df_wod.index.name = 'date' df_wod = df_wod.reset_index() df_wod['date'] = pd.to_datetime(df_wod['date']) return wods, df_wod
def load_entry(url): html = scraperwiki.scrape(url) html = html.replace("<br/>", "\n") if not "ureg-utdocument2.xsl" in html: return False doc = lxml.html.fromstring(html) last_key = None base = doc.find(".//div/div/div").xpath("string()").split("\n") base = [b.replace(u"\xc2\xa0", "").replace(" - ", "").strip() for b in base] base = [b for b in base if len(b)] data = {"Court": base[1], "CompanyRegister": base[2], "CompanyNumber": base[3], "CompanyName": base[4]} id = data.get("Court") + data.get("CompanyRegister") + data.get("CompanyNumber") data['UniqueID'] = sha1(id.encode("ascii", "ignore")).hexdigest() for elem in doc.findall(".//div"): if elem.get('class') == 'col1': last_key = elem.xpath("string()").strip() last_key = last_key.replace(":", "") if 'Eintragsdatum' in last_key: last_key = 'CreationDate' last_key = NAME_MAP.get(last_key, last_key) if elem.get('class') == 'col2': if 'Bilanz vorhanden' in last_key: opts = elem.findall('.//option') opts = [o.text for o in opts] if None in opts: continue data['BalanceDates'] = "/".join(opts) elif 'Anschrift' in last_key: data['Address'] = elem.xpath("string()") elif last_key == 'CreationDate': cd, _ = elem.xpath("string()").strip().split("(", 1) data[last_key] = cd.strip() else: data[last_key] = elem.xpath("string()").strip() scraperwiki.datastore.save(["UniqueID"], data) return True
def getSiteContact(self, account, username, mobile): HOST = "dealer.che168.com" # if account in config.che168VIPAccountList: # HOST = "dealer.che168.com" # else: # HOST = "dealers.che168.com" conn = httplib.HTTPConnection(HOST, timeout=timeout_che168) headers = copy.copy(self.headers) conn.request("GET", "/car/publish/?s=1", headers=headers) res = conn.getresponse() resHeaders = res.getheaders() resRead = res.read() html = self.decodeBody(resHeaders, resRead) html = html.decode('GB18030') html = html.replace("gb2312", "utf-8") dom = lxml.html.fromstring(html) contactItems = dom.xpath('//*[@id="sh_linkMan_div"]/a/@rel') conn.close() if len(contactItems) == 0: return self.createNewContact(username, mobile) logger.debug(str(contactItems)) for salesid in contactItems: # if self.checkCurrentContact(salesid, mobile) is True: return salesid return self.createNewContact(username, mobile)
def scrape_series(): years = scrape_years() data = [] id = itertools.count(0) for year in years[:2]: url = BASE_URL + year['link'] html = scraperwiki.scrape(url) root = lxml.html.fromstring(html.replace("\n", "")) for el1 in root.cssselect("p.ciGrndSubHead"): for el2 in el1.getnext().cssselect("dl.seasnResult"): series = el2.getchildren()[0].getchildren()[0] status = el2.getchildren()[1].text if status: status = status.strip() data.append({ "id": id.next(), "status": status, "class": el1.text, "title": series.text, "link": series.attrib['href'], "year": year['year'] }) return data
def scrape_matches(series=[]): data = [] for a_series in series[:2]: html = scraperwiki.scrape(BASE_URL + a_series['link']) root = lxml.html.fromstring(html.replace("\n", "")) id = itertools.count(0) titles = root.cssselect("p.potMatchHeading") for title in titles: match = { 'id': id.next(), 'title': re.sub(r'\s+', " ", title.text_content()), 'series_id': a_series['id'], } _links_iter = itertools.takewhile(lambda el: el.tag == 'p', title.itersiblings()) for (k, el) in ((el.attrib['class'].split(' ')[1][4:], el) for el in _links_iter): if k == 'links': links = el.cssselect("span a") for link in links: match[re.sub(r'\(\d+\)', "", link.text_content()) + '_link'] = link.attrib['href'] else: match[k] = re.sub(r'\s+', " ", el.text_content()) data.append(match) return data
def get_ticket_count_for_current_lottery(date, html): html = html.replace("\r", '').replace("\n", '') match = re.search('<div class="row">\s*<h2>Ziehung\s\w\w\,\s*' + date + '<\/h2>\s*(.*)<!--\/.row-->', html) if match == None: return 0 else: return len(re.findall('/Schein-Nummer/', match[1]))
def load_mds_extractive_summaries( summaries_tar: str) -> Dict[str, Dict[int, List[List[str]]]]: summaries = defaultdict(lambda: defaultdict(list)) with tarfile.open(summaries_tar, 'r') as tar: for member in tar.getmembers(): if member.isfile(): if member.name.startswith('./extracts_abstracts/d'): path = member.name.split('/') cluster = path[2][:-2] filename = path[-1] if filename in ['200e', '400e']: length = int(filename[:-1]) html = tar.extractfile(member).read().decode() # There is a typo in this particular file where the closing # tag is actually an opening tag, and it messes up the parse if member.name.endswith('d118if/200e'): html = html.replace('of <s>', 'of </s>') tree = lxml.html.document_fromstring(html) labels = [] for node in tree.xpath('//s'): doc = node.get('docid') num = int(node.get('num')) index = num - 1 labels.append((doc, index)) annotator = path[2][-1].upper() summary = {'annotator': annotator, 'labels': labels} if labels in summaries[cluster][length]: print( f'Cluster {cluster} has duplicate extractive summaries of length {length}' ) else: summaries[cluster][length].append(summary) return summaries
def get_html(self): html = self.read_variable("document.documentElement.innerHTML") if not html: return "" for encoding in encodings: header = 'charset=%s' % encoding if header in html: html = html.replace(header, 'charset=utf-8') break parser = lxml.html.HTMLParser() tree = lxml.etree.fromstring(html, parser) head = tree.find('head') if head is not None: base = tree.find('head/base') if base is None: base = lxml.html.Element("base") head.insert(0, base) uri = self.get_main_frame().get_uri() if uri is None: return html base.attrib['href'] = os.path.dirname(uri) return lxml.html.tostring(tree, encoding="utf-8")
def find_ID(name): # name即剧名 try: url1 = 'https://movie.douban.com/j/subject_suggest?q=' url2 = urllib.parse.quote(name) # URL只允许一部分ASCII字符,其他字符(如汉字)是不符合标准的,此时就要进行编码。 url = url1 + url2 # 生成针对该剧的链接,上面链接红字部分即为编码的name html = requests.get(url) # 访问链接,获取html页面的内容 html = html.content.decode() # 对html的内容解码为utf-8格式 html_list = html.replace('\/', '/') # 将html中的\/全部转换成/,只是为了看着方便(不换也行) html_list = html_list.split('},{') # 将html页面中的每一个条目提取为列表的一个元素。 # 定义正则,目的是从html中提取想要的信息(根据title提取id) str_title = '"title":"' + name + '"' ##匹配剧名name pattern_title = re.compile(str_title) str_id = '"id":"' + '[0-9]*' ##匹配该剧的id值 pattern_id = re.compile(str_id) # 从html_list中的每个item中提取对应的ID值 id_list = [] # ID存放列表 for l in html_list: # 遍历html_list find_results_title = re.findall(pattern_title, l, flags=0) # 找到匹配该剧name的条目item if find_results_title != []: # 如果有title=name的条目,即如果有匹配的结果 find_results_id = re.findall(pattern_id, l, flags=0) # 从该匹配的item中的寻找对应的id之 id_list.append(find_results_id) # 将寻找到的id值储存在id_list中 # 可能匹配到了多个ID(可能是同名不同剧),根据产生的id的数量,使剧名name匹配产生的id,使两个list相匹配 name_list = [name] * len(id_list) # 对id_list的格式进行修整,使之成为标准列表格式 id_list = str(id_list).replace('[', '').replace(']', '').replace("'", '').replace('"id":"', '').replace(' ', '') id_list = id_list.split(',') except: # 如果不能正常运行上述代码(不能访问网页等),输出未成功的剧名name。 print('ERROR:', name) return id_list[0]
def build_html_from_post(post): def entity_text(e): return post['text'][e['pos']:e['pos'] + e['len']] link_builder = lambda l: "<a href='%s'>%s</a>" % (l['url'], entity_text(l)) # map starting position, length of entity placeholder to the replacement html entity_map = {} for entity_key, builder in [('links', link_builder)]: for entity in post.get('entities', {}).get(entity_key, []): entity_map[(entity['pos'], entity['len'])] = builder(entity) # replace strings with html html_pieces = [] text_idx = 0 # our current place in the original text string for entity_start, entity_len in sorted(entity_map.keys()): if text_idx != entity_start: # if our current place isn't the start of an entity, bring in text until the next entity html_pieces.append(post.get('text', "")[text_idx:entity_start]) # pull out the entity html entity_html = entity_map[(entity_start, entity_len)] html_pieces.append(entity_html) # move past the entity we just added text_idx = entity_start + entity_len # clean up any remaining text html_pieces.append(post.get('text', "")[text_idx:]) html = ''.join(html_pieces) html = html.replace('\n', '<br>') # TODO: link to schema return '<span>%s</span>' % (html)
def dump_etree_html(etree, tidy=False, indent=True): """Renders an Element Tree (lxml.etree) as HTML (bytes)""" if tidy: return '\n'.join(i for i in walk_etree(etree, indent)) else: html = lxml.etree.tostring(etree, encoding='unicode') return html.replace(' ', '')
def _pdf_urls(self): if self._is_shelf(): return None if self.pdf_count is not None: return self.pdf_urls self.pdf_count = 0 pdf_hrefs = [] pdfs = self.tree_html.xpath("//a[contains(@href,'.pdf')]") for pdf in pdfs: try: pdf_hrefs.append(pdf.attrib['href']) except KeyError: pass pdfs = self.tree_html.xpath("//a[contains(@href,'pdfpdf')]") for pdf in pdfs: try: if pdf.attrib['href'] not in pdf_hrefs: pdf_hrefs.append(pdf.attrib['href']) except KeyError: pass pdfs = self.tree_html.xpath("//a[contains(@href,'pdf')]") for pdf in pdfs: try: if pdf.attrib['href'].endswith("pdf") and pdf.attrib['href'] not in pdf_hrefs: pdf_hrefs.append(pdf.attrib['href']) except KeyError: pass pdfs = self.tree_html.xpath("//a[contains(@onclick,'.pdf')]") for pdf in pdfs: # window.open('http://graphics.samsclub.com/images/pool-SNFRound.pdf','_blank') try: url = re.findall(r"open\('(.*?)',", pdf.attrib['onclick'])[0] if url not in pdf_hrefs: pdf_hrefs.append(url) except IndexError: pass pdfs = self.tree_html.xpath("//a[contains(@onclick,'pdf')]") for pdf in pdfs: # window.open('http://graphics.samsclub.com/images/pool-SNFRound.pdf','_blank') try: url = re.findall(r"open\('(.*?)',", pdf.attrib['onclick'])[0] if url not in pdf_hrefs and url.endswith("pdf"): pdf_hrefs.append(url) except IndexError: pass # http://content.webcollage.net/sc/smart-button?ird=true&channel-product-id=prod8570143 url = "http://content.webcollage.net/sc/smart-button?ird=true&channel-product-id=%s" % self._product_id() html = urllib.urlopen(url).read() # \"src\":\"\/_cp\/products\/1374451886781\/tab-6174b48c-58f3-4d4b-8d2f-0d9bf0c90a63 # \/552b9366-55ed-443c-b21e-02ede6dd89aa.mp4.mobile.mp4\" m = re.findall(r'wcobj="([^\"]*?\.pdf)"', html.replace("\\",""), re.DOTALL) pdf_hrefs += m pdf_hrefs = [r for r in pdf_hrefs if "JewelryDeliveryTimeline.pdf" not in r] if len(pdf_hrefs) < 1: return None self.pdf_urls = pdf_hrefs self.pdf_count = len(self.pdf_urls) return pdf_hrefs
def get_ticket_count_for_current_lottery(date, html): html = html.replace("\r", '').replace("\n", '') match = re.search( '<h4>Ziehung\s\w\w\s*' + date + '<\/h4>(.*)<div class="tab-pane', html) if match == None or match[1] == '': return 0 else: return len(re.findall('lotto_balls', match[1]))
def _clean_html(self, html): html = html.replace('\\', '') html = re.sub('[\n\t\r]', '', html) html = re.sub('<!--[^>]*-->', '', html) html = re.sub('</?(?!(ul|li|br))\w+[^>]*>', '', html) html = re.sub(' ', ' ', html) html = re.sub('\s+', ' ', html) return re.sub('> <', '><', html).strip()
def process_external_tools_link(item, html): """ Replace $CANVAS_OBJECT_REFERENCE$/external_tools/retrieve with appropriate external link """ external_tool_query = urllib.parse.urlparse(item).query external_tool_url = urllib.parse.parse_qs(external_tool_query).get("url", [""])[0] html = html.replace(item, external_tool_url) return html
def to_html(original_rtf, fixed_rtf): html = None from sh import unrtf with NamedTemporaryFile() as xml: xml.write(fixed_rtf) xml.flush() html = bytes(unrtf(xml.name)) for u in get_unencoded(original_rtf): html = html.replace(UNDECODED, u, 1) html = html.decode("latin-1") for match, correct in get_unencoded_unicode(original_rtf): html = html.replace(UNDECODED_UNICODE, correct, 1) return html.replace(">", ">").replace("<", "<")
def scrape_transcript(html): html = html.replace(' ', ' ') splitted = re.split(r'={30}.*', html) info_html = splitted[0] script_html = splitted[1] info = parse_episode_info(info_html) utterances = parse_script(script_html) return (info, utterances)
def getTodos(projects, objects): """ Get todos for each project """ tags_dict = getTags(objects) for project in projects: for ref_id in project['ref_ids'].split(): for object in objects: if object.attributes['id'].value == ref_id: attribute_nodes = object.getElementsByTagName("attribute") title = "" content = "" datemodified = "" datecreated = "" datecompleted= "" tags = "" for attribute_node in attribute_nodes: if attribute_node.attributes['name'].value == 'title': if attribute_node.childNodes: title = attribute_node.childNodes[0].nodeValue.encode("utf-8") break # Check if todo has a note attached if title: for attribute_node in attribute_nodes: # <attribute name="datemodified" >309306984.40529602766036987305 if attribute_node.attributes['name'].value == 'datemodified': datemodified = convertCocoaEpoch(attribute_node.childNodes[0].\ nodeValue.encode("utf-8")) # <attribute name="datecreated" >306520491.00000000000000000000 if attribute_node.attributes['name'].value == 'datecreated': datecreated = convertCocoaEpoch(attribute_node.childNodes[0].\ nodeValue.encode("utf-8")) #<attribute name="datecompleted" type="date">292880221.18648099899291992188 if attribute_node.attributes['name'].value == 'datecompleted': datecompleted = convertCocoaEpoch(attribute_node.childNodes[0].\ nodeValue.encode("utf-8")) if attribute_node.attributes['name'].value == 'content': content = attribute_node.childNodes[0].nodeValue #.encode("utf-8") # lets encode in writeOutline # I think we need to translate all this things html = content.replace('\\u3c00', '<').replace('\\u3e00', '>') html = html.replace('\u2600', '&') html = lxml.html.fromstring(html) content = html.text_content().split('\n') for l in html.iterlinks(): content += [l[2]] relationship_nodes = object.getElementsByTagName("relationship") for relationship_node in relationship_nodes: if relationship_node.attributes['name'].value == 'tags': try: tags_id = relationship_node.attributes['idrefs'].value tags = [tags_dict[t_id] for t_id in tags_id.split()] except: tags = "" project['todos'].append([title, content, datecreated, datemodified, datecompleted, tags]) return projects
def _parse_book_info(html): """解析豆瓣图书信息(作者,出版社,出版年,定价) :param html(string): 图书信息部分的原始html """ end_flag = 'END_FLAG' html = html.replace('<br>', end_flag) html = html.replace('<br/>', end_flag) doc = lxml.html.fromstring(html) text = doc.text_content() pattern = r'{}[::](.*?){}' result = dict() for key, column in [('author', '作者'), ('press', '出版社'), ('publish_date', '出版年'), ('price', '定价')]: result[key] = re.search(pattern.format(column, end_flag), text, re.I | re.DOTALL).group(1).strip() return result
def fix_minor_whitespace(html): html = html.replace('<b>', ' <b>') html = html.replace('</b>', '</b> ') html = html.replace('<code', ' <code') html = html.replace('</code>', '</code> ') html = html.replace('<a href', ' <a href') html = html.replace('</a>', '</a> ') titleHook = '"glyphicon glyphicon-home"></span>' html = html.replace(titleHook, titleHook + ' ') html = html.replace('"/>', '"/> ') return html
def stripemptyhtml(url): html = url if checkurl(url): html = gethtml(url) if not html: return None for anel in ('li', 'ul', 'ol'): repme = "<%s></%s>" html = html.replace(repme, "") return html
def embed_map(m): from IPython.display import HTML m.save("index.html") with open("index.html") as f: html = f.read() iframe = '<iframe srcdoc="{srcdoc}" style="width: 100%; height: 750px; border: none"></iframe>' srcdoc = html.replace('"', """) return HTML(iframe.format(srcdoc=srcdoc))
def striphtml(html): t = html.replace('<br>', '\n') try: dom = lxml.html.fromstring(t) t = dom.text_content() except lxml.etree.XMLSyntaxError as e: logger.warning(repr(e.message)) pass return t
def striphtml(html): t = html.replace('<br>','\n') try: dom = lxml.html.fromstring(t) t = dom.text_content() except lxml.etree.XMLSyntaxError as e: logger.warning(repr(e.message)) pass return t
def parse_call_record(html): records = [] doc_string = html.replace("<script>formateB", "<tr class='call_record'><script>formateB") doc = lxml.html.document_fromstring(doc_string) records_elements = doc.xpath("//tr[@class='call_record']") for record_element in records_elements: record = parse_record_element(record_element) records.append(record) return records
def clean_html(html): """Clean an HTML snippet into a readable string""" # Newline vs <br /> html = html.replace('\n', ' ') html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities html = html_decode(html) return html.strip()
def html(self): 'Returns an HTML citation.' s = ('{authors}, {title}, {journal}, {volissue}, {pages}, ' '({date}). {doi}.') au_link = ('<a href="http://www.scopus.com/authid/detail.url' '?origin=AuthorProfile&authorId={0}">{1}</a>') if len(self.authors) > 1: authors = ', '.join([au_link.format(a.auid, (str(a.given_name) + ' ' + str(a.surname))) for a in self.authors[0:-1]]) authors += (' and ' + au_link.format(self.authors[-1].auid, (str(self.authors[-1].given_name) + ' ' + str(self.authors[-1].surname)))) else: a = self.authors[0] authors = au_link.format(a.auid, str(a.given_name) + ' ' + str(a.surname)) title = '<a href="{link}">{title}</a>'.format(link=self.scopus_link, title=self.title) jname = self.publicationName sid = self.source_id jlink = ('<a href="http://www.scopus.com/source/sourceInfo.url' '?sourceId={sid}">{journal}</a>') journal = jlink.format(sid=sid, journal=jname) volume = self.volume issue = self.issueIdentifier if volume and issue: volissue = '<b>{0}({1})</b>'.format(volume, issue) elif volume: volissue = '<b>{0}</b>'.format(volume) else: volissue = 'no volume' date = self.coverDate if self.pageRange: pages = 'p. {0}'.format(self.pageRange) elif self.startingPage: pages = 'p. {self.startingPage}'.format(self=self) elif self.article_number: pages = 'Art. No. {self.article_number}, '.format(self=self) else: pages = '(no pages found)' doi = '<a href="http://dx.doi.org/{0}">doi:{0}</a>'.format(self.doi, self.doi) html = s.format(**locals()) return html.replace('None', '')
def images(html): d = Download() r = Rehost() r.page_html = html pool = ThreadPool(3) for image in r.get_img_list(): path = os.path.join('rehost', Rehost.today(), "%s.%s" % (uuid.uuid4(), Rehost.ext(image))) # d.download(image, os.path.join(settings.MEDIA_ROOT, path)) pool.add_task(d.download, image, os.path.join(settings.MEDIA_ROOT, path)) html = html.replace(image, "/media/%s" % path) pool.wait_completion() del d del r return html
def html_node(html): '''Returns an ``lxml.Element`` suitable for ``slice_node``.''' if not isinstance(html, unicode): html = unicode(html, 'utf-8') # The catch here is that lxml's HTML parser replaces *some* HTML # entity/char escape sequences with their proper Unicode codepoint # (e.g., `&` -> `&` and `"` -> `"`). # But not all such entities are replaced (e.g., `^` -> `^`). # We can either special case the entities that lxml does replace # (no thanks), or just escape every `&` in the HTML, which starts # every entity/char escape sequence. # # We care about this because changing `&` to `&` in the original # HTML will throw off indexing. return lxml.html.fromstring(html.replace(u'&', u'&'))
def to_xhtml (self, html, base_url): html = html.replace (u' ', u' ') html = html.replace (u'—', u'—') outputfilename = os.path.join (options.outputdir, options.outputfile) debugfilename = os.path.splitext (outputfilename)[0] + '.debug.html' try: os.remove (debugfilename) except OSError: pass if options.verbose > 1: with open (debugfilename, 'w') as fp: fp.write (html.encode ('utf-8')) try: xhtml = etree.fromstring ( html, lxml.html.XHTMLParser (), base_url = base_url) except etree.ParseError, what: error ("etree.fromstring says %s" % what) raise
def scrape_page(page): logInfo("scraping page="+str(page),iter_=page) res = list() url = URL_BASE + str(page) html = khgscrape(url) html = html.replace('"catalog-item catalog-item-odd"', '"catalog-item"') root=lxml.html.fromstring(html) divs = root.cssselect("div[class='catalog-item']") for div in divs: data = parseDiv(div) print data res.append(data) # data = {"scraper":scraper,"scraperc":scraperc,"user":user,"userc":userc,"language":lang,"status":stat} # res.append(data) return res
def add_spans(encoded_html): """Given string of encoded html, wrap each element with a span and class of element tag. e.g. <span class="my-div"><div id='sample'></span><br> """ # TODO: this only wraps opening element tags in spans. # Will need to write separate regex search to handle closing tags. def add_span_class(matchobj): return "<span class=\"my-{elem}\"><{elem}".format(elem=matchobj.group(1)) # This is the regex pattern to find the element type: <([A-Z|a-z]+[0-9]*) html = re.sub('<([A-Z|a-z]+[0-9]*)', add_span_class, encoded_html) html = html.replace(">", "></span><br>") return html
def scrape_series(): years = scrape_years() data = [] id = itertools.count(0) for year in years[:2]: url = BASE_URL + year['link'] html = scraperwiki.scrape(url) root = lxml.html.fromstring(html.replace("\n","")) for el1 in root.cssselect("p.ciGrndSubHead"): for el2 in el1.getnext().cssselect("dl.seasnResult"): series = el2.getchildren()[0].getchildren()[0] status = el2.getchildren()[1].text if status: status = status.strip() data.append({ "id": id.next(), "status": status, "class": el1.text, "title": series.text, "link": series.attrib['href'], "year": year['year'] }) return data
def getTitle(url): response = urllib.urlopen(url) html = response.read() html = html.replace(r'\"', '"') soup = BeautifulSoup(html.lower()) urlTitle = soup.find('title') try: urlTitleText = urlTitle.text except: try: t = lxml.html.parse(url) urlTitleText = t.find(".//title").text except: print "title not found" print url urlTitleText = "" return urlTitleText.lower()
def parse_html(html, content='xml', lf_on_block=False, space_on_elements=False): """Parse element and return etreeElement <div> element is added around the HTML recovery is used in case of bad markup :param str html: HTML markup :param str content: use 'xml' for XHTML or non html XML, and 'html' for HTML or if you are unsure :param bool lf_on_block: if True, add a line feed on block elements' tail :param bool space_on_elements: if True, add a space on each element's tail mainly used to count words with non HTML markup :return etree.Element: parsed element """ if not isinstance(html, str): raise ValueError("a string is expected") if not html: return etree.Element('div') if content == 'xml': # to preserve 'carriage return' otherwise it gets stripped. html = html.replace('\r', ' ') parser = etree.XMLParser(recover=True, remove_blank_text=True) root = etree.fromstring("<div>" + html + "</div>", parser) elif content == 'html': parser = etree.HTMLParser(recover=True, remove_blank_text=True) root = etree.fromstring(html, parser) if root is None: root = etree.Element('div') else: root = root.find('body') root.tag = 'div' else: raise ValueError('invalid content: {}'.format(content)) if lf_on_block: for elem in root.iterfind('.//'): # append \n to the tail if elem.tag in BLOCK_ELEMENTS: elem.tail = (elem.tail or '') + '\n' # prepend \n to the tail elif elem.tag in ('br',): elem.tail = '\n' + (elem.tail or '') if space_on_elements: for elem in root.iterfind('.//'): elem.tail = (elem.tail or '') + ' ' return root
def parse(db,url): global add global urls try: if not re.search('^http://',url): url=siteurl+"/"+url url="http://"+url.replace("//","/") request=urllib.request.Request(url) request.add_header('User-Agent', 'Flowgen/1.0 (http://floft.net)') page=urllib.request.urlopen(request) html=page.read().decode("utf-8") page.close() print("Notice: processing {}".format(url)) #get urls linkOpenTag,linkCloseTag = makeHTMLTags("a") link = linkOpenTag + SkipTo(linkCloseTag).setResultsName("body") + linkCloseTag.suppress() for toks,strt,end in link.scanString(html): newurl=toks.startA.href if newurl not in urls and newurl not in visited: if re.search('^(/|http://'+siteurl+')',newurl) and not \ re.search('(jpg|png|flac|mp3|zip|pdf)$',newurl): urls.append(newurl) #get title try: title=re.search('<title>([^<]*)</title>',html).groups()[0] except: title="Untitled" #get text xml=lxml.html.document_fromstring(html.replace(">","> ").replace("<", " <")) text=xml.cssselect('body')[0].text_content().replace("\n"," ").strip() #add to database add.append([time(),title,url,text]) except: print("Error: {} does not load".format(url))
def get_hourly_data(i): url = "http://www.imd.gov.in/section/nhac/aws/aws%02d.htm" % i html = scraperwiki.scrape(url) html = html.replace(" ", "") # Lot of strings like this root = lxml.html.fromstring(html) date = root.cssselect("p")[0].text_content().split("/")[-1] observed_date = dateutil.parser.parse(date + " %02d:00" % i) table = root.cssselect("table")[0] rows = table.cssselect("tr") headers = rows.pop(0) headers = [td.text_content() for td in headers.cssselect("td")] for row in rows: cells = [td.text_content() for td in row.cssselect("td")] rec = dict(zip(headers, cells)) rec["observed_date"] = observed_date rec["station_name"] = rec["Name"] del rec["Name"] del rec["S.No"] utils.save(rec)
def get_hourly_data(i): url = 'http://www.imd.gov.in/section/nhac/aws/aws%02d.htm' % i html = scraperwiki.scrape(url) html = html.replace(' ', '') # Lot of strings like this root = lxml.html.fromstring(html) date = root.cssselect('p')[0].text_content().split('/')[-1] observed_date = dateutil.parser.parse(date + ' %02d:00' % i) table = root.cssselect('table')[0] rows = table.cssselect('tr') headers = rows.pop(0) headers = [td.text_content() for td in headers.cssselect('td')] for row in rows: cells = [td.text_content() for td in row.cssselect('td')] rec = dict(zip(headers, cells)) rec['observed_date'] = observed_date rec['station_name'] = rec['Name'] del rec['Name'] del rec['S.No'] utils.save(rec)
def convert_links(self, html, vals, blacklist=None): for match in re.findall(URL_REGEX, html): short_schema = self.env['ir.config_parameter'].sudo().get_param('web.base.url') + '/r/' href = match[0] long_url = match[1] vals['url'] = utils.unescape(long_url) if not blacklist or not [s for s in blacklist if s in long_url] and not long_url.startswith(short_schema): link = self.create(vals) shorten_url = self.browse(link.id)[0].short_url if shorten_url: new_href = href.replace(long_url, shorten_url) html = html.replace(href, new_href) return html
def download_html(url): html = "" try: time.sleep(random.randint(1, 2)) req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) \ AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept':'text/html;q=0.9,*/*;q=0.8', 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding':'gzip', 'Connection':'close', 'Referer':None } req_timeout = 10 req = urllib2.Request(url, None, req_header) response = urllib2.urlopen(req, None, req_timeout) html = response.read() html = html.replace("&", "&") return html except: return ""
def url2lxml(self, url, xml=False): cache = getattr(self, '_url_cache', {}) self._url_cache = cache if url in cache: return cache[url] if xml: xml = self.urlopen(url) doc = lxml.etree.fromstring(xml.bytes) else: html = self.urlopen(url) html = html.replace('\x00', '') try: doc = lxml.html.fromstring(html) except lxml.etree.XMLSyntaxError: return None doc.make_links_absolute(url) cache[url] = doc return doc
def get_dms(url): if url == "" or url == None: return {"lat": "", "lon": ""} try: html = scraperwiki.scrape("http://en.wikipedia.org" + url, None, user_agent) except: return {"lat": "", "lon": ""} pass # html=html.replace("<html ", '<html xmlns="http://www.w3.org/1999/xhtml" ') html = html.replace( '<meta charset="UTF-8" />', '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />' ) root = lxml.html.document_fromstring(html) dms = root.xpath("//span[@class='geo-dms']") if dms != []: dms = dms[0].xpath("span") return {"lat": dms[0].text_content(), "lon": dms[1].text_content()} else: return {"lat": "", "lon": ""}
def scrape_matches(series = []): data = [] for a_series in series[:2]: html = scraperwiki.scrape(BASE_URL + a_series['link']) root = lxml.html.fromstring(html.replace("\n","")) id = itertools.count(0) titles = root.cssselect("p.potMatchHeading") for title in titles: match = { 'id': id.next(), 'title': re.sub(r'\s+', " ", title.text_content()), 'series_id': a_series['id'], } _links_iter = itertools.takewhile(lambda el : el.tag == 'p', title.itersiblings()) for (k, el) in ( (el.attrib['class'].split(' ')[1][4:], el) for el in _links_iter ): if k == 'links': links = el.cssselect("span a") for link in links: match[re.sub(r'\(\d+\)', "", link.text_content()) + '_link'] = link.attrib['href'] else: match[k] = re.sub(r'\s+', " ", el.text_content()) data.append(match) return data