def getData(url): page = requests.get(url) doc = lh.fromstring(page.content, parser = lh.HTMLParser(remove_comments = True)) tr_elements = doc.xpath('//tr') col = [] i = 0 for t in tr_elements[0]: i+=1 name=t.text_content() col.append((name,[])) for j in range(1, len(tr_elements)): T = tr_elements[j] i = 0 for t in T.iterchildren(): data = t.text_content() if i>0: try: data = float(data) except: pass col[i][1].append(data) i+=1 Dict={title:column for (title,column) in col} df=pd.DataFrame(Dict) return df
def save(self, value, xpath=None): """ Update a view section. The view section may embed fields to write :param str xpath: valid xpath to the tag to replace """ arch_section = html.fromstring( value, parser=html.HTMLParser(encoding='utf-8')) if xpath is None: # value is an embedded field on its own, not a view section self.save_embedded_field(arch_section) return for el in self.extract_embedded_fields(arch_section): self.save_embedded_field(el) # transform embedded field back to t-field el.getparent().replace(el, self.to_field_ref(el)) for view in self: arch = view.replace_arch_section(xpath, arch_section) view.write({'arch': view._pretty_arch(arch)}) self.sudo().mapped('model_data_id').write({'noupdate': True})
def __init__(self, html_file, preamble, tmp_dir, cache_dir, img_dir): self.html_file = html_file with open(self.html_file) as fobj: self.html_data = fobj.read() parser = html.HTMLParser(remove_comments=True) self.dom = html.parse(StringIO(self.html_data), parser=parser) self.to_replace = [] self.preamble = preamble self.basename = b64_hash(self.html_data) self.base_dir = os.path.join(tmp_dir, self.basename) self.pdf_dir = os.path.join(self.base_dir, 'pdf') self.svg_dir = os.path.join(self.base_dir, 'svg') self.out_img_dir = os.path.join(tmp_dir, 'img') self.cache_dir = cache_dir os.makedirs(self.base_dir, exist_ok=True) os.makedirs(self.pdf_dir, exist_ok=True) os.makedirs(self.svg_dir, exist_ok=True) link_dest = os.path.join(self.pdf_dir, 'figure-images') if not os.path.exists(link_dest): os.symlink(os.path.realpath(img_dir), link_dest, target_is_directory=True) self.latex_file = os.path.join(self.pdf_dir, self.basename + '.tex') self.pdf_file = os.path.join(self.pdf_dir, self.basename + '.pdf') self.boxsize_file = os.path.join(self.pdf_dir, 'boxsize.txt') self.pages_extents = [] self.num_pages = 0 self.fonts = {} self.font_hashes = {} self.images = [] self.contents = '' self.html_cache = None self.path_classes = CSSClasses() self.tspan_classes = CSSClasses()
def parse_consultor_file(file_path, html_encoding='utf-8'): """ :param file_path: path to file we want to parse an consultor :param html_encoding: html encoding of file :return: Optometrist instance """ # Xpaths for infos name_xpath = '//div[@class="name"]/text()' picture_xpath = '//div[@class="personBox"]/div/img/@src' attribute_infos_xpath = '//div[@class="profileColumn"]/div[contains(@class, "item") and contains(@style, "block")]' attribute_xpath = './@class' # Relative to attribute_infos_xpath label_xpath = './label' info_xpath = './span' # Relative to attribute_infos_xpath boxes_xpath = '//div[contains(@class,"commonBox")]' box_attribute_xpath = './h2/text()' # Relative to boxes_xpath box_info_xpath = './div' # Relative to boxes_xpath consultor_id_pattern = re.compile(r'(?P<id>\d+).html$') title_to_prop_dict = { u'adress': 'address', u'cellular': 'mobile', u'city': 'city', u'email': 'email', u'phone': 'phone', u'fax': 'fax', u'Case Study': 'case_studies', u'אודותי': 'about', u'כתובת האתר': 'website', u'פרסומים': 'publishes', u'רשימת לקוחות': 'customers', u'תחומי היתמחות': 'expertise', u'תחומי עיסוק': 'practicing_areas', u'תחומים': 'areas' } consultor_id_match = consultor_id_pattern.search(file_path) if consultor_id_match: consultor_id = consultor_id_match.group('id') else: consultor_id = '' with open(file_path, 'rb') as consultor_file: # Open it and parse current_consultor = OrgConsultor(consultor_id=consultor_id) tree = html.fromstring(consultor_file.read(), parser=html.HTMLParser(encoding=html_encoding)) # Manual parsing picture_url_result = tree.xpath(picture_xpath) if picture_url_result: current_consultor.pic_url = unicode(picture_url_result[0]) name_result = tree.xpath(name_xpath) if name_result: current_consultor.full_name = unicode(name_result[0]) # Get All attribute values from all xpaths info_attribute_values = tree.xpath(attribute_infos_xpath) for attribute_value in info_attribute_values: attribute_result = attribute_value.xpath(attribute_xpath) value_result = attribute_value.xpath(info_xpath) if value_result: attribute = unicode(attribute_result[0].split()[1]) value = value_result[0].text_content() if len(attribute_value.xpath(label_xpath)) == 0: # <span>ATTRIBUTE: VALUE</span> style value_result_match = re.search( r'^(?:.*?)\:\s*(?P<value>.*?)$', value, re.MULTILINE) if value_result_match: value = value_result_match.group('value') value = unicode(value) if attribute in title_to_prop_dict.keys(): actual_attribute = title_to_prop_dict[attribute] setattr(current_consultor, actual_attribute, value) else: print 'COULD NOT FIND ATTRIBUTE FOR', attribute else: pass # Simply means its an empty string boxes_results = tree.xpath(boxes_xpath) for box_atribute_value in boxes_results: attribute_result = box_atribute_value.xpath(box_attribute_xpath) value_result = box_atribute_value.xpath(box_info_xpath) if value_result: attribute = unicode(attribute_result[0]) value = unicode(value_result[0].text_content()) value = clean_string(value) if attribute in title_to_prop_dict.keys(): actual_attribute = title_to_prop_dict[attribute] setattr(current_consultor, actual_attribute, value) else: if attribute != u'פרטי התקשרות': print 'COULD NOT FIND ATTRIBUTE FOR', attribute else: pass # Simply means its an empty string if current_consultor.full_name is not None: return current_consultor return None
def html_parser_for(browser, element_mixins): "Return an HTMLParser linked to *browser* and powered by *element_mixins*." parser = lxml_html.HTMLParser() parser.set_element_class_lookup(ElementLookup(browser, element_mixins)) return parser
def main(): parser = argparse.ArgumentParser(prog='index2ddg.py') parser.add_argument( 'index', type=str, help='The path to the XML index containing identifier data') parser.add_argument( 'reference', type=str, help='The path to the downloaded reference (reference directory in ' 'the downloaded archive)') parser.add_argument('output', type=str, help='The path to destination output.txt file') parser.add_argument( '--split_code_snippets', action='store_true', default=False, help='Puts each declaration into a separate code snippet.') parser.add_argument( '--max_code_lines', type=int, default=6, help='Maximum number of lines of code to show in abstract') parser.add_argument( '--max_sentences', type=int, default=1, help='Maximum number of sentences to use for the description') parser.add_argument( '--max_characters', type=int, default=200, help='Maximum number of characters to use for the description') parser.add_argument( '--max_paren_chars', type=int, default=40, help='Maximum size of parenthesized text in the description. ' 'Parenthesized chunks longer than that is removed, unless they ' 'are within <code>, <b> or <i> tags') parser.add_argument('--debug', action='store_true', default=False, help='Enables debug mode.') parser.add_argument( '--debug_ident', type=str, default=None, help='Processes only the identifiers that match debug_ident') parser.add_argument( '--debug_abstracts_path', type=str, default=None, help='Path to print the abstracts before newline stripping occurs') args = parser.parse_args() # If a the second argument is 'debug', the program switches to debug mode # and prints everything to stdout. If the third argument is provided, the # program processes only the identifiers that match the provided string debug = DDGDebug(args.debug, args.debug_ident, args.debug_abstracts_path) index_file = args.index output_file = args.output # a map that stores information about location and type of identifiers # it's two level map: full_link maps to a dict that has full_name map to # ITEM_TYPE_* value ident_map = {} # get a list of pages to analyze tr = Index2DuckDuckGoList(ident_map) tr.transform_file(index_file) # get a mapping between titles and pages # linkmap = dict { title -> filename } link_map = build_link_map(args.reference) # create a list of processing instructions for each page proc_ins = get_processing_instructions(ident_map, link_map) # sort proc_ins to produce ordered output.txt proc_ins = [v for v in proc_ins.values()] proc_ins = sorted(proc_ins, key=lambda x: x['link']) for page in proc_ins: idents = page['idents'] idents = [v for v in idents.values()] idents = sorted(idents, key=lambda x: x['ident']) page['idents'] = idents redirects = [] out = open(output_file, 'w', encoding='utf-8') # i=1 for page in proc_ins: idents = page['idents'] link = page['link'] fn = page['fn'] if debug.should_skip_ident([i['ident'] for i in idents]): continue # print(str(i) + '/' + str(len(proc_ins)) + ': ' + link) # i+=1 root = e.parse(os.path.join(args.reference, fn), parser=html.HTMLParser()) for ident in idents: item_ident = ident['ident'] item_type = ident['type'] process_identifier(out, redirects, root, link, item_ident, item_type, args, debug=debug) output_redirects(out, redirects) if debug.enabled: print('=============================') print('Numbers of lines used:') for i, l in enumerate(debug.stat_line_nums): print(str(i) + ': ' + str(l) + ' result(s)')
timeout=TIMEOUT, ca_certs=certifi.where(), num_pools=50) NO_CERT_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=TIMEOUT, cert_reqs='CERT_NONE', num_pools=50) USER_AGENT = 'trafilatura/' + __version__ + ' (+https://github.com/adbar/trafilatura)' DEFAULT_HEADERS = { 'User-Agent': USER_AGENT, } # collect_ids=False, default_doctype=False, huge_tree=True, HTML_PARSER = html.HTMLParser(remove_comments=True, remove_pis=True, encoding='utf-8') RECOVERY_PARSER = html.HTMLParser(remove_comments=True, remove_pis=True) UNICODE_WHITESPACE = re.compile(r''' \u00A0|\u1680|\u2000|\u2001|\u2002|\u2003|\u2004|\u2005|\u2006|\u2007| \u2008|\u2009|\u200a|\u2028|\u2029|\u202F|\u205F|\u3000 ''') NO_TAG_SPACE = re.compile(r'(?<![p{P}>])\n') SPACE_TRIMMING = re.compile(r'\s+', flags=re.UNICODE | re.MULTILINE) NOPRINT_TRANS_TABLE = { i: None for i in range(0, sys.maxunicode + 1) if not chr(i).isprintable() and not chr(i) in (' ', '\t', '\n')
def get_html_requests(self): parser = html.HTMLParser(encoding='utf-8') page = html.fromstring(self.resp.content, parser=parser) return page
def czce_scrape(year=2018, month=6, day=11): from lxml import html url = 'http://www.czce.com.cn/portal/DFSStaticFiles/Future/' \ '{year:>04}/{year:>04}{month:>02}{day:>02}/FutureDataHolding.htm'.format(year=year, month=month, day=day) parser = html.HTMLParser(encoding='gbk') # root = html.document_fromstring(content, parser=parser) try: tree = html.parse(url, parser=parser) except Exception as e: print(e) return # tree.docinfo table = tree.findall('//table')[1] data = [[td.text_content().strip() for td in row.findall('td')] for row in table.findall('tr')] df = pd.DataFrame(data, ) df[0] = df[0].str.replace('\xa0', '') print(df) form_header = ['品种', '合约', '合计'] temp_index = [] start_index = [] import collections header_index_dict = collections.OrderedDict() for a_header in form_header: a_index_list = df.index[df[0].str.contains(a_header)].tolist() if not a_index_list: continue header_index_dict.update({a_header: a_index_list}) if '品种' in header_index_dict: contracts = header_index_dict['品种'] if '合计' not in header_index_dict: print("Error") end_index = header_index_dict['合计'] for beg, end in zip(contracts, end_index): t_df = df[beg:end] t_df = t_df.applymap(lambda x: x.replace(',', '') if x else x) t_df.reset_index(inplace=True, drop=True) print(t_df) h_str = t_df.iat[0, 0] # '品种:苹果AP 日期:2018-05-23' import re # m = re.match( # r"品种:(?P<productname>[\u4e00-\u9fa5]+)(?P<instrumentid>[a-zA-Z]+)\W*日期:(?P<date>[\d-]+)", h_str) m = re.match( r"品种:(?P<productname>[\u4e00-\u9fa5]+)?(?P<instrumentid>[a-zA-Z]+)\W*日期:(?P<date>[\d-]+)", h_str) t_instrumentid = m.group('instrumentid') t_productname = m.group('productname') if not t_productname: t_productname = t_instrumentid t_date = m.group('date') # productname 铜 instrumentid cu1804 t_df = t_df.drop([0, 1]) t_df['instrumentid'] = t_instrumentid t_df['productname'] = t_productname t_df['date'] = t_date print(t_df) col_names = [ 'RANK', 'PARTICIPANTABBR1', 'CJ1', 'CJ1_CHG', 'PARTICIPANTABBR2', 'CJ2', 'CJ2_CHG', 'PARTICIPANTABBR3', 'CJ3', 'CJ3_CHG', 'INSTRUMENTID', 'PRODUCTNAME', 'DATE' ] t_df.columns = col_names t_df['VARIETY'] = True print(t_df) from db_insert2 import set_ranks_df set_ranks_df(t_df, year=year, month=month, day=day, exchange='CZCE') if '合约' in header_index_dict: instruments_index = header_index_dict['合约'] len_instruments_index = len(instruments_index) if '合计' not in header_index_dict: print("Error") end_index = header_index_dict['合计'] end_index = end_index[-len_instruments_index:] for beg, end in zip(instruments_index, end_index): t_df = df[beg:end] t_df = t_df.applymap(lambda x: x.replace(',', '') if x else x) t_df.reset_index(inplace=True, drop=True) print(t_df) h_str = t_df.iat[0, 0] import re # m = re.match( # r"品种:(?P<productname>[\u4e00-\u9fa5]+)(?P<instrumentid>[a-zA-Z]+)\W*日期:(?P<date>[\d-]+)", h_str) m = re.match( r"合约:(?P<productname>[\u4e00-\u9fa5]+)?(?P<instrumentid>[a-zA-Z\d]+)\W*日期:(?P<date>[\d-]+)", h_str) t_instrumentid = m.group('instrumentid') t_productname = m.group('productname') if not t_productname: t_productname = "EMPTY" t_date = m.group('date') # productname 铜 instrumentid cu1804 t_df = t_df.drop([0, 1]) t_df['INSTRUMENTID'] = t_instrumentid t_df['PRODUCTNAME'] = t_productname print(t_df) col_names = [ 'RANK', 'PARTICIPANTABBR1', 'CJ1', 'CJ1_CHG', 'PARTICIPANTABBR2', 'CJ2', 'CJ2_CHG', 'PARTICIPANTABBR3', 'CJ3', 'CJ3_CHG', 'INSTRUMENTID', 'PRODUCTNAME', ] t_df.columns = col_names t_df['VARIETY'] = False print(t_df) from db_insert2 import set_ranks_df set_ranks_df(t_df, year=year, month=month, day=day, exchange='CZCE') return
def parse(self, url, postdata=None): """ Parse an URL and return an etree ElementRoot. Assumes UTF-8 encoding """ return html.parse(self.get(url, postdata), parser=html.HTMLParser(encoding='utf-8'))
def clean(content): head_pos = content.find('<head>') # insert the encoding of the file content = content[:head_pos + 6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[ head_pos + 6:] article = Extractor(content, loglevel=logging.INFO).extracted() if article is None: print("Error processing html file.") sys.exit(1) html_parser = html.HTMLParser(encoding="utf-8") html_doc = html.fromstring(content, parser=html_parser) head_doc = html_doc.find('head') source_url = head_doc.cssselect('meta[property="og:url"]')[0].get( 'content') title = html_doc.find('.//title').text_content() # Replace article = article.replace('<h1 class="tabtitle">C++</h1>', '<p><strong>C++</strong></p>') article = article.replace('<h1 class="tabtitle">C</h1>', '<p><strong>C</strong></p>') article = article.replace('<h1 class="tabtitle">C/C++</h1>', '<p><strong>C/C++</strong></p>') article = article.replace('<h1 class="tabtitle">Java</h1>', '<p><strong>Java</strong></p>') article = article.replace('<h1 class="tabtitle">Python</h1>', '<p><strong>Python</strong></p>') # if the title is unfortunately removed by boilerpipy, then add it back in if "h2" not in article: article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article reconstructed_body = "<html><body>" + article.replace( "<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>" if "<body><h1>" not in reconstructed_body: reconstructed_body = reconstructed_body.replace( "<body>", "<body><h1>" + title[:title.rfind('-')] + "</h1>") source_header_string = "<h3>Source</h3>" source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>" # further remove useless stuff body_doc = html.fromstring(reconstructed_body).find('body') for bad in body_doc.xpath("//div[@class='comments-main']"): bad.getparent().remove(bad) for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"): ad_by_google.getparent().remove(ad_by_google) for bad_h3 in body_doc.xpath("//h3"): bad_h3.getparent().remove(bad_h3) for pre_tag in body_doc.xpath("//pre"): if 'class' in pre_tag.attrib: pre_tag.attrib.pop('class') if 'title' in pre_tag.attrib: pre_tag.attrib.pop('title') post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0] post_content_doc.append(lxml.etree.XML(source_header_string)) post_content_doc.append(lxml.etree.XML(source_link)) result = html.tostring(body_doc) # replace <code> with <code><pre> for styling later. result = result.replace('<pre>', '<pre> <code>').replace('</pre>', '</code> </pre>') return result
def extract_from_html(msg_body): """ Extract not quoted message from provided html message body using tags and plain text algorithm. Cut out the 'blockquote', 'gmail_quote' tags. Cut Microsoft quotations. Then use plain text algorithm to cut out splitter or leftover quotation. This works by adding checkpoint text to all html tags, then converting html to text, then extracting quotations from text, then checking deleted checkpoints, then deleting necessary tags. """ if msg_body.strip() == '': return msg_body html_tree = html.document_fromstring( msg_body, parser=html.HTMLParser(encoding="utf-8")) cut_quotations = (html_quotations.cut_gmail_quote(html_tree) or html_quotations.cut_blockquote(html_tree) or html_quotations.cut_microsoft_quote(html_tree) or html_quotations.cut_by_id(html_tree) or html_quotations.cut_from_block(html_tree)) html_tree_copy = deepcopy(html_tree) number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0) quotation_checkpoints = [False for i in xrange(number_of_checkpoints)] msg_with_checkpoints = html.tostring(html_tree) h = html2text.HTML2Text() h.body_width = 0 # generate plain text without wrap # html2text adds unnecessary star symbols. Remove them. # Mask star symbols msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432') plain_text = h.handle(msg_with_checkpoints) # Remove created star symbols plain_text = plain_text.replace('*', '') # Unmask saved star symbols plain_text = plain_text.replace('3423oorkg432', '*') delimiter = get_delimiter(plain_text) plain_text = preprocess(plain_text, delimiter, content_type='text/html') lines = plain_text.splitlines() # Don't process too long messages if len(lines) > MAX_LINES_COUNT: return msg_body # Collect checkpoints on each line line_checkpoints = [ [ int(i[4:-4]) # Only checkpoint number for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line) ] for line in lines ] # Remove checkpoints lines = [ re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines ] # Use plain text quotation extracting algorithm markers = mark_message_lines(lines) return_flags = [] process_marked_lines(lines, markers, return_flags) lines_were_deleted, first_deleted, last_deleted = return_flags if lines_were_deleted: #collect checkpoints from deleted lines for i in xrange(first_deleted, last_deleted): for checkpoint in line_checkpoints[i]: quotation_checkpoints[checkpoint] = True else: if cut_quotations: return html.tostring(html_tree_copy) else: return msg_body # Remove tags with quotation checkpoints html_quotations.delete_quotation_tags(html_tree_copy, 0, quotation_checkpoints) return html.tostring(html_tree_copy)
def clean(file_name, directory="."): basename = os.path.basename(file_name) content = codecs.open(file_name, "r", 'utf-8').read() head_pos = content.find('<head>') # insert the encoding of the file content = content[:head_pos + 6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[ head_pos + 6:] article = Extractor(content, loglevel=logging.INFO).extracted() if article is None: print "Error processing html file" sys.exit(1) html_parser = html.HTMLParser(encoding="utf-8") html_doc = html.fromstring(content, parser=html_parser) head_doc = html_doc.find('head') published_time = head_doc.cssselect( 'meta[property="article:published_time"]')[0].get('content')[:-6] print published_time cleaned_file = os.path.splitext( basename)[0] + "_" + published_time + "_cleaned.html" # don't clean files that already have been cleaned if os.path.isfile(cleaned_file): return source_url = head_doc.cssselect('meta[property="og:url"]')[0].get( 'content') title = html_doc.find('.//title').text_content() # if the title is unfortunately removed by boilerpipy, then add it back in if "h2" not in article: article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article reconstructed_body = "<html><body>" + article.replace( "<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>" source_header_string = "<h3>Source</h3>" source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>" # further remove useless stuff body_doc = html.fromstring(reconstructed_body).find('body') for bad in body_doc.xpath("//div[@class='comments-main']"): bad.getparent().remove(bad) for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"): ad_by_google.getparent().remove(ad_by_google) for bad_h3 in body_doc.xpath("//h3"): bad_h3.getparent().remove(bad_h3) for pre_tag in body_doc.xpath("//pre"): if 'class' in pre_tag.attrib: pre_tag.attrib.pop('class') if 'title' in pre_tag.attrib: pre_tag.attrib.pop('title') post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0] post_content_doc.append(lxml.etree.XML(source_header_string)) post_content_doc.append(lxml.etree.XML(source_link)) result = html.tostring(body_doc) # replace <code> with <code><pre> for styling later. result = result.replace('<pre>', '<pre> <code>').replace('</pre>', '</code> </pre>') with open(directory + cleaned_file, 'w') as cleaned_file_handle: cleaned_file_handle.write(result.encode('utf-8'))
raise KeyError(_("Cannot find job attempt '%(id)s'.") % {'id': job.jobId}, e) except Exception, e: raise Exception(_("Failed to get application for job %s: %s") % (job.jobId, e)) if log_link: link = '/%s/' % name params = {} if offset != 0: params['start'] = offset root = Resource(get_log_client(log_link), urlparse.urlsplit(log_link)[2], urlencode=False) api_resp = None try: api_resp = root.get(link, params=params) log = html.fromstring(api_resp, parser=html.HTMLParser()).xpath('/html/body/table/tbody/tr/td[2]')[0].text_content() response['status'] = 0 response['log'] = LinkJobLogs._make_hdfs_links(log) except Exception, e: response['log'] = _('Failed to retrieve log: %s' % e) try: debug_info = '\nLog Link: %s' % log_link if api_resp: debug_info += '\nHTML Response: %s' % response response['debug'] = debug_info LOG.error(debug_info) except: LOG.exception('failed to create debug info') return JsonResponse(response)
# coding=utf-8 import api # Scraping rápido del formulario de horarios from lxml import html import requests from requests.exceptions import ConnectionError while True: try: pagina = html.fromstring(requests.get( 'https://guayacan.uninorte.edu.co/4PL1CACI0N35/registro/consulta_horarios.php' ).content, parser=html.HTMLParser(encoding='utf-8')) niveles = { nivel.get('value'): nivel.text for nivel in pagina.xpath('//select[@name="nivel"]/option')[1:] } periodos = { periodo.get('value'): periodo.text for periodo in pagina.xpath('//select[@name="periodo"]/option')[1:] } break except ConnectionError as e: print(e) # Creación de la aplicación en Flask from flask import Flask, render_template, redirect, url_for, request, jsonify app = Flask(__name__) from flask.json import JSONEncoder from bson import ObjectId
# print word1 text_porter = text_porter + " " + Porter.stem(word1) return text_porter def get_stem_text(text): m = Mystem() text_raw = re.sub(u'[^a-zA-Zа-яА-ЯйЙёЁ _]+', '', text) text_stem = m.lemmatize(text_raw) text_res = ''.join(text_stem).strip() return text_res if (__name__) == "__main__": link = 'http://www.mathnet.ru/php/archive.phtml?jrnid=uzku&wshow=issue&bshow=contents&series=0&year=2008&volume=150&issue=3&option_lang=rus&bookID=1000' parser = html.HTMLParser() root_tree = getXMLTreeByLink(link) root = etree.Element('Math-Net') year = etree.SubElement(root, 'year') year.text = root_tree.xpath( "// td[@width='70%'] / span[@class='red'] / font")[0].text_content( ).strip().split(' ')[0].replace(',', '') articles = etree.SubElement(root, 'articles') expresstion_article = "// td[@colspan='2'] / a[contains(@class, 'SLink')]".decode( 'utf-8') index = 0 for element in root_tree.xpath(expresstion_article): article = etree.SubElement(articles, 'article') article.set("id", str(index))
def ParseHtml(story, corpus): """Parses the HTML of a news story. Args: story: The raw Story to be parsed. corpus: Either 'cnn' or 'dailymail'. Returns: A Story containing URL, paragraphs and highlights. """ parser = html.HTMLParser(encoding=chardet.detect(story.html)['encoding']) tree = html.document_fromstring(story.html, parser=parser) # Elements to delete. delete_selectors = { 'cnn': [ '//blockquote[contains(@class, "twitter-tweet")]', '//blockquote[contains(@class, "instagram-media")]' ], 'dailymail': [ '//blockquote[contains(@class, "twitter-tweet")]', '//blockquote[contains(@class, "instagram-media")]' ] } # Paragraph exclusions: ads, links, bylines, comments cnn_exclude = ( 'not(ancestor::*[contains(@class, "metadata")])' ' and not(ancestor::*[contains(@class, "pullquote")])' ' and not(ancestor::*[contains(@class, "SandboxRoot")])' ' and not(ancestor::*[contains(@class, "twitter-tweet")])' ' and not(ancestor::div[contains(@class, "cnnStoryElementBox")])' ' and not(contains(@class, "cnnTopics"))' ' and not(descendant::*[starts-with(text(), "Read:")])' ' and not(descendant::*[starts-with(text(), "READ:")])' ' and not(descendant::*[starts-with(text(), "Join us at")])' ' and not(descendant::*[starts-with(text(), "Join us on")])' ' and not(descendant::*[starts-with(text(), "Read CNNOpinion")])' ' and not(descendant::*[contains(text(), "@CNNOpinion")])' ' and not(descendant-or-self::*[starts-with(text(), "Follow us")])' ' and not(descendant::*[starts-with(text(), "MORE:")])' ' and not(descendant::*[starts-with(text(), "SPOILER ALERT:")])') dm_exclude = ('not(ancestor::*[contains(@id,"reader-comments")])' ' and not(contains(@class, "byline-plain"))' ' and not(contains(@class, "byline-section"))' ' and not(contains(@class, "count-number"))' ' and not(contains(@class, "count-text"))' ' and not(contains(@class, "video-item-title"))' ' and not(ancestor::*[contains(@class, "column-content")])' ' and not(ancestor::iframe)') paragraph_selectors = { 'cnn': [ '//div[contains(@class, "cnnContentContainer")]//p[%s]' % cnn_exclude, '//div[contains(@class, "l-container")]//p[%s]' % cnn_exclude, '//div[contains(@class, "cnn_strycntntlft")]//p[%s]' % cnn_exclude ], 'dailymail': ['//div[contains(@class, "article-text")]//p[%s]' % dm_exclude] } # Highlight exclusions. he = ('not(contains(@class, "cnnHiliteHeader"))' ' and not(descendant::*[starts-with(text(), "Next Article in")])') highlight_selectors = { 'cnn': [ '//*[contains(@class, "el__storyhighlights__list")]//li[%s]' % he, '//*[contains(@class, "cnnStryHghLght")]//li[%s]' % he, '//*[@id="cnnHeaderRightCol"]//li[%s]' % he ], 'dailymail': ['//h1/following-sibling::ul//li'] } def ExtractText(selector): """Extracts a list of paragraphs given a XPath selector. Args: selector: A XPath selector to find the paragraphs. Returns: A list of raw text paragraphs with leading and trailing whitespace. """ xpaths = map(tree.xpath, selector) elements = list(chain.from_iterable(xpaths)) paragraphs = [e.text_content().encode('utf-8') for e in elements] # Remove editorial notes, etc. if corpus == 'cnn' and len( paragraphs) >= 2 and '(CNN)' in paragraphs[1]: paragraphs.pop(0) paragraphs = map(str.strip, paragraphs) paragraphs = [s for s in paragraphs if s and not str.isspace(s)] return paragraphs for selector in delete_selectors[corpus]: for bad in tree.xpath(selector): bad.getparent().remove(bad) paragraphs = ExtractText(paragraph_selectors[corpus]) highlights = ExtractText(highlight_selectors[corpus]) content = '\n\n'.join(paragraphs) return Story(story.url, content, highlights)
from flask import Blueprint, url_for, redirect, request import requests from lxml import html from .util import replaceahref, rmelement, updelement from urllib.parse import urljoin import re PRE_FIX = 'mh1359' BASE_URL = 'https://m.mh1359.com/' mh1359_bp = Blueprint(PRE_FIX, __name__) myparser = html.HTMLParser(encoding="UTF-8") @mh1359_bp.route('/') def index(): r = requests.get(BASE_URL) dom = html.fromstring(r.text, parser=myparser) dom = transform(dom) return html.tostring(dom, pretty_print=True) @mh1359_bp.route('/manhua/<idpage>') def manhua(idpage): r = requests.get(urljoin(BASE_URL, 'manhua/' + idpage)) dom = html.fromstring(r.text, parser=myparser) dom = transform(dom) return html.tostring(dom, pretty_print=True) @mh1359_bp.route('/chapter/<idpage>')
def parse_html_stream(f): parser = lhtml.HTMLParser(encoding="utf8") return lhtml.parse(f, parser)
padding = None if args.pad is not None: padding = eval("[" + args.pad + "]") assert len(padding) in [1, 4], (args.pad, padding) if len(padding) == 1: padding = padding * 4 tpattern = args.pattern + '.txt' if args.pattern[-4] == '.': tpattern = args.pattern[:-3] + 'txt' if args.unicodedammit: from bs4 import UnicodeDammit content = args.file.read() doc = UnicodeDammit(content, is_html=True) parser = html.HTMLParser(encoding=doc.original_encoding) doc = html.document_fromstring(content, parser=parser) else: doc = html.parse(args.file) pages = doc.xpath('//*[@class="ocr_page"]') for page in pages: iname = get_prop(page, 'file') if not iname: iname = get_prop(page, 'image') if args.basename: iname = os.path.join(args.basename, os.path.basename(iname)) if not os.path.exists(iname): print("not found:", iname) sys.exit(1) image = Image.open(iname)
def get_task_log(self, offset=0): logs = [] attempt = self.task.job.job_attempts['jobAttempt'][-1] log_link = attempt['logsLink'] # Generate actual task log link from logsLink url if self.task.job.status in ('NEW', 'SUBMITTED', 'RUNNING' ) or self.type == 'Oozie Launcher': logs_path = '/node/containerlogs/' node_url, tracking_path = log_link.split(logs_path) container_id, user = tracking_path.strip('/').split('/') # Replace log path tokens with actual container properties if available if hasattr(self, 'nodeHttpAddress') and 'nodeId' in attempt: node_url = '%s://%s' % (node_url.split('://')[0], self.nodeHttpAddress) container_id = self.assignedContainerId if hasattr( self, 'assignedContainerId') else container_id log_link = '%(node_url)s/%(logs_path)s/%(container)s/%(user)s' % { 'node_url': node_url, 'logs_path': logs_path.strip('/'), 'container': container_id, 'user': user } else: # Completed jobs logs_path = '/jobhistory/logs/' root_url, tracking_path = log_link.split(logs_path) node_url, container_id, attempt_id, user = tracking_path.strip( '/').split('/') # Replace log path tokens with actual attempt properties if available if hasattr(self, 'nodeHttpAddress') and 'nodeId' in attempt: node_url = '%s:%s' % (self.nodeHttpAddress.split(':')[0], attempt['nodeId'].split(':')[1]) container_id = self.assignedContainerId if hasattr( self, 'assignedContainerId') else container_id attempt_id = self.attemptId if hasattr(self, 'attemptId') else attempt_id log_link = '%(root_url)s/%(logs_path)s/%(node)s/%(container)s/%(attempt)s/%(user)s' % { 'root_url': root_url, 'logs_path': logs_path.strip('/'), 'node': node_url, 'container': container_id, 'attempt': attempt_id, 'user': user } for name in ('stdout', 'stderr', 'syslog'): link = '/%s/' % name if self.type == 'Oozie Launcher' and not self.task.job.status == 'FINISHED': # Yarn currently dumps with 500 error with doas in running state params = {} else: params = {'doAs': user} if int(offset) != 0: params['start'] = offset else: params['start'] = 0 response = None try: log_link = re.sub('job_[^/]+', self.id, log_link) root = Resource(get_log_client(log_link), urlparse.urlsplit(log_link)[2], urlencode=False) response = root.get(link, params=params) log = html.fromstring( response, parser=html.HTMLParser()).xpath( '/html/body/table/tbody/tr/td[2]')[0].text_content() except Exception, e: log = _('Failed to retrieve log: %s' % e) try: debug_info = '\nLog Link: %s' % log_link if response: debug_info += '\nHTML Response: %s' % response LOG.error(debug_info) except: LOG.exception('failed to build debug info') logs.append(log)
def _parse_file(self, file_name, cloth_parser): if cloth_parser is None: cloth_parser = html.HTMLParser() cloth = html.parse(file_name, parser=cloth_parser) return cloth.getroot()
_("Failed to get application for job %s: %s") % (job.jobId, e)) if log_link: link = '/%s/' % name params = {} if offset != 0: params['start'] = offset root = Resource(get_log_client(log_link), urlparse.urlsplit(log_link)[2], urlencode=False) api_resp = None try: api_resp = root.get(link, params=params) log = html.fromstring(api_resp, parser=html.HTMLParser()).xpath( '/html/body/table/tbody/tr/td[2]')[0].text_content() response['status'] = 0 response['log'] = LinkJobLogs._make_hdfs_links(log) except Exception, e: response['log'] = _('Failed to retrieve log: %s' % e) try: debug_info = '\nLog Link: %s' % log_link if api_resp: debug_info += '\nHTML Response: %s' % response response['debug'] = debug_info LOG.error(debug_info) except: LOG.exception('failed to create debug info')
def from_html_cloth(cls, cloth, strip_comments=True): retval = cls() retval._init_cloth(cloth, cloth_parser=html.HTMLParser(), strip_comments=strip_comments) return retval
def parse_lawyer_files_gen(bulk_amount=200, html_encoding='utf-8'): """ returns list of bulk_amount Lawyers that were parsed """ all_file_paths = iglob( 'C:\Python27\Scripts\Experiments\databases\lawyers\*') name_xpath = '//div[@class="screen_name"]//span/text()' header_item_xpath = 'div[contains(@class,"reference_item")]' item_title_xpath = './div[@class="title"]/text()' item_value_xpath = './span' title_to_prop_dict = { 'טלפון': 'phone', 'פקס': 'fax', 'נייד': 'mobile', 'תחום עיסוק': 'specialty', 'כתובת דוא"ל': 'email', 'כתובת': 'address', 'ת.ד': 'po_box', 'שפה': 'language', 'נוטריון': 'notary' } lawyers_lst = [] lawyer_id_pattern = re.compile(r'laywer_(?P<id>\d+).html$') for file_path in all_file_paths: # Loop over every lawyer file # Look for lawyer_id in lawyer_id_match = lawyer_id_pattern.search(file_path) if lawyer_id_match: lawyer_id = lawyer_id_match.group('id') else: lawyer_id = '' with open(file_path, 'rb') as lawyer_file: # Open it and parse html_tree = html.fromstring( lawyer_file.read(), parser=html.HTMLParser(encoding=html_encoding)) name = edit_string(html_tree.xpath(name_xpath)[0], html_encoding) #Lawyer name current_lawyer = Lawyer(lawyer_id=lawyer_id, name=name) all_headers = html_tree.xpath(header_item_xpath) for header in all_headers: # Loop over every attribute of the lawyer found in HTML file attribute_name = edit_string( header.xpath(item_title_xpath)[0], html_encoding) attribute_values = header.xpath(item_value_xpath) attribute_value = ' '.join( map(lambda x: edit_string(x.text_content()), attribute_values)) # join to 1 string # Get actual attribute name, and set it to the given value actual_attribute_name = title_to_prop_dict[attribute_name] setattr(current_lawyer, actual_attribute_name, attribute_value) lawyers_lst.append(current_lawyer) if len(lawyers_lst) == bulk_amount: # Yield a list of bulk_amount lawyers yield lawyers_lst lawyers_lst = [] yield lawyers_lst # Final yield
def parse_html_with_encoding(data, encoding='utf-8'): parser = html.HTMLParser(encoding=encoding) return html.fromstring(data, parser=parser)
def fromstring(s): html_parser = html.HTMLParser(encoding='utf-8') return html.fromstring(s, parser=html_parser).getroottree().getroot()
def _parse_file(self, file_name, cloth_parser): if cloth_parser is None: cloth_parser = html.HTMLParser(remove_comments=True) cloth = html.parse(file_name, parser=cloth_parser) return cloth.getroot()
except (KeyError, RestException), e: raise KeyError( _("Cannot find job attempt '%(id)s'.") % {'id': job.jobId}, e) link = '/%s/' % name params = {} if offset and int(offset) >= 0: params['start'] = offset root = Resource(get_log_client(log_link), urlparse.urlsplit(log_link)[2], urlencode=False) debug_info = '' try: response = root.get(link, params=params) log = html.fromstring(response, parser=html.HTMLParser()).xpath( '/html/body/table/tbody/tr/td[2]')[0].text_content() except Exception, e: log = _('Failed to retrieve log: %s' % e) try: debug_info = '\nLog Link: %s' % log_link debug_info += '\nHTML Response: %s' % response LOGGER.error(debug_info) except: LOGGER.exception('failed to create debug info') response = {'log': LinkJobLogs._make_hdfs_links(log), 'debug': debug_info} return JsonResponse(response)
def main(): global COUNT site_url = "http://www.sanskritlibrary.org/" seed_url = "http://www.sanskritlibrary.org/textsList.html" titus_url = "http://titus.uni-frankfurt.de" p = Page(seed_url) a_tags = CSSSelector('a') div_tags = CSSSelector('div') span_tags = CSSSelector('span') body_tags = CSSSelector('body') div = [e for e in div_tags(p.dom) if e.get("class")=="text"] div = div[0] links = [site_url + i.get("href") for i in div.getchildren() if i.tag=='a'] print "Links of texts:", len(links) source_links = list() #Creating list of links for l in links: lpage = Page(l) slinks = [i.get("href") for i in a_tags(lpage.dom) if i.get("target")=="source"] source_links += slinks print "Links of sources:", len(source_links) #134 source_links = list(set(source_links)) print "Unique links of sources:", len(source_links) #94 #Considering only ramayana and mahabharat links source_links = [i for i in source_links if ("/mbh" in i or "/ram" in i)] pp.pprint(source_links) b = p.selenium_load() for link in source_links: lp = link print "SOURCE_LINK",link while lp: try: b.get(lp) sleep(0.25) b.switch_to_frame(b.find_elements_by_tag_name("frame")[0]) bdom=html.fromstring(b.page_source, parser=html.HTMLParser(encoding='utf-8')) bt = body_tags(bdom) if len(bt)==0: print "No body tag for " + lp continue body = bt[0] f = open("download/" + lp[lp.rfind("/")+1:]+".txt", 'w') f.write(body.text_content().encode('utf-8')) f.close() print "File no. " + str(COUNT) + " created" COUNT += 1 anchors = a_tags(bdom) lp = None for i in range(len(anchors)-1, max(0, len(anchors)-5), -1): if len(anchors[i].getchildren())==1 and anchors[i].getchildren()[0].tag=="img" and "arribar" in anchors[i].getchildren()[0].get("src"): href = anchors[i].get("href") lp = titus_url+href print i, len(anchors)-i print "New frame:", lp break except: lp = None