Пример #1
0
def getData(url):
    page = requests.get(url)

    doc = lh.fromstring(page.content, parser = lh.HTMLParser(remove_comments = True))

    tr_elements = doc.xpath('//tr')

    col = []
    i = 0

    for t in tr_elements[0]:
        i+=1
        name=t.text_content()
        col.append((name,[]))




    for j in range(1, len(tr_elements)):
        T = tr_elements[j]

        i = 0

        for t in T.iterchildren():
            data = t.text_content()
            if i>0:
                try:
                    data = float(data)
                except:
                    pass
            col[i][1].append(data)
            i+=1

    Dict={title:column for (title,column) in col}
    df=pd.DataFrame(Dict)
    return df
Пример #2
0
    def save(self, value, xpath=None):
        """ Update a view section. The view section may embed fields to write

        :param str xpath: valid xpath to the tag to replace
        """
        arch_section = html.fromstring(
            value, parser=html.HTMLParser(encoding='utf-8'))

        if xpath is None:
            # value is an embedded field on its own, not a view section
            self.save_embedded_field(arch_section)
            return

        for el in self.extract_embedded_fields(arch_section):
            self.save_embedded_field(el)

            # transform embedded field back to t-field
            el.getparent().replace(el, self.to_field_ref(el))

        for view in self:
            arch = view.replace_arch_section(xpath, arch_section)
            view.write({'arch': view._pretty_arch(arch)})

        self.sudo().mapped('model_data_id').write({'noupdate': True})
Пример #3
0
    def __init__(self, html_file, preamble, tmp_dir, cache_dir, img_dir):
        self.html_file = html_file
        with open(self.html_file) as fobj:
            self.html_data = fobj.read()
        parser = html.HTMLParser(remove_comments=True)
        self.dom = html.parse(StringIO(self.html_data), parser=parser)
        self.to_replace = []
        self.preamble = preamble
        self.basename = b64_hash(self.html_data)
        self.base_dir = os.path.join(tmp_dir, self.basename)
        self.pdf_dir = os.path.join(self.base_dir, 'pdf')
        self.svg_dir = os.path.join(self.base_dir, 'svg')
        self.out_img_dir = os.path.join(tmp_dir, 'img')
        self.cache_dir = cache_dir

        os.makedirs(self.base_dir, exist_ok=True)
        os.makedirs(self.pdf_dir, exist_ok=True)
        os.makedirs(self.svg_dir, exist_ok=True)
        link_dest = os.path.join(self.pdf_dir, 'figure-images')
        if not os.path.exists(link_dest):
            os.symlink(os.path.realpath(img_dir),
                       link_dest,
                       target_is_directory=True)

        self.latex_file = os.path.join(self.pdf_dir, self.basename + '.tex')
        self.pdf_file = os.path.join(self.pdf_dir, self.basename + '.pdf')
        self.boxsize_file = os.path.join(self.pdf_dir, 'boxsize.txt')
        self.pages_extents = []
        self.num_pages = 0
        self.fonts = {}
        self.font_hashes = {}
        self.images = []
        self.contents = ''
        self.html_cache = None
        self.path_classes = CSSClasses()
        self.tspan_classes = CSSClasses()
Пример #4
0
def parse_consultor_file(file_path, html_encoding='utf-8'):
    """
    :param file_path: path to file we want to parse an consultor
    :param html_encoding: html encoding of file
    :return: Optometrist instance
    """

    # Xpaths for infos
    name_xpath = '//div[@class="name"]/text()'
    picture_xpath = '//div[@class="personBox"]/div/img/@src'

    attribute_infos_xpath = '//div[@class="profileColumn"]/div[contains(@class, "item") and contains(@style, "block")]'
    attribute_xpath = './@class'  # Relative to attribute_infos_xpath
    label_xpath = './label'
    info_xpath = './span'  # Relative to attribute_infos_xpath

    boxes_xpath = '//div[contains(@class,"commonBox")]'
    box_attribute_xpath = './h2/text()'  # Relative to boxes_xpath
    box_info_xpath = './div'  # Relative to boxes_xpath

    consultor_id_pattern = re.compile(r'(?P<id>\d+).html$')

    title_to_prop_dict = {
        u'adress': 'address',
        u'cellular': 'mobile',
        u'city': 'city',
        u'email': 'email',
        u'phone': 'phone',
        u'fax': 'fax',
        u'Case Study': 'case_studies',
        u'אודותי': 'about',
        u'כתובת האתר': 'website',
        u'פרסומים': 'publishes',
        u'רשימת לקוחות': 'customers',
        u'תחומי היתמחות': 'expertise',
        u'תחומי עיסוק': 'practicing_areas',
        u'תחומים': 'areas'
    }

    consultor_id_match = consultor_id_pattern.search(file_path)

    if consultor_id_match:
        consultor_id = consultor_id_match.group('id')
    else:
        consultor_id = ''

    with open(file_path, 'rb') as consultor_file:
        # Open it and parse

        current_consultor = OrgConsultor(consultor_id=consultor_id)

        tree = html.fromstring(consultor_file.read(),
                               parser=html.HTMLParser(encoding=html_encoding))

        # Manual parsing
        picture_url_result = tree.xpath(picture_xpath)
        if picture_url_result:
            current_consultor.pic_url = unicode(picture_url_result[0])

        name_result = tree.xpath(name_xpath)
        if name_result:
            current_consultor.full_name = unicode(name_result[0])

        # Get All attribute values from all xpaths
        info_attribute_values = tree.xpath(attribute_infos_xpath)

        for attribute_value in info_attribute_values:
            attribute_result = attribute_value.xpath(attribute_xpath)
            value_result = attribute_value.xpath(info_xpath)

            if value_result:
                attribute = unicode(attribute_result[0].split()[1])

                value = value_result[0].text_content()

                if len(attribute_value.xpath(label_xpath)) == 0:
                    # <span>ATTRIBUTE: VALUE</span> style
                    value_result_match = re.search(
                        r'^(?:.*?)\:\s*(?P<value>.*?)$', value, re.MULTILINE)
                    if value_result_match:
                        value = value_result_match.group('value')

                value = unicode(value)
                if attribute in title_to_prop_dict.keys():
                    actual_attribute = title_to_prop_dict[attribute]
                    setattr(current_consultor, actual_attribute, value)
                else:
                    print 'COULD NOT FIND ATTRIBUTE FOR', attribute
            else:
                pass  # Simply means its an empty string

        boxes_results = tree.xpath(boxes_xpath)
        for box_atribute_value in boxes_results:
            attribute_result = box_atribute_value.xpath(box_attribute_xpath)
            value_result = box_atribute_value.xpath(box_info_xpath)

            if value_result:
                attribute = unicode(attribute_result[0])
                value = unicode(value_result[0].text_content())
                value = clean_string(value)
                if attribute in title_to_prop_dict.keys():
                    actual_attribute = title_to_prop_dict[attribute]
                    setattr(current_consultor, actual_attribute, value)
                else:
                    if attribute != u'פרטי התקשרות':
                        print 'COULD NOT FIND ATTRIBUTE FOR', attribute
            else:
                pass  # Simply means its an empty string

    if current_consultor.full_name is not None:
        return current_consultor
    return None
Пример #5
0
def html_parser_for(browser, element_mixins):
    "Return an HTMLParser linked to *browser* and powered by *element_mixins*."
    parser = lxml_html.HTMLParser()
    parser.set_element_class_lookup(ElementLookup(browser, element_mixins))
    return parser
Пример #6
0
def main():
    parser = argparse.ArgumentParser(prog='index2ddg.py')

    parser.add_argument(
        'index',
        type=str,
        help='The path to the XML index containing identifier data')

    parser.add_argument(
        'reference',
        type=str,
        help='The path to the downloaded reference (reference directory in '
        'the downloaded archive)')

    parser.add_argument('output',
                        type=str,
                        help='The path to destination output.txt file')

    parser.add_argument(
        '--split_code_snippets',
        action='store_true',
        default=False,
        help='Puts each declaration into a separate code snippet.')

    parser.add_argument(
        '--max_code_lines',
        type=int,
        default=6,
        help='Maximum number of lines of code to show in abstract')

    parser.add_argument(
        '--max_sentences',
        type=int,
        default=1,
        help='Maximum number of sentences to use for the description')

    parser.add_argument(
        '--max_characters',
        type=int,
        default=200,
        help='Maximum number of characters to use for the description')

    parser.add_argument(
        '--max_paren_chars',
        type=int,
        default=40,
        help='Maximum size of parenthesized text in the description. '
        'Parenthesized chunks longer than that is removed, unless they '
        'are within <code>, <b> or <i> tags')

    parser.add_argument('--debug',
                        action='store_true',
                        default=False,
                        help='Enables debug mode.')

    parser.add_argument(
        '--debug_ident',
        type=str,
        default=None,
        help='Processes only the identifiers that match debug_ident')

    parser.add_argument(
        '--debug_abstracts_path',
        type=str,
        default=None,
        help='Path to print the abstracts before newline stripping occurs')
    args = parser.parse_args()

    # If a the second argument is 'debug', the program switches to debug mode
    # and prints everything to stdout. If the third argument is provided, the
    # program processes only the identifiers that match the provided string

    debug = DDGDebug(args.debug, args.debug_ident, args.debug_abstracts_path)

    index_file = args.index
    output_file = args.output

    # a map that stores information about location and type of identifiers
    # it's two level map: full_link maps to a dict that has full_name map to
    # ITEM_TYPE_* value
    ident_map = {}

    # get a list of pages to analyze
    tr = Index2DuckDuckGoList(ident_map)
    tr.transform_file(index_file)

    # get a mapping between titles and pages
    # linkmap = dict { title -> filename }
    link_map = build_link_map(args.reference)

    # create a list of processing instructions for each page
    proc_ins = get_processing_instructions(ident_map, link_map)

    # sort proc_ins to produce ordered output.txt
    proc_ins = [v for v in proc_ins.values()]
    proc_ins = sorted(proc_ins, key=lambda x: x['link'])

    for page in proc_ins:
        idents = page['idents']
        idents = [v for v in idents.values()]
        idents = sorted(idents, key=lambda x: x['ident'])
        page['idents'] = idents

    redirects = []

    out = open(output_file, 'w', encoding='utf-8')

    # i=1
    for page in proc_ins:
        idents = page['idents']
        link = page['link']
        fn = page['fn']

        if debug.should_skip_ident([i['ident'] for i in idents]):
            continue

        # print(str(i) + '/' + str(len(proc_ins)) + ': ' + link)
        # i+=1

        root = e.parse(os.path.join(args.reference, fn),
                       parser=html.HTMLParser())

        for ident in idents:

            item_ident = ident['ident']
            item_type = ident['type']

            process_identifier(out,
                               redirects,
                               root,
                               link,
                               item_ident,
                               item_type,
                               args,
                               debug=debug)

    output_redirects(out, redirects)

    if debug.enabled:
        print('=============================')
        print('Numbers of lines used:')
        for i, l in enumerate(debug.stat_line_nums):
            print(str(i) + ': ' + str(l) + ' result(s)')
Пример #7
0
                                timeout=TIMEOUT,
                                ca_certs=certifi.where(),
                                num_pools=50)
NO_CERT_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY,
                                   timeout=TIMEOUT,
                                   cert_reqs='CERT_NONE',
                                   num_pools=50)

USER_AGENT = 'trafilatura/' + __version__ + ' (+https://github.com/adbar/trafilatura)'
DEFAULT_HEADERS = {
    'User-Agent': USER_AGENT,
}

# collect_ids=False, default_doctype=False, huge_tree=True,
HTML_PARSER = html.HTMLParser(remove_comments=True,
                              remove_pis=True,
                              encoding='utf-8')
RECOVERY_PARSER = html.HTMLParser(remove_comments=True, remove_pis=True)

UNICODE_WHITESPACE = re.compile(r'''
    \u00A0|\u1680|\u2000|\u2001|\u2002|\u2003|\u2004|\u2005|\u2006|\u2007|
    \u2008|\u2009|\u200a|\u2028|\u2029|\u202F|\u205F|\u3000
    ''')

NO_TAG_SPACE = re.compile(r'(?<![p{P}>])\n')
SPACE_TRIMMING = re.compile(r'\s+', flags=re.UNICODE | re.MULTILINE)

NOPRINT_TRANS_TABLE = {
    i: None
    for i in range(0, sys.maxunicode + 1)
    if not chr(i).isprintable() and not chr(i) in (' ', '\t', '\n')
Пример #8
0
 def get_html_requests(self):
     parser = html.HTMLParser(encoding='utf-8')
     page = html.fromstring(self.resp.content, parser=parser)
     return page
Пример #9
0
def czce_scrape(year=2018, month=6, day=11):
    from lxml import html
    url = 'http://www.czce.com.cn/portal/DFSStaticFiles/Future/' \
          '{year:>04}/{year:>04}{month:>02}{day:>02}/FutureDataHolding.htm'.format(year=year, month=month, day=day)

    parser = html.HTMLParser(encoding='gbk')
    # root = html.document_fromstring(content, parser=parser)
    try:
        tree = html.parse(url, parser=parser)
    except Exception as e:
        print(e)
        return
    # tree.docinfo
    table = tree.findall('//table')[1]
    data = [[td.text_content().strip() for td in row.findall('td')]
            for row in table.findall('tr')]

    df = pd.DataFrame(data, )

    df[0] = df[0].str.replace('\xa0', '')
    print(df)
    form_header = ['品种', '合约', '合计']
    temp_index = []
    start_index = []
    import collections
    header_index_dict = collections.OrderedDict()
    for a_header in form_header:
        a_index_list = df.index[df[0].str.contains(a_header)].tolist()
        if not a_index_list:
            continue
        header_index_dict.update({a_header: a_index_list})

    if '品种' in header_index_dict:
        contracts = header_index_dict['品种']
        if '合计' not in header_index_dict:
            print("Error")
        end_index = header_index_dict['合计']
        for beg, end in zip(contracts, end_index):
            t_df = df[beg:end]
            t_df = t_df.applymap(lambda x: x.replace(',', '') if x else x)
            t_df.reset_index(inplace=True, drop=True)
            print(t_df)
            h_str = t_df.iat[0, 0]
            # '品种:苹果AP 日期:2018-05-23'
            import re
            # m = re.match(
            #     r"品种:(?P<productname>[\u4e00-\u9fa5]+)(?P<instrumentid>[a-zA-Z]+)\W*日期:(?P<date>[\d-]+)", h_str)
            m = re.match(
                r"品种:(?P<productname>[\u4e00-\u9fa5]+)?(?P<instrumentid>[a-zA-Z]+)\W*日期:(?P<date>[\d-]+)",
                h_str)
            t_instrumentid = m.group('instrumentid')
            t_productname = m.group('productname')
            if not t_productname:
                t_productname = t_instrumentid
            t_date = m.group('date')
            # productname 铜 instrumentid cu1804
            t_df = t_df.drop([0, 1])
            t_df['instrumentid'] = t_instrumentid
            t_df['productname'] = t_productname
            t_df['date'] = t_date
            print(t_df)
            col_names = [
                'RANK', 'PARTICIPANTABBR1', 'CJ1', 'CJ1_CHG',
                'PARTICIPANTABBR2', 'CJ2', 'CJ2_CHG', 'PARTICIPANTABBR3',
                'CJ3', 'CJ3_CHG', 'INSTRUMENTID', 'PRODUCTNAME', 'DATE'
            ]
            t_df.columns = col_names
            t_df['VARIETY'] = True
            print(t_df)
            from db_insert2 import set_ranks_df

            set_ranks_df(t_df,
                         year=year,
                         month=month,
                         day=day,
                         exchange='CZCE')

    if '合约' in header_index_dict:
        instruments_index = header_index_dict['合约']
        len_instruments_index = len(instruments_index)
        if '合计' not in header_index_dict:
            print("Error")
        end_index = header_index_dict['合计']
        end_index = end_index[-len_instruments_index:]
        for beg, end in zip(instruments_index, end_index):
            t_df = df[beg:end]
            t_df = t_df.applymap(lambda x: x.replace(',', '') if x else x)
            t_df.reset_index(inplace=True, drop=True)
            print(t_df)
            h_str = t_df.iat[0, 0]
            import re
            # m = re.match(
            #     r"品种:(?P<productname>[\u4e00-\u9fa5]+)(?P<instrumentid>[a-zA-Z]+)\W*日期:(?P<date>[\d-]+)", h_str)
            m = re.match(
                r"合约:(?P<productname>[\u4e00-\u9fa5]+)?(?P<instrumentid>[a-zA-Z\d]+)\W*日期:(?P<date>[\d-]+)",
                h_str)
            t_instrumentid = m.group('instrumentid')
            t_productname = m.group('productname')
            if not t_productname:
                t_productname = "EMPTY"
            t_date = m.group('date')
            # productname 铜 instrumentid cu1804
            t_df = t_df.drop([0, 1])
            t_df['INSTRUMENTID'] = t_instrumentid
            t_df['PRODUCTNAME'] = t_productname

            print(t_df)
            col_names = [
                'RANK',
                'PARTICIPANTABBR1',
                'CJ1',
                'CJ1_CHG',
                'PARTICIPANTABBR2',
                'CJ2',
                'CJ2_CHG',
                'PARTICIPANTABBR3',
                'CJ3',
                'CJ3_CHG',
                'INSTRUMENTID',
                'PRODUCTNAME',
            ]
            t_df.columns = col_names

            t_df['VARIETY'] = False
            print(t_df)
            from db_insert2 import set_ranks_df

            set_ranks_df(t_df,
                         year=year,
                         month=month,
                         day=day,
                         exchange='CZCE')

    return
Пример #10
0
 def parse(self, url, postdata=None):
     """ Parse an URL and return an etree ElementRoot.
         Assumes UTF-8 encoding
     """
     return html.parse(self.get(url, postdata),
                       parser=html.HTMLParser(encoding='utf-8'))
Пример #11
0
def clean(content):
    head_pos = content.find('<head>')

    # insert the encoding of the file
    content = content[:head_pos +
                      6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[
                          head_pos + 6:]
    article = Extractor(content, loglevel=logging.INFO).extracted()

    if article is None:
        print("Error processing html file.")
        sys.exit(1)

    html_parser = html.HTMLParser(encoding="utf-8")
    html_doc = html.fromstring(content, parser=html_parser)
    head_doc = html_doc.find('head')

    source_url = head_doc.cssselect('meta[property="og:url"]')[0].get(
        'content')
    title = html_doc.find('.//title').text_content()

    # Replace
    article = article.replace('<h1 class="tabtitle">C++</h1>',
                              '<p><strong>C++</strong></p>')
    article = article.replace('<h1 class="tabtitle">C</h1>',
                              '<p><strong>C</strong></p>')
    article = article.replace('<h1 class="tabtitle">C/C++</h1>',
                              '<p><strong>C/C++</strong></p>')
    article = article.replace('<h1 class="tabtitle">Java</h1>',
                              '<p><strong>Java</strong></p>')
    article = article.replace('<h1 class="tabtitle">Python</h1>',
                              '<p><strong>Python</strong></p>')

    # if the title is unfortunately removed by boilerpipy, then add it back in
    if "h2" not in article:
        article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article

    reconstructed_body = "<html><body>" + article.replace(
        "<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>"

    if "<body><h1>" not in reconstructed_body:
        reconstructed_body = reconstructed_body.replace(
            "<body>", "<body><h1>" + title[:title.rfind('-')] + "</h1>")

    source_header_string = "<h3>Source</h3>"
    source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>"

    # further remove useless stuff
    body_doc = html.fromstring(reconstructed_body).find('body')
    for bad in body_doc.xpath("//div[@class='comments-main']"):
        bad.getparent().remove(bad)
    for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"):
        ad_by_google.getparent().remove(ad_by_google)
    for bad_h3 in body_doc.xpath("//h3"):
        bad_h3.getparent().remove(bad_h3)
    for pre_tag in body_doc.xpath("//pre"):
        if 'class' in pre_tag.attrib:
            pre_tag.attrib.pop('class')
        if 'title' in pre_tag.attrib:
            pre_tag.attrib.pop('title')

    post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0]
    post_content_doc.append(lxml.etree.XML(source_header_string))
    post_content_doc.append(lxml.etree.XML(source_link))
    result = html.tostring(body_doc)

    # replace <code> with <code><pre> for styling later.
    result = result.replace('<pre>',
                            '<pre> <code>').replace('</pre>', '</code> </pre>')

    return result
Пример #12
0
def extract_from_html(msg_body):
    """
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.

    Cut out the 'blockquote', 'gmail_quote' tags.
    Cut Microsoft quotations.

    Then use plain text algorithm to cut out splitter or
    leftover quotation.
    This works by adding checkpoint text to all html tags,
    then converting html to text,
    then extracting quotations from text,
    then checking deleted checkpoints,
    then deleting necessary tags.
    """

    if msg_body.strip() == '':
        return msg_body

    html_tree = html.document_fromstring(
        msg_body, parser=html.HTMLParser(encoding="utf-8"))

    cut_quotations = (html_quotations.cut_gmail_quote(html_tree)
                      or html_quotations.cut_blockquote(html_tree)
                      or html_quotations.cut_microsoft_quote(html_tree)
                      or html_quotations.cut_by_id(html_tree)
                      or html_quotations.cut_from_block(html_tree))

    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False for i in xrange(number_of_checkpoints)]
    msg_with_checkpoints = html.tostring(html_tree)

    h = html2text.HTML2Text()
    h.body_width = 0  # generate plain text without wrap

    # html2text adds unnecessary star symbols. Remove them.
    # Mask star symbols
    msg_with_checkpoints = msg_with_checkpoints.replace('*', '3423oorkg432')
    plain_text = h.handle(msg_with_checkpoints)
    # Remove created star symbols
    plain_text = plain_text.replace('*', '')
    # Unmask saved star symbols
    plain_text = plain_text.replace('3423oorkg432', '*')

    delimiter = get_delimiter(plain_text)

    plain_text = preprocess(plain_text, delimiter, content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
        return msg_body

    # Collect checkpoints on each line
    line_checkpoints = [
        [
            int(i[4:-4])  # Only checkpoint number
            for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)
        ] for line in lines
    ]

    # Remove checkpoints
    lines = [
        re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines
    ]

    # Use plain text quotation extracting algorithm
    markers = mark_message_lines(lines)
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags

    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in xrange(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True
    else:
        if cut_quotations:
            return html.tostring(html_tree_copy)
        else:
            return msg_body

    # Remove tags with quotation checkpoints
    html_quotations.delete_quotation_tags(html_tree_copy, 0,
                                          quotation_checkpoints)

    return html.tostring(html_tree_copy)
Пример #13
0
def clean(file_name, directory="."):
    basename = os.path.basename(file_name)
    content = codecs.open(file_name, "r", 'utf-8').read()

    head_pos = content.find('<head>')

    # insert the encoding of the file
    content = content[:head_pos +
                      6] + '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' + content[
                          head_pos + 6:]
    article = Extractor(content, loglevel=logging.INFO).extracted()

    if article is None:
        print "Error processing html file"
        sys.exit(1)
    html_parser = html.HTMLParser(encoding="utf-8")
    html_doc = html.fromstring(content, parser=html_parser)
    head_doc = html_doc.find('head')
    published_time = head_doc.cssselect(
        'meta[property="article:published_time"]')[0].get('content')[:-6]
    print published_time
    cleaned_file = os.path.splitext(
        basename)[0] + "_" + published_time + "_cleaned.html"
    # don't clean files that already have been cleaned
    if os.path.isfile(cleaned_file):
        return
    source_url = head_doc.cssselect('meta[property="og:url"]')[0].get(
        'content')
    title = html_doc.find('.//title').text_content()

    # if the title is unfortunately removed by boilerpipy, then add it back in
    if "h2" not in article:
        article = "<h1>" + title[:title.rfind('-')] + "</h1>" + article

    reconstructed_body = "<html><body>" + article.replace(
        "<h2", "<h1").replace("</h2>", "</h1>") + "</body></html>"
    source_header_string = "<h3>Source</h3>"
    source_link = "<p><a href='" + source_url + "' rel='tag'>" + source_url + "</a></p>"
    # further remove useless stuff
    body_doc = html.fromstring(reconstructed_body).find('body')
    for bad in body_doc.xpath("//div[@class='comments-main']"):
        bad.getparent().remove(bad)
    for ad_by_google in body_doc.xpath("//ins[@class='adsbygoogle']"):
        ad_by_google.getparent().remove(ad_by_google)
    for bad_h3 in body_doc.xpath("//h3"):
        bad_h3.getparent().remove(bad_h3)
    for pre_tag in body_doc.xpath("//pre"):
        if 'class' in pre_tag.attrib:
            pre_tag.attrib.pop('class')
        if 'title' in pre_tag.attrib:
            pre_tag.attrib.pop('title')

    post_content_doc = body_doc.xpath("//div[@class='entry-content']")[0]
    post_content_doc.append(lxml.etree.XML(source_header_string))
    post_content_doc.append(lxml.etree.XML(source_link))
    result = html.tostring(body_doc)
    # replace <code> with <code><pre> for styling later.
    result = result.replace('<pre>',
                            '<pre> <code>').replace('</pre>', '</code> </pre>')
    with open(directory + cleaned_file, 'w') as cleaned_file_handle:
        cleaned_file_handle.write(result.encode('utf-8'))
Пример #14
0
    raise KeyError(_("Cannot find job attempt '%(id)s'.") % {'id': job.jobId}, e)
  except Exception, e:
    raise Exception(_("Failed to get application for job %s: %s") % (job.jobId, e))

  if log_link:
    link = '/%s/' % name
    params = {}
    if offset != 0:
      params['start'] = offset

    root = Resource(get_log_client(log_link), urlparse.urlsplit(log_link)[2], urlencode=False)
    api_resp = None

    try:
      api_resp = root.get(link, params=params)
      log = html.fromstring(api_resp, parser=html.HTMLParser()).xpath('/html/body/table/tbody/tr/td[2]')[0].text_content()

      response['status'] = 0
      response['log'] = LinkJobLogs._make_hdfs_links(log)
    except Exception, e:
      response['log'] = _('Failed to retrieve log: %s' % e)
      try:
        debug_info = '\nLog Link: %s' % log_link
        if api_resp:
          debug_info += '\nHTML Response: %s' % response
        response['debug'] = debug_info
        LOG.error(debug_info)
      except:
        LOG.exception('failed to create debug info')

  return JsonResponse(response)
Пример #15
0
# coding=utf-8
import api

# Scraping rápido del formulario de horarios
from lxml import html
import requests
from requests.exceptions import ConnectionError
while True:
    try:
        pagina = html.fromstring(requests.get(
            'https://guayacan.uninorte.edu.co/4PL1CACI0N35/registro/consulta_horarios.php'
        ).content,
                                 parser=html.HTMLParser(encoding='utf-8'))
        niveles = {
            nivel.get('value'): nivel.text
            for nivel in pagina.xpath('//select[@name="nivel"]/option')[1:]
        }
        periodos = {
            periodo.get('value'): periodo.text
            for periodo in pagina.xpath('//select[@name="periodo"]/option')[1:]
        }
        break
    except ConnectionError as e:
        print(e)

# Creación de la aplicación en Flask
from flask import Flask, render_template, redirect, url_for, request, jsonify
app = Flask(__name__)

from flask.json import JSONEncoder
from bson import ObjectId
Пример #16
0
            # print word1
            text_porter = text_porter + " " + Porter.stem(word1)
    return text_porter


def get_stem_text(text):
    m = Mystem()
    text_raw = re.sub(u'[^a-zA-Zа-яА-ЯйЙёЁ _]+', '', text)
    text_stem = m.lemmatize(text_raw)
    text_res = ''.join(text_stem).strip()
    return text_res


if (__name__) == "__main__":
    link = 'http://www.mathnet.ru/php/archive.phtml?jrnid=uzku&wshow=issue&bshow=contents&series=0&year=2008&volume=150&issue=3&option_lang=rus&bookID=1000'
    parser = html.HTMLParser()
    root_tree = getXMLTreeByLink(link)

    root = etree.Element('Math-Net')
    year = etree.SubElement(root, 'year')
    year.text = root_tree.xpath(
        "// td[@width='70%'] / span[@class='red'] / font")[0].text_content(
        ).strip().split(' ')[0].replace(',', '')
    articles = etree.SubElement(root, 'articles')

    expresstion_article = "// td[@colspan='2'] / a[contains(@class, 'SLink')]".decode(
        'utf-8')
    index = 0
    for element in root_tree.xpath(expresstion_article):
        article = etree.SubElement(articles, 'article')
        article.set("id", str(index))
def ParseHtml(story, corpus):
    """Parses the HTML of a news story.

  Args:
    story: The raw Story to be parsed.
    corpus: Either 'cnn' or 'dailymail'.

  Returns:
    A Story containing URL, paragraphs and highlights.
  """

    parser = html.HTMLParser(encoding=chardet.detect(story.html)['encoding'])
    tree = html.document_fromstring(story.html, parser=parser)

    # Elements to delete.
    delete_selectors = {
        'cnn': [
            '//blockquote[contains(@class, "twitter-tweet")]',
            '//blockquote[contains(@class, "instagram-media")]'
        ],
        'dailymail': [
            '//blockquote[contains(@class, "twitter-tweet")]',
            '//blockquote[contains(@class, "instagram-media")]'
        ]
    }

    # Paragraph exclusions: ads, links, bylines, comments
    cnn_exclude = (
        'not(ancestor::*[contains(@class, "metadata")])'
        ' and not(ancestor::*[contains(@class, "pullquote")])'
        ' and not(ancestor::*[contains(@class, "SandboxRoot")])'
        ' and not(ancestor::*[contains(@class, "twitter-tweet")])'
        ' and not(ancestor::div[contains(@class, "cnnStoryElementBox")])'
        ' and not(contains(@class, "cnnTopics"))'
        ' and not(descendant::*[starts-with(text(), "Read:")])'
        ' and not(descendant::*[starts-with(text(), "READ:")])'
        ' and not(descendant::*[starts-with(text(), "Join us at")])'
        ' and not(descendant::*[starts-with(text(), "Join us on")])'
        ' and not(descendant::*[starts-with(text(), "Read CNNOpinion")])'
        ' and not(descendant::*[contains(text(), "@CNNOpinion")])'
        ' and not(descendant-or-self::*[starts-with(text(), "Follow us")])'
        ' and not(descendant::*[starts-with(text(), "MORE:")])'
        ' and not(descendant::*[starts-with(text(), "SPOILER ALERT:")])')

    dm_exclude = ('not(ancestor::*[contains(@id,"reader-comments")])'
                  ' and not(contains(@class, "byline-plain"))'
                  ' and not(contains(@class, "byline-section"))'
                  ' and not(contains(@class, "count-number"))'
                  ' and not(contains(@class, "count-text"))'
                  ' and not(contains(@class, "video-item-title"))'
                  ' and not(ancestor::*[contains(@class, "column-content")])'
                  ' and not(ancestor::iframe)')

    paragraph_selectors = {
        'cnn': [
            '//div[contains(@class, "cnnContentContainer")]//p[%s]' %
            cnn_exclude,
            '//div[contains(@class, "l-container")]//p[%s]' % cnn_exclude,
            '//div[contains(@class, "cnn_strycntntlft")]//p[%s]' % cnn_exclude
        ],
        'dailymail':
        ['//div[contains(@class, "article-text")]//p[%s]' % dm_exclude]
    }

    # Highlight exclusions.
    he = ('not(contains(@class, "cnnHiliteHeader"))'
          ' and not(descendant::*[starts-with(text(), "Next Article in")])')
    highlight_selectors = {
        'cnn': [
            '//*[contains(@class, "el__storyhighlights__list")]//li[%s]' % he,
            '//*[contains(@class, "cnnStryHghLght")]//li[%s]' % he,
            '//*[@id="cnnHeaderRightCol"]//li[%s]' % he
        ],
        'dailymail': ['//h1/following-sibling::ul//li']
    }

    def ExtractText(selector):
        """Extracts a list of paragraphs given a XPath selector.

    Args:
      selector: A XPath selector to find the paragraphs.

    Returns:
      A list of raw text paragraphs with leading and trailing whitespace.
    """

        xpaths = map(tree.xpath, selector)
        elements = list(chain.from_iterable(xpaths))
        paragraphs = [e.text_content().encode('utf-8') for e in elements]

        # Remove editorial notes, etc.
        if corpus == 'cnn' and len(
                paragraphs) >= 2 and '(CNN)' in paragraphs[1]:
            paragraphs.pop(0)

        paragraphs = map(str.strip, paragraphs)
        paragraphs = [s for s in paragraphs if s and not str.isspace(s)]

        return paragraphs

    for selector in delete_selectors[corpus]:
        for bad in tree.xpath(selector):
            bad.getparent().remove(bad)

    paragraphs = ExtractText(paragraph_selectors[corpus])
    highlights = ExtractText(highlight_selectors[corpus])

    content = '\n\n'.join(paragraphs)

    return Story(story.url, content, highlights)
Пример #18
0
from flask import Blueprint, url_for, redirect, request
import requests
from lxml import html
from .util import replaceahref, rmelement, updelement
from urllib.parse import urljoin
import re

PRE_FIX = 'mh1359'
BASE_URL = 'https://m.mh1359.com/'

mh1359_bp = Blueprint(PRE_FIX, __name__)
myparser = html.HTMLParser(encoding="UTF-8")


@mh1359_bp.route('/')
def index():
    r = requests.get(BASE_URL)
    dom = html.fromstring(r.text, parser=myparser)
    dom = transform(dom)
    return html.tostring(dom, pretty_print=True)


@mh1359_bp.route('/manhua/<idpage>')
def manhua(idpage):
    r = requests.get(urljoin(BASE_URL, 'manhua/' + idpage))
    dom = html.fromstring(r.text, parser=myparser)
    dom = transform(dom)
    return html.tostring(dom, pretty_print=True)


@mh1359_bp.route('/chapter/<idpage>')
Пример #19
0
 def parse_html_stream(f):
     parser = lhtml.HTMLParser(encoding="utf8")
     return lhtml.parse(f, parser)
Пример #20
0
padding = None
if args.pad is not None:
    padding = eval("[" + args.pad + "]")
    assert len(padding) in [1, 4], (args.pad, padding)
    if len(padding) == 1:
        padding = padding * 4

tpattern = args.pattern + '.txt'
if args.pattern[-4] == '.':
    tpattern = args.pattern[:-3] + 'txt'

if args.unicodedammit:
    from bs4 import UnicodeDammit
    content = args.file.read()
    doc = UnicodeDammit(content, is_html=True)
    parser = html.HTMLParser(encoding=doc.original_encoding)
    doc = html.document_fromstring(content, parser=parser)
else:
    doc = html.parse(args.file)

pages = doc.xpath('//*[@class="ocr_page"]')
for page in pages:
    iname = get_prop(page, 'file')
    if not iname:
        iname = get_prop(page, 'image')
    if args.basename:
        iname = os.path.join(args.basename, os.path.basename(iname))
    if not os.path.exists(iname):
        print("not found:", iname)
        sys.exit(1)
    image = Image.open(iname)
Пример #21
0
    def get_task_log(self, offset=0):
        logs = []
        attempt = self.task.job.job_attempts['jobAttempt'][-1]
        log_link = attempt['logsLink']

        # Generate actual task log link from logsLink url
        if self.task.job.status in ('NEW', 'SUBMITTED', 'RUNNING'
                                    ) or self.type == 'Oozie Launcher':
            logs_path = '/node/containerlogs/'
            node_url, tracking_path = log_link.split(logs_path)
            container_id, user = tracking_path.strip('/').split('/')

            # Replace log path tokens with actual container properties if available
            if hasattr(self, 'nodeHttpAddress') and 'nodeId' in attempt:
                node_url = '%s://%s' % (node_url.split('://')[0],
                                        self.nodeHttpAddress)
            container_id = self.assignedContainerId if hasattr(
                self, 'assignedContainerId') else container_id

            log_link = '%(node_url)s/%(logs_path)s/%(container)s/%(user)s' % {
                'node_url': node_url,
                'logs_path': logs_path.strip('/'),
                'container': container_id,
                'user': user
            }
        else:  # Completed jobs
            logs_path = '/jobhistory/logs/'
            root_url, tracking_path = log_link.split(logs_path)
            node_url, container_id, attempt_id, user = tracking_path.strip(
                '/').split('/')

            # Replace log path tokens with actual attempt properties if available
            if hasattr(self, 'nodeHttpAddress') and 'nodeId' in attempt:
                node_url = '%s:%s' % (self.nodeHttpAddress.split(':')[0],
                                      attempt['nodeId'].split(':')[1])
            container_id = self.assignedContainerId if hasattr(
                self, 'assignedContainerId') else container_id
            attempt_id = self.attemptId if hasattr(self,
                                                   'attemptId') else attempt_id

            log_link = '%(root_url)s/%(logs_path)s/%(node)s/%(container)s/%(attempt)s/%(user)s' % {
                'root_url': root_url,
                'logs_path': logs_path.strip('/'),
                'node': node_url,
                'container': container_id,
                'attempt': attempt_id,
                'user': user
            }

        for name in ('stdout', 'stderr', 'syslog'):
            link = '/%s/' % name
            if self.type == 'Oozie Launcher' and not self.task.job.status == 'FINISHED':  # Yarn currently dumps with 500 error with doas in running state
                params = {}
            else:
                params = {'doAs': user}

            if int(offset) != 0:
                params['start'] = offset
            else:
                params['start'] = 0

            response = None
            try:
                log_link = re.sub('job_[^/]+', self.id, log_link)
                root = Resource(get_log_client(log_link),
                                urlparse.urlsplit(log_link)[2],
                                urlencode=False)
                response = root.get(link, params=params)
                log = html.fromstring(
                    response, parser=html.HTMLParser()).xpath(
                        '/html/body/table/tbody/tr/td[2]')[0].text_content()
            except Exception, e:
                log = _('Failed to retrieve log: %s' % e)
                try:
                    debug_info = '\nLog Link: %s' % log_link
                    if response:
                        debug_info += '\nHTML Response: %s' % response
                    LOG.error(debug_info)
                except:
                    LOG.exception('failed to build debug info')

            logs.append(log)
Пример #22
0
    def _parse_file(self, file_name, cloth_parser):
        if cloth_parser is None:
            cloth_parser = html.HTMLParser()

        cloth = html.parse(file_name, parser=cloth_parser)
        return cloth.getroot()
Пример #23
0
            _("Failed to get application for job %s: %s") % (job.jobId, e))

    if log_link:
        link = '/%s/' % name
        params = {}
        if offset != 0:
            params['start'] = offset

        root = Resource(get_log_client(log_link),
                        urlparse.urlsplit(log_link)[2],
                        urlencode=False)
        api_resp = None

        try:
            api_resp = root.get(link, params=params)
            log = html.fromstring(api_resp, parser=html.HTMLParser()).xpath(
                '/html/body/table/tbody/tr/td[2]')[0].text_content()

            response['status'] = 0
            response['log'] = LinkJobLogs._make_hdfs_links(log)
        except Exception, e:
            response['log'] = _('Failed to retrieve log: %s' % e)
            try:
                debug_info = '\nLog Link: %s' % log_link
                if api_resp:
                    debug_info += '\nHTML Response: %s' % response
                response['debug'] = debug_info
                LOG.error(debug_info)
            except:
                LOG.exception('failed to create debug info')
Пример #24
0
 def from_html_cloth(cls, cloth, strip_comments=True):
     retval = cls()
     retval._init_cloth(cloth, cloth_parser=html.HTMLParser(),
                                               strip_comments=strip_comments)
     return retval
Пример #25
0
def parse_lawyer_files_gen(bulk_amount=200, html_encoding='utf-8'):
    """
    returns list of bulk_amount Lawyers 
    that were parsed
    """

    all_file_paths = iglob(
        'C:\Python27\Scripts\Experiments\databases\lawyers\*')

    name_xpath = '//div[@class="screen_name"]//span/text()'

    header_item_xpath = 'div[contains(@class,"reference_item")]'
    item_title_xpath = './div[@class="title"]/text()'
    item_value_xpath = './span'

    title_to_prop_dict = {
        'טלפון': 'phone',
        'פקס': 'fax',
        'נייד': 'mobile',
        'תחום עיסוק': 'specialty',
        'כתובת דוא"ל': 'email',
        'כתובת': 'address',
        'ת.ד': 'po_box',
        'שפה': 'language',
        'נוטריון': 'notary'
    }

    lawyers_lst = []
    lawyer_id_pattern = re.compile(r'laywer_(?P<id>\d+).html$')

    for file_path in all_file_paths:
        # Loop over every lawyer file

        #  Look for lawyer_id in
        lawyer_id_match = lawyer_id_pattern.search(file_path)
        if lawyer_id_match:
            lawyer_id = lawyer_id_match.group('id')
        else:
            lawyer_id = ''

        with open(file_path, 'rb') as lawyer_file:
            # Open it and parse

            html_tree = html.fromstring(
                lawyer_file.read(),
                parser=html.HTMLParser(encoding=html_encoding))

            name = edit_string(html_tree.xpath(name_xpath)[0],
                               html_encoding)  #Lawyer name
            current_lawyer = Lawyer(lawyer_id=lawyer_id, name=name)

            all_headers = html_tree.xpath(header_item_xpath)
            for header in all_headers:
                # Loop over every attribute of the lawyer found in HTML file
                attribute_name = edit_string(
                    header.xpath(item_title_xpath)[0], html_encoding)

                attribute_values = header.xpath(item_value_xpath)
                attribute_value = ' '.join(
                    map(lambda x: edit_string(x.text_content()),
                        attribute_values))  # join to 1 string

                # Get actual attribute name, and set it to the given value
                actual_attribute_name = title_to_prop_dict[attribute_name]
                setattr(current_lawyer, actual_attribute_name, attribute_value)

            lawyers_lst.append(current_lawyer)

            if len(lawyers_lst) == bulk_amount:
                # Yield a list of bulk_amount lawyers
                yield lawyers_lst
                lawyers_lst = []

    yield lawyers_lst  # Final yield
Пример #26
0
def parse_html_with_encoding(data, encoding='utf-8'):
    parser = html.HTMLParser(encoding=encoding)
    return html.fromstring(data, parser=parser)
Пример #27
0
def fromstring(s):
    html_parser = html.HTMLParser(encoding='utf-8')
    return html.fromstring(s, parser=html_parser).getroottree().getroot()
Пример #28
0
    def _parse_file(self, file_name, cloth_parser):
        if cloth_parser is None:
            cloth_parser = html.HTMLParser(remove_comments=True)

        cloth = html.parse(file_name, parser=cloth_parser)
        return cloth.getroot()
Пример #29
0
    except (KeyError, RestException), e:
        raise KeyError(
            _("Cannot find job attempt '%(id)s'.") % {'id': job.jobId}, e)

    link = '/%s/' % name
    params = {}
    if offset and int(offset) >= 0:
        params['start'] = offset

    root = Resource(get_log_client(log_link),
                    urlparse.urlsplit(log_link)[2],
                    urlencode=False)
    debug_info = ''
    try:
        response = root.get(link, params=params)
        log = html.fromstring(response, parser=html.HTMLParser()).xpath(
            '/html/body/table/tbody/tr/td[2]')[0].text_content()
    except Exception, e:
        log = _('Failed to retrieve log: %s' % e)
        try:
            debug_info = '\nLog Link: %s' % log_link
            debug_info += '\nHTML Response: %s' % response
            LOGGER.error(debug_info)
        except:
            LOGGER.exception('failed to create debug info')

    response = {'log': LinkJobLogs._make_hdfs_links(log), 'debug': debug_info}

    return JsonResponse(response)

Пример #30
0
def main():
	global COUNT

	site_url = "http://www.sanskritlibrary.org/"
	seed_url = "http://www.sanskritlibrary.org/textsList.html"
	titus_url = "http://titus.uni-frankfurt.de"
	p = Page(seed_url)

	a_tags = CSSSelector('a')
	div_tags = CSSSelector('div')
	span_tags = CSSSelector('span')
	body_tags = CSSSelector('body')

	div = [e for e in div_tags(p.dom) if e.get("class")=="text"]
	div = div[0]
	links = [site_url + i.get("href") for i in div.getchildren() if i.tag=='a']
	print "Links of texts:", len(links)
	source_links = list()


	#Creating list of links
	for l in links:
		lpage = Page(l)
		slinks = [i.get("href") for i in a_tags(lpage.dom) if i.get("target")=="source"]
		source_links += slinks

	
	print "Links of sources:", len(source_links) #134
	source_links = list(set(source_links))
	print "Unique links of sources:", len(source_links) #94

	#Considering only ramayana and mahabharat links
	source_links = [i for i in source_links if ("/mbh" in i or "/ram" in i)]
	pp.pprint(source_links)

	b = p.selenium_load()
	for link in source_links:
		lp = link

		print "SOURCE_LINK",link
		while lp:
			try:
				b.get(lp)
				sleep(0.25)

				b.switch_to_frame(b.find_elements_by_tag_name("frame")[0])
				bdom=html.fromstring(b.page_source, parser=html.HTMLParser(encoding='utf-8'))
				bt = body_tags(bdom)
				if len(bt)==0:
					print "No body tag for " + lp
					continue
				body = bt[0]
				f = open("download/" + lp[lp.rfind("/")+1:]+".txt", 'w')
				f.write(body.text_content().encode('utf-8'))
				f.close()
				print "File no. " + str(COUNT) + " created"
				COUNT += 1
				anchors = a_tags(bdom)

				lp = None


				for i in range(len(anchors)-1, max(0, len(anchors)-5), -1):
				
					if len(anchors[i].getchildren())==1 and anchors[i].getchildren()[0].tag=="img" and "arribar" in anchors[i].getchildren()[0].get("src"):
						href = anchors[i].get("href")
						lp = titus_url+href
						print i, len(anchors)-i
						print "New frame:", lp
						break
			except:
				lp = None