Python replace 예제들, lxml.html.replace Python 예제들

예제 #1

0

파일 보기

파일: util.py 프로젝트: dal-bhaat/stack5

def normalize_entities(html):
    # turn &nbsp; and aliases into normal spaces
    html = html.replace(u'&nbsp;', u' ')
    html = html.replace(u'&#160;', u' ')
    html = html.replace(u'&#xA0;', u' ')
    html = html.replace(u'\xa0', u' ')
    return html

예제 #2

0

파일 보기

파일: extract_samsclub_data.py 프로젝트: lifelonglearner127/tmtext

    def _video_urls(self):
        if self._is_shelf():
            return None

        if self.video_count is not None:
            return self.video_urls
        self.video_count = 0
        rows = self.tree_html.xpath("//div[@id='tabItemDetails']//a/@href")
        rows = [r for r in rows if "video." in r or "/mediaroom/" in r or ("//media." in r and (".flv" in r or ".mov" in r))]

        url = "http://content.webcollage.net/sc/smart-button?ird=true&channel-product-id=%s" % self._product_id()
        html = urllib.urlopen(url).read()
        # \"src\":\"\/_cp\/products\/1374451886781\/tab-6174b48c-58f3-4d4b-8d2f-0d9bf0c90a63
        # \/552b9366-55ed-443c-b21e-02ede6dd89aa.mp4.mobile.mp4\"
        video_base_url = self._find_between(html, 'data-resources-base=\\"', '\\">').replace("\\", "") + "%s"
        m = re.findall(r'"src":"([^"]*?\.mp4)"', html.replace("\\",""), re.DOTALL)
        for item in m:
            if ".blkbry" in item or ".mobile" in item:
                pass
            else:
                if video_base_url % item not in rows and item.count(".mp4") < 2:
                    rows.append(video_base_url % item)
        m = re.findall(r'"src":"([^"]*?\.flv)"', html.replace("\\",""), re.DOTALL)
        for item in m:
            if ".blkbry" in item or ".mobile" in item:
                pass
            else:
                if video_base_url % item not in rows and item.count(".flv") < 2:
                    rows.append(video_base_url % item)
        if len(rows) < 1:
            return None
        new_rows = [r for r in rows if ("%s.flash.flv" % r) not in rows]
        self.video_urls = list(set(new_rows))
        self.video_count = len(self.video_urls)
        return self.video_urls

예제 #3

0

파일 보기

    def get_html(self, options, line_id=None, additional_context=None):
        templates = self.get_templates()
        report = {'name': self.get_report_name(),
                  'company_name': self.env.user.company_id.name}
        lines = self.with_context(
            self.set_context(options)).get_lines(options, line_id=line_id)

        rcontext = {
            'report': report,
            'lines': {'columns_header': self.get_columns_name(options),
                      'lines': lines},
            'options': options,
            'context': self.env.context,
            'model': self,
        }
        if additional_context and type(additional_context) == dict:
            rcontext.update(additional_context)
        render_template = templates.get(
            'main_template', 'stock_kardex.main_template')
        if line_id is not None:
            render_template = templates.get(
                'line_template', 'stock_kardex.line_template')
        html = self.env['ir.ui.view'].render_template(
            render_template,
            values=dict(rcontext),
        )
        if self.env.context.get('print_mode', False):
            for k, v in self.replace_class().items():
                html = html.replace(k, v)
            html = html.replace(
                b'<div class="js_stock_report_footnotes"></div>',
                self.get_html_footnotes(''))
        return html

예제 #4

0

파일 보기

파일: util.py 프로젝트: iffy/htmltreediff

def normalize_entities(html):
    # turn &nbsp; and aliases into normal spaces
    html = html.replace(u'&nbsp;', u' ')
    html = html.replace(u'&#160;', u' ')
    html = html.replace(u'&#xA0;', u' ')
    html = html.replace(u'\xa0', u' ')
    return html

예제 #5

0

파일 보기

파일: common.py 프로젝트: nmeraihi/pipulate

def barebones(url):
  html = url
  if checkurl(url):
    html = gethtml(url)
    if not html:
      return None
  # This chops out the following tags AND all the presumably extraneous content in-between.
  for nuketagblock in ['title', 'head']:
    html = deletenode(html, nuketagblock)
  html = bodycopy(html)
  html = stripcomments(html)
  # Same as above, but a second-pass on the usual code-bloating suspects in between body tags.
  for nuketagblock in ['header', 'footer', 'nav', 'script', 'style', 'noscript', 'form', 'object', 'embed', 'select']:
    html = deletenode(html, nuketagblock)
  html = stripparams(html)
  html = lowercasetags(html)
  # html = striplists(html)
  html = stripemptyhtml(html)
  html = stripbr(html)
  # This strips out the following tags, but leaves the in-between content in place.
  for nuketag in ['label', 'section', 'article', 'div', 'span', 'img', 'a', 'b', 'i', 'param', 'table',
    'td', 'tr', 'font', 'title', 'head', 'meta', 'strong', 'em', 'iframe']:
    html = deletetag(html, nuketag)
  html = stripwhitespace(html)
  html = stripcrlf(html)
  html = onetagoneline(html)
  html = convert_html_entities(html)
  html = lesslines(html)
  html = html.replace('\n', ' ')
  html = html.replace('  ', ' ')
  html = html.strip()
  return html

예제 #6

0

파일 보기

파일: hmrc-scraper.py 프로젝트: hutchida/HMRC

def CreateReport(Date, dfChange, dfAdditions, dfDeletions, report_filepath,
                 template_filepath):
    log('Creating report html...')
    pd.set_option(
        'display.max_colwidth', -1
    )  #stop the dataframe from truncating cell contents. This needs to be set if you want html links to work in cell contents

    with open(template_filepath, 'r') as template:
        htmltemplate = template.read()

    additionsTable = dfAdditions.to_html(
        na_rep=" ",
        index=False,
        classes=
        "table table-bordered text-left table-striped table-hover table-sm")
    changeTable = dfChange.to_html(
        na_rep=" ",
        index=False,
        classes=
        "table table-bordered text-left table-striped table-hover table-sm")
    deletionsTable = dfDeletions.to_html(
        na_rep=" ",
        index=False,
        classes=
        "table table-bordered text-left table-striped table-hover table-sm")

    with open(report_filepath, 'w', encoding='utf-8') as f:
        html = htmltemplate.replace('__DATE__', Date).replace(
            '__CHANGELEN__',
            str(len(dfChange))).replace('__DFCHANGES__', changeTable)

        if len(dfAdditions) > 0:
            html = html.replace('__ADDITIONSLEN__',
                                str(len(dfAdditions))).replace(
                                    '__DFADDITIONS__', additionsTable)
        else:
            html = html.replace('__ADDITIONSLEN__',
                                str(len(dfAdditions))).replace(
                                    '__DFADDITIONS__', '')

        if len(dfDeletions) > 0:
            html = html.replace('__DELETIONSLEN__',
                                str(len(dfDeletions))).replace(
                                    '__DFDELETIONS__', deletionsTable)
        else:
            html = html.replace('__DELETIONSLEN__',
                                str(len(dfDeletions))).replace(
                                    '__DFDELETIONS__', '')

        html = html.replace('&lt;', '<').replace('&gt;', '>').replace(
            '\\', '/').replace('\u2011',
                               '-').replace('\u2015', '&#8213;').replace(
                                   'ī', '').replace('─', '&mdash;')
        f.write(html)
        f.close()
        pass

    print("Exported html report to..." + report_filepath)
    log("Exported html report to..." + report_filepath)

예제 #7

0

파일 보기

파일: account_report.py 프로젝트: FIDINGSARL/baytonia

    def get_html(self, options, line_id=None, additional_context=None):
        '''
        return the html value of report, or html value of unfolded line
        * if line_id is set, the template used will be the line_template
        otherwise it uses the main_template. Reason is for efficiency, when unfolding a line in the report
        we don't want to reload all lines, just get the one we unfolded.
        '''
        templates = self.get_templates()
        report_manager = self.get_report_manager(options)
        report = {'name': self.get_report_name(),
                'summary': report_manager.summary,
                'company_name': self.env.user.company_id.name,}
        ctx = self.set_context(options)
        lines = self.with_context(ctx).get_lines(options, line_id=line_id)

        if options.get('hierarchy'):
            lines = self.create_hierarchy(lines)

        footnotes_to_render = []
        if self.env.context.get('print_mode', False):
            # we are in print mode, so compute footnote number and include them in lines values, otherwise, let the js compute the number correctly as
            # we don't know all the visible lines.
            footnotes = dict([(str(f.line), f) for f in report_manager.footnotes_ids])
            number = 0
            for line in lines:
                f = footnotes.get(str(line.get('id')))
                if f:
                    number += 1
                    line['footnote'] = str(number)
                    footnotes_to_render.append({'id': f.id, 'number': number, 'text': f.text})

        rcontext = {'report': report,
                    'lines': {'columns_header': self.get_columns_name(options), 'lines': lines},
                    'options': options,
                    'context': self.env.context,
                    'model': self,
                }
        if additional_context and type(additional_context) == dict:
            rcontext.update(additional_context)
        if ctx.get('analytic_account_ids'):
            rcontext['options']['analytic_account_ids'] = [
                {'id': acc.id, 'name': acc.name} for acc in ctx['analytic_account_ids']
            ]

        render_template = templates.get('main_template', 'account_reports.main_template')
        if line_id is not None:
            render_template = templates.get('line_template', 'account_reports.line_template')
        html = self.env['ir.ui.view'].render_template(
            render_template,
            values=dict(rcontext),
        )
        if self.env.context.get('print_mode', False):
            for k,v in self.replace_class().items():
                html = html.replace(k, v)
            # append footnote as well
            html = html.replace(b'<div class="js_account_report_footnotes"></div>', self.get_html_footnotes(footnotes_to_render))
        return html

예제 #8

0

파일 보기

파일: mail.py 프로젝트: tedi3231/openerp

def html2plaintext(html, body_id=None, encoding='utf-8'):
    """ From an HTML text, convert the HTML to plain text.
    If @param body_id is provided then this is the tag where the
    body (not necessarily <body>) starts.
    """
    ## (c) Fry-IT, www.fry-it.com, 2007
    ## <*****@*****.**>
    ## download here: http://www.peterbe.com/plog/html2plaintext

    html = ustr(html)
    tree = etree.fromstring(html, parser=etree.HTMLParser())

    if body_id is not None:
        source = tree.xpath('//*[@id=%s]' % (body_id,))
    else:
        source = tree.xpath('//body')
    if len(source):
        tree = source[0]

    url_index = []
    i = 0
    for link in tree.findall('.//a'):
        url = link.get('href')
        if url:
            i += 1
            link.tag = 'span'
            link.text = '%s [%s]' % (link.text, i)
            url_index.append(url)

    html = ustr(etree.tostring(tree, encoding=encoding))

    html = html.replace('<strong>', '*').replace('</strong>', '*')
    html = html.replace('<b>', '*').replace('</b>', '*')
    html = html.replace('<h3>', '*').replace('</h3>', '*')
    html = html.replace('<h2>', '**').replace('</h2>', '**')
    html = html.replace('<h1>', '**').replace('</h1>', '**')
    html = html.replace('<em>', '/').replace('</em>', '/')
    html = html.replace('<tr>', '\n')
    html = html.replace('</p>', '\n')
    html = re.sub('<br\s*/?>', '\n', html)
    html = re.sub('<.*?>', ' ', html)
    html = html.replace(' ' * 2, ' ')

    # strip all lines
    html = '\n'.join([x.strip() for x in html.splitlines()])
    html = html.replace('\n' * 2, '\n')

    for i, url in enumerate(url_index):
        if i == 0:
            html += '\n\n'
        html += ustr('[%s] %s\n') % (i + 1, url)

    return html

예제 #9

0

파일 보기

파일: mail.py 프로젝트: ccdos/OpenERP

def html2plaintext(html, body_id=None, encoding='utf-8'):
    """ From an HTML text, convert the HTML to plain text.
    If @param body_id is provided then this is the tag where the
    body (not necessarily <body>) starts.
    """
    ## (c) Fry-IT, www.fry-it.com, 2007
    ## <*****@*****.**>
    ## download here: http://www.peterbe.com/plog/html2plaintext

    html = ustr(html)
    tree = etree.fromstring(html, parser=etree.HTMLParser())

    if body_id is not None:
        source = tree.xpath('//*[@id=%s]' % (body_id,))
    else:
        source = tree.xpath('//body')
    if len(source):
        tree = source[0]

    url_index = []
    i = 0
    for link in tree.findall('.//a'):
        url = link.get('href')
        if url:
            i += 1
            link.tag = 'span'
            link.text = '%s [%s]' % (link.text, i)
            url_index.append(url)

    html = ustr(etree.tostring(tree, encoding=encoding))

    html = html.replace('<strong>', '*').replace('</strong>', '*')
    html = html.replace('<b>', '*').replace('</b>', '*')
    html = html.replace('<h3>', '*').replace('</h3>', '*')
    html = html.replace('<h2>', '**').replace('</h2>', '**')
    html = html.replace('<h1>', '**').replace('</h1>', '**')
    html = html.replace('<em>', '/').replace('</em>', '/')
    html = html.replace('<tr>', '\n')
    html = html.replace('</p>', '\n')
    html = re.sub('<br\s*/?>', '\n', html)
    html = re.sub('<.*?>', ' ', html)
    html = html.replace(' ' * 2, ' ')

    # strip all lines
    html = '\n'.join([x.strip() for x in html.splitlines()])
    html = html.replace('\n' * 2, '\n')

    for i, url in enumerate(url_index):
        if i == 0:
            html += '\n\n'
        html += ustr('[%s] %s\n') % (i + 1, url)

    return html

예제 #10

0

파일 보기

def convertHtml(html):
    html = html.replace('&nbsp;', ' ')
    html = html.replace('<p>', '')
    html = html.replace('</p>', '')
    html = html.replace('\t', '')
    html = html.replace('/b>', '')
    html = html.replace('-', '')
    html = html.replace('<b>', '')
    html = html.replace('<br>', '')
    html = html.replace('<', '')
    return html

예제 #11

0

파일 보기

    def to_xhtml(self, html, base_url):
        html = html.replace('&nbsp;', ' ')
        html = html.replace('&mdash;', '—')

        try:
            xhtml = etree.fromstring(html,
                                     lxml.html.XHTMLParser(),
                                     base_url=base_url)
        except etree.ParseError as what:
            error("etree.fromstring says %s" % what)
            raise

        xhtml.make_links_absolute(base_url=base_url)

        return xhtml

예제 #12

0

파일 보기

파일: olx.py 프로젝트: edx/cc2olx

 def process_canvas_reference(item, html):
     """
     Replace $CANVAS_OBJECT_REFERENCE$ with edx /jump_to_id/<url_name>
     """
     object_id = urllib.parse.unquote(item).replace("$CANVAS_OBJECT_REFERENCE$/quizzes/", "/jump_to_id/")
     html = html.replace(item, object_id)
     return html

예제 #13

0

파일 보기

파일: alexa_cb.py 프로젝트: freedomofme/CrawlerNovels

    def __call__(self, url, html):
        if html == '':
            print 'empty html from downloader'
            raise Exception("empty html")
            # return None
        # if url == self.seed_url:

        # if not decode, sometims failed, and arise 'encoding error : input conversion failed due to input error, bytes 0x84 0x31 0x95 0x33.'
        # so decode manual, and add param 'ignore'
        html = html.decode('GBK', 'ignore').encode('GBK')

        urls = []
        results = []
        queue = self.queue

        # filter for links matching our regular expression
        # and self.same_domain(link, seed_url)
        for oneUrl in (self.normalize(self.seed_url, link)
                       for link in self.get_links(html)
                       if re.search('allvisit_', link)):

            if self.same_domain(oneUrl,
                                self.seed_url) and (oneUrl not in queue
                                                    or queue[oneUrl] != 2):
                results.append(oneUrl)

        # sometimes needs to revome the following codes
        html = html.replace('''"/>
    <meta property=''', '')

        tree = lxml.html.fromstring(html)
        fixText = lxml.html.tostring(tree, pretty_print=True)
        tree = lxml.html.fromstring(fixText)

        for t in tree.cssselect('ul.xbk'):
            book = []
            name = None
            for index, tag in enumerate(t.cssselect('li.tjxs > span')):
                if index == 0:
                    templink = tag.cssselect('a')[0].attrib['href']
                    book.append(self.normalize(self.seed_url, templink))
                    name = tag.cssselect('a')[0].text_content()
                    # print name
                    # print tag.cssselect('a')[0].text_content()
                    # print tag.cssselect('a')[0].attrib['href']
                if index == 1:
                    book.append(tag.cssselect('a')[0].text_content())
                    book.append(tag.cssselect('a')[0].attrib['href'])
                    # print tag.cssselect('a')[0].text_content()
                    # print tag.cssselect('a')[0].attrib['href']
                if index == 2:
                    book.append(tag.text_content())
                    # print tag.text_content()
                if index == 3:
                    book.append(tag.cssselect('i')[0].text_content())
                    # print tag.cssselect('i')[0].text_content()
            if name is not None:
                self.book_data[name] = book

        return results

예제 #14

0

파일 보기

파일: University.py 프로젝트: think-weige/excavat0r

    def crawl(self):
        for p in range(self.first, self.last + 1):
            browser = requests.get(self.url.format(p))
            text = browser.text

            pattern = re.compile(r'<div.*</div>', re.DOTALL | re.MULTILINE)
            m = pattern.search(text)
            if m:
                html = m.group()
                html = html.replace('\\"', '"').replace('\\n', '')

                html = lxml.html.fromstring(html)
                divs = html.xpath('//div[@class="tabDiv"]')
                trs = divs[0].xpath('.//tr')
                for n in range(1, len(trs)):
                    tds = trs[n].xpath('.//td')

                    url = tds[0].xpath('./a')[0].attrib["href"]
                    name = tds[0].text_content().strip()
                    degree_provided = tds[3].text_content().strip()
                    data = {
                        "name": name,
                        "url": self.BASE_URL + url,
                        "degree_provided": degree_provided,
                    }
                    print(json.dumps(data), file=self.ipin_url)

                print("Page{:>6}: [done]".format(p))
                print("Page{:>6}: [done]".format(p), file=self.ipin_url_log)
            else:
                print("Page{:>6}: [fail]".format(p))
                print("Page{:>6}: [fail]".format(p), file=self.ipin_url_log)

        self.ipin_url.close()
        self.ipin_url_log.close()

예제 #15

0

파일 보기

파일: wodupcrawler.py 프로젝트: hnagib/hnagib-training-dashboard

def read_wods_json(file):
    with open(file) as json_file:
        wods = json.load(json_file)

    for k, v in wods.items():
        html = '<p>&nbsp;</p>'.join(v)

        img_urls = re.findall('src="([^"]+)"', html)

        new = [
            f'<a href="{url.replace("avatar_thumbnail", "feed_photo")}" target="_blank"><img style="width: 100px; height: 100px;"  src="{url}" alt="Media for Result"></a>'
            for url in img_urls
        ]

        old = [f'<img src="{url}" alt="Media for Result">' for url in img_urls]

        for i, j in zip(new, old):
            html = html.replace(j, i)

        wods[k] = html

    df_wod = pd.DataFrame(wods, index=['html']).T
    df_wod.index.name = 'date'
    df_wod = df_wod.reset_index()
    df_wod['date'] = pd.to_datetime(df_wod['date'])

    return wods, df_wod

예제 #16

0

파일 보기

파일: trial_de_company_basedata.py 프로젝트: flyeven/scraperwiki-scraper-vault

def load_entry(url):
    html = scraperwiki.scrape(url)
    html = html.replace("<br/>", "\n")
    if not "ureg-utdocument2.xsl" in html:
        return False
    doc = lxml.html.fromstring(html)
    last_key = None
    base = doc.find(".//div/div/div").xpath("string()").split("\n")
    base = [b.replace(u"\xc2\xa0", "").replace("  - ", "").strip() for b in base]
    base = [b for b in base if len(b)]
    data = {"Court": base[1], "CompanyRegister": base[2], "CompanyNumber": base[3], "CompanyName": base[4]}
    id = data.get("Court") + data.get("CompanyRegister") + data.get("CompanyNumber")
    data['UniqueID'] = sha1(id.encode("ascii", "ignore")).hexdigest()
    for elem in doc.findall(".//div"):
        if elem.get('class') == 'col1':
            last_key = elem.xpath("string()").strip()
            last_key = last_key.replace(":", "")
            if 'Eintragsdatum' in last_key: last_key = 'CreationDate'
            last_key = NAME_MAP.get(last_key, last_key)
        if elem.get('class') == 'col2':
            if 'Bilanz vorhanden' in last_key:
                opts = elem.findall('.//option')
                opts = [o.text for o in opts]
                if None in opts:
                    continue
                data['BalanceDates'] = "/".join(opts)
            elif 'Anschrift' in last_key:
                data['Address'] = elem.xpath("string()")
            elif last_key == 'CreationDate':
                cd, _ = elem.xpath("string()").strip().split("(", 1)
                data[last_key] = cd.strip()
            else:
                data[last_key] = elem.xpath("string()").strip()
    scraperwiki.datastore.save(["UniqueID"], data)
    return True

예제 #17

0

파일 보기

    def getSiteContact(self, account, username, mobile):
        HOST = "dealer.che168.com"
        # if account in config.che168VIPAccountList:
        #     HOST = "dealer.che168.com"
        # else:
        #     HOST = "dealers.che168.com"

        conn = httplib.HTTPConnection(HOST, timeout=timeout_che168)
        headers = copy.copy(self.headers)
        conn.request("GET", "/car/publish/?s=1", headers=headers)
        res = conn.getresponse()
        resHeaders = res.getheaders()
        resRead = res.read()
        html = self.decodeBody(resHeaders, resRead)
        html = html.decode('GB18030')
        html = html.replace("gb2312", "utf-8")
        dom = lxml.html.fromstring(html)
        contactItems = dom.xpath('//*[@id="sh_linkMan_div"]/a/@rel')
        conn.close()
        if len(contactItems) == 0:
            return self.createNewContact(username, mobile)
        logger.debug(str(contactItems))
        for salesid in contactItems:
            # if self.checkCurrentContact(salesid, mobile) is True:
            return salesid
        return self.createNewContact(username, mobile)

예제 #18

0

파일 보기

파일: cricinfo_matches.py 프로젝트: rayassch/scraperwiki-scraper-vault

def scrape_series():
    years = scrape_years()

    data = []
    id = itertools.count(0)
    for year in years[:2]:
        url = BASE_URL + year['link']
        html = scraperwiki.scrape(url)

        root = lxml.html.fromstring(html.replace("\n", ""))
        for el1 in root.cssselect("p.ciGrndSubHead"):
            for el2 in el1.getnext().cssselect("dl.seasnResult"):
                series = el2.getchildren()[0].getchildren()[0]
                status = el2.getchildren()[1].text
                if status:
                    status = status.strip()
                data.append({
                    "id": id.next(),
                    "status": status,
                    "class": el1.text,
                    "title": series.text,
                    "link": series.attrib['href'],
                    "year": year['year']
                })
    return data

예제 #19

0

파일 보기

파일: cricinfo_matches.py 프로젝트: rayassch/scraperwiki-scraper-vault

def scrape_matches(series=[]):
    data = []
    for a_series in series[:2]:
        html = scraperwiki.scrape(BASE_URL + a_series['link'])
        root = lxml.html.fromstring(html.replace("\n", ""))
        id = itertools.count(0)
        titles = root.cssselect("p.potMatchHeading")
        for title in titles:
            match = {
                'id': id.next(),
                'title': re.sub(r'\s+', " ", title.text_content()),
                'series_id': a_series['id'],
            }
            _links_iter = itertools.takewhile(lambda el: el.tag == 'p',
                                              title.itersiblings())
            for (k, el) in ((el.attrib['class'].split(' ')[1][4:], el)
                            for el in _links_iter):
                if k == 'links':
                    links = el.cssselect("span a")
                    for link in links:
                        match[re.sub(r'\(\d+\)', "", link.text_content()) +
                              '_link'] = link.attrib['href']
                else:
                    match[k] = re.sub(r'\s+', " ", el.text_content())
            data.append(match)
    return data

예제 #20

0

파일 보기

def get_ticket_count_for_current_lottery(date, html):
    html = html.replace("\r", '').replace("\n", '')
    match = re.search('<div class="row">\s*<h2>Ziehung\s\w\w\,\s*' + date + '<\/h2>\s*(.*)<!--\/.row-->', html)
    if match == None:
        return 0
    else:
        return len(re.findall('/Schein-Nummer/', match[1]))

예제 #21

0

파일 보기

def load_mds_extractive_summaries(
        summaries_tar: str) -> Dict[str, Dict[int, List[List[str]]]]:
    summaries = defaultdict(lambda: defaultdict(list))
    with tarfile.open(summaries_tar, 'r') as tar:
        for member in tar.getmembers():
            if member.isfile():
                if member.name.startswith('./extracts_abstracts/d'):
                    path = member.name.split('/')
                    cluster = path[2][:-2]
                    filename = path[-1]
                    if filename in ['200e', '400e']:
                        length = int(filename[:-1])
                        html = tar.extractfile(member).read().decode()
                        # There is a typo in this particular file where the closing
                        # tag is actually an opening tag, and it messes up the parse
                        if member.name.endswith('d118if/200e'):
                            html = html.replace('of <s>', 'of </s>')
                        tree = lxml.html.document_fromstring(html)
                        labels = []
                        for node in tree.xpath('//s'):
                            doc = node.get('docid')
                            num = int(node.get('num'))
                            index = num - 1
                            labels.append((doc, index))

                        annotator = path[2][-1].upper()
                        summary = {'annotator': annotator, 'labels': labels}

                        if labels in summaries[cluster][length]:
                            print(
                                f'Cluster {cluster} has duplicate extractive summaries of length {length}'
                            )
                        else:
                            summaries[cluster][length].append(summary)
    return summaries

예제 #22

0

파일 보기

파일: crawler.py 프로젝트: malthe/prince-wsgi-server

    def get_html(self):
        html = self.read_variable("document.documentElement.innerHTML")
        if not html:
            return ""

        for encoding in encodings:
            header = 'charset=%s' % encoding
            if header in html:
                html = html.replace(header, 'charset=utf-8')
                break

        parser = lxml.html.HTMLParser()
        tree = lxml.etree.fromstring(html, parser)

        head = tree.find('head')
        if head is not None:
            base = tree.find('head/base')
            if base is None:
                base = lxml.html.Element("base")
                head.insert(0, base)

            uri = self.get_main_frame().get_uri()
            if uri is None:
                return html

            base.attrib['href'] = os.path.dirname(uri)

        return lxml.html.tostring(tree, encoding="utf-8")

예제 #23

0

파일 보기

def find_ID(name):  # name即剧名
    try:
        url1 = 'https://movie.douban.com/j/subject_suggest?q='
        url2 = urllib.parse.quote(name)  # URL只允许一部分ASCII字符，其他字符（如汉字）是不符合标准的，此时就要进行编码。
        url = url1 + url2  # 生成针对该剧的链接，上面链接红字部分即为编码的name
        html = requests.get(url)  # 访问链接，获取html页面的内容
        html = html.content.decode()  # 对html的内容解码为utf-8格式
        html_list = html.replace('\/', '/')  # 将html中的\/全部转换成/，只是为了看着方便（不换也行）
        html_list = html_list.split('},{')  # 将html页面中的每一个条目提取为列表的一个元素。

        # 定义正则，目的是从html中提取想要的信息（根据title提取id）
        str_title = '"title":"' + name + '"'  ##匹配剧名name
        pattern_title = re.compile(str_title)

        str_id = '"id":"' + '[0-9]*'  ##匹配该剧的id值
        pattern_id = re.compile(str_id)

        # 从html_list中的每个item中提取对应的ID值
        id_list = []  # ID存放列表
        for l in html_list:  # 遍历html_list
            find_results_title = re.findall(pattern_title, l, flags=0)  # 找到匹配该剧name的条目item
            if find_results_title != []:  # 如果有title=name的条目，即如果有匹配的结果
                find_results_id = re.findall(pattern_id, l, flags=0)  # 从该匹配的item中的寻找对应的id之
                id_list.append(find_results_id)  # 将寻找到的id值储存在id_list中

        # 可能匹配到了多个ID（可能是同名不同剧），根据产生的id的数量，使剧名name匹配产生的id，使两个list相匹配
        name_list = [name] * len(id_list)

        # 对id_list的格式进行修整，使之成为标准列表格式
        id_list = str(id_list).replace('[', '').replace(']', '').replace("'", '').replace('"id":"', '').replace(' ', '')
        id_list = id_list.split(',')

    except:  # 如果不能正常运行上述代码（不能访问网页等），输出未成功的剧名name。
        print('ERROR:', name)
    return id_list[0]

예제 #24

0

파일 보기

파일: poster.py 프로젝트: ianmintz/pourover

def build_html_from_post(post):

    def entity_text(e):
        return post['text'][e['pos']:e['pos'] + e['len']]

    link_builder = lambda l: "<a href='%s'>%s</a>" % (l['url'], entity_text(l))

    # map starting position, length of entity placeholder to the replacement html
    entity_map = {}
    for entity_key, builder in [('links', link_builder)]:
        for entity in post.get('entities', {}).get(entity_key, []):
            entity_map[(entity['pos'], entity['len'])] = builder(entity)

    # replace strings with html
    html_pieces = []
    text_idx = 0  # our current place in the original text string
    for entity_start, entity_len in sorted(entity_map.keys()):
        if text_idx != entity_start:
            # if our current place isn't the start of an entity, bring in text until the next entity
            html_pieces.append(post.get('text', "")[text_idx:entity_start])

        # pull out the entity html
        entity_html = entity_map[(entity_start, entity_len)]
        html_pieces.append(entity_html)

        # move past the entity we just added
        text_idx = entity_start + entity_len

    # clean up any remaining text
    html_pieces.append(post.get('text', "")[text_idx:])
    html = ''.join(html_pieces)
    html = html.replace('\n', '<br>')
    # TODO: link to schema
    return '<span>%s</span>' % (html)

예제 #25

0

파일 보기

파일: htmlpage.py 프로젝트: codefortallahassee/microwebscraper

def dump_etree_html(etree, tidy=False, indent=True):
    """Renders an Element Tree (lxml.etree) as HTML (bytes)"""
    if tidy:
        return '\n'.join(i for i in walk_etree(etree, indent))
    else:
        html = lxml.etree.tostring(etree, encoding='unicode')
        return html.replace('&#13;', '')

예제 #26

0

파일 보기

파일: extract_samsclub_data.py 프로젝트: lifelonglearner127/tmtext

    def _pdf_urls(self):
        if self._is_shelf():
            return None

        if self.pdf_count is not None:
            return self.pdf_urls
        self.pdf_count = 0
        pdf_hrefs = []
        pdfs = self.tree_html.xpath("//a[contains(@href,'.pdf')]")
        for pdf in pdfs:
            try:
                pdf_hrefs.append(pdf.attrib['href'])
            except KeyError:
                pass
        pdfs = self.tree_html.xpath("//a[contains(@href,'pdfpdf')]")
        for pdf in pdfs:
            try:
                if pdf.attrib['href'] not in pdf_hrefs:
                    pdf_hrefs.append(pdf.attrib['href'])
            except KeyError:
                pass
        pdfs = self.tree_html.xpath("//a[contains(@href,'pdf')]")
        for pdf in pdfs:
            try:
                if pdf.attrib['href'].endswith("pdf") and pdf.attrib['href'] not in pdf_hrefs:
                    pdf_hrefs.append(pdf.attrib['href'])
            except KeyError:
                pass
        pdfs = self.tree_html.xpath("//a[contains(@onclick,'.pdf')]")
        for pdf in pdfs:
            # window.open('http://graphics.samsclub.com/images/pool-SNFRound.pdf','_blank')
            try:
                url = re.findall(r"open\('(.*?)',", pdf.attrib['onclick'])[0]
                if url not in pdf_hrefs:
                    pdf_hrefs.append(url)
            except IndexError:
                pass
        pdfs = self.tree_html.xpath("//a[contains(@onclick,'pdf')]")
        for pdf in pdfs:
            # window.open('http://graphics.samsclub.com/images/pool-SNFRound.pdf','_blank')
            try:
                url = re.findall(r"open\('(.*?)',", pdf.attrib['onclick'])[0]
                if url not in pdf_hrefs and url.endswith("pdf"):
                    pdf_hrefs.append(url)
            except IndexError:
                pass
        # http://content.webcollage.net/sc/smart-button?ird=true&channel-product-id=prod8570143
        url = "http://content.webcollage.net/sc/smart-button?ird=true&channel-product-id=%s" % self._product_id()
        html = urllib.urlopen(url).read()
        # \"src\":\"\/_cp\/products\/1374451886781\/tab-6174b48c-58f3-4d4b-8d2f-0d9bf0c90a63
        # \/552b9366-55ed-443c-b21e-02ede6dd89aa.mp4.mobile.mp4\"
        m = re.findall(r'wcobj="([^\"]*?\.pdf)"', html.replace("\\",""), re.DOTALL)
        pdf_hrefs += m
        pdf_hrefs = [r for r in pdf_hrefs if "JewelryDeliveryTimeline.pdf" not in r]
        if len(pdf_hrefs) < 1:
            return None
        self.pdf_urls = pdf_hrefs
        self.pdf_count = len(self.pdf_urls)
        return pdf_hrefs

예제 #27

0

파일 보기

파일: lottowunderclicker.py 프로젝트: matthex/clickify

def get_ticket_count_for_current_lottery(date, html):
    html = html.replace("\r", '').replace("\n", '')
    match = re.search(
        '<h4>Ziehung\s\w\w\s*' + date + '<\/h4>(.*)<div class="tab-pane', html)
    if match == None or match[1] == '':
        return 0
    else:
        return len(re.findall('lotto_balls', match[1]))

예제 #28

0

파일 보기

 def _clean_html(self, html):
     html = html.replace('\\', '')
     html = re.sub('[\n\t\r]', '', html)
     html = re.sub('<!--[^>]*-->', '', html)
     html = re.sub('</?(?!(ul|li|br))\w+[^>]*>', '', html)
     html = re.sub('&#160;', ' ', html)
     html = re.sub('\s+', ' ', html)
     return re.sub('> <', '><', html).strip()

예제 #29

0

파일 보기

파일: olx.py 프로젝트: edx/cc2olx

 def process_external_tools_link(item, html):
     """
     Replace $CANVAS_OBJECT_REFERENCE$/external_tools/retrieve with appropriate external link
     """
     external_tool_query = urllib.parse.urlparse(item).query
     external_tool_url = urllib.parse.parse_qs(external_tool_query).get("url", [""])[0]
     html = html.replace(item, external_tool_url)
     return html

예제 #30

0

파일 보기

파일: apa_prof.py 프로젝트: isususi/amcat

def to_html(original_rtf, fixed_rtf):
    html = None
    from sh import unrtf

    with NamedTemporaryFile() as xml:
        xml.write(fixed_rtf)
        xml.flush()
        html = bytes(unrtf(xml.name))

    for u in get_unencoded(original_rtf):
        html = html.replace(UNDECODED, u, 1)

    html = html.decode("latin-1")

    for match, correct in get_unencoded_unicode(original_rtf):
        html = html.replace(UNDECODED_UNICODE, correct, 1)

    return html.replace("&gt;", ">").replace("&lt;", "<")

예제 #31

0

파일 보기

파일: apa_prof.py 프로젝트: Stolpovskaya/amcat

def to_html(original_rtf, fixed_rtf):
    html = None
    from sh import unrtf

    with NamedTemporaryFile() as xml:
        xml.write(fixed_rtf)
        xml.flush()
        html = bytes(unrtf(xml.name))

    for u in get_unencoded(original_rtf):
        html = html.replace(UNDECODED, u, 1)

    html = html.decode("latin-1")

    for match, correct in get_unencoded_unicode(original_rtf):
        html = html.replace(UNDECODED_UNICODE, correct, 1)

    return html.replace("&gt;", ">").replace("&lt;", "<")

예제 #32

0

파일 보기

파일: scrape.py 프로젝트: professorlust/artificial_seinfeld

def scrape_transcript(html):
    html = html.replace('&nbsp;', ' ')
    splitted = re.split(r'={30}.*', html)
    info_html = splitted[0]
    script_html = splitted[1]
    info = parse_episode_info(info_html)

    utterances = parse_script(script_html)
    return (info, utterances)

예제 #33

0

파일 보기

파일: ReadThingsDB.py 프로젝트: munen/things2outline

def getTodos(projects, objects):
  """
    Get todos for each project
  """
  tags_dict = getTags(objects)
  for project in projects:
    for ref_id in project['ref_ids'].split():
      for object in objects:
        if object.attributes['id'].value == ref_id:
          attribute_nodes = object.getElementsByTagName("attribute")
          title        = ""
          content      = ""
          datemodified = ""
          datecreated  = ""
          datecompleted= ""
          tags         = ""           
          for attribute_node in attribute_nodes:
            if attribute_node.attributes['name'].value == 'title':
              if attribute_node.childNodes:
                  title = attribute_node.childNodes[0].nodeValue.encode("utf-8")
                  break
          # Check if todo has a note attached
          if title:
            for attribute_node in attribute_nodes:
              # <attribute name="datemodified" >309306984.40529602766036987305
              if attribute_node.attributes['name'].value == 'datemodified':
                datemodified = convertCocoaEpoch(attribute_node.childNodes[0].\
                    nodeValue.encode("utf-8"))
              # <attribute name="datecreated" >306520491.00000000000000000000
              if attribute_node.attributes['name'].value == 'datecreated':
                datecreated = convertCocoaEpoch(attribute_node.childNodes[0].\
                    nodeValue.encode("utf-8"))
              #<attribute name="datecompleted" type="date">292880221.18648099899291992188
              if attribute_node.attributes['name'].value == 'datecompleted':
                datecompleted = convertCocoaEpoch(attribute_node.childNodes[0].\
                    nodeValue.encode("utf-8"))
              if attribute_node.attributes['name'].value == 'content':
                content = attribute_node.childNodes[0].nodeValue #.encode("utf-8")
                # lets encode in writeOutline               
                # I think we need to translate all this things
                html = content.replace('\\u3c00', '<').replace('\\u3e00', '>') 
                html = html.replace('\u2600', '&')
                html = lxml.html.fromstring(html)
                content = html.text_content().split('\n')
                for l in html.iterlinks():
                    content += [l[2]]
            relationship_nodes = object.getElementsByTagName("relationship")
            for relationship_node in relationship_nodes:
              if relationship_node.attributes['name'].value == 'tags':
                try:
                  tags_id = relationship_node.attributes['idrefs'].value
                  tags = [tags_dict[t_id] for t_id in tags_id.split()]
                except:
                  tags = ""

          project['todos'].append([title, content, datecreated, datemodified, datecompleted, tags])
  return projects

예제 #34

0

파일 보기

파일: utils.py 프로젝트: yipeng0428/discover-books

def _parse_book_info(html):
    """解析豆瓣图书信息（作者，出版社，出版年，定价）

    :param html(string): 图书信息部分的原始html
    """
    end_flag = 'END_FLAG'
    html = html.replace('<br>', end_flag)
    html = html.replace('<br/>', end_flag)

    doc = lxml.html.fromstring(html)
    text = doc.text_content()
    pattern = r'{}[:：](.*?){}'
    result = dict()
    for key, column in [('author', '作者'), ('press', '出版社'),
                        ('publish_date', '出版年'), ('price', '定价')]:
        result[key] = re.search(pattern.format(column, end_flag), text,
                                re.I | re.DOTALL).group(1).strip()
    return result

예제 #35

0

파일 보기

파일: translate_html.py 프로젝트: elliottsj/codeInternational

def fix_minor_whitespace(html):
    html = html.replace('<b>', ' <b>')
    html = html.replace('</b>', '</b> ')
    html = html.replace('<code', ' <code')
    html = html.replace('</code>', '</code> ')
    html = html.replace('<a href', ' <a href')
    html = html.replace('</a>', '</a> ')
    titleHook = '"glyphicon glyphicon-home"></span>'
    html = html.replace(titleHook, titleHook + ' ')
    html = html.replace('"/>', '"/> ')
    return html

예제 #36

0

파일 보기

파일: common.py 프로젝트: ssi379/pipulate

def stripemptyhtml(url):
    html = url
    if checkurl(url):
        html = gethtml(url)
        if not html:
            return None
    for anel in ('li', 'ul', 'ol'):
        repme = "<%s></%s>"
        html = html.replace(repme, "")
    return html

예제 #37

0

파일 보기

파일: common.py 프로젝트: nmeraihi/pipulate

def stripemptyhtml(url):
  html = url
  if checkurl(url):
    html = gethtml(url)
    if not html:
      return None
  for anel in ('li', 'ul', 'ol'):
    repme = "<%s></%s>"
    html = html.replace(repme, "")
  return html

예제 #38

0

파일 보기

def embed_map(m):
    from IPython.display import HTML

    m.save("index.html")
    with open("index.html") as f:
        html = f.read()

    iframe = '<iframe srcdoc="{srcdoc}" style="width: 100%; height: 750px; border: none"></iframe>'
    srcdoc = html.replace('"', "&quot;")
    return HTML(iframe.format(srcdoc=srcdoc))

예제 #39

0

파일 보기

    def striphtml(html):
        t = html.replace('<br>', '\n')
        try:
            dom = lxml.html.fromstring(t)
            t = dom.text_content()
        except lxml.etree.XMLSyntaxError as e:
            logger.warning(repr(e.message))
            pass

        return t

예제 #40

0

파일 보기

파일: text.py 프로젝트: Komzpa/telepathy-eri

    def striphtml(html):
        t =  html.replace('<br>','\n')
        try:
            dom = lxml.html.fromstring(t)
            t = dom.text_content()
        except lxml.etree.XMLSyntaxError as e:
            logger.warning(repr(e.message))
            pass

        return t

예제 #41

0

파일 보기

def parse_call_record(html):
    records = []
    doc_string = html.replace("<script>formateB",
                              "<tr class='call_record'><script>formateB")
    doc = lxml.html.document_fromstring(doc_string)
    records_elements = doc.xpath("//tr[@class='call_record']")
    for record_element in records_elements:
        record = parse_record_element(record_element)
        records.append(record)
    return records

예제 #42

0

파일 보기

파일: crunchyroll.py 프로젝트: Forkk/vlyc2

def clean_html(html):
    """Clean an HTML snippet into a readable string"""
    # Newline vs <br />
    html = html.replace('\n', ' ')
    html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
    html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
    # Strip html tags
    html = re.sub('<.*?>', '', html)
    # Replace html entities
    html = html_decode(html)
    return html.strip()

예제 #43

0

파일 보기

파일: scopus_api.py 프로젝트: RezaGhanbari/scopus

    def html(self):
        'Returns an HTML citation.'
        s = ('{authors}, {title}, {journal}, {volissue}, {pages}, '
             '({date}). {doi}.')

        au_link = ('<a href="http://www.scopus.com/authid/detail.url'
                   '?origin=AuthorProfile&authorId={0}">{1}</a>')

        if len(self.authors) > 1:
            authors = ', '.join([au_link.format(a.auid,
                                                (str(a.given_name) +
                                                 ' ' + str(a.surname)))
                                 for a in self.authors[0:-1]])
            authors += (' and ' +
                        au_link.format(self.authors[-1].auid,
                                       (str(self.authors[-1].given_name) +
                                        ' ' +
                                        str(self.authors[-1].surname))))
        else:
            a = self.authors[0]
            authors = au_link.format(a.auid,
                                     str(a.given_name) + ' ' + str(a.surname))

        title = '<a href="{link}">{title}</a>'.format(link=self.scopus_link,
                                                      title=self.title)

        jname = self.publicationName
        sid = self.source_id
        jlink = ('<a href="http://www.scopus.com/source/sourceInfo.url'
                 '?sourceId={sid}">{journal}</a>')
        journal = jlink.format(sid=sid, journal=jname)

        volume = self.volume
        issue = self.issueIdentifier
        if volume and issue:
            volissue = '<b>{0}({1})</b>'.format(volume, issue)
        elif volume:
            volissue = '<b>{0}</b>'.format(volume)
        else:
            volissue = 'no volume'
        date = self.coverDate
        if self.pageRange:
            pages = 'p. {0}'.format(self.pageRange)
        elif self.startingPage:
            pages = 'p. {self.startingPage}'.format(self=self)
        elif self.article_number:
            pages = 'Art. No. {self.article_number}, '.format(self=self)
        else:
            pages = '(no pages found)'
        doi = '<a href="http://dx.doi.org/{0}">doi:{0}</a>'.format(self.doi,
                                                                   self.doi)

        html = s.format(**locals())
        return html.replace('None', '')

예제 #44

0

파일 보기

파일: rehost.py 프로젝트: kravciuk/vcms

 def images(html):
     d = Download()
     r = Rehost()
     r.page_html = html
     pool = ThreadPool(3)
     for image in r.get_img_list():
         path = os.path.join('rehost', Rehost.today(), "%s.%s" % (uuid.uuid4(), Rehost.ext(image)))
         # d.download(image, os.path.join(settings.MEDIA_ROOT, path))
         pool.add_task(d.download, image, os.path.join(settings.MEDIA_ROOT, path))
         html = html.replace(image, "/media/%s" % path)
     pool.wait_completion()
     del d
     del r
     return html

예제 #45

0

파일 보기

파일: xpath.py 프로젝트: diffeo/streamcorpus

 def html_node(html):
     '''Returns an ``lxml.Element`` suitable for ``slice_node``.'''
     if not isinstance(html, unicode):
         html = unicode(html, 'utf-8')
     # The catch here is that lxml's HTML parser replaces *some* HTML
     # entity/char escape sequences with their proper Unicode codepoint
     # (e.g., `&amp;` -> `&` and `&quot;` -> `"`).
     # But not all such entities are replaced (e.g., `&Hat;` -> `&Hat;`).
     # We can either special case the entities that lxml does replace
     # (no thanks), or just escape every `&` in the HTML, which starts
     # every entity/char escape sequence.
     #
     # We care about this because changing `&amp;` to `&` in the original
     # HTML will throw off indexing.
     return lxml.html.fromstring(html.replace(u'&', u'&amp;'))

예제 #46

0

파일 보기

파일: RSTParser.py 프로젝트: rdhyee/pg-epubmaker

    def to_xhtml (self, html, base_url):
        html = html.replace (u'&nbsp;', u' ')
        html = html.replace (u'&mdash;', u'—')

        outputfilename = os.path.join (options.outputdir, options.outputfile)
        debugfilename = os.path.splitext (outputfilename)[0] + '.debug.html'

        try:
            os.remove (debugfilename)
        except OSError:
            pass
        
        if options.verbose > 1:
            with open (debugfilename, 'w') as fp:
                fp.write (html.encode ('utf-8'))

        try:
            xhtml = etree.fromstring (
                html, 
                lxml.html.XHTMLParser (),
                base_url = base_url)                                           
        except etree.ParseError, what:
            error ("etree.fromstring says %s" % what)
            raise

예제 #47

0

파일 보기

파일: testvito.py 프로젝트: flyeven/scraperwiki-scraper-vault

def scrape_page(page):
   logInfo("scraping page="+str(page),iter_=page)
   res = list()
   url = URL_BASE + str(page)
   html = khgscrape(url)
   html =  html.replace('"catalog-item catalog-item-odd"', '"catalog-item"')
   root=lxml.html.fromstring(html)
   divs = root.cssselect("div[class='catalog-item']")
   for div in divs:
        data = parseDiv(div)
        print data
        res.append(data)
   
   #  data = {"scraper":scraper,"scraperc":scraperc,"user":user,"userc":userc,"language":lang,"status":stat}
   #  res.append(data)
   return res

예제 #48

0

파일 보기

파일: parse.py 프로젝트: noelledaley/inspektor

def add_spans(encoded_html):
    """Given string of encoded html, wrap each element with a span and class of element tag.

    e.g. <span class="my-div">&lt;div id='sample'&gt;</span><br>
    """
    # TODO: this only wraps opening element tags in spans.
    # Will need to write separate regex search to handle closing tags.

    def add_span_class(matchobj):
        return "<span class=\"my-{elem}\">&lt;{elem}".format(elem=matchobj.group(1))

    # This is the regex pattern to find the element type: &lt;([A-Z|a-z]+[0-9]*)
    html = re.sub('&lt;([A-Z|a-z]+[0-9]*)', add_span_class, encoded_html)

    html = html.replace("&gt;", "&gt;</span><br>")

    return html

예제 #49

0

파일 보기

파일: cricinfo_matches.py 프로젝트: flyeven/scraperwiki-scraper-vault

def scrape_series():
    years = scrape_years()
    
    data = []
    id = itertools.count(0)
    for year in years[:2]:
        url = BASE_URL + year['link']
        html = scraperwiki.scrape(url)
        
        root = lxml.html.fromstring(html.replace("\n","")) 
        for el1 in root.cssselect("p.ciGrndSubHead"):
            for el2 in el1.getnext().cssselect("dl.seasnResult"):
                series = el2.getchildren()[0].getchildren()[0]  
                status = el2.getchildren()[1].text
                if status:
                    status = status.strip()
                data.append({ "id": id.next(), "status": status, "class": el1.text, "title": series.text, "link": series.attrib['href'], "year": year['year'] })
    return data

예제 #50

0

파일 보기

파일: 1.py 프로젝트: rahularora/PageRank

def getTitle(url):
  response = urllib.urlopen(url)
  html = response.read()
  html = html.replace(r'\"', '"')
  soup = BeautifulSoup(html.lower())
  urlTitle = soup.find('title')
  try:
    urlTitleText = urlTitle.text
  except:
    try:
      t = lxml.html.parse(url)
      urlTitleText = t.find(".//title").text
    except:
      print "title not found"
      print url
      urlTitleText = ""
  
  return urlTitleText.lower()

예제 #51

0

파일 보기

파일: etree.py 프로젝트: jerome-poisson/superdesk-core

def parse_html(html, content='xml', lf_on_block=False, space_on_elements=False):
    """Parse element and return etreeElement

    <div> element is added around the HTML
    recovery is used in case of bad markup
    :param str html: HTML markup
    :param str content: use 'xml' for XHTML or non html XML, and 'html' for HTML or if you are unsure
    :param bool lf_on_block: if True, add a line feed on block elements' tail
    :param bool space_on_elements: if True, add a space on each element's tail
        mainly used to count words with non HTML markup
    :return etree.Element: parsed element
    """
    if not isinstance(html, str):
        raise ValueError("a string is expected")
    if not html:
        return etree.Element('div')

    if content == 'xml':
        # to preserve 'carriage return' otherwise it gets stripped.
        html = html.replace('\r', '&#13;')
        parser = etree.XMLParser(recover=True, remove_blank_text=True)
        root = etree.fromstring("<div>" + html + "</div>", parser)
    elif content == 'html':
        parser = etree.HTMLParser(recover=True, remove_blank_text=True)
        root = etree.fromstring(html, parser)
        if root is None:
            root = etree.Element('div')
        else:
            root = root.find('body')
            root.tag = 'div'
    else:
        raise ValueError('invalid content: {}'.format(content))
    if lf_on_block:
        for elem in root.iterfind('.//'):
            # append \n to the tail
            if elem.tag in BLOCK_ELEMENTS:
                elem.tail = (elem.tail or '') + '\n'
            # prepend \n to the tail
            elif elem.tag in ('br',):
                elem.tail = '\n' + (elem.tail or '')
    if space_on_elements:
        for elem in root.iterfind('.//'):
            elem.tail = (elem.tail or '') + ' '
    return root

예제 #52

0

파일 보기

파일: search.py 프로젝트: floft/flowgen

def parse(db,url):
	global add
	global urls

	try:
		if not re.search('^http://',url):
			url=siteurl+"/"+url
			url="http://"+url.replace("//","/")

		request=urllib.request.Request(url)
		request.add_header('User-Agent', 'Flowgen/1.0 (http://floft.net)')
		page=urllib.request.urlopen(request)
		html=page.read().decode("utf-8")
		page.close()

		print("Notice: processing {}".format(url))

		#get urls
		linkOpenTag,linkCloseTag = makeHTMLTags("a")
		link = linkOpenTag + SkipTo(linkCloseTag).setResultsName("body") + linkCloseTag.suppress()

		for toks,strt,end in link.scanString(html):
			newurl=toks.startA.href

			if newurl not in urls and newurl not in visited:
				if re.search('^(/|http://'+siteurl+')',newurl) and not \
				   re.search('(jpg|png|flac|mp3|zip|pdf)$',newurl):
					urls.append(newurl)

		#get title
		try:
			title=re.search('<title>([^<]*)</title>',html).groups()[0]
		except:
			title="Untitled"
		
		#get text
		xml=lxml.html.document_fromstring(html.replace(">","> ").replace("<", " <"))
		text=xml.cssselect('body')[0].text_content().replace("\n"," ").strip()

		#add to database
		add.append([time(),title,url,text])
	except:
		print("Error: {} does not load".format(url))

예제 #53

0

파일 보기

파일: automatic_weather_stations_data.py 프로젝트: ranjithtenz/scraperwiki-scripts

def get_hourly_data(i):
    url = "http://www.imd.gov.in/section/nhac/aws/aws%02d.htm" % i
    html = scraperwiki.scrape(url)
    html = html.replace("&nbsp", "")  # Lot of strings like this
    root = lxml.html.fromstring(html)
    date = root.cssselect("p")[0].text_content().split("/")[-1]
    observed_date = dateutil.parser.parse(date + " %02d:00" % i)
    table = root.cssselect("table")[0]
    rows = table.cssselect("tr")
    headers = rows.pop(0)
    headers = [td.text_content() for td in headers.cssselect("td")]
    for row in rows:
        cells = [td.text_content() for td in row.cssselect("td")]
        rec = dict(zip(headers, cells))
        rec["observed_date"] = observed_date
        rec["station_name"] = rec["Name"]
        del rec["Name"]
        del rec["S.No"]
        utils.save(rec)

예제 #54

0

파일 보기

파일: india_automatic_weather_stations_hourly_data.py 프로젝트: flyeven/scraperwiki-scraper-vault

def get_hourly_data(i):
    url = 'http://www.imd.gov.in/section/nhac/aws/aws%02d.htm' % i
    html = scraperwiki.scrape(url)
    html = html.replace('&nbsp', '') # Lot of strings like this
    root = lxml.html.fromstring(html)
    date = root.cssselect('p')[0].text_content().split('/')[-1]
    observed_date = dateutil.parser.parse(date + ' %02d:00' % i)
    table = root.cssselect('table')[0]
    rows = table.cssselect('tr')
    headers = rows.pop(0)
    headers = [td.text_content() for td in headers.cssselect('td')]
    for row in rows:
        cells = [td.text_content() for td in row.cssselect('td')]
        rec = dict(zip(headers, cells))
        rec['observed_date'] = observed_date
        rec['station_name'] = rec['Name']
        del rec['Name']
        del rec['S.No']
        utils.save(rec)

예제 #55

0

파일 보기

파일: link_tracker.py 프로젝트: 10537/odoo

    def convert_links(self, html, vals, blacklist=None):
        for match in re.findall(URL_REGEX, html):

            short_schema = self.env['ir.config_parameter'].sudo().get_param('web.base.url') + '/r/'

            href = match[0]
            long_url = match[1]

            vals['url'] = utils.unescape(long_url)

            if not blacklist or not [s for s in blacklist if s in long_url] and not long_url.startswith(short_schema):
                link = self.create(vals)
                shorten_url = self.browse(link.id)[0].short_url

                if shorten_url:
                    new_href = href.replace(long_url, shorten_url)
                    html = html.replace(href, new_href)

        return html

예제 #56

0

파일 보기

파일: crawler.py 프로젝트: wpandveil/web_crawler

def download_html(url):
	html = ""
	try:
		time.sleep(random.randint(1, 2))
		req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) \
		AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
		'Accept':'text/html;q=0.9,*/*;q=0.8',
		'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
		'Accept-Encoding':'gzip',
		'Connection':'close',
		'Referer':None
		}
		req_timeout = 10
		req = urllib2.Request(url, None, req_header)
		response = urllib2.urlopen(req, None, req_timeout)
		html = response.read()
		html = html.replace("&", "&amp;")
		return html
	except:
		return ""

예제 #57

0

파일 보기

파일: bills.py 프로젝트: unixcrh/openstates

    def url2lxml(self, url, xml=False):

        cache = getattr(self, '_url_cache', {})
        self._url_cache = cache

        if url in cache:
            return cache[url]
        if xml:
            xml = self.urlopen(url)
            doc = lxml.etree.fromstring(xml.bytes)
        else:
            html = self.urlopen(url)
            html = html.replace('\x00', '')
            try:
                doc = lxml.html.fromstring(html)
            except lxml.etree.XMLSyntaxError:
                return None
            doc.make_links_absolute(url)
        cache[url] = doc
        return doc

예제 #58

0

파일 보기

파일: tunisia_cities.py 프로젝트: flyeven/scraperwiki-scraper-vault

def get_dms(url):
    if url == "" or url == None:
        return {"lat": "", "lon": ""}

    try:
        html = scraperwiki.scrape("http://en.wikipedia.org" + url, None, user_agent)

    except:
        return {"lat": "", "lon": ""}
        pass

    # html=html.replace("<html ", '<html xmlns="http://www.w3.org/1999/xhtml" ')
    html = html.replace(
        '<meta charset="UTF-8" />', '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'
    )
    root = lxml.html.document_fromstring(html)
    dms = root.xpath("//span[@class='geo-dms']")
    if dms != []:
        dms = dms[0].xpath("span")
        return {"lat": dms[0].text_content(), "lon": dms[1].text_content()}
    else:
        return {"lat": "", "lon": ""}

예제 #59

0

파일 보기

파일: cricinfo_matches.py 프로젝트: flyeven/scraperwiki-scraper-vault

def scrape_matches(series = []):    
    data = []
    for a_series in series[:2]:
        html = scraperwiki.scrape(BASE_URL + a_series['link'])
        root = lxml.html.fromstring(html.replace("\n",""))
        id = itertools.count(0)
        titles = root.cssselect("p.potMatchHeading")
        for title in titles:
            match = {
                        'id': id.next(),
                    'title': re.sub(r'\s+', " ", title.text_content()),
                'series_id': a_series['id'],
            }
            _links_iter = itertools.takewhile(lambda el : el.tag == 'p', title.itersiblings())
            for (k, el) in ( (el.attrib['class'].split(' ')[1][4:], el) for el in _links_iter ):
                if k == 'links':
                    links = el.cssselect("span a")
                    for link in links:
                        match[re.sub(r'\(\d+\)', "", link.text_content()) + '_link'] = link.attrib['href']
                else:
                    match[k] = re.sub(r'\s+', " ", el.text_content())
            data.append(match)
    return data