Python HTML 예제들, lxml.etree.HTML Python 예제들

예제 #1

0

파일 보기

파일: luceneremoteservicetest.py 프로젝트: seecr/meresco-lucene

    def testRemoteInfoCore(self):
        header, body = getRequest(port=self.httpPort, path='/remote/info/core', arguments=dict(name='main'), parse=False)
        self.assertFalse('Traceback' in body, body)  # only tested for MultiLucene situation for now!
        bodyLxml = HTML(body)
        lists = bodyLxml.xpath('//ul')
        fieldList = lists[0]
        fields = fieldList.xpath('li/a/text()')
        self.assertEquals(19, len(fields))
        self.assertEqual([
                '$facets',
                '__id__',
                '__key__.field',
                'copy',
                'field1',
                'field2',
                'field3',
                'field4',
                'field5',
                'field_missing',
                'intfield1',
                'intfield2',
                'intfield3',
                'intfield_missing',
                'sorted.field2',
                'sorted.field4',
                'sorted.intfield1',
                'sorted.intfield_missing',
                'untokenized.field3',
            ], fields)

        drilldownFieldList = lists[1]
        drilldownFields = drilldownFieldList.xpath('li/a/text()')
        self.assertEquals(set(['untokenized.field2', 'untokenized.fieldHier', 'untokenized.field2.copy']), set(drilldownFields))

예제 #2

0

파일 보기

파일: edit_message.py 프로젝트: RavuAlHemio/vbcbbot

def edit_message(base_url, username, password, message_id, new_body):
    url_opener = _utils.login_and_go_to_faq(base_url, username, password)

    # calculate some more URLs
    faq_url = urljoin(base_url, "faq.php")
    edit_url = urljoin(base_url, "misc.php")

    # go to the FAQ page (page with low backend complexity) to get the security token
    print("fetching security token")
    faq_response = url_opener.open(faq_url)
    faq = HTML(faq_response.read())
    token_field = faq.find(".//input[@name='securitytoken']")
    security_token = token_field.attrib["value"]

    # encode the message
    request_string = \
        "do=vsacb_editmessage&s=&securitytoken={0}&id={1}&vsacb_editmessage={2}".format(
            security_token, message_id, encode_outgoing_message(new_body)
        )
    request_bytes = request_string.encode(server_encoding)

    print("updating message")
    edit_response = url_opener.open(edit_url, data=request_bytes)
    edit_response.read()

    print("done")

예제 #3

0

파일 보기

파일: base.py 프로젝트: fdrong/spider_news

 def parse_xpath_content(self, url):
     result = dict()
     content = self.get_content(url)
     if not content:
         return result
     result["url"] = url
     result["md5"] = self.md5(url)
     result["creat_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     tree = HTML(content)
     for key in self.config.Xpath.keys():
         if not self.config.Xpath.get(key):
             continue
         elif isinstance(self.config.Xpath.get(key), dict):
             # 字符串截取
             if self.config.Xpath[key]['op'] == 'cut':
                 pos1 = content.find(self.config.Xpath[key]['start'])
                 if pos1 != -1:
                     pos2 = content[pos1:].find(self.config.Xpath[key]['end'])
                     result[key] = content[pos1+len(self.config.Xpath[key]['start']):pos1+pos2]
                 else:
                     result[key] = ""
         else:
             list_content = tree.xpath(self.config.Xpath[key].replace('tbody/', ''))
             if list_content:
                 result[key] = "".join(list_content)
             else:
                 result[key] = ""
     result['publish_time'] = self.parse_time(result['publish_time'])
     return result

예제 #4

0

파일 보기

파일: landing.py 프로젝트: binjo/ekdeco

def decode_first(d):
    h = HTML(d)
    inner_js=''.join(h.xpath('//div/text()')).replace('_','')
    inner_js=inner_js.replace('&','').replace('%','')
    inner_js=inner_js.replace('=','').replace('undefined','')
    inner_js=inner_js.decode('hex')
    return inner_js

예제 #5

0

파일 보기

파일: web_proxy.py 프로젝트: xujun10110/death-star

def hijack(content):
    html = HTML(content)
    body = html.xpath('//body')[0]
    script = Element('script')
    script.text = 'alert(/hijacked/);'
    body.append(script)
    content = tostring(html)
    return content

예제 #6

0

파일 보기

파일: proxy.py 프로젝트: iitwebdev/lectures

 def save_download(self, url, data, index):
     page = HTML(data)
     body = page.xpath('//body')[0]
     bundles = elquery.get_elements_by_class(body, 'olpc-bundle')
     bundle = bundles[index]
     links = bundle.xpath('descendant-or-self::a[@href]')
     for link in links:
         href = urlparse.urljoin(url, link.attrib['href'])
         print 'got one page:', href
         self.store.save_page_set(href)

예제 #7

0

파일 보기

파일: landing.py 프로젝트: MeteorAdminz/ekdeco

def decode_first_js(data):
    h = HTML(data)
    off = get_off(h)
    off.append(0)
    for el in h.xpath("//*[@id]"):
        if el.text:
            txt = decode_payload(off, el.text)
            if not txt:
                continue
            yield txt

예제 #8

0

파일 보기

파일: luceneremoteservicetest.py 프로젝트: jerryba/meresco-lucene

    def testRemoteInfoCore(self):
        header, body = getRequest(port=self.httpPort, path='/remote/info/core', arguments=dict(name='main'), parse=False)
        bodyLxml = HTML(body)
        lists = bodyLxml.xpath('//ul')
        fieldList = lists[0]
        fields = fieldList.xpath('li/a/text()')
        self.assertEquals(12, len(fields))

        drilldownFieldList = lists[1]
        drilldownFields = drilldownFieldList.xpath('li/a/text()')
        self.assertEquals(['untokenized.field2', 'untokenized.fieldHier'], drilldownFields)

예제 #9

0

파일 보기

파일: spiders.py 프로젝트: Kxrr/ThreadingSpider

 def get(self, url, depth=1):
     counter_processed.update((depth, ))
     logging.info('[{}] Processing {} ({}).'.format(threading.current_thread().name, url, depth))
     rsp = self.session.get(url)
     rsp.encoding = 'GB2312'
     html = HTML(rsp.text)
     urls = html.xpath('//a/@href')
     urls = list(set(filter(lambda url: re.search(self.url_loc, url), urls)))
     for url in urls:
         self.data.put((url, depth + 1))
     counter.update([depth + 1] * len(urls))

예제 #10

0

파일 보기

파일: tr_download.py 프로젝트: mtxs007/Mac-App-Downloader-Alfred-Workflow

def main(wf):
    kw = wf.args[0]
    r = web.get(kw)
    r.raise_for_status()
    reg = re.compile('<ul id="dl-btn">.*</ul>', flags=re.DOTALL + re.MULTILINE)
    match = reg.search(r.text)
    if match:
        html = match.group(0)
        node = HTML(html).find('.//a')
        log.info(node.text)
        call(["open", node.get('href')])

예제 #11

0

파일 보기

파일: base.py 프로젝트: fdrong/spider_news

 def parse_urls(self):
     content = self.get_content(self.config.Root)
     if content:
         tree = HTML(content)
         url_list = tree.xpath(u"//a/@href")
         pattern = re.compile(self.config.Regex)
         url_joined_list = [urlparse.urljoin(self.config.Root, url) for url in url_list]
         url_joined_list = list(set(url_joined_list))   # 去重
         return filter(pattern.match, url_joined_list)
     else:
         return []

예제 #12

0

파일 보기

파일: landing.py 프로젝트: binjo/ekdeco

def doit(d):
    if '<div' in d:
        d = decode_first(d)
        
    for p in decode_payloads(d):
        urls = []
        if 'application/x-shockwave-flash' in p:
            t = 'flash'
            x=p.strip().splitlines()[-2].replace("'",'"').split('"')
            url_b=x[1].split('/')[1]
            sh =x[-2].decode('hex').strip("\x00")
            urls = re.findall('"(/'+url_b+'.*?)"',p)
            payload_url = re.findall('(http.*)',sh)[0]
            
        elif 'data:application/x-silverlight' in p:
            t = 'silverlight'
            x = HTML(re.findall('"(.*?)"',p)[0])
            for i in x.xpath('//param'):
                if i.attrib['name'] == 'source':
                    urls = [i.attrib['value']]
                elif i.attrib['name'] == 'initParams':
                    vals = dict(map(lambda x: tuple(x.split('=')),i.attrib['value'].split('&')))
                    sh   = vals['shell32'].decode('hex').strip("\x00")
                    payload_url = re.findall('(http.*)',sh)[0]
                    
        elif 'CollectGarbage' in p:
            t = 'ie'
            x= p.strip().splitlines()[-1].replace("'",'"').split('"')
            payload_url = x[1] + ' rc4 key: %s' % x[-2]
            sh = re.findall('"([0-9a-f]+)"\+',p,re.I)[0].decode('hex')            
        else:
            t = 'unknown'

        sh_hash = hashlib.sha256(sh).hexdigest()
        print '[+] found %s exploit' % t
        if urls:
            print '[+] additional exploits:', ', '.join(urls)
        print '[+] payload url:', payload_url
        print '[+] shellcode hash:',sh_hash

        if args.save:
            n = args.dir + '/exp.%s.%s.txt' % (t,hashlib.sha256(p).hexdigest())
            with open(n,'w') as f:
                f.write(p)
            print '[+] js saved to', n
            if sh:
                n = args.dir + '/exp.%s.%s.sh.bin' % (t,sh_hash)
                with open(n,'w') as f:
                    f.write(sh)
                print '[+] shellcode saved to', n

예제 #13

0

파일 보기

파일: favicons.py 프로젝트: AlexUlrich/digsby

def link_tag_url(html):
    '''
    extracts a relative url from an HTML document's link tag, like

        <link rel="shortcut icon" href="images-template/favicon.ico" type="image/x-icon" />

    '''
    from lxml.etree import HTML
    doc = HTML(html)
    link_tag = doc.find('.//link[@rel="shortcut icon"]')
    if link_tag is not None:
        favicon_url = link_tag.get('href', '')
        if favicon_url:
            return favicon_url

예제 #14

0

파일 보기

def parase_response(start_url):
    flag = True
    try:
        # with open('2.txt','r') as f:
        #     start_url=f.read().strip()
        while flag:
            product_code_list = []
            print('start_url:', start_url)
            r = session.get(url=start_url, proxies=random.choice(proxies_list))
            print(r.status_code)
            if r.status_code != 200:
                # 被屏蔽后可以更换地址 待扩展
                raise Exception('地址被屏蔽')
            print('解析页面获取商品')
            html = HTML(r.text)
            products_html = html.xpath('//*[@id="Products"]/ul/li/div')
            conn = sqlite3.connect('tuhu_db.sqlite3')
            with conn:
                cur = conn.cursor()
                if products_html:
                    for product in products_html:
                        product_name = is_null(product.xpath('a/text()'))
                        product_url = is_null(product.xpath('a/@href'))
                        product_price = is_null(
                            product.xpath('div/strong/text()'))
                        product_code = is_null(
                            product.xpath('form/input[1]/@value'))
                        insert_product_sql = "INSERT INTO product_des (product_name,product_url,product_price,product_code) VALUES (?,?,?,?)"
                        cur.execute(insert_product_sql,
                                    (product_name.strip(), product_url,
                                     product_price, product_code))
                        product_code_list.append(product_code)

                conn.commit()

            for code in product_code_list:
                parse_comment(code)
            # 如果没有下一页则循环
            start_url = is_null(html.xpath('//*[@class="last-child"]/@href'))
            if not start_url:
                flag = False
    except Exception as e:
        print(e)
        with open('2.txt', 'a') as f:
            f.write(start_url + str(e) + '\n')
        conn = sqlite3.connect('tuhu_db.sqlite3')
        conn.commit()
        conn.close()

예제 #15

0

파일 보기

 def doc(self):
     """获取后缀名"""
     url = 'https://linshiyouxiang.net/'
     headers = {
         'accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;'
         'q=0.8,application/signed-exchange;v=b3;q=0.9',
         'accept-encoding':
         'gzip, deflate, br',
         'accept-language':
         'zh-CN,zh;q=0.9',
         'cache-control':
         'no-cache',
         'pragma':
         'no-cache',
         'referer':
         'https://linshiyouxiang.net/',
         'sec-ch-ua':
         '"Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"',
         'sec-ch-ua-mobile':
         '?0',
         'sec-fetch-dest':
         'document',
         'sec-fetch-mode':
         'navigate',
         'sec-fetch-site':
         'same-origin',
         'sec-fetch-user':
         '******',
         'upgrade-insecure-requests':
         '1',
         'user-agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
         'Chrome/87.0.4280.88 Safari/537.36',
     }
     self.session.headers = headers
     response = self.session.get(url, timeout=timeout)
     if response.status_code == 200:
         context = HTML(response.text)
         self.address = context.xpath(
             '//*/ul[@class="dropdown-menu dropdown-menu-right"]/li/a/@data-mailhost'
         )
         if self.address:
             return True
         else:
             return False
     else:
         return None

예제 #16

0

파일 보기

    def city(self):
        """
        城市
        :return:
        """
        count = red_cli.scard('jr_category')
        while count:

            #没爬取一次分类换一个IP
            proxy = choice(get_proxy())["ip"]
            proxy = {
                'http': 'http://{}'.format(proxy),
                'https': 'https://{}'.format(proxy)
            }

            data_category = red_cli.srandmember('jr_category')
            cate = eval(data_category)["category_name"]
            cate_url = eval(data_category)["category_url"]
            tag = 0

            item = EasyDict()
            item.category = cate
            resp = self.feach.get_req(url=cate_url, proxies=proxy)
            if resp != False:
                etre = HTML(resp)
                city_urls = etre.xpath(
                    '//div[@class="filter-item"]/div[last()]/a/@href')
                city_names = etre.xpath(
                    '//div[@class="filter-item"]/div[last()]/a/text()')

                for _ in range(len(city_names)):
                    if city_names[_] == "全部":
                        continue
                    else:
                        item.city_url = "https://www.jvrong.com" + str(
                            city_urls[_])
                        item.city_name = city_names[_]
                        red_cli.sadd('jr_city', str(item))
            else:
                tag = 1

            if tag == 1:
                print('请求失败')
                pass
            else:
                pprint('数据插入redis全部成功')
                red_cli.srem('jr_category', data_category)
                count -= 1

예제 #17

0

파일 보기

 def cpxx(self, resp):
     """
     融资信息
     :return:
     """
     cpxx = self.sd_xpath.cpxx(HTML(resp))
     return cpxx

예제 #18

0

파일 보기

 async def get(self, url, proxy='', retry=5):
     response = None
     # 重试次数
     for i in range(retry):
         try:
             response = await self.session.get(
                 url,
                 headers=self.headers,
                 proxy='' if proxy == None else proxy,
                 timeout=5)
             if 'content-type' in response.headers and 'html' in response.content_type:
                 response.xpath = HTML(await response.text()).xpath
             if response.content_type == 'application/json':
                 response.json_data = await response.json()
             if response.status != 200 or self.except_content_type != None and response.content_type != self.except_content_type:
                 if proxy != None:
                     await self.__update_proxy()
                     proxy = self.proxy
                 continue
             break
         except (Exception, BaseException, TimeoutError) as e:
             if proxy != None:
                 await self.__update_proxy()
                 proxy = self.proxy
             continue
         break
     if response != None and response.status == 200:
         self.succeed_proxies.add(proxy)
     else:
         self.succeed_proxies.discard(self.proxy)
         if proxy != None:
             await self.__update_proxy()
     return response

예제 #19

0

파일 보기

    def _get_description(self, card: etree.HTML) -> str:
        description: str = ""
        for node in card.xpath(".//div[contains(@class, \"desc\")]"):
            description = node.text.strip()
            break

        return description

예제 #20

0

파일 보기

    def _get_price(self, card: etree.HTML) -> str:
        price: str = ""
        for node in card.xpath(".//*[contains(@class, \"price\")]"):
            price = node.text.strip()
            break

        return price

예제 #21

0

파일 보기

 def jpxx(self, resp):
     """
     融资信息
     :return:
     """
     jpxx = self.sd_xpath.jpxx(HTML(resp))
     return jpxx

예제 #22

0

파일 보기

파일: baiduNews.py 프로젝트: inwikipedia/spiders

def get_detail(url):
    response = requests.get(url, headers=detail_headers)
    html = HTML(response.text)
    # print(response.text)
    contentList = html.xpath('//div[@class="article-content"]//text()')
    content = ''.join(contentList)

    savedate = html.xpath(
        'string(//div[@class="article-source article-source-bjh"]/span[@class="date"])'
    )
    savetime = html.xpath(
        'string(//div[@class="article-source article-source-bjh"]/span[@class="time"])'
    )
    publishDateStr = '2019-' + savedate + ' ' + savetime

    return content, publishDateStr

예제 #23

0

파일 보기

 def qyyw(self, resp):
     """
     融资信息
     :return:
     """
     qyyw = self.sd_xpath.qyyw(HTML(resp))
     return qyyw

예제 #24

0

파일 보기

 async def __get_proxy_from_xila(self, session):
     '''
 Crawl data from xiladaili.
 '''
     try:
         for page in range(1, 5):
             url = f'http://www.xiladaili.com/gaoni/{page}/'
             res = await session.get(url, timeout=10)
             text = await res.text()
             html = HTML(text)
             for data in html.xpath('//table/tbody/tr'):
                 ip = data.xpath('.//td[1]/text()')[0]
                 await self.put_proxy(f'http://{ip}', '西拉代理')
     except Exception as e:
         logging.exception(e)
         pass

예제 #25

0

파일 보기

파일: html.py 프로젝트: DarioGT/lino

    def raw(self, raw_html):
        """Parses the given string into an HTML Element."""
        # print 20151008, raw_html

        # the lxml parser wraps `<html><body>...</body></html>` around
        # the snippet, but we don't want it.
        return HTML(raw_html)[0][0]

예제 #26

0

파일 보기

파일: logicimmo_handler.py 프로젝트: yawks/pyrssw

    def _get_img_url(self, card: etree.HTML) -> str:
        img_url: str = ""
        for node in card.xpath(".//img[@data-original]"):
            img_url = node.attrib["data-original"]
            break

        return img_url

예제 #27

0

파일 보기

def main():
    r = requests.get(BASE_URL)

    for filename in HTML(
            r.content).xpath('//a[starts-with(@href, "tl_2012_")]/@href'):
        print("Downloading", filename)
        download_file(posixpath.join(BASE_URL, filename))

예제 #28

0

파일 보기

    def _get_location(self, card: etree.HTML) -> str:
        location: str = ""
        for node in card.xpath(".//*[contains(@class, \"location\")]"):
            location = node.text.strip()
            break

        return location

예제 #29

0

파일 보기

 def zpzz(self, resp):
     """
     专利信息
     :return:
     """
     zpzz = self.sd_xpath.zpzz(HTML(resp))
     return zpzz

예제 #30

0

파일 보기

 def wxgz(self, resp):
     """
     融资信息
     :return:
     """
     wxgz = self.sd_xpath.wxgz(HTML(resp))
     return wxgz

예제 #31

0

파일 보기

 def rjzz(self, resp):
     """
     专利信息
     :return:
     """
     rjzz = self.sd_xpath.rjzz(HTML(resp))
     return rjzz

예제 #32

0

파일 보기

 def sbxx(self, resp):
     """
     商标信息
     :return:
     """
     sbxx = self.sd_xpath.sbxx(HTML(resp))
     return sbxx

예제 #33

0

파일 보기

 def zlxx(self, resp):
     """
     专利信息
     :return:
     """
     zlxx = self.sd_xpath.zlxx(HTML(resp))
     return zlxx

예제 #34

0

파일 보기

파일: tr.py 프로젝트: webpatch/Mac-App-Downloader-Alfred-Workflow

def main(wf):
    if len(wf.args):
        kws = ' '.join(wf.args)
        url = 'http://www.mac-torrent-download.net/?s={}&x=0&y=0&open=1'.format(
            kws)
        text = get_recent_list(url)
        try:
            dd_arr = HTML(text).findall('.//dd')
            for dd in dd_arr:
                a = dd.find('.//a')
                href, title = a.get('href') + '?open=1', a.text.strip()
                info = dd.find('.//div[@class="blog_info"]')
                tags = ' / '.join([a.text for a in info.findall('.//a')])
                time = info.find('.//i').tail.strip()
                wf.add_item(title=title,
                            subtitle='{} {}'.format(time, tags),
                            valid=True,
                            arg=href,
                            icon=ICON)
        except Exception as e:
            wf.add_item(title='槽糕！没找到 “{}”'.format(kws),
                        subtitle='去“mac-torrent-download.net”手动搜索看看？',
                        valid=True,
                        arg=url,
                        icon='icon.png')
        finally:
            wf.send_feedback()

예제 #35

0

파일 보기

 def zbxx(self, resp):
     """
     融资信息
     :return:
     """
     zbxx = self.sd_xpath.zbxx(HTML(resp))
     return zbxx

예제 #36

0

파일 보기

 def zzzs(self, resp):
     """
     融资信息
     :return:
     """
     zzzs = self.sd_xpath.zzzs(HTML(resp))
     return zzzs

예제 #37

0

파일 보기

 def searchUrlMagnetCilifeng(self, content:str, page:int=1) -> None:
     """http://www.cilifeng.me/"""
     try:
         domain_name = "磁力风"
         search_url = "http://www.cilifeng.me/search?word={}&page={}".format(content, page)
         search_response = self._session.get(url=search_url)
         html = HTML(search_response.text)
         lis = html.xpath("//ul[@class='alt']/li")
         if lis and not self._stop:
             for li in lis:
                 url = "http://www.cilifeng.me" + li.xpath(".//a/@href")[0].replace("../../..", "")
                 result = (domain_name, url)
                 self._detail_urls.put(result)
             self.searchUrlMagnetCilifeng(content=content, page=page + 1)
     except: pass
     finally: return

예제 #38

0

파일 보기

 def gdxx(self, resp):
     """
     解析股东信息
     :return:
     """
     gsxx = self.sd_xpath.gdxx(HTML(resp))
     return gsxx

예제 #39

0

파일 보기

 def xzxk(self, resp):
     """
     融资信息
     :return:
     """
     xzxk = self.sd_xpath.xzxk(HTML(resp))
     return xzxk

예제 #40

0

파일 보기

 def gsbg(self, resp):
     """
     解析工商变更信息
     :return:
     """
     gsbg = self.sd_xpath.gsbg(HTML(resp))
     return gsbg

예제 #41

0

파일 보기

파일: HotSpider.py 프로젝트: mindis/tql-Python

    def df_sites_info(self):
        self.urls = self.df_sites.url

        dfs = []
        for url in tqdm(self.urls):
            r = self._request(url)
            dom_tree = HTML(r.text)
            site = dom_tree.xpath('normalize-space(//div[@class="Xc-ec-L b-L"]/text())')
            print('ðŸ•·ï¼š%s %s' % (site, url))

            df = pd.read_html(self._request(url).text)[0]
            df.columns = ['rank', 'title', 'hot', 'site']
            df['site'] = site
            df['url'] = url
            dfs.append(df)
        return pd.concat(dfs)  # pd.merge(self.df_sites, pd.concat(dfs))

예제 #42

0

파일 보기

 def rzxx(self, resp):
     """
     融资信息
     :return:
     """
     rzxx = self.sd_xpath.rzxx(HTML(resp))
     return rzxx

예제 #43

0

파일 보기

 def wzba(self, resp):
     """
     网站备案
     :return:
     """
     wzba = self.sd_xpath.wzba(HTML(resp))
     return wzba

예제 #44

0

파일 보기

파일: nubank.py 프로젝트: turicas/nubank-to-csv

def html_to_table(input_filename, encoding='utf-8'):
    with open(input_filename) as fobj:
        html = fobj.read().decode(encoding).replace('\xa0', ' ')
    tree = HTML(html)

    data = tree.xpath('//body/b')
    for index, element in enumerate(data):
        text = element.text
        if text.startswith('Valores') and text.endswith('R$'):
            break
    new = []
    for element in data[index + 1:]:
        text = element.text
        if text.startswith('FATURA DE '):
            continue
        elif REGEXP_PAGE.findall(text):
            continue
        else:
            new.append(element.text)
    data = new

    chunks = [[value.strip() for value in row]
              for row in partition(data, 4) if len(row) == 4]
    table = rows.Table(fields=FIELDS)
    current_year = datetime.datetime.now().year
    months = set(extract_month(row) for row in chunks)
    subtract_year = 'DEZ' in months and 'JAN' in months
    for row in chunks:
        try:
            category = convert_text(row[0])
            description = convert_text(row[1])
            value = convert_value(row[2])
        except:
            print('WARNING: Ignoring row: {}'.format(row))
            continue
        year = current_year
        month = extract_month(row)
        if subtract_year and month in ('NOV', 'DEZ'):
            year = current_year - 1
        date = convert_date(row[3], year)
        table.append({'category': category,
                      'description': description,
                      'value': value,
                      'date': date, })

    return table

예제 #45

0

파일 보기

파일: fakescore.py 프로젝트: RavuAlHemio/vbcbbot

def fake(base_url, username, password, game_id, time, score, game_name=None):
    url_opener = _utils.login_and_enter_arcade(base_url, username, password)

    # calculate some more URLs
    play_game_url = urljoin(base_url, "arcade.php?do=play&gameid={0}".format(game_id))
    score_url = urljoin(base_url, "index.php?act=Arcade&do=newscore")

    # pretend to play the game
    print("playing the game")
    play_game_response = url_opener.open(play_game_url)
    play_game = HTML(play_game_response.read())

    if game_name is None:
        # (meanwhile, find the game's name)
        game_flash = play_game.find(".//embed[@type='application/x-shockwave-flash']")
        if game_flash is None:
            print("didn't find the flash plugin on the game page :'-(")
            return

        flash_vars = game_flash.attrib['flashvars'].split("&")
        for var in flash_vars:
            if var.startswith("gamename="):
                game_name = var[len("gamename="):]

    if game_name is None:
        print("game name not found :'-(")
        return

    # wait the given time
    print("waiting")
    sleep(time)

    post_values = {
        "gscore": score,
        "gname": game_name
    }
    post_data = _utils.encode_post_data(post_values)
    print("submitting fake score")
    score_response = url_opener.open(score_url, data=post_data)
    score_response.read()

    print("done")

예제 #46

0

파일 보기

파일: reuters_parser.py 프로젝트: tsili/datpy

def parse_fulltext(fulltext_raw):
    """Extract article text from HTML page.

    Method extracts main text element from the supplied HTML assuming the HTML
    is from www.reuters.com.

    Parameters
    ----------
    fulltext_raw : str
        HTML page to extract the article from.

    Returns
    ----------
    str
        Article text.
    """
    texts = HTML(fulltext_raw)
    texts = texts.xpath('//span[@id="articleText"]')[0].xpath('.//text()')
    text = " ".join(texts).strip()
    return text

예제 #47

0

파일 보기

파일: fabfile.py 프로젝트: ckwang8128/circle-of-parity

def scrape_logos():
    local("mkdir -p data/logos")

    base_uri = "http://www.sportslogos.net/league.php?id={0}"
    for url in [base_uri.format(page_id) for page_id in xrange(30, 36)]:
        resp = requests.get(url)

        if not resp.ok:
            print "Error retrieving {0}".format(url)
            continue

        tree = HTML(resp.content)

        for thumb in tree.findall(".//div[@class='thumbHolder']"):
            link = thumb.find("a")
            logo = link.find("img")

            title = link.attrib["title"].lower().replace("Logos", "")
            title = title.replace(" ", "_").strip()
            filename = "data/logos/{0}.gif".format(title)

            urllib.urlretrieve(logo.attrib["src"], filename)

예제 #48

0

파일 보기

파일: mx_download.py 프로젝트: mtxs007/Mac-App-Downloader-Alfred-Workflow

def main(wf):
    parse = argparse.ArgumentParser()
    parse.add_argument('--app', dest='app')
    parse.add_argument('query', nargs='*', default=None)
    args = parse.parse_args()
    query = args.query[0]
    log.warn(query)
    if query:
        id = query.rsplit('/', 1)[-1].split('.')[0]
        url = 'http://soft.macx.cn/downloado.do?softid={}&cpus=2&urls=3'.format(id)
        r = web.get(url)
        r.raise_for_status()
        a = r.text
        node = HTML(a).find('.//a[@rel="facebox"][last()]')
        log.info(node.text)
        open = ['open']
        if args.app:
            open.extend(['-a',args.app])
        if node is not None and node.text == '浏览器直接下载':
            open.append(node.get('href'))
        else:
            open.append(url)
        call(open)

예제 #49

0

파일 보기

파일: _init19_.py 프로젝트: wulg123456/myrepos

def parseData(urlList):
    urlW=open("/usr/product/zhenzhufen/url.txt" ,'a')
    for u in urlList:
        url=u.get("href").strip()
        print url
        urlW.write(url)
        urlW.write("\n")
        h = HTML(getHtml(url).decode('gbk'))
        dTxt=h.xpath('//h3')
        name=dTxt[0].text.strip().split()[0]+" "+dTxt[0].text.strip().split()[1]#名字
        brand=dTxt[0].text.strip().split()[0]#品牌
#        print brand
#        print name
        pCpgg=h.xpath('//p[@class="pCpgg"]')
        td=h.xpath('//td[@class="td2"]')
        if td:
            price=list(td[0].itertext())[1].strip()
        else :
            price=list(pCpgg[0].itertext())[1].strip()#价格
    #    print price    
        norms=list(pCpgg[-1].itertext())[1].strip()#规格
    #    print norms
        spePs=h.xpath('//p[@class="speP"]/a')
        effect=''
        for speP in spePs:
            effect+=speP.text.strip()+" "#功效
    #    print effect
        awrap=h.xpath('//div[@class="Awrap"]/ul/li/a')
        imgUrl=awrap[0].find("img").attrib.get("src")#图片链接地址
    #    print imgUrl
        troCon=h.xpath('//div[@class="troCon"]')
        des=list(troCon[0].itertext())
        description=''
        for d in des:
            if len(d.strip())>20:
                description+=d.strip()+""#产品描述
    #    print description
        dTxt=h.xpath('//div[@class="dTxt"]/p/a')
        series=dTxt[1].text.strip() #系列
#        print series
        insertData(name,brand,price,norms,effect,imgUrl,description,series)

예제 #50

0

파일 보기

파일: fakescore_tourney.py 프로젝트: RavuAlHemio/vbcbbot

def fake(base_url, username, password, game_id, time, score, tourney_id, game_name=None, rung=None,
         face_off=None):
    url_opener = _utils.login_and_enter_arcade(base_url, username, password)

    # calculate some more URLs
    tourneys_url = urljoin(base_url, "arcade.php?&do=viewtournaments")
    view_tourney_url = urljoin(base_url, "arcade.php?&act=Arcade&do=viewtourney&tid={0}".format(
        tourney_id
    ))
    play_tourney_game_url = urljoin(
        base_url,
        "arcade.php?&do=playtourney&gameid={0}&tid={1}{2}{3}".format(
            game_id, tourney_id,
            "&rung={0}".format(rung) if rung is not None else "",
            "&faceoff={0}".format(face_off) if face_off is not None else ""
        )
    )
    score_url = urljoin(base_url, "index.php?act=Arcade&do=newscore")

    # go to tourneys
    print("entering tourneys page")
    tourneys_response = url_opener.open(tourneys_url)
    tourneys_response.read()

    # view the tourney
    print("looking at the tourney")
    view_tourney_response = url_opener.open(view_tourney_url)
    view_tourney_response.read()

    # pretend to play the game
    print("playing the game")
    play_tourney_game_response = url_opener.open(play_tourney_game_url)
    play_tourney_game = HTML(play_tourney_game_response.read())

    if game_name is None:
        # (meanwhile, find the game's name)
        game_flash = play_tourney_game.find(".//embed[@type='application/x-shockwave-flash']")
        if game_flash is None:
            print("didn't find the flash plugin on the game page :'-(")
            return

        flash_vars = game_flash.attrib['flashvars'].split("&")
        for var in flash_vars:
            if var.startswith("gamename="):
                game_name = var[len("gamename="):]

    if game_name is None:
        print("game name not found :'-(")
        return

    # wait the given time
    print("waiting")
    sleep(time)

    post_values = {
        "gscore": score,
        "gname": game_name
    }
    post_data = _utils.encode_post_data(post_values)
    print("submitting fake score")
    score_response = url_opener.open(score_url, data=post_data)
    score_response.read()

    print("done")

예제 #51

0

파일 보기

파일: luceneremoteservicetest.py 프로젝트: seecr/meresco-lucene

 def testRemoteInfoDrilldownValues(self):
     header, body = getRequest(port=self.httpPort, path='/remote/info/drilldownvalues', arguments=dict(path='untokenized.field2', name='main'), parse=False)
     self.assertFalse('Traceback' in body, body)
     bodyLxml = HTML(body)
     self.assertEquals(set(['value1', 'value0', 'value9', 'value8', 'value7', 'value6', 'value5', 'value4', 'value3', 'othervalue2', 'value2']), set(bodyLxml.xpath('//ul/li/a/text()')))

예제 #52

0

파일 보기

파일: landing.py 프로젝트: Beercow/ekdeco

            
            ## some basic heuristic....
            # if tlen - len(x) < epsi:
            #     print 'fop'
            #     return x
        except:
            pass
    return None

def get_num(x):
    return int(re.search('[0-9]+$',x).group(0))

        
if __name__ == '__main__':
    args = apr.parse_args()
    h = HTML(open(args.file).read().replace('<br>',''))
    key_var = None
    for key in get_keys(h):
        print '[*] testing key:',key
        stream = ''; txt = None
        for el in  h.xpath('//*[@id or @ui or @di]'):
            if el.text:
                txt = decode_page(el.text,key)
                
            if not txt:
                continue

            if 'cryptKey' in txt:

                key_var = re.findall('var cryptKey = ([_a-z0-9]+(\[\s*[0-9]+\s*\])?),',txt,re.I)[0][0]
                key_var = re.sub('\s+','',key_var)

예제 #53

0

파일 보기

파일: proxy.py 프로젝트: iitwebdev/lectures

 def filter(self, environ, headers, data):
     url = construct_url(environ)
     static_url = environ['olpcproxy.static_url']
     found = environ['olpcproxy.keys']
     action = False
     if self.save_key in found:
         self.store.save_page_set(url, headers, data)
         action = True
     if self.remove_key in found:
         self.store.remove_page(url)
         action = True
     if environ.get('olpcproxy.downloads'):
         for index in environ['olpcproxy.downloads']:
             self.save_download(url, data, index)
         action = True
     if action:
         exc = httpexceptions.HTTPTemporaryRedirect(
             headers=[('Location', url)])
         raise exc
     if '?' not in url:
         url_query = url + '?'
     else:
         url_query = url + '&'
     has_page = self.store.has_url(url)
     page = HTML(data)
     try:
         head = page.xpath('//head')[0]
         body = page.xpath('//body')[0]
     except IndexError:
         # Not a full HTML page
         return data
     self.sub_links(url, page, static_url)
     if has_page:
         time_diff = time.time() - self.store.url_cache_time(url)
         time_diff = format_time_diff(time_diff)
         message = ['This page was cached %s ago.  You may '
                    % time_diff,
                    tag.a('remove it from the cache',
                           href=url_query+self.remove_key)]
         div_class = 'olpc-cached'
     else:
         message = ['This page is NOT cached.  You may ',
                    tag.a('add it to the cache',
                           href=url_query+self.save_key)]
         div_class = None
     if head_style:
         insert_beginning(
             head, tag.style(head_style % {'static_url': static_url},
                              type="text/css"))
     image_location = static_url + '/x-small.gif'
     msg = tag.div(
         message,
         tag.a(tag.img(src=image_location, border=0, id="olpc-close-image"), href="#", onclick="document.getElementById('olpc-top-message').style.display='none'", valign="top"),
         id="olpc-top-message",
         class_=div_class)
     bundles = elquery.get_elements_by_class(body, 'olpc-bundle')
     if bundles:
         image_location = static_url + '/caution.gif'
         append(
             msg,
             tag.br(),
             tag.img(src=image_location),
             "Bundles were found in this page")
         for index, bundle in enumerate(bundles):
             b_msg = tag.div(
                 tag.a(tag.img(src=static_url+'/arrow-down-red.gif', border=0),
                       "You may download this bundle",
                       href=url_query+self.download_key+'='+str(index)))
             insert_beginning(bundle, b_msg)
     insert_beginning(body, msg, tag.br(clear="all"))
     data = tostring(page, True)
     # Now fix up the content-type:
     content_type = header_value(headers, 'content-type') or ''
     content_type = self._charset_re.sub('', content_type).strip().lstrip(';')
     content_type += '; charset=utf'
     replace_header(headers, 'content-type', content_type)
     return data

예제 #54

0

파일 보기

파일: _init19_.py 프로젝트: wulg123456/myrepos

    print sql
    sqlW.write(sql)
    sqlW.write("\n")
    try:
        db.set_character_set('utf8')
        cursor.execute('SET NAMES utf8;')
        cursor.execute('SET CHARACTER SET utf8;')
        cursor.execute('SET character_set_connection=utf8;')
        cursor.execute(sql)
        db.commit()
    except MySQLdb.Error,e:
        print "Mysql Error %d: %s" % (e.args[0], e.args[1])
    cursor.close()  
    db.close()    
urlHtml=getHtml("http://cosme.pclady.com.cn/products_list/br0_bs0_bi1_sm119_ef0_pb0_pe0_or0.html")
html= HTML(urlHtml.decode('gbk'))
urlList=html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a')
parseData(urlList)
for i in range(3 ,4):
    i=str(i)
    print i
    htmls="http://cosme.pclady.com.cn/products_list/br0_bs0_bi1_sm119_ef0_pb0_pe0_or0_p"+i+".html#productList"
    urlHtml=getHtml(htmls)
    try:
        html= HTML(urlHtml.decode('gbk'))
        urlList=html.xpath('//div[@class="dList"]/ul/li/i[@class="iPic"]/a')
        parseData(urlList) 
    except Exception :
        errorTxt.write("\n")
        errorTxt.write(i)
        errorTxt.write("\n")

예제 #55

0

파일 보기

파일: trackleaders.py 프로젝트: DSrcl/get-back-on-your-bike

def get_racers(race_id):
    page = HTML(requests.get(TL_URL_TEMPL% race_id, headers={'User-Agent': USER_AGENT}).text)
    racers = set(el.text for el in page.xpath('.//a[@onmouseout]'))
    return list(racers)

예제 #56

0

파일 보기

파일: trackleaders.py 프로젝트: DSrcl/get-back-on-your-bike

def get_race_name(race_id):
    page = HTML(requests.get(TL_URL_TEMPL% race_id, headers={'User-Agent': USER_AGENT}).text)
    return page.findtext('.//title').split('live')[0].strip()

예제 #57

0

파일 보기

파일: search.py 프로젝트: argandgahandapandpa/Headless-selenium-screen-scraping-example

sel = selenium("localhost", 4444, "*chrome", "http://uk.yahoo.com/")
time.sleep(10)

for _ in range(10):  # Wait for selenium to come up
    try:
        sel.start()
    except Exception:
        import traceback

        print traceback.format_exc()
        time.sleep(2)
    else:
        break
else:
    raise Exception("Selenium failed to start")


## Do some searching

try:
    sel.open("/?p=us")
    sel.type("id=p_13838465-p", search)
    sel.click("id=search-submit")
    sel.wait_for_page_to_load("30000")
    tree = HTML(sel.get_html_source())
    results = tree.xpath('//*[@class="res"]/descendant::a/@href')
    print "\n".join(results)
finally:
    sel.stop()

예제 #58

0

파일 보기

파일: get_debian_version.py 프로젝트: xenron/sandbox-dev-python

#!/usr/bin/env python3

import re
import requests
from lxml.etree import HTML

response = requests.get('http://www.debian.org/releases/stable/')
root = HTML(response.content)
title_text = root.find('head').find('title').text
release = re.search('\u201c(.*)\u201d', title_text).group(1)
p_text = root.xpath('//div[@id="content"]/p[1]')[0].text
version = p_text.split()[1]
print('Codename: {}\nVersion: {}'.format(release, version))

예제 #59

0

파일 보기

파일: __init__4.py 프로젝트: wulg123456/myrepos

def parseData(urlList):
    urlW=open("/usr/caizhuang/zhuangqian/url.txt" ,'a')
    for u in urlList:
        url=u.get("href").strip()
        print url
        urlW.write(url)
        urlW.write("\n")
        h = HTML(getHtml(url).decode('gbk'))
        try:
            dTxt=h.xpath('//h3')
            name=dTxt[0].text.strip().split()[0]+" "+dTxt[0].text.strip().split()[1]#锟斤拷锟斤拷
            brand=dTxt[0].text.strip().split()[0]#品锟斤拷
        except Exception:
            errorTxt.write(url)
#        print brand
#        print name
        try:
            pCpgg=h.xpath('//p[@class="pCpgg"]')
            td=h.xpath('//td[@class="td2"]')  
        except Exception:
            errorTxt.write(url)
        try:
            if td:
                price=list(td[0].itertext())[1].strip()
            else :
                price=list(pCpgg[0].itertext())[1].strip()#锟桔革拷
#    print price   
        except Exception:
            errorTxt.write(url)
        try:
            norms=list(pCpgg[-1].itertext())[1].strip()#锟斤拷锟�
        #    print norms
        except Exception:
            errorTxt.write(url)
        try:
            spePs=h.xpath('//p[@class="speP"]/a')
            effect=''
            for speP in spePs:
                effect+=speP.text.strip()+" "#锟斤拷效
        #    print effect
        except Exception:
            errorTxt.write(url)
        try:
            awrap=h.xpath('//div[@class="Awrap"]/ul/li/a')
            imgUrl=awrap[0].find("img").attrib.get("src")#图片锟斤拷锟接碉拷址
        #    print imgUrl
        except Exception:
            errorTxt.write(url)
        try:
            troCon=h.xpath('//div[@class="troCon"]')
            des=list(troCon[0].itertext())
            description=''
            for d in des:
                if len(d.strip())>20:
                    description+=d.strip()+""#锟斤拷品锟斤拷锟斤拷
    #    print description
        except Exception:
            errorTxt.write(url)    
    
        
        try:
            dTxt=h.xpath('//div[@class="dTxt"]/p/a')
            series=dTxt[1].text.strip() #系锟斤拷
        except Exception:
            errorTxt.write(url) 
        
#        print series
        insertData(name,brand,price,norms,effect,imgUrl,description,series)

예제 #60

0

파일 보기

파일: _init7_.py 프로젝트: wulg123456/myrepos

def parseData(urlList):
    urlW=open("/usr/product/mianmo/url.txt" ,'a')
    for u in urlList:
        url=u.get("href").strip()
        print url
        urlW.write(url)
        urlW.write("\n")
        h = HTML(getHtml(url).decode('gbk'))
        try:
            dTxt=h.xpath('//h3')
            name=dTxt[0].text.strip().split()[0]+" "+dTxt[0].text.strip().split()[1]#闁跨喐鏋婚幏鐑芥晸閺傘倖瀚�
            brand=dTxt[0].text.strip().split()[0]#閸濅線鏁撻弬銈嗗
        except Exception:
            errorTxt.write(url)
#        print brand
#        print name
        try:
            pCpgg=h.xpath('//p[@class="pCpgg"]')
            td=h.xpath('//td[@class="td2"]')  
        except Exception:
            errorTxt.write(url)
        try:
            if td:
                price=list(td[0].itertext())[1].strip()
            else :
                price=list(pCpgg[0].itertext())[1].strip()#闁跨喐顢欓棃鈺傚
#    print price   
        except Exception:
            errorTxt.write(url)
        try:
            norms=list(pCpgg[-1].itertext())[1].strip()#闁跨喐鏋婚幏鐑芥晸閿燂拷
        #    print norms
        except Exception:
            errorTxt.write(url)
        try:
            spePs=h.xpath('//p[@class="speP"]/a')
            effect=''
            for speP in spePs:
                effect+=speP.text.strip()+" "#闁跨喐鏋婚幏閿嬫櫏
        #    print effect
        except Exception:
            errorTxt.write(url)
        try:
            awrap=h.xpath('//div[@class="Awrap"]/ul/li/a')
            imgUrl=awrap[0].find("img").attrib.get("src")#閸ュ墽澧栭柨鐔告灮閹风兘鏁撻幒銉ь暜閹峰嘲娼�
        #    print imgUrl
        except Exception:
            errorTxt.write(url)
        try:
            troCon=h.xpath('//div[@class="troCon"]')
            des=list(troCon[0].itertext())
            description=''
            for d in des:
                if len(d.strip())>20:
                    description+=d.strip()+""#闁跨喐鏋婚幏宄版惂闁跨喐鏋婚幏鐑芥晸閺傘倖瀚�
    #    print description
        except Exception:
            errorTxt.write(url)    
    
        
        try:
            dTxt=h.xpath('//div[@class="dTxt"]/p/a')
            series=dTxt[1].text.strip() #缁鏁撻弬銈嗗
        except Exception:
            errorTxt.write(url) 
        
#        print series
        insertData(name,brand,price,norms,effect,imgUrl,description,series)