예제 #1
0
파일: core.py 프로젝트: anukat2015/jusText
def decode_html(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=DEFAULT_ENC_ERRORS):
    """
    Converts a `html` containing an HTML page into Unicode.
    Tries to guess character encoding from meta tag.
    """
    if isinstance(html, unicode):
        return html

    if encoding:
        return html.decode(encoding, errors)

    match = CHARSET_META_TAG_PATTERN.search(html)
    if match:
        declared_encoding = match.group(1).decode("ASCII")
        # proceed unknown encoding as if it wasn't found at all
        with ignored(LookupError):
            return html.decode(declared_encoding, errors)

    # unknown encoding
    try:
        # try UTF-8 first
        return html.decode("utf8")
    except UnicodeDecodeError:
        # try lucky with default encoding
        try:
            return html.decode(default_encoding, errors)
        except UnicodeDecodeError as e:
            raise JustextError("Unable to decode the HTML to Unicode: " + unicode(e))
예제 #2
0
def extract(html, url, **kwargs):
    """ """
    logging.debug("*** extracting %s ***" % (url,))

    kw = { 'remove_comments': True }
    if 'encoding' in kwargs:
        kw['encoding'] = kwargs['encoding']
        try:
            foo = html.decode(kw['encoding'])
        except UnicodeDecodeError:
            # make it legal
            logging.warning("Invalid %s - cleaning up" %(kw['encoding'],))
            foo = html.decode(kw['encoding'],'ignore')
            html = foo.encode(kw['encoding'])


    parser = lxml.html.HTMLParser(**kw)
    doc = lxml.html.document_fromstring(html, parser, base_url=url)

    [i.drop_tree() for i in util.tags(doc,'script','style')]

    # drop comment divs - they have a nasty habit of screwing things up
    [i.drop_tree() for i in doc.cssselect('#disqus_thread')]
    [i.drop_tree() for i in doc.cssselect('#comments, .comment')]

    # drop obvious structural cruft
    [i.drop_tree() for i in doc.cssselect('#header, #footer, #sidebar')]

    # NASTY SITE-SPECIFIC HACKS
    # nasty little hacks with no obvious general solutions:

    # Johnston Publishing sites - they have adverts embedded in the headline :-(
    [i.drop_tree() for i in doc.cssselect('.sponsorPanel')]
    
    # www.shropshirestar.com
    # www.expressandstar.com
    # Have annoyingly-well marked up author links to featured articles in masthead
    [i.drop_tree() for i in doc.cssselect('#masthead-quote')]

    if 'independent.co.uk' in url:
        [i.drop_tree() for i in doc.cssselect('#side, .readNextWidget')]

#    html = UnicodeDammit(html, isHTML=True).markup
    headline_info = extract_headline(doc,url)
    headline_linenum = 0
    headline = None
    headline_node = None
    if headline_info is not None:
        headline_linenum = headline_info['sourceline']
        headline = headline_info['txt']
        headline_node = headline_info['node']

    pubdate, pubdate_node = extract_pubdate(doc,url,headline_linenum)

    authors = byline.extract(doc, url, headline_node, pubdate_node)

    return headline,authors,pubdate
예제 #3
0
 def _request(self, request):
     if self.__last_acttime is not None:
         diff = time.time() - self.__last_acttime
         if diff < self.time_interval:
             time.sleep(self.time_interval - diff)
     self.__last_acttime = time.time()
     html = self.browser.open(request).read()
     try:
         html = html.decode('utf8')
     except UnicodeDecodeError:
         html = html.decode('cp936', 'ignore')
     r = html.lstrip()
     return r
예제 #4
0
    def get_lines (url):
        try:
            html = urlopen(url).read()
            # make random guesses on the document coding
            try:
                text = html2text(html.decode("utf-8"))
            except Exception:
                text = html2text(html.decode("latin-1", "ignore"))

            # workaround a html2text bug
            text = text.replace("&nbsp_place_holder;", " ");
            return text.split("\n")
        except Exception, err:
            print "Failed to get lines: %s" % err
            return []
예제 #5
0
 def scrape_membernames_generic(self, committee, url, chamber, term):
     html = self.urlopen(url)
     html = html.decode(self.encoding)
     doc = lxml.html.fromstring(html)
     names = doc.xpath('//a/text()')
     names = filter(lambda n: 'Senator' in n, names)
     return names
예제 #6
0
    def getSiteContact(self, account, username, mobile):
        HOST = "dealer.che168.com"
        # if account in config.che168VIPAccountList:
        #     HOST = "dealer.che168.com"
        # else:
        #     HOST = "dealers.che168.com"

        conn = httplib.HTTPConnection(HOST, timeout=timeout_che168)
        headers = copy.copy(self.headers)
        conn.request("GET", "/car/publish/?s=1", headers=headers)
        res = conn.getresponse()
        resHeaders = res.getheaders()
        resRead = res.read()
        html = self.decodeBody(resHeaders, resRead)
        html = html.decode('GB18030')
        html = html.replace("gb2312", "utf-8")
        dom = lxml.html.fromstring(html)
        contactItems = dom.xpath('//*[@id="sh_linkMan_div"]/a/@rel')
        conn.close()
        if len(contactItems) == 0:
            return self.createNewContact(username, mobile)
        logger.debug(str(contactItems))
        for salesid in contactItems:
            # if self.checkCurrentContact(salesid, mobile) is True:
            return salesid
        return self.createNewContact(username, mobile)
def craw_sitemap(url, user_agent, num_retrics):
    #download the sitemap file
    sitemap = download(url, user_agent, num_retrics)
    #f = open(r'D:\exercise\zhurenwenji.txt','w')
    doc = Document()

    #extract the sitemap links
    links = re.findall('<a href="http:(.*?)" title="http', sitemap)
    #print links
    i = 0
    print len(links)
    for link in links:
        link = 'http:' + link
        try:
            html = download(link, user_agent, num_retrics)
            #contents = re.findall('<div class="detail">(.*?)</div>',html)
            tree = lxml.html.fromstring(html.decode('utf-8'))
            td = tree.cssselect('div.detail')[0].text_content()
            i += 1
            print i
        except:
            pass
        #doc.save(contents)
        doc.add_paragraph(td)
    doc.save('d:\exercise\zhurenwenji.docx')
예제 #8
0
파일: ach.py 프로젝트: bet0x/smartbot
    def _guide(name):
        game_id = name.lower().replace(" ", "-")
        session = utils.web.requests_session()
        page = session.get(GUIDE_URL.format(game_id))
        tree = lxml.html.fromstring(page.text)

        li_elements = tree.cssselect("#col_l .bl_la_main_guide .showhide ul li")
        if li_elements:
            return [x.text_content().strip() for x in li_elements[:5]]
        else:
            elements = tree.cssselect("#col_l .bl_la_main_guide .showhide p")
            if not elements:
                elements = tree.cssselect("#col_l .bl_la_main_guide .showhide div div")

            if elements:
                info = []
                html = lxml.html.tostring(elements[0])
                lines = html.decode("utf-8").split("<br>")
                for line in lines[1:6]:
                    span_str = "<span>{0}</span>".format(line)
                    span = lxml.html.fragment_fromstring(span_str)
                    s = span.text_content().strip()
                    if s.startswith("-"):
                        s = s[1:]
                    info.append(s)
                return info
예제 #9
0
    def _execute(self, options, args):
        """Compile reStructuredText to standalone HTML files."""
        compiler = self.site.plugin_manager.getPluginByName('rest', 'PageCompiler').plugin_object
        if len(args) != 1:
            print("This command takes only one argument (input file name).")
            return 2
        source = args[0]
        with io.open(source, "r", encoding="utf8") as in_file:
            data = in_file.read()
            output, error_level, deps, shortcode_deps = compiler.compile_string(data, source, True)

        rstcss_path = resource_filename('nikola', 'data/themes/base/assets/css/rst_base.css')
        with io.open(rstcss_path, "r", encoding="utf8") as fh:
            rstcss = fh.read()

        template_path = resource_filename('nikola', 'plugins/command/rst2html/rst2html.tmpl')
        template = Template(filename=template_path)
        template_output = template.render(rstcss=rstcss, output=output)
        parser = lxml.html.HTMLParser(remove_blank_text=True)
        doc = lxml.html.document_fromstring(template_output, parser)
        html = b'<!DOCTYPE html>\n' + lxml.html.tostring(doc, encoding='utf8', method='html', pretty_print=True)
        print(html.decode('utf-8'))
        if error_level < 3:
            return 0
        else:
            return 1
예제 #10
0
    def scrape_members_senate_subcommittees(self, committee, url, chamber,
                                            term, cache={}):

        if cache:
            names = cache[committee['subcommittee']]
            return Membernames.scrub(names)

        html = self.urlopen(url)
        html = html.decode(self.encoding)
        doc = lxml.html.fromstring(html)

        # Commence horrific regex-based hackery to get subcommittee members.
        text = doc.xpath('//div[@class="content"]')[0].text_content()
        chunks = re.split(r'\s*Subcommittee.*', text)
        namelists = []
        for c in chunks:
            names = re.sub(r'\s*Members\s*', '', c)
            names = re.split(r'\s*(,|and)\s*', names)
            names = filter(lambda s: s not in [',', 'and'], names)
            names = map(clean, names)
            if filter(None, names):
                namelists.append(names)

        committee_names = doc.xpath('//div[@class="content"]/h3/text()')
        for _committee, _names in zip(map(clean, committee_names), namelists):
            cache[_committee] = _names

        names = cache[committee['subcommittee']]
        return Membernames.scrub(names)
예제 #11
0
    def _execute(self, options, args):
        """Compile reStructuredText to standalone HTML files."""
        compiler = self.site.plugin_manager.getPluginByName('rest', 'PageCompiler').plugin_object
        if len(args) != 1:
            print("This command takes only one argument (input file name).")
            return 2
        source = args[0]
        with io.open(source, "r", encoding="utf8") as in_file:
            data = in_file.read()
            output, error_level, deps, shortcode_deps = compiler.compile_string(data, source, True)

        rstcss_path = resource_filename('nikola', 'data/themes/base/assets/css/rst.css')
        with io.open(rstcss_path, "r", encoding="utf8") as fh:
            rstcss = fh.read()

        template_path = resource_filename('nikola', 'plugins/command/rst2html/rst2html.tmpl')
        template = Template(filename=template_path)
        template_output = template.render(rstcss=rstcss, output=output)
        parser = lxml.html.HTMLParser(remove_blank_text=True)
        doc = lxml.html.document_fromstring(template_output, parser)
        html = b'<!DOCTYPE html>\n' + lxml.html.tostring(doc, encoding='utf8', method='html', pretty_print=True)
        print(html.decode('utf-8'))
        if error_level < 3:
            return 0
        else:
            return 1
예제 #12
0
def parse_odds(url):
        #pdb.set_trace()
	global M_DB
	html=pget(url)
	if not html:return
	MatchId=url.split('/')[-4]
	
	tree=lxml.html.fromstring(html.decode("utf8"))
	#table=tree.xpath("//div[@id='data_main_content']/table")[0]
	trs=tree.xpath("tr")
	#if MatchId not in M_DB[day]:
	#	M_DB[day][MatchId]={"odds":{}}
	#data=M_DB[day][MatchId]["odds"]
	data={}
	#pdb.set_trace()
	for tr in trs:
		tds=tr.xpath("td")
		company=unicode(tds[1].text_content()).strip()
		if company not in M_COMPANY:continue
		
		s_zhu=unicode(tds[2].text_content()).strip().replace(u'↓' ,'').replace(u'↑' ,'').strip()
		s_ping=unicode(tds[3].text_content()).strip().replace(u'↓' ,'').replace(u'↑' ,'').strip()
		s_ke=unicode(tds[4].text_content()).strip().replace(u'↓' ,'').replace(u'↑' ,'').strip()
		n_zhu=unicode(tds[5].text_content()).strip().replace(u'↓' ,'').replace(u'↑' ,'').strip()
		n_ping=unicode(tds[6].text_content()).strip().replace(u'↓' ,'').replace(u'↑' ,'').strip()
		n_ke=unicode(tds[7].text_content()).strip().replace(u'↓' ,'').replace(u'↑' ,'').strip()
		href=tds[5].xpath("a/@href")[0]
		#print href
		odds_change=pase_history(href)
		odds_change.update({"company":company,
				"s_zhu":s_zhu,"s_ping":s_ping,"s_ke":s_ke,
				"n_zhu":n_zhu,"n_ping":n_ping,"n_ke":n_ke,})
		data[company]=odds_change
	return data
예제 #13
0
  def get_albums(self, html):
    if not html: raise ValueError

    doc = lxml.html.fromstring(html.decode('utf8'))
    sections = doc.cssselect('h2 > span.mw-headline')
    albums = {}

    for s in sections:
      try:
        album = s.cssselect('a:first-child')[0].text
      except:
        album = s.text

      if not len(album): continue
      
      album = self.normalizeish(album)

      songlist = s.xpath("./following::ol[1]")

      if not songlist: continue

      al = albums.setdefault(album, {})
      for i, e in enumerate(songlist[0].getchildren()):
        try:
          a = e.cssselect('a')[0]
          if '(page does not exist)' not in a.get('title'):
            al[i+1] = a
        except (TypeError, IndexError):
          pass
      if not al: del albums[album]

    return albums or None
def fetch_oep_entry(id, datastorage):
    oepurl = url_from_id(id)
    html = scraperwiki.scrape(oepurl)
    root = lxml.html.fromstring(html.decode('utf-8'))
    data = { 'journalPostId' : id }
    for tr in root.cssselect("table.defaultTable tr"):
        vtype = tr.cssselect("th")[0].text_content().strip().replace(":", "").replace(",", "")
        value = tr.cssselect("td")[0].text_content().strip()
        #print '"' + vtype + '"', '"'+value+'"'
        if (vtype == 'Record entry date' and value == 'Not stated.') or \
            (vtype == 'Document type' and value == '-') or \
            (vtype == 'Case number' and value == ''):
            return -1
        if vtype in fieldmap:
            vtype = fieldmap[vtype]
        if 'doctype' == vtype:
            value = doctypemap[value]
        if 'caseid' == vtype:
            caseyear, caseseqnr = value.split("/")
            data['caseyear'] = caseyear
            data['caseseqnr'] =  caseseqnr
        data[vtype] = value
#    print str(id) + ": " + str(data)
    data['scrapestamputc'] = datetime.datetime.now()
#    print data['scrapestamputc']
#    exit ()

    datastorage.append(data)
#    scraperwiki.sqlite.save(unique_keys=['journalPostId'], data=data)
    return 0
def parseNewApplications(url):
    html = scraperwiki.scrape(url)
    html = html.decode("utf8")
    apps = lxml.html.fromstring(html)
    new_cnt=0
    total_cnt=0
    global warning
    global scrape_id
    warning=''
    for app in apps.cssselect("ul[id='highlights'] li"):
        try:
            appAnchor = safeXPath(app.cssselect("a"), 0)
            appHref = safeXPath(appAnchor.xpath("@href"), 0)
            app_ID = appHref.partition('consultations/')[2].partition('.aspx')[0]
            #appTitle = safeXPath(appAnchor.xpath("@title"), 0)
            #appPara = safeXPath(app.cssselect("p"), 0)
            #appDescr = safeXPath(appPara.xpath("text()"), 0)            

            if scraperwiki.sqlite.select("* from licence_applications WHERE app_ID="+str(app_ID)) == []:
                new_yn=1
                new_cnt=new_cnt+1
            else: new_yn=0

            parseAppDetail(app_ID, baseURL+appHref, new_yn)
            total_cnt=total_cnt+1
        except IndexError as ex:
            print "parseNewApplications: ex={1}: url={0} app={2}".format(url, str(ex), app)
            warning='Could not parse page'
#save log
    scrape_id=scraperwiki.sqlite.get_var('last_page')
    log_entry= {"scrape_id":scrape_id , "scrape_date":now, "task":'parse list', "url":url, "result":str(new_cnt) + ' New records / ' + str(total_cnt) + ' Total records', "warnings":warning}
    scraperwiki.sqlite.save(['scrape_id'],log_entry,'log')
def scrape_study(url):
    html = scraperwiki.scrape(url)
    root = lxml.html.fromstring(html.decode('utf-8'))
    body = root.cssselect('.large-copy')[0]
    data = {}
    key = None
    value = None
    for child in body:
        if child.tag == 'b':
            if key is not None:
                if len(value) == 1:
                    data[key] = value[0]
                else:
                    data[key] = '\n'.join(value)
            key = make_key(child.text_content().strip())
            value = []
        if child.tag == 'ul':
            value.extend([s.text_content().strip() for s in child.cssselect('li')])
        if child.tail is not None and child.tail.strip():
            content = child.tail.strip()
            content = content.replace(u'•', '')
            content = content.strip()
            value.append(content)
    if len(value) == 1:
        data[key] = value[0]
    else:
        data[key] = '\n'.join(value)
    return data
def parseDetail(uri):
    html = scrape(uri)
    root = lxml.html.fromstring(html.decode('utf-8'))
    root.make_links_absolute(DELIBERE_URI)
    data = {}
    tds = root.cssselect('table td')
    for ix, td in enumerate(tds):
        if ix == 1:
            data['organo'] = d = td.find('strong').text.strip()
        elif ix == 3:
            data['numero'] = int(td.find('strong').text.strip())
        elif ix == 5:
            data['anno'] = int(td.find('strong').text.strip())
        elif ix == 7:
            date = td.find('strong').text.strip()
            data['data'] = dateutil.parser.parse(date, dayfirst=True).date()
        elif ix == 9:
            data['oggetto'] = td.find('strong').text.strip()
        elif ix == 11:
            date = td.find('strong').text.strip()
            data['data_pubblicazione'] = dateutil.parser.parse(date, dayfirst=True).date()
        elif ix == 13:
            date = td.find('strong').text.strip()
            data['data_esecutivita'] = dateutil.parser.parse(date, dayfirst=True).date()
        elif ix == 15:
            allegati = []
            for li in td.iter('li'):
                # get always the second link which contains a title                
                link = li.findall('a')[1]
                if link.get('href').endswith('.pdf'):
                    allegati.append({'uri': link.get('href'), 'titolo': link.text.strip()})
            data['allegati'] = allegati
    return data
예제 #18
0
파일: Download.py 프로젝트: csrgxtu/maxent
 def getEncoding(self):
   html = self.getSOURCE()
   # first, get encode from http head
   
   # second, get from source
   dom = lxml.html.fromstring(html.decode('utf8', 'ignore'), \
     parser = lxml.html.HTMLParser(remove_comments = True))
     
   encs = dom.xpath('.//head/meta[@charset]/@charset')
   
   encs += [re.findall(r'charset=(.*)', _.get('content'))[0] 
     for _ in dom.xpath('.//head/meta[@http-equiv][@content]') \
     if _.get('http-equiv').lower() == "content-type" and \
     _.get('content').count('charset=') == 1]
     
   encs = set([_.lower() for _ in encs])
   
   if set(['gb2312', 'gbk']) <= encs: encs.remove('gb2312')
   if set(['gb2312']) == encs: encs = set(['gbk'])
   
   if len(encs) == 1: return encs.pop()
   
   try:
     import chardet
     return chardet.detect(html)['encoding']
   except ImportError, e: raise e
예제 #19
0
    def scrape_lower_members(self, committee, url, chamber, term,
        re_name=re.compile(r'^(Senator|Assemblymember)'),):

        try:
            # Some committees display the members @ /memberstaff
            html = self.urlopen(url + '/membersstaff')
        except:
            # Others display the members table on the homepage.
            html = self.urlopen(url)

        html = html.decode(self.encoding)
        doc = lxml.html.fromstring(html)
        members = doc.xpath('//table/descendant::td/a/text()')
        members = map(strip, members)
        members = filter(None, members)[::2]


        if not members:
            self.warning('Dind\'t find any committe members at url: %s' % url)
        
        for member in members:
            
            if ' - ' in member:
                member, role = member.split(' - ')
            else:
                role = 'member'
            
            member = re_name.sub('', member)
            member = member.strip()
            committee.add_member(member, role)
            
        return committee
def process(url, season, internal_matchday):
    html = scraperwiki.scrape(url)
    html = html.decode(
        "utf-8"
    )  # convert to unicode before lxml gets it since the encoding declaration is missing in the html
    root = lxml.html.fromstring(html)
    matches = root.xpath(match_detail_xpath)
    for match in matches:
        record = {}
        record["matchday"] = root.xpath("//h3")[0].text
        record["season"] = season
        record["internal_matchday"] = internal_matchday
        # trs = match.xpath('//tr[@class="sup"]') #info and url
        record["stage"] = match.xpath('tr[@class="sup"]//span[@class="rname"]')[0].text_content()
        try:  # one or two cancels
            record["match_detail_url"] = (
                "http://www.uefa.com"
                + match.xpath('tr[@class="sup"]//span[contains(@class,"report")]/a')[0].attrib["href"]
            )
        except:
            pass
        record["home_team"] = match.xpath('tr[@class=" match_res"]//td[contains(@class,"home")]')[0].text_content()
        record["away_team"] = match.xpath('tr[@class=" match_res"]//td[contains(@class,"away")]')[0].text_content()
        try:
            record["aggregate"] = match.xpath('tr[@class="reasonwin"]//span[contains(@class,"rwa")]')[0].text_content()
        except:
            pass
        try:
            record["aggregate_notes"] = match.xpath('tr[@class="reasonwin"]//span[contains(@class,"woag")]')[
                0
            ].text_content()
        except:
            pass

        record["home_team_url"] = (
            "http://www.uefa.com"
            + match.xpath('tr[@class=" match_res"]//td[contains(@class,"home")]/a')[0].attrib["href"]
        )
        record["away_team_url"] = (
            "http://www.uefa.com"
            + match.xpath('tr[@class=" match_res"]//td[contains(@class,"away")]/a')[0].attrib["href"]
        )
        record["score"] = match.xpath('tr[@class=" match_res"]//td[contains(@class,"score")]')[0].text_content()
        ref_stadium = re.split(u"\u2013", match.xpath('tr[@class="referee_stadium"]')[0].text_content())
        # print repr(match.xpath('tr[@class="referee_stadium"]')[0][0].text)
        try:
            # record['referee'] = ref_stadium[0].lstrip('Referee: ').strip()
            record["referee"] = ref_stadium[0].replace("Referee: ", "").strip()
        except:
            pass
        try:
            # record['stadium'] = ref_stadium[1].lstrip('Stadium: ').strip()
            record["stadium"] = ref_stadium[1].replace("Stadium: ", "").strip()
        except:
            pass

        # print record
        scraperwiki.sqlite.save(
            unique_keys=["matchday", "season", "score", "home_team", "away_team"], data=record, verbose=1
        )
예제 #21
0
파일: logger.py 프로젝트: AlexUlrich/digsby
def parse_html_slow(html):
    'Uses Beautiful Soup to parse messages out of a log file.'

    html = html.decode('utf-8', 'ignore')

    soup     = soupify(html, markupMassage = ((br_re,lambda m: '<br />'),))
    messages = []
    strptime = datetime.strptime

    for div in soup.findAll(message_divs):
        try:
            buddyname = div.findAll('span', class_buddy)[0].renderContents(None)
            timestamp = parse_timestamp(div['timestamp'])
            message   = div.findAll('span', class_msgcontent)[0].renderContents(None)
            type      = div['class'].replace('message', '').strip()
            auto      = boolify(div.get('auto', 'false'))
        except Exception:
            print_exc()
        else:
            messages.append(Message(buddy     = S(name = buddyname),
                                    timestamp = timestamp,
                                    message   = message,
                                    type      = type,
                                    auto      = auto))

    log_info('parse_html_slow with %d bytes returning %d messages', len(html), len(messages))
    return messages
예제 #22
0
파일: lxml_tools.py 프로젝트: subeax/grab
def sanitize_html(html, encoding='utf-8', return_unicode=False):
    html = smart_str(html, encoding=encoding)
    if RE_TAG_START.search(html):
        html = render_html(parse_html(html))
    if return_unicode:
        return html.decode('utf-8')
    else:
        return html
예제 #23
0
def search2():
    writer = csv.writer(open('countries.csv', 'w'))
    D = Downloader()
    html = D('http://example.webscraping.com/places/default/search?page=0&page_size=1000&search_term=.')
    print(html.decode('utf-8'))
    ajax = json.loads(html)
    for record in ajax['records']:
        writer.writerow([record['country']])
예제 #24
0
파일: tbs.py 프로젝트: tehron/tehbot
 def userstats_api(self, user):
     url = "http://www.bright-shadows.net/userdata.php?"
     html = urllib.request.urlopen(url + urllib.parse.urlencode({"username" : user}), timeout=5).read()
     html = html.decode()
     if html == "Unknown User":
         return None
     real_user, rank, users_total, challs_cnt, challs_total = html.split(":")
     return real_user, str(int(challs_cnt)), int(challs_total), str(int(rank)), int(users_total), None, None, None
예제 #25
0
def get_captcha(html):
    tree = lxml.html.fromstring(html.decode('utf8'))
    img_data = tree.cssselect('div#recaptcha img')[0].get('src')
    img_data = img_data.partition(',')[-1]
    binary_img_data = base64.b64decode(img_data)
    file_like = BytesIO(binary_img_data)
    img = Image.open(file_like)
    return img
예제 #26
0
 def __init__(self, html, encoding='utf-8', cache_xpath=True):
     if isinstance(html, bytes):
         self.html = lxml.html.fromstring(html.decode(encoding))
     elif isinstance(html, lxml.html.HtmlElement):
         self.html = html
     else:
         self.html = lxml.html.fromstring(html)
     self.cache_xpath = cache_xpath
예제 #27
0
파일: lxml_tools.py 프로젝트: GrIvA/NB
def sanitize_html(html, encoding='utf-8', return_unicode=False):
    html = smart_str(html, encoding=encoding)
    if RE_TAG_START.search(html):
        html = render_html(parse_html(html))
    if return_unicode:
        return html.decode('utf-8')
    else:
        return html
예제 #28
0
 def scrape_membernames_senate_autism(self, committee, url, chamber, term):
     '''The Senate Autism committee has its own wierd format.
     '''
     url = 'http://autism.senate.ca.gov/committeemembers1'
     html = self.urlopen(url)
     html = html.decode(self.encoding)
     doc = lxml.html.fromstring(html)
     return self.scrape_membernames_generic(doc)
예제 #29
0
def regex_scraper(html):
    results = {}
    if html is not None:
        html = html.decode('utf-8')
    for field in FIELDS:
        print(field)
        results[field] = re.search('<a href="/places/default/index">(.*?)</a>',
                                   html).groups()[0]
    return results
예제 #30
0
def scrape(crno):
    crnostr = "%07d" % crno
    baseurl = "https://www.mobile-cr.gov.hk/mob/cps_criteria.do?queryCRNO="
    url = baseurl + crnostr

    print "trying local", crnostr
    html = load_local(url)
    if html is None:
        print "trying site", crnostr
        html = scraperwiki.scrape(url).decode('utf-8')
        print "storing local", crnostr
        store_local(url, html.encode('utf-8'))
    else:
        html = html.decode('utf-8')

    if '沒有紀錄與輸入的查詢資料相符' in html.encode('utf-8'):
        print 'NO MATCHING RECORD FOUND FOR THE SEARCH INFORMATION INPUT!'
        return nil
    root = lxml.html.fromstring(html) # , encoding="utf-8")
    tds = root.cssselect("tr td tr td")
    namestds = root.cssselect("td.data")   

    while tds == []:
        print "trying", crnostr, "again"
        sleep(46)
        html = scraperwiki.scrape(baseurl + crnostr).decode('utf-8')
        root = lxml.html.fromstring(html) # , encoding="utf-8")
        tds = root.cssselect("tr td tr td")
        namestds = root.cssselect("td.data")   

        #for idx, val in enumerate(tds):
        #    print idx, ":", val.text_content().encode('utf-8')
    names = {}
    for nameidx, nameval in enumerate(namestds):
        names["Name" + str(nameidx)] = nameval.text_content()[10:]
        names["Name" + str(nameidx) + "date"] = nameval.text_content()[:10]

    print "got", tds[1].text_content() 

    data = {
        'cr' : tds[1].text_content(),
        'English Company Name' : tds[2].text_content().rsplit('\r')[1].lstrip('\n\t'),
        'Chinese Company Name' : tds[2].text_content().rpartition('\r')[2].lstrip('\r\n\t'),
        'Company Type' : tds[4].text_content()[:-1],
        'Date of incorporation' : tds[6].text_content(),
        # 'Company status' : tds[8].text_content()[:-1],
        'Active status' : tds[8].text_content()[:-1],
        'Remarks' : tds[9].text_content().replace(u"備註:",""),
        'Winding up mode' : tds[11].text_content()[:-1],
        'Date of Dissolution' : tds[13].text_content(),
        'Register of Charges' : tds[15].text_content()[:-1],
        'Important Note' : tds[16].text_content().replace(u"重要事項:","").lstrip('\r\n\t')
    }
    data.update(names)
    
    db['swdata'].upsert(data, ['cr'])
    print "wrote", tds[1].text_content()
예제 #31
0
파일: scraper.py 프로젝트: jiel30/Completed
def process_html(html):
    '''process html to tokenize to get a list of words
    '''
    table = str.maketrans(".,?!'\";:-_(){}[]\|`~#$%^&*<:>/+=","                                ")
    reg = re.compile('<[^>]*>')
    html = reg.sub('',html.decode().replace('\n','').replace(' ',''))
    text = html.translate(table)
    words = text.split()
    return words
예제 #32
0
파일: draft_xpath.py 프로젝트: sxhylkl/--1
def getPage(url):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
    }
    req = request.Request(url=url, headers=headers)
    html = request.urlopen(req).read()
    html = html.decode('utf-8')  # 解码
    return html
def getArticleHead(url):
    html = requests.get(url).content
    html = html.decode('utf-8')
    # 标题
    msg_title = re.findall(r'var msg_title = \"(.*?)\"', html)[0]
    # 推文首页图
    msg_cdn_url = re.findall(r'var msg_cdn_url = \"(.*?)\"', html)[0]
    print("title:" + msg_title)
    print("msg_url:" + msg_cdn_url)
예제 #34
0
def parse(html):
    tree=lxml.html.fromstring(html.decode("utf8"))
    query = tree.xpath('//input[@id="lst-ib"]/@value')
    if len(query) > 0:
        query = query[0]
    else:
        query = ""
    for href in tree.xpath('//li[@class="g card-section"]//h3/a/@href'):
        yield href,query
예제 #35
0
    def doLogin(self, username, password):
        if self.baseheaders.has_key("Cookie"):
            self.baseheaders.pop("Cookie", None)
            self.cookies = {}
        if self.headers.has_key("Cookie"):
            self.headers.pop("Cookie", None)
        conn = httplib.HTTPConnection("dealer.che168.com", timeout=10)
        conn.request("GET", "/", "", self.baseheaders)
        res = conn.getresponse()
        resHeaders = res.getheaders()
        resRead = res.read()
        self.setCookies(resHeaders)
        html = base.BaseSharer.decodeBody(resHeaders, resRead)
        html = html.decode('GB18030')
        dom = lxml.html.fromstring(html)
        checkCodeImageUrls = dom.xpath('.//span/img[@src]/@src')
        if len(checkCodeImageUrls) == 0:
            return False
        checkCodeImageUrl = checkCodeImageUrls[0]
        conn.close()

        conn = httplib.HTTPConnection("dealer.che168.com", timeout=10)
        conn.request("GET", checkCodeImageUrl, "", self.baseheaders)
        res = conn.getresponse()
        self.setCookies(res.getheaders())
        imageData = res.read()
        conn.close()
        image = StringIO(imageData)
        captcha = base.BaseSharer.getCaptcha(image, imageData)

        if captcha is None:
            return False

        validcode = captcha["text"]

        conn = httplib.HTTPConnection("dealer.che168.com", timeout=10)

        url = "/Handler/Login/Login.ashx?"
        username = urllib.quote(username.encode("GB18030"))
        password = urllib.quote(password.encode("GB18030"))
        url = url + 'name=' + username
        url = url + '&pwd=' + password
        url = url + '&validcode=' + validcode.strip()
        url += '&remember=false'
        url = url + '&req=' + str(random.random())

        conn.request("GET", url, "", self.baseheaders)
        res = conn.getresponse()
        resHeaders = res.getheaders()
        resRead = res.read()
        loginResult = base.BaseSharer.decodeBody(resHeaders, resRead)
        loginResult = loginResult.decode('GB18030')
        if not loginResult.startswith(u"var code='1';"):
            return False
        logger.debug("loginResult=" + loginResult)
        self.setCookies(res.getheaders())
        return True
예제 #36
0
파일: get_web.py 프로젝트: wowngasb/kl_tool
def strToUnicode(html, decoding=None):
    if not isinstance(html, unicode):
        if not decoding:
            decoding, charJust = '', chardet.detect(html)
            try: decoding = 'gbk' if charJust['encoding'].lower() == 'gb2312' else charJust['encoding']
            except Exception, e: print 'strToUnicode chardet detect error:', Exception, '->', e
        decoding = 'utf-8' if not decoding else decoding
        if decoding: html = html.decode(decoding, 'ignore')
    return html
def fetch_detail(url, detail):
    html = requests.get(url).content
    root = lxml.html.fromstring(html.decode(TARGET_ENCODING))
    summary = root.xpath('//*[@id="mw-content-text"]/h2[1]/following-sibling::node()[not(preceding-sibling::h2[2])]')
    # summary = root.xpath('//*[@id="mw-content-text"]/p[1]')
    # detail['description'] = lxml.html.tostring(summary[0], method='text', encoding=ENCODING)
    # detail['description'] = ''.join(s if issubclass(type(s), str) else s.text_content() for s in summary)
    detail['description'] = ''.join(s.text_content() if hasattr(s, 'text_content') else s for s in summary)
    return detail
예제 #38
0
def getUrl(url):
    req = request.Request(url)
    html = request.urlopen(req).read()
    html = html.decode('utf-8')
    req.add_header(
        'User-Agent',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
    )
    return html
예제 #39
0
파일: get_web.py 프로젝트: wowngasb/kl_tool
def unicodeToStr(html, encoding='utf-8'):
    if not isinstance(html, unicode):
        decoding, charJust = '', chardet.detect(html)
        try: decoding = 'gbk' if charJust['encoding'].lower() == 'gb2312' else charJust['encoding']
        except Exception, e: print 'unicodeToStr chardet detect error:', Exception, '->', e
        if encoding and decoding and decoding!=encoding : html = html.decode(decoding, 'ignore').encode(encoding, 'ignore')
    else:
        if encoding: html = html.encode(encoding, 'ignore')
    return html
예제 #40
0
    def tidy (html):
        """ Pipe html thru w3c tidy. """

        html = parsers.RE_RESTRICTED.sub ('', html)
        html = RE_XMLDECL.sub ('', html)
        html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html)

        # convert to xhtml
        tidy = subprocess.Popen (
            ["tidy",
             "-utf8",
             "-clean",
             "--wrap",             "0",
             # "--drop-font-tags",   "y",
             # "--drop-proprietary-attributes", "y",
             # "--add-xml-space",    "y",
             "--output-xhtml",     "y",
             "--numeric-entities", "y",
             "--merge-divs",       "n", # keep poetry indentation
             "--merge-spans",      "n",
             "--add-xml-decl",     "n",
             "--doctype",          "strict",
             "--anchor-as-name",   "n",
             "--enclose-text",     "y" ],

            stdin = subprocess.PIPE,
            stdout = subprocess.PIPE,
            stderr = subprocess.PIPE)

        # print (html.encode ('utf-8'))
        # sys.exit ()

        (html, stderr) = tidy.communicate (html.encode ('utf-8'))

        regex = re.compile (r'(Info:|Warning:|Error:)\s*', re.I)

        # pylint: disable=E1103
        msg = stderr.decode (sys.stderr.encoding).strip ()
        for line in msg.splitlines ():
            match = regex.search (line)
            if match:
                sline = regex.sub ("", line)
                g = match.group (1).lower ()
                if g == 'info:':
                    info ("tidy: %s" % sline)
                elif g == 'warning:':
                    warning ("tidy: %s" % sline)
                elif g == 'error:':
                    error ("tidy: %s" % sline)
                else:
                    error (line)

        if tidy.returncode == 2:
            raise ValueError (stderr)

        return html.decode ('utf-8')
예제 #41
0
    def tidy (html):
        """ Pipe html thru w3c tidy. """

        html = parsers.RE_RESTRICTED.sub ('', html)
        html = RE_XMLDECL.sub ('', html)
        html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html)

        # convert to xhtml
        tidy = subprocess.Popen (
            ["tidy",
             "-utf8",
             "-clean",
             "--wrap",             "0",
             # "--drop-font-tags",   "y",
             # "--drop-proprietary-attributes", "y",
             # "--add-xml-space",    "y",
             "--output-xhtml",     "y",
             "--numeric-entities", "y",
             "--merge-divs",       "n", # keep poetry indentation
             "--merge-spans",      "n",
             "--add-xml-decl",     "n",
             "--doctype",          "strict",
             "--anchor-as-name",   "n",
             "--enclose-text",     "y" ],

            stdin = subprocess.PIPE,
            stdout = subprocess.PIPE,
            stderr = subprocess.PIPE)

        # print (html.encode ('utf-8'))
        # sys.exit ()

        (html, stderr) = tidy.communicate (html.encode ('utf-8'))

        regex = re.compile ('(Info:|Warning:|Error:)\s*', re.I)

        # pylint: disable=E1103
        msg = stderr.rstrip ()
        for line in msg.splitlines ():
            match = regex.search (line)
            if match:
                sline = regex.sub ("", line)
                g = match.group (1).lower ()
                if g == 'info:':
                    info ("tidy: %s" % sline)
                elif g == 'warning:':
                    warn ("tidy: %s" % sline)
                elif g == 'error:':
                    error ("tidy: %s" % sline)
                else:
                    error (line)

        if tidy.returncode == 2:
            raise ValueError, stderr

        return html.decode ('utf-8')
예제 #42
0
def read_film_list(page_index):
    website = 'http://www.tasteofcinema.com/category/lists/film-lists/page/' + str(page_index) + '/'
    html = get_url_content(website)

    print("Parsing: ", website)
    from toc_parser.list_parser import TocListParser
    parser = TocListParser()
    parser.feed(html.decode('utf-8'))
    parser.close()
    return parser.get_film_list()
예제 #43
0
파일: bills.py 프로젝트: RCGTDev/openstates
 def _url_2_lxml(self, url, base_url='{0.scheme}://{0.netloc}'.format):
     '''
     Fetch the url as a string, convert it to unicode,
     and parse with lxml.
     '''
     html = self.urlopen(url)
     doc = lxml.html.fromstring(html.decode(self.encoding))
     urldata = urlparse(url)
     doc.make_links_absolute(base_url(urldata))
     return doc
예제 #44
0
def main():
    remove=re.compile(r'                            |</br>|\.*')
    baseurl = 'https://movie.douban.com/top250?start='
    datalist=[]
    for i in range(0,10):
        url=baseurl+str(i*25)
        html=askURL(url)      
        html=html.decode('utf-8').replace(u'\xa0', u' ')
        tree = lxml.html.fromstring(html)
        items=tree.cssselect('div.item')#找到每一个影片项
        for item in items:
            data=[]
            td=item.cssselect('div.hd a span.title')#片名可能只有一个中文名,没有外国名
            if(len(td)==2):
                ctitle=td[0].text_content()
                data.append(ctitle)#添加中文片名  
                otitle=td[1].text_content().replace(u'\xa0', u' ')
                otitle=otitle.replace(" / ","")#去掉无关符号               
                data.append(otitle)#添加外国片名
            else:
                data.append(td[0].text_content())#添加中文片名
                data.append(' ')#留空                         
            
            rating=item.cssselect('span.rating_num')[0]
            data.append(rating.text_content())#添加评分     
            judgeNum=item.cssselect('div.star span')[3]
            judgeNum=judgeNum.text_content().replace('人评价','')
            data.append(judgeNum)#添加评论人数
            
            inq=item.cssselect('p.quote')
            #可能没有概况
            if len(inq)!=0:
                inq=inq[0].text_content().replace("。","").strip()#去掉句号和没用的空格
                data.append(inq)#添加概况
            else:
                data.append(' ')#留空
            
            bd=item.cssselect('p')[0]
            bd=bd.text_content().replace(u'\xa0', u'/')
            bd=bd.encode('GBK','ignore')
            bd=bd.decode('GBK')
            bd=re.sub(remove,"",bd)
            bd=re.sub('\n',"|",bd)#去掉<br>
            bd=re.sub(': ',":",bd)#替换/
            bd=re.sub('<br/>',"",bd)#去掉<br>           
            bd=re.sub('///',"|",bd)#替换/
            words=bd.split("|")
            for s in words:
                if len(s)!=0 and s.strip()!='': #去掉空白内容
                    data.append(s)
            #主演有可能因为导演内容太长而没有
            if(len(data)!=10):
                data.insert(6,' ')#留空  
            datalist.append(data)
    return datalist
예제 #45
0
def scrape(url, season):
    html = scraperwiki.scrape(url)
    html = html.decode('utf-8')
    #print html
    root = lxml.html.fromstring(html)
    matchdays = root.xpath(match_date_xpath)
    number_matchdays = len(matchdays)
    for m in matchdays:
        r = match_detail_regex.search(str(m.attrib['id']))
        url = real_baseurl % (season, r.groups()[0], r.groups()[1])
        process(url, season, r.groups()[1])
예제 #46
0
def get_listings(city):
    url = "http://{}.craigslist.org/search/apa".format(city)
    resp = requests.get(url)
    listing_ids = parse_listings(resp.content)

    for listing_id in listing_ids:
        filename = '{}{}.html'.format(DATA_PATH, listing_id)
        if os.path.isfile(filename):
            continue
        html = fetch_listing(city, listing_id)
        open(filename, 'w').write(html.decode('utf-8'))
예제 #47
0
파일: lxml_tools.py 프로젝트: GrIvA/NB
def truncate_html(html, limit, encoding='utf-8'):
    """
    Truncate html data to specified length and then fix broken tags.
    """

    if not isinstance(html, unicode):
        html = html.decode(encoding)
    truncated_html = html[:limit]
    elem = parse_html(truncated_html, encoding=encoding)
    fixed_html = render_html(elem, encoding=encoding)
    return fixed_html
예제 #48
0
파일: lxml_tools.py 프로젝트: subeax/grab
def truncate_html(html, limit, encoding='utf-8'):
    """
    Truncate html data to specified length and then fix broken tags.
    """

    if not isinstance(html, unicode):
        html = html.decode(encoding)
    truncated_html = html[:limit]
    elem = parse_html(truncated_html, encoding=encoding)
    fixed_html = render_html(elem, encoding=encoding)
    return fixed_html
예제 #49
0
def download():
    """
    获取电影列表并存入csv文件
    """
    # 114个页面
    url = 'https://www.dytt8.net/html/gndy/china/list_4_{}.html'
    urls = [url.format(i) for i in range(1, 115)]

    # 获取proxy与user-agent
    proxy = [get_proxy()]
    #proxy = ['223.85.196.75:9999']
    userAgent = getAgent()
    downloader = Downloader(delay=1,
                            user_agent=userAgent,
                            timeout=100,
                            proxies=proxy,
                            num_retries=3)

    # csv存储
    path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    filepath = os.path.join(path, 'myfile')
    file = os.path.join(filepath, 'dytt.csv')
    col_head = ['名称', '日期']

    csvHtml = CsvHtml(file=file, col_head=col_head)

    for url in urls:
        html = downloader(url)
        # html charset=gb2312,使用gbk解码
        if html:
            html.decode('gbk', 'ignore')
            tree = lxml.html.fromstring(html)

            titles = tree.cssselect('div.co_content8 b')
            dates = tree.cssselect('div.co_content8 font')

            for e in zip(titles, dates):
                row = [
                    e[0].text_content(), e[1].text_content().split('\r\n')[0]
                ]
                csvHtml(row)
예제 #50
0
파일: st.py 프로젝트: tehron/tehbot
 def userstats_api(self, user):
     url = "https://www.securitytraps.pl/wcscore.php?uname=%s&key=%s"
     authkey = self.settings["securitytraps_api_key"]
     html = urllib.request.urlopen(url % (Plugin.to_utf8(user), authkey),
                                   timeout=5).read()
     html = html.decode()
     if html == "0":
         return None
     rank, challs_solved, challs_total, users_total, scoremax = html.split(
         ":")
     return user, str(int(challs_solved)), int(challs_total), str(
         int(rank)), int(users_total), None, int(scoremax), None
def scrap( url ):
    # ソースを取得.
    html = scraperwiki.scrape( url )
    root = lxml.html.fromstring( html.decode( TARGET_ENCODING ) )
    circles = root.xpath( TARGET_XPATH_CIRCLE )
    tables = root.xpath( TARGET_XPATH_TABLES )
    
    validate( len( circles ) > 0 and len( tables ) > 0, ERROR_CODE_NOT_FOUND_TABLE )
    validate( len( circles ) == len( tables ), ERROR_CODE_NOT_CORRESPONDE_COUNT )
    
    # サークル単位のテーブル処理.
    for (circle,tables) in zip(circles,tables):
        # テーブルのヘッダ情報を処理.
        lines = tables.xpath( 'tr' )
        line_count = len( lines )
        validate( line_count > 1, ERROR_CODE_NO_CIRCLE_RECORD )
    
        name_to_index_table = {}
        index_to_name_table = {}
        header_rows = lines[0].xpath( 'td' )
        row_count = len( header_rows )
        for x in range( row_count ):
            item = header_rows[x]
            name = item.text_content()
            name_to_index_table[name] = x
            index_to_name_table[x] = name
    
        # テーブルのレコード読み取り.
        prev_record = {}
        rowspans_count = row_count * [0]
        for y in range( 1, line_count ):
            rows = lines[y].xpath( 'td' )
            record = {TARGET_CIRCLE: circle.text_content()}
            x = 0
            for i in range( row_count ):
                name = index_to_name_table[i]
                if rowspans_count[i] > 0:
                    record[name] = prev_record[name]
                else:
                    item = rows[x]
                    if 'rowspan' in item.attrib:
                        rowspans_count[i] = int( item.attrib['rowspan'] )
                    if name in TARGET_TABLE_ITEM_PARSER:
                        f = TARGET_TABLE_ITEM_PARSER[name]
                        record[name] = f( item )
                    else:
                        record[name] = parse_item_default( item )
                    x += 1
            validate_with_msg( validate_record( record ), ERROR_CODE_INVALID_RECORD_FOUND, record )
            prev_record = record
            formalized_record = formalize_record( record )
            validate( save_record( formalized_record ), ERROR_CODE_FAILED_TO_SAVE )
            rowspans_count = map( (lambda n: n-1), rowspans_count )
예제 #52
0
def unmunge(html):
    """Clean up Word HTML"""
    if 'mso' in html:  # remove outlook html style
        key = '%s:unmunge' % hash(html)
        out = cache.get(key, namespace="filters")
        if not out:
            html = re.sub(re.compile('p"mso.*?"'), 'p', html)
            html = re.sub(re.compile('( style=".*?")'), '', html)
            out = unmungeHtml(html.decode('utf-8'))
            cache.set(key, out, namespace="filters")
        return out
    return html
예제 #53
0
 def userstats(self, user):
     url = "https://defendtheweb.net/wechall/userscore?username=%s&authkey=%s"
     authkey = self.settings["defendtheweb_auth_key"]
     html = urllib.request.urlopen(url % (Plugin.to_utf8(user), authkey),
                                   timeout=5).read()
     html = html.decode()
     if html == "0":
         return None
     user, rank, score, scoremax, challs_solved, challs_total, users_total = html.split(
         ":")
     return user, str(int(challs_solved)), int(challs_total), str(
         int(rank)), int(users_total), int(score), int(scoremax), None
예제 #54
0
파일: two47ctf.py 프로젝트: tehron/tehbot
 def userstats_api(self, user):
     url = "https://247ctf.com/wechall_validate_score_service?username=%s&authkey=%s"
     authkey = self.settings["247ctf_api_key"]
     html = urllib.request.urlopen(url % (Plugin.to_utf8(user), authkey),
                                   timeout=5).read()
     html = html.decode()
     if html == "":
         return None
     user, rank, score, scoremax, challs_solved, challs_total, users_total = html.split(
         ":")
     return user, str(int(challs_solved)), int(challs_total), str(
         int(rank)), int(users_total), int(score), int(scoremax), None
예제 #55
0
    def smart_decode(self, html):
        """
        Obviously this function is not smart at all.
        Anyway, it works on most sites in Chinese.
        """

        encodings = ['utf-8', 'gbk', 'big5']
        for enc in encodings:
            try:
                return html.decode(enc)
            except:
                continue
        return html
예제 #56
0
def get_branches():
    b=Browse()
    lid=1
    id=1
    scraperwiki.sqlite.execute('delete from coords_data')
    scraperwiki.sqlite.execute('delete from branch_data')
    data=scraperwiki.sqlite.select('* from regions')
    for d in data:
        
        html=b.query("http://www.openbank.ru/ru/about/office/"+d['city_url'])
        html=html.decode('windows-1251')

        #print html
        r=lxml.html.document_fromstring(html)
        
        data=[]
        branchName=''
        address=''
        for el in r.xpath("//div[@class='body_sec']//*[name()='h4' or name()='ul']"):
            #print el.tag

            #print el.tag
            if el.tag == 'h4': 
                branchName=el.text_content()
                address=''

            if el.tag == 'ul':
                address=get_xpath_el(el, 'li[1]')
            
            if branchName != '' and address!='':
                data.append({'id':id,'branch_name':branchName, 'address':address, 'city':d['city'], 'oblasty':d['oblasty']})
                branchName=''
                address=''
                id+=1

        print data
        #get coordinates
        latlon=re.findall(r'createObject\("Placemark"\, new YMaps\.GeoPoint\(([\d\.]+?),\s*([\d\.]+?)\),\s*?".*?",\s*?"(.*?)"\)\)',html, re.I|re.U)
       
        lldata=[]
        if latlon!=[]:
            for l in latlon:
                lldata.append({'lid':lid, 'lat':l[1], 'lon':l[0], 'branch_data':l[2].encode('utf-8')})
                lid+=1

        if data!=[]:        
            scraperwiki.sqlite.save(unique_keys=['id'], data=data, table_name='branch_data')

        if lldata!=[]:
            scraperwiki.sqlite.save(unique_keys=['lid'], data=lldata, table_name='coords_data')
예제 #57
0
 def getWeibo(self, id, page, cid='107603'):
     #id(字符串类型):博主的用户id,page(整型):微博翻页参数
     url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value=' + id + '&containerid=' + cid + id + '&page=' + str(
         page)
     print("url:", url)
     response = requests.get(url)
     html = response.text.encode('utf-8', 'ignore')
     html1 = html.decode("utf-8", 'ignore')
     ob_json = json.loads(html1)
     #print("ob_json:",ob_json)
     #ob_json: {'ok': 0, 'msg': '这里还没有内容', 'data': {'cards': []}}
     list_card = ob_json['data']
     list_cards = list_card['cards']
     #print("list_cards:",list_cards)
     return list_cards  # 返回本页所有的cards
예제 #58
0
def get_url(url):
    req = urllib2.Request(url)
    req.add_header(
        'User-Agent',
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
    )
    response = urllib2.urlopen(req)
    #tml = res.read().decode('utf-8')
    html = response.read()
    selector = etree.HTML(html)
    print html.decode('utf-8')
    data = selector.xpath("a")
    news = []
    for i in data:
        news.append(i["href"])

    #//a[@class='single-story-module__headline-link']|//section/section[@id='_up_with_heds_1']/div/article/h3/a

    #
    # # 请求URL,获取其text文本
    # wbdata = requests.get(url).text
    # # 对获取到的文本进行解析
    # soup = BeautifulSoup(wbdata, 'lxml')
    # # 从解析文件中通过select选择器定位指定的元素,返回一个列表
    # news_links = soup.select("h3")
    # #,h3.story-package-module__story__headline > a
    # #.single-story-module__headline-link
    # news = []
    #
    # # 对返回的列表进行遍历
    # for n in news_links:
    #     # 提取出标题和链接信息
    #     link = n.get("href")
    #     news.append(link)
    print news
    return news
예제 #59
0
파일: crawling.py 프로젝트: 2470370075/all
def download(url,user_agent='wswp',retries=2):
    print('download:',url)
    headers={'user agent':user_agent}
    try:
        html=urllib.request.urlopen(url).read()
        html = html.decode('utf-8')
        print(html)
    except urllib.request.URLError as e:
        print('dowmload error:',e.reason)
        html=None
        if retries> 0:
            if hasattr(e,'code') and 500<= e.code<600:
                return download(url,user_agent,retries-1)

    return html