def get_rss2(url): try: with XmlReader.Create(url) as reader: return [ RSSItem( i.Title.Text, i.Summary.Text, i.Links[0].Uri.AbsoluteUri if i.Links.Count > 0 else "") for i in SyndicationFeed.Load(reader).Items ] except XmlException: wc = WebClient() wc.Encoding = UTF8 xmlstr = wc.DownloadString(url) xdoc = XmlDocument() xdoc.LoadXml(xmlstr) xelem = xdoc.DocumentElement titles = [ i.InnerText.Replace("\n", "").Replace("\r", "") for i in xelem.SelectNodes("//item//title") ] links = [i.InnerText for i in xelem.SelectNodes("//item//link")] descriptions = [ i.InnerText for i in xelem.SelectNodes("//item//description") ] return [ RSSItem(t, d, l) for t, d, l in zip(titles, descriptions, links) ]
def get_rss2(url): try: with XmlReader.Create(url) as reader: return [ RSSItem( i.Title.Text, i.Summary.Text, i.Links[0].Uri.AbsoluteUri if i.Links.Count > 0 else "" ) for i in SyndicationFeed.Load(reader).Items ] except XmlException: wc = WebClient() wc.Encoding = UTF8 xmlstr = wc.DownloadString(url) xdoc = XmlDocument() xdoc.LoadXml(xmlstr) xelem = xdoc.DocumentElement titles = [ i.InnerText.Replace("\n", "").Replace("\r", "") for i in xelem.SelectNodes("//item//title")] links = [i.InnerText for i in xelem.SelectNodes("//item//link")] descriptions = [ i.InnerText for i in xelem.SelectNodes("//item//description") ] return [ RSSItem(t, d, l) for t, d, l in zip(titles, descriptions, links) ]
def process(self): empty = False try: client = WebClient() client.Encoding = Encoding.UTF8 client.Headers['Accept'] = 'text/html' client.Headers[ 'User-Agent'] = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)' body = client.DownloadString( 'http://search.twitter.com/search/thread/%d' % self.status.Id) divs = re.findall(r'<div class="msg">(.*?)</div>', body, re.S) if divs: for div in divs: match = re.search( r'<a[^>]*>(.*?)</a>.*<span[^>]*>(.*?)</span>', div) name = match.group(1) text = re.sub(r'<[^>]*>', '', match.group(2)) self.notice(text, nick=name) else: empty = True except WebException, e: if e.Response.StatusCode == 404: # クロールされていないかプロテクトか empty = True else: raise
def _get_htmldoc(url, encode=UTF8): """指定したURLのHTMLテキストを取得。失敗するとNoneを返す""" wc = WebClient() wc.Encoding = encode try: html = wc.DownloadString(url) except WebException as ex: return None htmlDoc = HtmlDocument() htmlDoc.LoadHtml(html) return htmlDoc
def process(self): empty = False try: client = WebClient() client.Encoding = Encoding.UTF8 client.Headers['Accept'] = 'text/html' client.Headers['User-Agent'] = 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)' body = client.DownloadString('http://search.twitter.com/search/thread/%d' % self.status.Id) divs = re.findall(r'<div class="msg">(.*?)</div>', body, re.S) if divs: for div in divs: match = re.search(r'<a[^>]*>(.*?)</a>.*<span[^>]*>(.*?)</span>', div) name = match.group(1) text = re.sub(r'<[^>]*>', '', match.group(2)) self.notice(text, nick=name) else: empty = True except WebException, e: if e.Response.StatusCode == 404: # クロールされていないかプロテクトか empty = True else: raise