def singtao(): # cons = sqlite3.connect(path + '/singtao.db') # cons.text_factory = str url = 'http://news.singtao.ca/vancouver/' + datetime.date.today().strftime( "%Y-%m-%d") + '/' res = httpfetch(url, 'utf-8') # f=file('a.html','r') # res =f.read() # f.close() res2 = re.compile(r'>headline(.*?)\.html', re.DOTALL).findall(res) for topic in res2: web_site = '星島日報' if database.find(topic, web_site): return urlbase = url + 'headline' + topic + '.html' try: item_page = httpfetch(urlbase, 'utf-8', report=True) except Exception: print "Unexpected error:", sys.exc_info()[1] try: title = re.compile(r'<title>(.*?)</title>', re.DOTALL).findall(item_page)[0].split('_')[0] content = re.compile(r'<div class="content" id="Zoom">(.*?)</div>', re.DOTALL).findall(item_page)[0] content = re.compile(r'<br />', re.DOTALL).sub('\n', content) content = re.compile(r'<.*?>', re.DOTALL).sub('', content) content = re.compile(r'&.*?;', re.DOTALL).sub(' ', content) content = re.compile(r'\n\s+', re.DOTALL).sub('\n', content) # content = content.strip() except: print "Unexpected error:", sys.exc_info()[1] print urlbase source = '星島日報' post_date = datetime.date.today().strftime("%Y-%m-%d") tries = 0 while tries < 2: try: if not database.find(topic, web_site): database.insert(topic, title, source, content, post_date, urlbase, web_site) else: continue except Exception: print urlbase print sys.exc_info()[0] tries += 1 time.sleep(10) continue break return
def feed(): ''' read verycd feed and keep update very 30 min ''' url = 'http://www.verycd.com/sto/feed' print 'fetching feed ...' feeds = httpfetch(url) ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(feeds) ids = set(ids) print ids now = time.mktime(time.gmtime()) for id in ids: q.put(id)
def feed(): """ read verycd feed and keep update very 30 min """ url = "http://www.verycd.com/sto/feed" print "fetching feed ..." feeds = httpfetch(url) ids = re.compile(r"/topics/(\d+)", re.DOTALL).findall(feeds) ids = set(ids) print ids now = time.mktime(time.gmtime()) for id in ids: q.put(id)
def feed(): ''' read verycd feed and keep update very 30 min ''' url = 'http://www.verycd.com/sto/feed' print 'fetching feed ...' feeds = httpfetch(url) ids = re.compile(r'/topics/(\d+)', re.DOTALL).findall(feeds) ids = set(ids) print ids now = time.mktime(time.gmtime()) for id in ids: q.put(id)
def request(pages): '''fetch request res that need login''' if '-' in pages: (f,t)=[ int(x) for x in pages.split('-') ] else: f = t = int(pages) for page in range(f,t+1): url = 'http://www.verycd.com/orz/page%d?stat=request' % page idx = httpfetch(url,needlogin=True) ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx) print ids[0] for id in ids: q.put(id)
def hot(): ''' read verycd hot res and keep update very day ''' url = 'http://www.verycd.com/' print 'fetching homepage ...' home = httpfetch(url) hotzone = re.compile(r'热门资源.*?</dl>',re.DOTALL).search(home).group() hot = re.compile(r'<a href="/topics/(\d+)/"[^>]*>(《.*?》)[^<]*</a>',re.DOTALL).findall(hotzone) html = '<h2 style="color:red">每日热门资源</h2>\n' for topic in hot: print 'fetching hot topic',topic[0],'...' q.put(topic[0]) html += ' <a target="_parent" href="/?id=%s">%s</a> \n' % topic open(path+'/static/hot.html','w').write(html)
def request(pages): """fetch request res that need login""" if "-" in pages: (f, t) = [int(x) for x in pages.split("-")] else: f = t = int(pages) for page in range(f, t + 1): url = "http://www.verycd.com/orz/page%d?stat=request" % page idx = httpfetch(url, needlogin=True) ids = re.compile(r"/topics/(\d+)", re.DOTALL).findall(idx) print ids[0] for id in ids: q.put(id)
def hot(): ''' read verycd hot res and keep update very day ''' url = 'http://www.verycd.com/sto/' print 'fetching homepage ...' home = httpfetch(url) hotzone = re.compile(r'今日热门.*?</dl>',re.DOTALL).search(home).group() hot = re.compile(r'<a href="/topics/(\d+)/"[^>]*>(《.*?》)[^<]*</a>',re.DOTALL).findall(hotzone) html = '<h2 style="color:red">每日热门资源</h2>\n' for topic in hot: print 'fetching hot topic',topic[0],'...' q.put(topic[0]) html += ' <a target="_parent" href="/?id=%s">%s</a> \n' % topic open(path+'/static/hot.html','w').write(html)
def hot(): """ read verycd hot res and keep update very day """ url = "http://www.verycd.com/" print "fetching homepage ..." home = httpfetch(url) hotzone = re.compile(r"热门资源.*?</dl>", re.DOTALL).search(home).group() hot = re.compile(r'<a href="/topics/(\d+)/"[^>]*>(《.*?》)[^<]*</a>', re.DOTALL).findall(hotzone) html = '<h2 style="color:red">每日热门资源</h2>\n' for topic in hot: print "fetching hot topic", topic[0], "..." q.put(topic[0]) html += ' <a target="_parent" href="/?id=%s">%s</a> \n' % topic open(path + "/static/hot.html", "w").write(html)
def fetchcmt(id,dbc=dbc,debug=False,page=1): print 'fetching topic',id,'...' urlbase = 'http://www.verycd.com/topics/' url = urlbase + str(id) + '/comments/page' + str(page) res = '' for _ in range(3): try: res = httpfetch(url,report=True) break except: continue if page == 1: pages = re.compile(r'/comments/page(\d+)').findall(res) if pages: pages = set(pages) for page in pages: if page != 1: fetchcmt(id=id,dbc=dbc,page=page,debug=debug) stmts = re.compile(r'<a href="/members/[^>]*>([^<]*)</a>.*?<span class="date-time">(.*?)</span>.*?<!--Wrap-head end-->(.*?)<!--Wrap-tail begin-->',re.DOTALL).findall(res) stmts = [ [x[0].replace(r'<.*?>',r'').strip(),x[1].replace(r'<.*?>',r'').strip(),x[2].replace(r'<.*?>',r'').strip()] for x in stmts] for i in range(len(stmts)): stmts[i][2] = re.compile(r'(image-\d*)\.verycd\.com',re.I).sub(r'\1.app-base.com',stmts[i][2]) stmts[i][2] = re.compile(r'<div[^>]*>',re.I).sub(r'',stmts[i][2]) stmts[i][2] = re.compile(r'</div>',re.I).sub(r'',stmts[i][2]) stmts[i][2] = re.compile(r'<!--.*-->',re.I).sub(r'',stmts[i][2]) stmts = [ (int(id),x[0],x[2],int(time.mktime(time.strptime(x[1],'%Y/%m/%d %H:%M:%S')))-8*3600) for x in stmts ] if debug: print len(stmts) for stmt in stmts: print stmt[0],stmt[2],stmt[1] tries = 0 while tries<5: try: c = dbc.cursor() c.executemany('replace into comment values (?,?,?,?)',stmts) break except: tries += 1; time.sleep(5); continue; dbc.commit() c.close() return
def update(num=10,off=1): urlbase = 'http://www.verycd.com/sto/~all/page' for i in range(off,num+1): print 'fetching list',i,'...' url = urlbase+str(i) res = httpfetch(url,needlogin=True) res2 = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(res) if res2: res2 = res2[0] else: continue topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res2) topics = set(topics) print topics for topic in topics: q.put(topic)
def update(num=10): urlbase = "http://www.verycd.com/sto/~all/page" for i in range(1, num + 1): print "fetching list", i, "..." url = urlbase + str(i) res = httpfetch(url) res2 = re.compile(r'"topic-list"(.*?)"pnav"', re.DOTALL).findall(res) if res2: res2 = res2[0] else: continue topics = re.compile(r"/topics/(\d+)", re.DOTALL).findall(res2) topics = set(topics) print topics for topic in topics: q.put(topic)
def update(num=10): urlbase = 'http://www.verycd.com/sto/~all/page' for i in range(1, num + 1): print 'fetching list', i, '...' url = urlbase + str(i) res = httpfetch(url) res2 = re.compile(r'"topic-list"(.*?)"pnav"', re.DOTALL).findall(res) if res2: res2 = res2[0] else: continue topics = re.compile(r'/topics/(\d+)', re.DOTALL).findall(res2) topics = set(topics) print topics for topic in topics: q.put(topic)
def search(keyword,full=True): '''search verycd, fetch search results''' url = 'http://www.verycd.com/search/folders/'+keyword print 'fetching search results ...' res = httpfetch(url) topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res) topics = set(topics) links = [] if full: links = re.compile(r'/search/folders/(.*?\?start=\d+)',re.DOTALL).findall(res) print links print topics if topics: for topic in topics: q.put(topic) if full and links: for key in links: search(key,full=False)
def wenxue(num=1): urlbase = 'http://news.wenxuecity.com/index.php?page=' for i in range(1, num + 1): print 'fetching wenxue city news on page', i, '...' url = urlbase + str(i) res = httpfetch(url) res2 = re.compile(r'"images/bbslogos/news\.gif"(.*?)"BBSAdd\.php\?SubID=news"', re.DOTALL).findall(res) if res2: res2 = res2[0] else: continue topics = re.compile(r'messages/(.*?)\.html', re.DOTALL).findall(res2) topics = set(topics) print topics for topic in topics: q.put(topic)
def fetchall(ran='1-max',debug=False): urlbase = 'http://www.verycd.com/archives/' if ran == '1-max': m1 = 1 res = urllib.urlopen(urlbase).read() m2 = int(re.compile(r'archives/(\d+)').search(res).group(1)) else: m = ran.split('-') m1 = int(m[0]) m2 = int(m[1]) print 'fetching list from',m1,'to',m2,'...' for i in range(m1,m2+1): url = urlbase + '%05d'%i + '.html' print 'fetching from',url,'...' res = httpfetch(url) ids = re.compile(r'topics/(\d+)/',re.DOTALL).findall(res) print ids for id in ids: q.put(id)
def fetchall(ran='1-max', debug=False): urlbase = 'http://www.verycd.com/archives/' if ran == '1-max': m1 = 1 res = urllib.urlopen(urlbase).read() m2 = int(re.compile(r'archives/(\d+)').search(res).group(1)) else: m = ran.split('-') m1 = int(m[0]) m2 = int(m[1]) print 'fetching list from', m1, 'to', m2, '...' for i in range(m1, m2 + 1): url = urlbase + '%05d' % i + '.html' print 'fetching from', url, '...' res = httpfetch(url) ids = re.compile(r'topics/(\d+)/', re.DOTALL).findall(res) print ids for id in ids: q.put(id)
def fetchall(ran="1-max", debug=False): urlbase = "http://www.verycd.com/archives/" if ran == "1-max": m1 = 1 res = urllib.urlopen(urlbase).read() m2 = int(re.compile(r"archives/(\d+)").search(res).group(1)) else: m = ran.split("-") m1 = int(m[0]) m2 = int(m[1]) print "fetching list from", m1, "to", m2, "..." for i in range(m1, m2 + 1): url = urlbase + "%05d" % i + ".html" print "fetching from", url, "..." res = httpfetch(url) ids = re.compile(r"topics/(\d+)/", re.DOTALL).findall(res) print ids for id in ids: q.put(id)
def wenxue(num=2): urlbase = 'http://www.wenxuecity.com/news/' for i in range(1, num + 1): # print 'fetching wenxue city news on page', i, '...' url = urlbase + "morenews/?page=" + str(i) res = httpfetch(url, 'gb2312') res2 = re.compile(r'<div class="list" id="contentList">(.*?)<div class="turnpage">', re.DOTALL).findall(res) if res2: res2 = res2[0] else: continue # 抓取新闻条目的ID topics = re.compile(r'<a href="(.*?)" target="_blank">', re.DOTALL).findall(res2) topics = set(topics) # print topics for topic in topics: print topic q.put(topic) fetch(topic) return
def search(keyword, full=True): """search verycd, fetch search results""" searchlog = path + "/search.log" open(searchlog, "a").write("\n" + keyword + "\n") url = "http://www.verycd.com/search/folders/" + keyword print "fetching search results ..." res = httpfetch(url) topics = re.compile(r"/topics/(\d+)", re.DOTALL).findall(res) topics = set(topics) links = [] if full: links = re.compile(r"/search/folders/(.*?\?start=\d+)", re.DOTALL).findall(res) print links print topics if topics: for topic in topics: open(searchlog, "a").write(topic + ",") q.put(topic) if full and links: for key in links: search(key, full=False)
def fetch(id, conn=conn, debug=False): print 'fetching topic', id, '...' urlbase = 'http://news.wenxuecity.com/messages/' url = urlbase + str(id) + '.html' news_id = id.split('-')[2] if dbfind(news_id, conn): return res = '' for _ in range(3): try: res = httpfetch(url, report=True) break except: continue title = re.compile(r'<h1 class="cnTitle">(.*?)</h1>', re.DOTALL).findall(res) if title: title = title[0] link = url web_site = '文学城' else: return try: source = re.compile(r'<span style="color: #006699;">(.*?)</span>', re.DOTALL).search(res).group(1) post_date = re.compile(r'#cc3300;">(.*?)</span>', re.DOTALL).search(res).group(1) content = re.compile(r'<td valign="top" class="main">(.*?)<div align="right">', re.DOTALL).findall(res) except: return if content: content = content[0] content = re.compile(r'<br />', re.DOTALL).sub('\n', content) content = re.compile(r'<.*?>', re.DOTALL).sub('', content) content = re.compile(r'&.*?;', re.DOTALL).sub(' ', content) content = re.compile(r'\n\s+', re.DOTALL).sub('\n', content) content = content.strip() else: content = '' if debug: print title print source print content print post_date print web_site tries = 0 while tries < 3: try: if not dbfind(news_id, conn): dbinsert(news_id, title, source, content, post_date, link, web_site, conn) else: continue # dbupdate(news_id,title,source,content,post_date,link,web_site,conn) break; except: print sys.exc_info()[0] tries += 1; time.sleep(5); continue; return post_date
def fetch(id, conn=conn, debug=False): print 'fetching topic', id, '...' urlbase = 'http://www.verycd.com/topics/' url = urlbase + str(id) res = '' for _ in range(3): try: res = httpfetch(url, report=True) break except: continue abstract = re.compile(r'<h1>.*?visit', re.DOTALL).findall(res) if not abstract: print res if res == '' or '很抱歉' in res: print 'resource does not exist' return else: print 'fetching', id, 'again...' return fetch(id, conn) abstract = abstract[0] title = re.compile(r'<h1>(.*?)</h1>', re.DOTALL).findall(abstract) if title: title = title[0] else: return try: status = re.compile(r'"requestWords">(.*?)<', re.DOTALL).search(abstract).group(1) brief = re.compile(r'"font-weight:normal"><span>(.*?)</td>', re.DOTALL).search(abstract).group(1) brief = re.compile(r'<.*?>', re.DOTALL).sub('', brief).strip() pubtime = re.compile( r'"date-time">(.*?)</span>.*?"date-time">(.*?)</span>', re.DOTALL).findall(abstract)[0] category1 = re.compile(r'分类.*?<td>(.*?) (.*?) ', re.DOTALL).findall(abstract)[0] category = ['', ''] category[0] = re.compile(r'<.*?>', re.DOTALL).sub('', category1[0]).strip() category[1] = re.compile(r'<.*?>', re.DOTALL).sub('', category1[1]).strip() res2 = re.compile(r'iptcomED2K"><!--eMule.*?<!--eMule end-->', re.DOTALL).findall(res)[0] ed2k = re.compile( r'ed2k="([^"]*)" subtitle_[^=]*="([^"]*)">([^<]*)</a>', re.DOTALL).findall(res2) ed2k.extend( re.compile(r'ed2k="([^"]*)">([^<]*)</a>', re.DOTALL).findall(res2)) content = re.compile(r'<!--eMule end-->(.*?)<!--Wrap-tail end-->', re.DOTALL).findall(res) except: return if content: content = content[0] content = re.compile(r'<br />', re.DOTALL).sub('\n', content) content = re.compile(r'<.*?>', re.DOTALL).sub('', content) content = re.compile(r'&.*?;', re.DOTALL).sub(' ', content) content = re.compile(r'\n\s+', re.DOTALL).sub('\n', content) content = content.strip() else: content = '' if debug: print title print status print brief print pubtime[0], pubtime[1] print category[0], category[1] for x in ed2k: print x print content ed2kstr = '' for x in ed2k: ed2kstr += '`'.join(x) + '`' if not dbfind(id, conn): dbinsert(id, title, status, brief, pubtime, category, ed2kstr, content, conn) else: dbupdate(id, title, status, brief, pubtime, category, ed2kstr, content, conn) return pubtime[1]
def fetch(id,db=db,dbl=dbl,dbc=dbc,debug=False): print 'fetching topic',id,'...' urlbase = 'http://www.verycd.com/topics/' url = urlbase + str(id) res = '' for _ in range(3): try: res = httpfetch(url,report=True,needlogin=False) break except: continue abstract = re.compile(r'<h1>.*?visit',re.DOTALL).findall(res) if not abstract: print res if res == '' or '很抱歉' in res: print 'resource does not exist' return else: print 'fetching',id,'again...' return fetch(id,db) abstract = abstract[0] title = re.compile(r'<h1>(.*?)</h1>',re.DOTALL).findall(abstract) if title: title=title[0] else: return try: status = re.compile(r'"requestWords">(.*?)<',re.DOTALL).search(abstract).group(1) brief = re.compile(r'"font-weight:normal">\s*<span>(.*?)</td>',re.DOTALL).search(abstract).group(1) brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip() pubtime = re.compile(r'"date-time">(.*?)</span>.*?"date-time">(.*?)</span>',re.DOTALL).findall(abstract)[0] category1 = re.compile(r'<strong>分类.*?<td>(.*?) (.*?) ',re.DOTALL).findall(abstract)[0] category = ['',''] category[0] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[0]).strip() category[1] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[1]).strip() ed2k = re.compile(r'ed2k="([^"]*)" (subtitle_[^=]*="[^"]*"[^>]*)>([^<]*)</a>',re.DOTALL).findall(res) ed2k.extend( re.compile(r'ed2k="([^"]*)">([^<]*)</a>',re.DOTALL).findall(res) ) #delete duplicates newed2k = ed2k for i in range(len(ed2k)-1,-1,-1): if ed2k[i] in ed2k[:i]: newed2k.remove(ed2k[i]) content = re.compile(r'id="iptcomContents">(.*?)<!--Wrap-tail end-->',re.DOTALL).findall(res) except: return if content: content = content[0] content = re.compile(r'<(img .*?)>').sub(r'[\1]',content) content = re.compile(r'<br />',re.DOTALL).sub('\n',content) content = re.compile(r'<.*?>',re.DOTALL).sub('',content) content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content) content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content) content = re.compile(r'\[(img .*?)\]').sub(r'<\1><br>',content) content = re.compile(r'(image-\d*)\.verycd\.com',re.I).sub(r'\1.app-base.com',content) content = content.strip() else: content='' vcpv = 0 #fetch stat try: staturl = 'http://stat.verycd.com/counters/folder/'+str(id)+'/' st = httpfetch(staturl) vcpv = int(re.compile(r'\'(\d+)\'').findall(st)[0]) except: pass # update lock owner = re.compile(r'<div id="userres">.*?<td align="left" valign="top"><p><strong id="username"><a href=.*?>(.*)</a></strong>',re.DOTALL).findall(res) if owner: owner = owner[0] cl=dbl.cursor() try: cl.execute('replace into lock values (?,?,?,?,?,?,?)',(long(id),True,owner,'',title,pubtime[1],vcpv)) except: pass while True: try: dbl.commit() break except: pass cl.close() if debug: if vcpv: print vcpv if owner: print owner print title print status print brief print pubtime[0],pubtime[1] print category[0],category[1] for x in ed2k: print x print content ed2kstr = '' for x in ed2k: ed2kstr += '`'.join(x)+'`' if ed2kstr == '': return # update verycd try: if not dbfind(id,db): dbinsert(id,title,status,brief,pubtime,category,ed2kstr,content,db) else: dbupdate(id,title,status,brief,pubtime,category,ed2kstr,content,db) except Exception as what: print what # update comment fetchcmt(id=id,dbc=dbc) return pubtime[1]
def fetch(id, conn=conn, debug=False): print "fetching topic", id, "..." urlbase = "http://www.verycd.com/topics/" url = urlbase + str(id) res = "" for _ in range(3): try: res = httpfetch(url, report=True) break except: continue abstract = re.compile(r"<h1>.*?visit", re.DOTALL).findall(res) if not abstract: print res if res == "" or "很抱歉" in res: print "resource does not exist" return else: print "fetching", id, "again..." return fetch(id, conn) abstract = abstract[0] title = re.compile(r"<h1>(.*?)</h1>", re.DOTALL).findall(abstract) if title: title = title[0] else: return try: status = re.compile(r'"requestWords">(.*?)<', re.DOTALL).search(abstract).group(1) brief = re.compile(r'"font-weight:normal"><span>(.*?)</td>', re.DOTALL).search(abstract).group(1) brief = re.compile(r"<.*?>", re.DOTALL).sub("", brief).strip() pubtime = re.compile(r'"date-time">(.*?)</span>.*?"date-time">(.*?)</span>', re.DOTALL).findall(abstract)[0] category1 = re.compile(r"分类.*?<td>(.*?) (.*?) ", re.DOTALL).findall(abstract)[0] category = ["", ""] category[0] = re.compile(r"<.*?>", re.DOTALL).sub("", category1[0]).strip() category[1] = re.compile(r"<.*?>", re.DOTALL).sub("", category1[1]).strip() # res2 = re.compile(r'iptcomED2K"><!--eMule.*?<!--eMule end-->',re.DOTALL).findall(res)[0] ed2k = re.compile(r'ed2k="([^"]*)" (subtitle_[^=]*="[^"]*"[^>]*)>([^<]*)</a>', re.DOTALL).findall(res) ed2k.extend(re.compile(r'ed2k="([^"]*)">([^<]*)</a>', re.DOTALL).findall(res)) content = re.compile(r"<!--eMule end-->(.*?)<!--Wrap-tail end-->", re.DOTALL).findall(res) except: return if content: content = content[0] content = re.compile(r"<br />", re.DOTALL).sub("\n", content) content = re.compile(r"<.*?>", re.DOTALL).sub("", content) content = re.compile(r"&.*?;", re.DOTALL).sub(" ", content) content = re.compile(r"\n\s+", re.DOTALL).sub("\n", content) content = content.strip() else: content = "" if debug: print title print status print brief print pubtime[0], pubtime[1] print category[0], category[1] for x in ed2k: print x print content ed2kstr = "" for x in ed2k: ed2kstr += "`".join(x) + "`" tries = 0 while tries < 3: try: if not dbfind(id, conn): dbinsert(id, title, status, brief, pubtime, category, ed2kstr, content, conn) else: dbupdate(id, title, status, brief, pubtime, category, ed2kstr, content, conn) break except: tries += 1 time.sleep(5) continue return pubtime[1]
def fetch(id,conn=conn,debug=False): print 'fetching topic',id,'...' urlbase = 'http://www.verycd.com/topics/' url = urlbase + str(id) res = '' for _ in range(3): try: res = httpfetch(url,report=True) break except: continue abstract = re.compile(r'<h1>.*?visit',re.DOTALL).findall(res) if not abstract: print res if res == '' or '很抱歉' in res: print 'resource does not exist' return else: print 'fetching',id,'again...' return fetch(id,conn) abstract = abstract[0] title = re.compile(r'<h1>(.*?)</h1>',re.DOTALL).findall(abstract)[0] status = re.compile(r'"requestWords">(.*?)<',re.DOTALL).search(abstract).group(1) brief = re.compile(r'"font-weight:normal"><span>(.*?)</td>',re.DOTALL).search(abstract).group(1) brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip() pubtime = re.compile(r'"date-time">(.*?)</span>.*?"date-time">(.*?)</span>',re.DOTALL).findall(abstract)[0] category1 = re.compile(r'分类.*?<td>(.*?) (.*?) ',re.DOTALL).findall(abstract)[0] category = ['',''] category[0] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[0]).strip() category[1] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[1]).strip() res2 = re.compile(r'iptcomED2K"><!--eMule.*?<!--eMule end-->',re.DOTALL).findall(res)[0] ed2k = re.compile(r'ed2k="([^"]*)" subtitle_[^=]*="([^"]*)">([^<]*)</a>',re.DOTALL).findall(res2) ed2k.extend( re.compile(r'ed2k="([^"]*)">([^<]*)</a>',re.DOTALL).findall(res2) ) content = re.compile(r'<!--eMule end-->(.*?)<!--Wrap-tail end-->',re.DOTALL).findall(res) if content: content = content[0] content = re.compile(r'<br />',re.DOTALL).sub('\n',content) content = re.compile(r'<.*?>',re.DOTALL).sub('',content) content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content) content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content) content = content.strip() else: content='' if debug: print title print status print brief print pubtime[0],pubtime[1] print category[0],category[1] for x in ed2k: print x print content ed2kstr = '' for x in ed2k: ed2kstr += '`'.join(x)+'`' if not dbfind(id,conn): dbinsert(id,title,status,brief,pubtime,category,ed2kstr,content,conn) else: dbupdate(id,title,status,brief,pubtime,category,ed2kstr,content,conn) return pubtime[1]
def fetch(i, debug=False): path = os.path.dirname(os.path.realpath(sys.argv[0])) conn = sqlite3.connect(path + '/news.sqlite3.db') conn.text_factory = str # print 'fetching topic', i, '...' urlbase = 'http://www.wenxuecity.com' url = urlbase + i news_id = i.split('/')[5] news_id = news_id.split('.')[0] w = "文学城" # if database.find(news_id, w, conn): # return res = '' for _ in range(3): try: res = httpfetch(url, 'utf-8', report=True) break except: print sys.exc_info()[1] continue res = re.compile(r'<div class="maincontainer">(.*?)<div class="banners">', re.DOTALL).findall(res)[0] title = re.compile(r'<h3>(.*?)</h3>', re.DOTALL).findall(res) if title: title = title[0].encode('utf-8') link = url web_site = '文学城' try: parse = re.compile(r'<div id="postmeta">(.*?) <span>', re.DOTALL).search(res).group(1) source = re.compile(r'itemprop="author">(.*?)</span>', re.DOTALL).findall(parse)[0] post_date = re.compile(r'datetime(.*?)</time>', re.DOTALL).findall(parse)[0] post_date = post_date.split('>')[1] content = re.compile( r'<div id="articleContent" class="article">(.*?)<div class="sharewechat">', re.DOTALL).findall(res)[0] if content: # content = content[0] content = re.compile(r'<div style=(.*?)>', re.DOTALL).sub('', content) content = re.compile(r'<br>', re.DOTALL).sub('', content) content = re.compile(r'<.*?>', re.DOTALL).sub('', content) content = re.compile(r'&.*?;', re.DOTALL).sub(' ', content) content = re.compile(r'\n\s+', re.DOTALL).sub('\n', content) content = content.strip() else: content = '' print news_id if debug: print title print source print content print post_date print web_site if not database.find(news_id, web_site, conn): database.insert(news_id, title, source, content, post_date, link, web_site, conn) else: database.update(news_id, title, source, content, post_date, link, web_site, conn) except: print "" return post_date
def run(self): for i in range(8): t = Thread(target=self.thread_fetch) t.setDaemon(True) t.start() conn = sqlite3.connect(self.path+'/verycd.sqlite3.db') conn.text_factory = str while True: try: #feed if time.mktime(time.gmtime())%60<10: self.q.put('feed') #check searchqueue every 10 secs taskqueue = open(self.path+'/searchqueue','r').readlines() print taskqueue,time.mktime(time.gmtime()),time.mktime(time.gmtime())%900 open(self.path+'/searchqueue','w').write('') for task in taskqueue: url = 'http://www.verycd.com/search/folders/'+task print 'fetching', url, '...' res = httpfetch(url) print '...fetching completed' topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res) topics = set(topics) for topic in topics: self.q.put(topic) if taskqueue == []: time.sleep(10) # read feed every 900 secs if time.mktime(time.gmtime())%600<10: url = 'http://www.verycd.com/sto/feed' print 'fetching feed ...' feeds = httpfetch(url) topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(feeds) topics = set(topics) print topics now = time.mktime(time.gmtime()) for topic in topics: self.q.put(topic) # read hot everyday at gmt 19:00 # read hot every 4 hours timeofday = time.mktime(time.gmtime())%(86400/6) # if timeofday>68400 and timeofday < 68410: if time.mktime(time.gmtime())%(3600*4)<10: url = 'http://www.verycd.com/' print 'fetching homepage ...' home = httpfetch(url) hotzone = re.compile(r'热门资源.*?</dl>',re.DOTALL).search(home).group() hot = re.compile(r'<a href="/topics/(\d+)/"[^>]*>(《.*?》)[^<]*</a>',re.DOTALL).findall(hotzone) html = '<h2 style="color:red">每日热门资源</h2>\n' for topic in hot: print 'fetching hot topic',topic[0],'...' self.q.put(topic[0]) html += ' <a target="_parent" href="/?id=%s">%s</a> \n' % topic open(self.path+'/static/hot.html','w').write(html) # update 20 whole pages at gmt 19:10 if timeofday>69000 and timeofday < 69010: urlbase = 'http://www.verycd.com/sto/~all/page' for i in range(1,20): print 'fetching list',i,'...' url = urlbase+str(i) res = httpfetch(url) res2 = re.compile(r'"topic-list"(.*?)"pnav"',re.DOTALL).findall(res) if res2: res2 = res2[0] else: continue topics = re.compile(r'/topics/(\d+)',re.DOTALL).findall(res2) topics = set(topics) print topics for topic in topics: self.q.put(topic) # update 1 pages@normal and 1 pages@request every 3600 secs if time.mktime(time.gmtime())%3600<10: url = 'http://www.verycd.com/orz/page1?stat=normal' idx = httpfetch(url,needlogin=True) ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx) print ids[0] for id in ids: self.q.put(id) url = 'http://www.verycd.com/orz/page1?stat=request' idx = httpfetch(url,needlogin=True) ids = re.compile(r'/topics/(\d+)',re.DOTALL).findall(idx) print ids[0] for id in ids: self.q.put(id) except: time.sleep(10) continue
def fetch(id,conn=conn,debug=False): print 'fetching topic',id,'...' urlbase = 'http://www.verycd.com/topics/' url = urlbase + str(id) res = '' for _ in range(3): try: res = httpfetch(url,report=True) break except: continue abstract = re.compile(r'<h1.*?</ul>',re.DOTALL).findall(res) if not abstract: print res if res == '' or '很抱歉' in res: print 'resource does not exist' return else: print 'fetching',id,'again...' return fetch(id,conn) abstract = abstract[0] title = re.compile(r'<h1.*?</h1>',re.DOTALL).findall(abstract) if title: title = title[0] title = re.compile(r'<.*?>',re.DOTALL).sub('',title).strip() else: return try: status = re.compile(r'状态.*?<span>(.*?)</span>.*?</li>',re.DOTALL).search(abstract).group(1) brief = re.compile(r'摘要.*?<span>(.*?)</li>',re.DOTALL).search(abstract).group(1) brief = re.compile(r'<.*?>',re.DOTALL).sub('',brief).strip() pubtime = re.compile(r'date-time.*?>(.*?)</span>.*?date-time.*?>(.*?)</span>',re.DOTALL).findall(abstract)[0] category1 = re.compile(r'分类.*?<a.*?>(.*?)</a>.*?<a.*?>(.*?)</a>',re.DOTALL).findall(abstract)[0] category = ['',''] category[0] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[0]).strip() category[1] = re.compile(r'<.*?>',re.DOTALL).sub('',category1[1]).strip() print category # res2 = re.compile(r'iptcomED2K"><!--eMule.*?<!--eMule end-->',re.DOTALL).findall(res)[0] ed2k = re.compile(r'ed2k="([^"]*)" (subtitle_[^=]*="[^"]*"[^>]*)>([^<]*)</a>',re.DOTALL).findall(res) ed2k.extend( re.compile(r'ed2k="([^"]*)">([^<]*)</a>',re.DOTALL).findall(res) ) content = re.compile(r'<!--eMule end-->(.*?)<!--Wrap-tail end-->',re.DOTALL).findall(res) except: return if content: content = content[0] content = re.compile(r'<br />',re.DOTALL).sub('\n',content) content = re.compile(r'<.*?>',re.DOTALL).sub('',content) content = re.compile(r'&.*?;',re.DOTALL).sub(' ',content) content = re.compile(r'\n\s+',re.DOTALL).sub('\n',content) content = content.strip() else: content='' if debug: print title print status print brief print pubtime[0],pubtime[1] print category[0],category[1] for x in ed2k: print x print content ed2kstr = '' for x in ed2k: ed2kstr += '`'.join(x)+'`' tries=0 while tries<3: try: if not dbfind(id,conn): dbinsert(id,title,status,brief,pubtime,category,ed2kstr,content,conn) else: dbupdate(id,title,status,brief,pubtime,category,ed2kstr,content,conn) break; except: tries += 1; time.sleep(5); continue; return pubtime[1]
def fetch(i, debug=False): # path = os.path.dirname(os.path.realpath(sys.argv[0])) # conn = sqlite3.connect(path + '/news.sqlite3.db') # conn.text_factory = str # print 'fetching topic', i, '...' urlbase = 'http://www.wenxuecity.com' url = urlbase + i news_id = i.split('/')[5] news_id = news_id.split('.')[0] w = "文学城" # if database.find(news_id, w, conn): # return res = '' for _ in range(3): try: res = httpfetch(url, 'utf-8', report=True) break except: print sys.exc_info()[1] continue try: res = re.compile(r'<div class="maincontainer">(.*?)<div class="sharewechat">', re.DOTALL).findall(res)[0] title = re.compile(r'<h3>(.*?)</h3>', re.DOTALL).findall(res) except Exception as e: print e print url if title: title = html_decode(title[0].encode('utf-8')) link = url web_site = '文学城' try: parse = re.compile(r'<div id="postmeta">(.*?) <span>', re.DOTALL).search(res).group(1) source = re.compile(r'itemprop="author">(.*?)</span>', re.DOTALL).findall(parse)[0] post_date = re.compile(r'datetime(.*?)</time>', re.DOTALL).findall(parse)[0] post_date = post_date.split('>')[1] content = re.compile(r'<div id="articleContent" class="article">(.*?)<iframe', re.DOTALL).findall(res)[0] if content: # content = content[0] content = re.compile(r'<div style=(.*?)>', re.DOTALL).sub('', content) content = re.compile(r'<br>', re.DOTALL).sub('', content) content = re.compile(r'<.*?>', re.DOTALL).sub('', content) content = re.compile(r'&.*?;', re.DOTALL).sub(' ', content) content = re.compile(r'\n\s+', re.DOTALL).sub('\n', content) content = content.strip() else: content = '' print news_id if debug: print title print source print content print post_date print web_site n = { "news_id": news_id, "title": title, "content": html_decode(content), "source": source, "link": link, "post_date": post_date } uri = 'http://' + HOST_NAME + '/api/wenxue' headers = {"Content-Type": "application/json"} r = requests.post(uri, json=n, headers=headers) print r.text # if not database.find(news_id, web_site, conn): # database.insert(news_id, title, source, content, post_date, link, web_site, conn) # else: # database.update(news_id, title, source, content, post_date, link, web_site, conn) except Exception, e: print "Failed with:", title print e