def _getpics_index(self): picpagelist=[] oripagelist=[] resp,content=self.ht.request(self.url,headers=_headers.get()) self.name=re.findall('<h1>(.+)</h1',content)[0].strip() print self.name picarea=re.findall('div class="listnr">(.*?)</div',content,re.DOTALL)[0] for sec in re.findall('li>(.*?)</li',picarea,re.DOTALL): picpagelist.append(self.absurl(re.findall('href="(.*?)"',sec)[0])) print('Done gathering %d pages.' % len(picpagelist)) for url in picpagelist: rs,ct=self.ht.request(url,headers=_headers.get()) oripagelist.append(self.absurl(re.findall('href="(.+)" class="in2"',ct)[0])) print('Done loading %d pages.' % len(oripagelist)) for url in oripagelist: self.piclist.append(self._parseori(url)[0]) print('Done loading %d image urls.' % len(self.piclist))
def _parseori(self,url): rs,ct=self.ht.request(url,headers=_headers.get()) picurl=re.findall('<img id="on_img".*?src="(.*?)"',ct,re.DOTALL)[0] nexturl=re.findall('<a href="(.*?)"',ct) nexturl= nexturl==[] and '' or ('http://photos.pcgames.com.cn/source/'+nexturl[0]) cur,total=re.findall('<i>(\d+)</i>.*?<i>(\d+)</i>',ct)[0] if not self.name: self.name=re.findall('<b>(.*?)</b></a></span><span class="page">',ct)[0].strip() print(self.name) if cur==total:nexturl='' return picurl,nexturl,cur,total
def _getpics_index(self): picpagelist = [] oripagelist = [] resp, content = self.ht.request(self.url, headers=_headers.get()) self.name = re.findall('<h1>(.+)</h1', content)[0].strip() print self.name picarea = re.findall('div class="listnr">(.*?)</div', content, re.DOTALL)[0] for sec in re.findall('li>(.*?)</li', picarea, re.DOTALL): picpagelist.append(self.absurl(re.findall('href="(.*?)"', sec)[0])) print('Done gathering %d pages.' % len(picpagelist)) for url in picpagelist: rs, ct = self.ht.request(url, headers=_headers.get()) oripagelist.append( self.absurl(re.findall('href="(.+)" class="in2"', ct)[0])) print('Done loading %d pages.' % len(oripagelist)) for url in oripagelist: self.piclist.append(self._parseori(url)[0]) print('Done loading %d image urls.' % len(self.piclist))
def _parseori(self, url): rs, ct = self.ht.request(url, headers=_headers.get()) picurl = re.findall('<img id="on_img".*?src="(.*?)"', ct, re.DOTALL)[0] nexturl = re.findall('<a href="(.*?)"', ct) nexturl = nexturl == [] and '' or ( 'http://photos.pcgames.com.cn/source/' + nexturl[0]) cur, total = re.findall('<i>(\d+)</i>.*?<i>(\d+)</i>', ct)[0] if not self.name: self.name = re.findall( '<b>(.*?)</b></a></span><span class="page">', ct)[0].strip() print(self.name) if cur == total: nexturl = '' return picurl, nexturl, cur, total
#pageend=467#467 baseurl = "http://www.youiv.com/forum.php?mod=forumdisplay&fid=279&&filter=typeid&typeid=%s&page=%d" cat = {'289': 'IV', '290': 'U15', '293': 'MAG'} totl = {'289': 356, '290': 105, '293': 7} convms = lambda a: time.strftime('%M:%S', time.localtime(a)) for c in cat: file = 'youiv\\%s.txt' % cat[c] analyzed = open(file, 'r').read() time0 = time.time() for i in range(pagest, totl[c] + 1): print('Page %d. Time %s ETA %s'%(i,\ convms(time.time()-time0),\ convms((time.time()-time0)*1.0/(i-pagest+1)*(totl[c]+1-pagest)))) if os.path.exists('youiv\\%d.txt' % (i)): continue resp, cont = http.request(baseurl % (c, i), headers=_headers.get()) #http://www.youiv.com/thread-116976-1-1.html a = re.findall('(forum\.php\?mod=viewthread.*?)\"', cont) thread = [] for j in range(len(a) / 3): thread.append(unquote(a[3 * j])) print('Single thread parsing start.') str = '' strerr = '' strnoname = '' for pg in thread: tid = re.findall('tid=(\d+)', pg)[0] if tid in analyzed: continue alert = '' resp, cont = http.request(home + pg, headers=_headers.get())
self.tprint('skip %s' % _f) continue _ = self.http.request(p, headers=headers)[1] open(fname, 'wb').write(_) self.tprint('done %s' % _f) except Empty: if parse_finnished: self._dead = True self.tprint('bye~') break else: time.sleep(random.random() * 2) ht = httplib2.Http() headers = _headers.get() _index = ht.request(base_url, headers=headers)[1].decode('gbk') name = re.findall('h1\>(.+)\<\/h1', _index)[0][:-2] count = len( re.findall('/li', re.findall('paginator.*?</ul>', _index, re.DOTALL)[0])) - 3 if count < 0: count = 1 print('%s total:%d pages' % (name, count)) if not opath.exists(opath.join(save_dir, name)): os.mkdir(opath.join(save_dir, name)) all_thread = [] for i in range(thread_count): t = downloader(i) all_thread.append(t) t.setDaemon(True)
self.tprint('skip %s' % _f) continue _ = self.http.request(p, headers = headers)[1] open(fname, 'wb').write(_) self.tprint('done %s' % _f) except Empty: if parse_finnished: self._dead = True self.tprint('bye~') break else: time.sleep(random.random() * 2) ht = httplib2.Http() headers = _headers.get() _index = ht.request(base_url, headers = headers)[1].decode('gbk') name = re.findall('h1\>(.+)\<\/h1', _index)[0][:-2] count = len(re.findall('/li', re.findall('paginator.*?</ul>', _index, re.DOTALL)[0])) - 3 if count < 0: count = 1 print('%s total:%d pages' % (name, count)) if not opath.exists(opath.join(save_dir, name)): os.mkdir(opath.join(save_dir, name)) all_thread = [] for i in range(thread_count): t = downloader(i) all_thread.append(t) t.setDaemon(True) t.start()
#coding:gb2312 from __future__ import print_function import _headers import httplib2 import re import os from subprocess import Popen, PIPE import time import Queue import urllib import threading targdir="E:\ACG\Drama\down" metaurl="Special/p_22.Html" fixed_pref="пл╬╞803_" error_file=r'z:/%serror.log'%fixed_pref he=_headers.get() ht=httplib2.Http() q=Queue.Queue() baseurl="http://www.17ttt.com/" GET=lambda url:ht.request(baseurl+url,headers=he) ct=GET(metaurl)[1] thread_cnt=5 chapters=re.findall("Musiclist/\d+\.html",ct) title=re.findall("id=\"title\".+<p><b>(.*?)</b></p",ct,re.DOTALL)[0] has=[] if os.path.exists(r'z:/%s.txt'%title): for l in open(r'z:/%s.txt'%title,'r').readlines(): q.put(l.rstrip('\n')) else: f=open(r'z:/%s.txt'%title,'w') for ch in chapters[::-1]:
#pageend=467#467 baseurl="http://www.youiv.com/forum.php?mod=forumdisplay&fid=279&&filter=typeid&typeid=%s&page=%d" cat={'289':'IV','290':'U15','293':'MAG'} totl={'289':356,'290':105,'293':7} convms=lambda a:time.strftime('%M:%S', time.localtime(a)) for c in cat: file='youiv\\%s.txt'%cat[c] analyzed=open(file,'r').read() time0=time.time() for i in range(pagest,totl[c]+1): print('Page %d. Time %s ETA %s'%(i,\ convms(time.time()-time0),\ convms((time.time()-time0)*1.0/(i-pagest+1)*(totl[c]+1-pagest)))) if os.path.exists('youiv\\%d.txt'%(i)): continue resp,cont=http.request(baseurl%(c,i),headers=_headers.get()) #http://www.youiv.com/thread-116976-1-1.html a=re.findall('(forum\.php\?mod=viewthread.*?)\"',cont) thread=[] for j in range(len(a)/3): thread.append(unquote(a[3*j])) print('Single thread parsing start.') str='' strerr='' strnoname='' for pg in thread: tid=re.findall('tid=(\d+)',pg)[0] if tid in analyzed: continue alert='' resp,cont=http.request(home+pg,headers=_headers.get())
#coding:gbk import httplib2plus as httplib2,re,urllib,sys import _headers import clipboard if len(sys.argv)>1:topicid=sys.argv[1] else:topicid=raw_input('输入topicid或者网址 >') if topicid.startswith('http'):topicid=re.findall('.+/(\d+)/*',topicid.rstrip('/'))[0] url='http://verycd.gdajie.com/topics/%s' %topicid resp, content = httplib2.Http().request(url, method='GET',headers=headers) list=re.findall('<font color="red".*?href="(.*?)".*?</font>',content,re.DOTALL) str='' for li in list: if not li.startswith('http://www.verycd.gdajie.com/detail.htm'):continue content = httplib2.Http().request(li, method='GET',headers=_headers.get())[1] strp=re.findall("var ed2k_links = '(.+)';",content)[0] str+=(strp+'\n') print urllib.unquote(strp).decode('utf-8') clipboard.SetClipboardText(str) raw_input('\n抓取完成并已复制到剪贴板XD')
import json import httplib2 import re import _headers ht = httplib2.Http('itellyou') rsp, ct = ht.request('http://msdn.itellyou.cn/', headers=_headers.get()) lists = re.findall('\#collapse_([0-9a-z-]+)\"\>([^\<]+)\<', ct) hpost = dict(_headers.get()) hpost.update({ 'Origin': 'http://msdn.itellyou.cn', 'Referer': 'http://msdn.itellyou.cn/', 'X-Requested-With': 'XMLHttpRequest', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}) resfile = open('itellyou_links.txt', 'w') def p_w(s, nowrite=False): print(s) if not nowrite: resfile.write(('%s\n' % s).encode('utf-8')) for (gid, name) in lists: p_w('[%s]' % name.decode('utf-8')) ct = ht.request('http://msdn.itellyou.cn/Category/Index', method='POST', body='id=%s' % gid, headers=hpost)[1] res_dict = json.loads(ct) for dic in res_dict: p_w('-%s' % dic['name'], nowrite=True) ct = ht.request('http://msdn.itellyou.cn/Category/GetLang', method='POST', body='id=%s' % dic['id'], headers=hpost)[1] lang_ids = json.loads(ct)['result'] for lang in lang_ids: p_w('--%s' % lang['lang'], nowrite=True)