Exemplo n.º 1
0
 def _getpics_index(self):
     picpagelist=[]
     oripagelist=[]
     resp,content=self.ht.request(self.url,headers=_headers.get())
     self.name=re.findall('<h1>(.+)</h1',content)[0].strip()
     print self.name
     picarea=re.findall('div class="listnr">(.*?)</div',content,re.DOTALL)[0]
     for sec in re.findall('li>(.*?)</li',picarea,re.DOTALL):
         picpagelist.append(self.absurl(re.findall('href="(.*?)"',sec)[0]))
     print('Done gathering %d pages.' % len(picpagelist))
     
     for url in picpagelist:
         rs,ct=self.ht.request(url,headers=_headers.get())
         oripagelist.append(self.absurl(re.findall('href="(.+)" class="in2"',ct)[0]))
     print('Done loading %d pages.' % len(oripagelist))
     
     for url in oripagelist:
         self.piclist.append(self._parseori(url)[0])
     print('Done loading %d image urls.' % len(self.piclist))
Exemplo n.º 2
0
 def _parseori(self,url):
     rs,ct=self.ht.request(url,headers=_headers.get())
     picurl=re.findall('<img id="on_img".*?src="(.*?)"',ct,re.DOTALL)[0]
     nexturl=re.findall('<a href="(.*?)"',ct)
     nexturl= nexturl==[] and '' or ('http://photos.pcgames.com.cn/source/'+nexturl[0])
     cur,total=re.findall('<i>(\d+)</i>.*?<i>(\d+)</i>',ct)[0]
     if not self.name:
         self.name=re.findall('<b>(.*?)</b></a></span><span class="page">',ct)[0].strip()
         print(self.name)
     if cur==total:nexturl=''
     return picurl,nexturl,cur,total
Exemplo n.º 3
0
    def _getpics_index(self):
        picpagelist = []
        oripagelist = []
        resp, content = self.ht.request(self.url, headers=_headers.get())
        self.name = re.findall('<h1>(.+)</h1', content)[0].strip()
        print self.name
        picarea = re.findall('div class="listnr">(.*?)</div', content,
                             re.DOTALL)[0]
        for sec in re.findall('li>(.*?)</li', picarea, re.DOTALL):
            picpagelist.append(self.absurl(re.findall('href="(.*?)"', sec)[0]))
        print('Done gathering %d pages.' % len(picpagelist))

        for url in picpagelist:
            rs, ct = self.ht.request(url, headers=_headers.get())
            oripagelist.append(
                self.absurl(re.findall('href="(.+)" class="in2"', ct)[0]))
        print('Done loading %d pages.' % len(oripagelist))

        for url in oripagelist:
            self.piclist.append(self._parseori(url)[0])
        print('Done loading %d image urls.' % len(self.piclist))
Exemplo n.º 4
0
 def _parseori(self, url):
     rs, ct = self.ht.request(url, headers=_headers.get())
     picurl = re.findall('<img id="on_img".*?src="(.*?)"', ct, re.DOTALL)[0]
     nexturl = re.findall('<a href="(.*?)"', ct)
     nexturl = nexturl == [] and '' or (
         'http://photos.pcgames.com.cn/source/' + nexturl[0])
     cur, total = re.findall('<i>(\d+)</i>.*?<i>(\d+)</i>', ct)[0]
     if not self.name:
         self.name = re.findall(
             '<b>(.*?)</b></a></span><span class="page">', ct)[0].strip()
         print(self.name)
     if cur == total: nexturl = ''
     return picurl, nexturl, cur, total
Exemplo n.º 5
0
#pageend=467#467
baseurl = "http://www.youiv.com/forum.php?mod=forumdisplay&fid=279&&filter=typeid&typeid=%s&page=%d"
cat = {'289': 'IV', '290': 'U15', '293': 'MAG'}
totl = {'289': 356, '290': 105, '293': 7}
convms = lambda a: time.strftime('%M:%S', time.localtime(a))
for c in cat:
    file = 'youiv\\%s.txt' % cat[c]
    analyzed = open(file, 'r').read()
    time0 = time.time()
    for i in range(pagest, totl[c] + 1):
        print('Page %d. Time %s ETA %s'%(i,\
              convms(time.time()-time0),\
              convms((time.time()-time0)*1.0/(i-pagest+1)*(totl[c]+1-pagest))))
        if os.path.exists('youiv\\%d.txt' % (i)):
            continue
        resp, cont = http.request(baseurl % (c, i), headers=_headers.get())
        #http://www.youiv.com/thread-116976-1-1.html
        a = re.findall('(forum\.php\?mod=viewthread.*?)\"', cont)
        thread = []
        for j in range(len(a) / 3):
            thread.append(unquote(a[3 * j]))
        print('Single thread parsing start.')
        str = ''
        strerr = ''
        strnoname = ''
        for pg in thread:
            tid = re.findall('tid=(\d+)', pg)[0]
            if tid in analyzed:
                continue
            alert = ''
            resp, cont = http.request(home + pg, headers=_headers.get())
Exemplo n.º 6
0
                    self.tprint('skip %s' % _f)
                    continue
                _ = self.http.request(p, headers=headers)[1]
                open(fname, 'wb').write(_)
                self.tprint('done %s' % _f)
            except Empty:
                if parse_finnished:
                    self._dead = True
                    self.tprint('bye~')
                    break
                else:
                    time.sleep(random.random() * 2)


ht = httplib2.Http()
headers = _headers.get()
_index = ht.request(base_url, headers=headers)[1].decode('gbk')
name = re.findall('h1\>(.+)\<\/h1', _index)[0][:-2]
count = len(
    re.findall('/li',
               re.findall('paginator.*?</ul>', _index, re.DOTALL)[0])) - 3
if count < 0:
    count = 1
print('%s total:%d pages' % (name, count))
if not opath.exists(opath.join(save_dir, name)):
    os.mkdir(opath.join(save_dir, name))
all_thread = []
for i in range(thread_count):
    t = downloader(i)
    all_thread.append(t)
    t.setDaemon(True)
Exemplo n.º 7
0
                    self.tprint('skip %s' % _f)
                    continue
                _ = self.http.request(p, headers = headers)[1]
                open(fname, 'wb').write(_)
                self.tprint('done %s' % _f)
            except Empty:
                if parse_finnished:
                    self._dead = True
                    self.tprint('bye~')
                    break
                else:
                    time.sleep(random.random() * 2)


ht = httplib2.Http()
headers = _headers.get()
_index = ht.request(base_url, headers = headers)[1].decode('gbk')
name = re.findall('h1\>(.+)\<\/h1', _index)[0][:-2]
count = len(re.findall('/li', re.findall('paginator.*?</ul>', _index, re.DOTALL)[0])) - 3
if count < 0:
    count = 1
print('%s total:%d pages' % (name, count))
if not opath.exists(opath.join(save_dir, name)):
    os.mkdir(opath.join(save_dir, name))
all_thread = []
for i in range(thread_count):
    t = downloader(i)
    all_thread.append(t)
    t.setDaemon(True)
    t.start()
Exemplo n.º 8
0
#coding:gb2312
from __future__ import print_function
import _headers
import httplib2
import re
import os
from subprocess import Popen, PIPE
import time
import Queue
import urllib
import threading
targdir="E:\ACG\Drama\down"
metaurl="Special/p_22.Html"
fixed_pref="пл╬╞803_"
error_file=r'z:/%serror.log'%fixed_pref
he=_headers.get()
ht=httplib2.Http()
q=Queue.Queue()
baseurl="http://www.17ttt.com/"
GET=lambda url:ht.request(baseurl+url,headers=he)
ct=GET(metaurl)[1]
thread_cnt=5
chapters=re.findall("Musiclist/\d+\.html",ct)
title=re.findall("id=\"title\".+<p><b>(.*?)</b></p",ct,re.DOTALL)[0]
has=[]
if os.path.exists(r'z:/%s.txt'%title):
    for l in open(r'z:/%s.txt'%title,'r').readlines():
        q.put(l.rstrip('\n'))
else:
    f=open(r'z:/%s.txt'%title,'w')
    for ch in chapters[::-1]:
Exemplo n.º 9
0
#pageend=467#467
baseurl="http://www.youiv.com/forum.php?mod=forumdisplay&fid=279&&filter=typeid&typeid=%s&page=%d"
cat={'289':'IV','290':'U15','293':'MAG'}
totl={'289':356,'290':105,'293':7}
convms=lambda a:time.strftime('%M:%S', time.localtime(a))
for c in cat:
    file='youiv\\%s.txt'%cat[c]
    analyzed=open(file,'r').read()
    time0=time.time()
    for i in range(pagest,totl[c]+1):
        print('Page %d. Time %s ETA %s'%(i,\
              convms(time.time()-time0),\
              convms((time.time()-time0)*1.0/(i-pagest+1)*(totl[c]+1-pagest))))
        if os.path.exists('youiv\\%d.txt'%(i)):
            continue
        resp,cont=http.request(baseurl%(c,i),headers=_headers.get())
        #http://www.youiv.com/thread-116976-1-1.html
        a=re.findall('(forum\.php\?mod=viewthread.*?)\"',cont)
        thread=[]
        for j in range(len(a)/3):
            thread.append(unquote(a[3*j]))
        print('Single thread parsing start.')
        str=''
        strerr=''
        strnoname=''
        for pg in thread:
            tid=re.findall('tid=(\d+)',pg)[0]
            if tid in analyzed:
                continue
            alert=''
            resp,cont=http.request(home+pg,headers=_headers.get())
Exemplo n.º 10
0
#coding:gbk
import httplib2plus as httplib2,re,urllib,sys
import _headers
import clipboard
if len(sys.argv)>1:topicid=sys.argv[1]
else:topicid=raw_input('输入topicid或者网址 >')
if topicid.startswith('http'):topicid=re.findall('.+/(\d+)/*',topicid.rstrip('/'))[0]
url='http://verycd.gdajie.com/topics/%s' %topicid
resp, content = httplib2.Http().request(url, method='GET',headers=headers)
list=re.findall('<font color="red".*?href="(.*?)".*?</font>',content,re.DOTALL)
str=''
for li in list:
	if not li.startswith('http://www.verycd.gdajie.com/detail.htm'):continue
	content = httplib2.Http().request(li, method='GET',headers=_headers.get())[1]
	strp=re.findall("var ed2k_links = '(.+)';",content)[0]
	str+=(strp+'\n')
	print urllib.unquote(strp).decode('utf-8')
clipboard.SetClipboardText(str)
raw_input('\n抓取完成并已复制到剪贴板XD')
Exemplo n.º 11
0
import json
import httplib2
import re
import _headers
ht = httplib2.Http('itellyou')
rsp, ct = ht.request('http://msdn.itellyou.cn/', headers=_headers.get())
lists = re.findall('\#collapse_([0-9a-z-]+)\"\>([^\<]+)\<', ct)
hpost = dict(_headers.get())
hpost.update({
    'Origin': 'http://msdn.itellyou.cn',
    'Referer': 'http://msdn.itellyou.cn/',
    'X-Requested-With': 'XMLHttpRequest',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'})
resfile = open('itellyou_links.txt', 'w')


def p_w(s, nowrite=False):
    print(s)
    if not nowrite:
        resfile.write(('%s\n' % s).encode('utf-8'))

for (gid, name) in lists:
    p_w('[%s]' % name.decode('utf-8'))
    ct = ht.request('http://msdn.itellyou.cn/Category/Index', method='POST', body='id=%s' % gid, headers=hpost)[1]
    res_dict = json.loads(ct)
    for dic in res_dict:
        p_w('-%s' % dic['name'], nowrite=True)
        ct = ht.request('http://msdn.itellyou.cn/Category/GetLang', method='POST', body='id=%s' % dic['id'], headers=hpost)[1]
        lang_ids = json.loads(ct)['result']
        for lang in lang_ids:
            p_w('--%s' % lang['lang'], nowrite=True)