def fetch(head): coo = 'bid="yrdz0J5ispI"; __gads=ID=720d3fea3d3fb612:T=1352676703:S=ALNI_MZ72ae6zGEgpSfYlI_B0WyhBlV-zA; ll="0"; viewed="2052978_11584608"; dbcl2="4898454:5qnPL5l4FFw"; ck="3bMe"; __utma=30149280.1032140921.1356576933.1356576933.1356614007.2; __utmc=30149280; __utmz=30149280.1356576933.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=30149280.489' parts = head.split('/') ref = '/'.join(parts[:-3]) headP = getWebpage('http://read.douban.com/reader/', cookies='hst=1; ' + coo, reLoad=True, referer=ref) return soup = BeautifulSoup(headP.decode('utf8', 'ignore')) t = soup.find('h1') title = t.find(text=True) soup = soup.find('div', {'id': "content"}) ans = [] for x in soup.findAll('a'): if not x.has_key('href'): break link = x['href'] if link[:len(prehead)] != prehead: continue chapter = getWebpage(link) chapter = BeautifulSoup(chapter) content = chapter.find('div', {'class': "book-content"}) content = '\n'.join(map(clean, content.findAll(text=True))) ans.append(content) g = open(title + '.txt', 'w') g.write('\n'.join(ans)) g.close()
def fetchTieba(prefaceL, lz=True): #lz, only see the top floor poster if lz and prefaceL[-8:] != 'see_lz=1': prefaceL += '?see_lz=1' prefaceP = getWebpage(prefaceL) prefaceS = BeautifulSoup(prefaceP.decode('gbk', 'ignore')) book_title = prefaceS.find('title').find(text=True) for link in prefaceS.findAll('a'): if link.find(text=True) == '尾页': lastL = link['href'] ind = lastL.rfind('=') totalP = int(lastL[ind + 1:]) pageS = prefaceS currentP = 1 if not '?' in prefaceL: prefaceL += '?' else: prefaceL += '&' ans = [] while True: posts = pageS.findAll('div', {'class': "d_post_content"}) for post in posts: ans.append('\n'.join(post.findAll(text=True))) currentP += 1 if currentP > totalP: break page = getWebpage(prefaceL + 'pn=' + str(currentP)) pageS = BeautifulSoup(page) g = open(book_title + '.txt', 'w') g.write('\n\n'.join(ans)) g.close()
def fetchNovel(link, rLS=None): if rLS==None: rLS=link chapters=extractLinks(link=link, requireLinkStart=rLS,avoidKeys=['img','alt','src'],\ requireLinkEnd='.html') content=getWebpage(link) soup=BeautifulSoup(content) book_name=''.join(soup.find('title').contents) book_name=book_name.split('_')[0] print 'collecting: ',book_name f=open(book_name+'.txt','w') f.close() count=0 for x in chapters: chapter='http://data.book.163.com'+x['href'] #print chapter,'0' content=getWebpage(chapter) soup=BeautifulSoup(content) content=soup.find('div',{'class':'bk-article-body','id':'bk-article-body'}) f=open(book_name+'.txt','a') #print chapter,'1' try: title=''.join(x.contents).encode('GBK','ignore') except: title='' if title!='' and (title[-1]!=')' or title[-3:]=='(1)'):f.write('\n\n'+title+'\n') #print chapter,'2' for y in content.findAll(text=True): if y.encode('GBK','ignore').strip()=='': continue f.write(y.encode('GBK','ignore')+'\n') f.close() #if count>5:break count+=1
def fetchTieba(prefaceL,lz=True):#lz, only see the top floor poster if lz and prefaceL[-8:]!='see_lz=1': prefaceL+='?see_lz=1' prefaceP=getWebpage(prefaceL) prefaceS=BeautifulSoup(prefaceP.decode('gbk','ignore')) book_title=prefaceS.find('title').find(text=True) for link in prefaceS.findAll('a'): if link.find(text=True)=='尾页': lastL=link['href'] ind=lastL.rfind('=') totalP=int(lastL[ind+1:]) pageS=prefaceS currentP=1 if not '?' in prefaceL: prefaceL+='?' else: prefaceL+='&' ans=[] while True: posts=pageS.findAll('div',{'class':"d_post_content"}) for post in posts: ans.append('\n'.join(post.findAll(text=True))) currentP+=1 if currentP>totalP: break page=getWebpage(prefaceL+'pn='+str(currentP)) pageS=BeautifulSoup(page) g=open(book_title+'.txt','w') g.write('\n\n'.join(ans)) g.close()
def fetchNovel(link): content=getWebpage(link) soup=BeautifulSoup(content) list_c=soup.find('div',{'class':"book_neirong_left"}) chapters=list_c.findAll('a') paras=soup.findAll('div',{'class':'paragraph'}) intro=soup.find('div',{'class':'bookintro'}) book_name=''.join(soup.find('title').findAll(text=True)).strip() print 'collecting: ',book_name for c in chapters: url=c['href'][:-4] url=url.split('/') if len(url)!=5: continue page_info = urllib2.build_opener() postData='c='+url[-1]+'&b='+url[-2] req = urllib2.Request('http://v.book.ifeng.com/book/remc.htm', postData) page=page_info.open(req) content=page.read()[14:-1] content=BeautifulSoup(content) f=open(book_name+'.txt','a') if content==None: continue for y in content.findAll(text=True): if y.encode('GBK','ignore').strip()=='': continue f.write(y.encode('GBK','ignore')+'\n') f.close()
def test(): link = 'http://read.360buy.com/14532/' chapters = extractLinks(link=link, requireLinkStart=link, avoidKeys=['img', 'alt', 'src'], requireLinkEnd='.html') print chapters content = getWebpage(link) soup = BeautifulSoup(content) paras = soup.findAll('div', {'class': 'paragraph'}) intro = soup.find('div', {'class': 'bookintro'}) book_name = soup.find('div', {'id': 'book-cover'}).find('a')['title'] print 'collecting: ', book_name f = open(book_name + '.txt', 'w') f.write('intro: ') for y in intro.findAll(text=True): if y.encode('utf8', 'ignore').strip() == '': continue f.write(y.encode('utf8', 'ignore') + '\n') for x in paras: for y in x.findAll(text=True): f.write(y.encode('utf8', 'ignore') + '\n') f.close() start = int(chapters[0]['href'][len(link):-5]) end = int(chapters[-1]['href'][len(link):-5]) + 20 chapterD = {} for x in chapters: num = int(x['href'][len(link):-5]) title = x['title'] chapterD[num] = title count = 0 for i in range(start, end): chapter = link + str(i) + '.html' content = getWebpage(chapter) soup = BeautifulSoup(content) content = soup.find('div', {'id': 'zoom'}) f = open(book_name + '.txt', 'a') if i in chapterD: f.write('\n\n' + chapterD[i].encode('utf8', 'ignore') + '\n') if content == None: continue for y in content.findAll(text=True): if y.encode('utf8', 'ignore').strip() == '': continue f.write(y.encode('utf8', 'ignore') + '\n') f.close() #if count>5:break count += 1
def fetch(head): headP=getWebpage(head) soup=BeautifulSoup(headP.decode('gbk','ignore')) t=soup.find('h1') title=t.find(text=True) ans=[] for x in soup.findAll('td',{'class':"ccss"})[90:]: link=x.find('a') if link==None: continue if not link.has_key('href'): continue link=head+link['href'] chapter=getWebpage(link) chapter=BeautifulSoup(chapter) content=chapter.find('div',{'id':"content"}) content='\n'.join(map(clean,content.findAll(text=True))) ans.append(content) g=open(title+'.txt','w') g.write('\n'.join(ans)) g.close()
def fetch(head): headP = getWebpage(head) soup = BeautifulSoup(headP.decode('gbk', 'ignore')) t = soup.find('h1') title = t.find(text=True) ans = [] for x in soup.findAll('td', {'class': "ccss"})[90:]: link = x.find('a') if link == None: continue if not link.has_key('href'): continue link = head + link['href'] chapter = getWebpage(link) chapter = BeautifulSoup(chapter) content = chapter.find('div', {'id': "content"}) content = '\n'.join(map(clean, content.findAll(text=True))) ans.append(content) g = open(title + '.txt', 'w') g.write('\n'.join(ans)) g.close()
def fetch(prefix, suffix='.shtml'): prefaceP = getWebpage(prefix + '1' + suffix) prefaceS = BeautifulSoup(prefaceP.decode('utf8', 'ignore')) book_title = prefaceS.find('title').find(text=True) num_page = prefaceS.find('div', { 'class': "atl-pages" }).find('form')['onsubmit'] num_page = num_page.split()[-1] num_page = num_page.split(',')[-1] num_page = num_page.split(')')[0] num_page = int(num_page) book_author = prefaceS.find('a', {'replyid': 0})['author'] ans = [] last_author = book_author for page_num in xrange(1, num_page): link = prefix + str(page_num) + suffix page = getWebpage(link) soup = BeautifulSoup(page.decode('utf8', 'ignore')) posts = soup.findAll('div', {'class': "atl-item"}) for post in posts: try: author = post.find('div', { 'class': "atl-info" }).find('a', {'target': "_blank"})['uname'] except: author = '' if author == last_author and author != '': author = '' else: last_author = author try: post = post.find('div', {'class': "bbs-content"}) except: pass post = '\n'.join(map(clean, post.findAll(text=True))) if len(post) < 30: continue if author != '': post = u'作者:' + author + '\n' + post post.replace('\n\n', '\n') post.replace('\n\n', '\n') ans.append(post) g = open(book_title + '.txt', 'w') g.write('\n\n'.join(ans)) g.close()
def test(): link='http://read.360buy.com/14532/' chapters=extractLinks(link=link, requireLinkStart=link,avoidKeys=['img','alt','src'],requireLinkEnd='.html') print chapters content=getWebpage(link) soup=BeautifulSoup(content) paras=soup.findAll('div',{'class':'paragraph'}) intro=soup.find('div',{'class':'bookintro'}) book_name=soup.find('div',{'id':'book-cover'}).find('a')['title'] print 'collecting: ',book_name f=open(book_name+'.txt','w') f.write('intro: ') for y in intro.findAll(text=True): if y.encode('utf8','ignore').strip()=='': continue f.write(y.encode('utf8','ignore')+'\n') for x in paras: for y in x.findAll(text=True): f.write(y.encode('utf8','ignore')+'\n') f.close() start=int(chapters[0]['href'][len(link):-5]) end=int(chapters[-1]['href'][len(link):-5])+20 chapterD={} for x in chapters: num=int(x['href'][len(link):-5]) title=x['title'] chapterD[num]=title count=0 for i in range(start,end): chapter=link+str(i)+'.html' content=getWebpage(chapter) soup=BeautifulSoup(content) content=soup.find('div',{'id':'zoom'}) f=open(book_name+'.txt','a') if i in chapterD: f.write('\n\n'+chapterD[i].encode('utf8','ignore')+'\n') if content==None: continue for y in content.findAll(text=True): if y.encode('utf8','ignore').strip()=='': continue f.write(y.encode('utf8','ignore')+'\n') f.close() #if count>5:break count+=1
def fetch(head): prehead = 'http://book.douban.com/reading/' headP = getWebpage(head) soup = BeautifulSoup(headP.decode('utf8', 'ignore')) t = soup.find('h1') title = t.find(text=True) soup = soup.find('div', {'id': "content"}) ans = [] for x in soup.findAll('a'): if not x.has_key('href'): break link = x['href'] if link[:len(prehead)] != prehead: continue chapter = getWebpage(link) chapter = BeautifulSoup(chapter) content = chapter.find('div', {'class': "book-content"}) content = '\n'.join(map(clean, content.findAll(text=True))) ans.append(content) g = open(title + '.txt', 'w') g.write('\n'.join(ans)) g.close()
def fetch(head): prehead='http://book.douban.com/reading/' headP=getWebpage(head) soup=BeautifulSoup(headP.decode('utf8','ignore')) t=soup.find('h1') title=t.find(text=True) soup=soup.find('div',{'id':"content"}) ans=[] for x in soup.findAll('a'): if not x.has_key('href'): break link=x['href'] if link[:len(prehead)]!=prehead: continue chapter=getWebpage(link) chapter=BeautifulSoup(chapter) content=chapter.find('div',{'class':"book-content"}) content='\n'.join(map(clean,content.findAll(text=True))) ans.append(content) g=open(title+'.txt','w') g.write('\n'.join(ans)) g.close()
def fetch(head): headP = getWebpage(head) ind = head.rfind('/') head = head[:ind + 1] soup = BeautifulSoup(headP.decode('gbk', 'ignore')) t = soup.find('h1') title = t.find(text=True) print title ans = [] soup = soup.find('div', {'class': "booklist clearfix"}) for x in soup.findAll('a'): if not x.has_key('href'): continue link = head + x['href'] chapter = getWebpage(link) chapter = BeautifulSoup(chapter) content = chapter.find('div', {'class': "bookcontent clearfix"}) content = '\n'.join(map(clean, content.findAll(text=True))) ans.append(content) g = open(title + '.txt', 'w') g.write('\n'.join(ans)) g.close()
def fetch(head): headP=getWebpage(head) ind=head.rfind('/') head=head[:ind+1] soup=BeautifulSoup(headP.decode('gbk','ignore')) t=soup.find('h1') title=t.find(text=True) print title ans=[] soup=soup.find('div',{'class':"booklist clearfix"}) for x in soup.findAll('a'): if not x.has_key('href'): continue link=head+x['href'] chapter=getWebpage(link) chapter=BeautifulSoup(chapter) content=chapter.find('div',{'class':"bookcontent clearfix"}) content='\n'.join(map(clean,content.findAll(text=True))) ans.append(content) g=open(title+'.txt','w') g.write('\n'.join(ans)) g.close()
def getTraffic(title, date , silent=True,country='en'): date=str(date) link='http://stats.grok.se/json/'+country+'/' + date +'/'+title if silent==False: print link page=getWebpage(link) jsondata= json.loads(page) daily_views =jsondata['daily_views'] tot = 0 days = 0 for day, views in daily_views.iteritems(): tot+=views days+=1 return tot*1.0/days
def fetch(prefix,suffix='.shtml'): prefaceP=getWebpage(prefix+'1'+suffix) prefaceS=BeautifulSoup(prefaceP.decode('utf8','ignore')) book_title=prefaceS.find('title').find(text=True) num_page=prefaceS.find('div',{'class':"atl-pages"}).find('form')['onsubmit'] num_page=num_page.split()[-1] num_page=num_page.split(',')[-1] num_page=num_page.split(')')[0] num_page=int(num_page) book_author=prefaceS.find('a',{'replyid':0})['author'] ans=[] last_author=book_author for page_num in xrange(1,num_page): link=prefix+str(page_num)+suffix page=getWebpage(link) soup=BeautifulSoup(page.decode('utf8','ignore')) posts=soup.findAll('div',{'class':"atl-item"}) for post in posts: try: author=post.find('div',{'class':"atl-info"}).find('a',{'target':"_blank"})['uname'] except: author='' if author==last_author and author!='': author='' else: last_author=author try: post=post.find('div',{'class':"bbs-content"}) except: pass post='\n'.join(map(clean,post.findAll(text=True))) if len(post)<30: continue if author!='': post=u'作者:'+author+'\n'+post post.replace('\n\n','\n') post.replace('\n\n','\n') ans.append(post) g=open(book_title+'.txt','w') g.write('\n\n'.join(ans)) g.close()
def fetch(head): coo='bid="yrdz0J5ispI"; __gads=ID=720d3fea3d3fb612:T=1352676703:S=ALNI_MZ72ae6zGEgpSfYlI_B0WyhBlV-zA; ll="0"; viewed="2052978_11584608"; dbcl2="4898454:5qnPL5l4FFw"; ck="3bMe"; __utma=30149280.1032140921.1356576933.1356576933.1356614007.2; __utmc=30149280; __utmz=30149280.1356576933.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=30149280.489' parts=head.split('/') ref='/'.join(parts[:-3]) headP=getWebpage('http://read.douban.com/reader/',cookies='hst=1; '+coo, reLoad=True,referer=ref) return soup=BeautifulSoup(headP.decode('utf8','ignore')) t=soup.find('h1') title=t.find(text=True) soup=soup.find('div',{'id':"content"}) ans=[] for x in soup.findAll('a'): if not x.has_key('href'): break link=x['href'] if link[:len(prehead)]!=prehead: continue chapter=getWebpage(link) chapter=BeautifulSoup(chapter) content=chapter.find('div',{'class':"book-content"}) content='\n'.join(map(clean,content.findAll(text=True))) ans.append(content) g=open(title+'.txt','w') g.write('\n'.join(ans)) g.close()
def fetch(head): headP=getWebpage(head+'firstchapter') soup=BeautifulSoup(headP) for f in soup.findAll('input'): if not f.has_key('value'): continue if len(f['value'])<100: continue break ids=re.findall("'id\d*'",f['value']) ids=map(toId,ids) ans=[] for id in ids: try: page=head+'id'+id page=getWebpage(page,timeSleep=10) soup=BeautifulSoup(page) content=soup.find('div',{'class':"htmlcontent"}) content=''.join(map(clean,content.findAll(text=True))) ans.append(content) except: print head+'id'+id+' failed' g=open('download.txt','w') g.write('\n'.join(ans)) g.close()
def fetchNovel(link, rLS=None): if rLS == None: rLS = link chapters=extractLinks(link=link, requireLinkStart=rLS,avoidKeys=['img','alt','src'],\ requireLinkEnd='.html') content = getWebpage(link) soup = BeautifulSoup(content) book_name = ''.join(soup.find('title').contents) book_name = book_name.split('_')[0] print 'collecting: ', book_name f = open(book_name + '.txt', 'w') f.close() count = 0 for x in chapters: chapter = 'http://data.book.163.com' + x['href'] #print chapter,'0' content = getWebpage(chapter) soup = BeautifulSoup(content) content = soup.find('div', { 'class': 'bk-article-body', 'id': 'bk-article-body' }) f = open(book_name + '.txt', 'a') #print chapter,'1' try: title = ''.join(x.contents).encode('GBK', 'ignore') except: title = '' if title != '' and (title[-1] != ')' or title[-3:] == '(1)'): f.write('\n\n' + title + '\n') #print chapter,'2' for y in content.findAll(text=True): if y.encode('GBK', 'ignore').strip() == '': continue f.write(y.encode('GBK', 'ignore') + '\n') f.close() #if count>5:break count += 1
def fetch(head): headP = getWebpage(head + 'firstchapter') soup = BeautifulSoup(headP) for f in soup.findAll('input'): if not f.has_key('value'): continue if len(f['value']) < 100: continue break ids = re.findall("'id\d*'", f['value']) ids = map(toId, ids) ans = [] for id in ids: try: page = head + 'id' + id page = getWebpage(page, timeSleep=10) soup = BeautifulSoup(page) content = soup.find('div', {'class': "htmlcontent"}) content = ''.join(map(clean, content.findAll(text=True))) ans.append(content) except: print head + 'id' + id + ' failed' g = open('download.txt', 'w') g.write('\n'.join(ans)) g.close()
def fetchInfo(homeUrl, type=None): homeSoup = getSoup(homeUrl) pageSoup = homeSoup info = [] count = homeSoup.find('span', {'class': 'count'}) if count: count = count.find(text=True)[2:-2] count = int(count) else: count = N # only one page ind = len(homeSoup.findAll('h1')) - 1 if ind > 1: ind = 1 album_name = homeSoup.findAll('h1')[ind].find(text=True) if '-' in album_name: album_name = album_name.split('-')[1] album_name = album_name.replace("*", '') album_name = album_name.replace("/", '') album_name = album_name.split()[0] start = 0 while True: photos = pageSoup.findAll('div', {'class': 'photo_wrap'}) if len(photos) > N: print 'warning on photo number!' for photo in photos: aTag = photo.find('a', {'class': "photolst_photo"}) if not aTag: continue name = aTag['title'] url = photo.find('img')['src'] url = url.replace('thumb', 'large') info.append((name, url)) start += N if start > count: break page = getWebpage(homeUrl + '?start=' + str(start)) pageSoup = BeautifulSoup(page) photos = homeSoup.findAll('span', {'class': "img"}) if not photos: photos = homeSoup.findAll('a', {'class': "pic"}) for photo in photos: img = photo.find('img') if not img: continue if not img.has_key('alt'): continue name = img['alt'] if img.has_key('data-src'): url = img['data-src'] else: url = img['src'] url = url.replace('head', 'original') info.append((url, name)) return (album_name, info)
def forum_crawl(link,outFile): createFile(outFile, force=True) p=1 lastpage='' while True: page=getWebpage(link+str(p),timeSleep=0) if not page or page==lastpage: break lastpage=page soup=BeautifulSoup(page.decode('gb2312','ignore')) fields=soup.findAll('div',{'id':"content"}) for f in fields: for line in f.findAll(text=True): if len(line.strip())>1: f=open(outFile,'a') f.write(line) f.close() p+=1
def forum_crawl(link, outFile): createFile(outFile, force=True) p = 1 lastpage = '' while True: page = getWebpage(link + str(p), timeSleep=0) if not page or page == lastpage: break lastpage = page soup = BeautifulSoup(page.decode('gb2312', 'ignore')) fields = soup.findAll('div', {'id': "content"}) for f in fields: for line in f.findAll(text=True): if len(line.strip()) > 1: f = open(outFile, 'a') f.write(line) f.close() p += 1
def extractLinks(link='', requireLinkStart='', requiredKeys=[],\ specification=[], containedIn=None,numTable_level_1=None,\ numTable_level_2=None, avoidKeys=[],coo='',requireLinkEnd=''): ans=[] content=getWebpage(link, cookies=coo) soup = BeautifulSoup(content) if containedIn!=None: tables=soup.findAll(containedIn[0],containedIn[1]) else: tables=[soup] for table in tables[:numTable_level_1]: for field in table.findAll('a')[:numTable_level_2]: if field.has_key('href'): extLink=field['href'] satisfySpec=True for subfield, require in specification: if not field.has_key(subfield): satisfySpec=False break if not require in field[subfield]: satisfySpec=False break for requiredKey in requiredKeys: if not field.has_key(requiredKey): satisfySpec=False break for avoidKey in avoidKeys: if isinstance(avoidKey,str): if field.has_key(avoidKey): satisfySpec=False break else: akey,akeyValue=avoidKey if field.has_key(akey) and field[akey]==[akeyValue]: satisfySpec=False break if satisfySpec==False: continue if extLink[:len(requireLinkStart)]==requireLinkStart and \ (requireLinkEnd=='' or extLink[-len(requireLinkEnd):]==requireLinkEnd): ans.append(field) return ans
def extractLinks(link='', requireLinkStart='', requiredKeys=[],\ specification=[], containedIn=None,numTable_level_1=None,\ numTable_level_2=None, avoidKeys=[],coo='',requireLinkEnd=''): ans = [] content = getWebpage(link, cookies=coo) soup = BeautifulSoup(content) if containedIn != None: tables = soup.findAll(containedIn[0], containedIn[1]) else: tables = [soup] for table in tables[:numTable_level_1]: for field in table.findAll('a')[:numTable_level_2]: if field.has_key('href'): extLink = field['href'] satisfySpec = True for subfield, require in specification: if not field.has_key(subfield): satisfySpec = False break if not require in field[subfield]: satisfySpec = False break for requiredKey in requiredKeys: if not field.has_key(requiredKey): satisfySpec = False break for avoidKey in avoidKeys: if isinstance(avoidKey, str): if field.has_key(avoidKey): satisfySpec = False break else: akey, akeyValue = avoidKey if field.has_key(akey) and field[akey] == [akeyValue]: satisfySpec = False break if satisfySpec == False: continue if extLink[:len(requireLinkStart)]==requireLinkStart and \ (requireLinkEnd=='' or extLink[-len(requireLinkEnd):]==requireLinkEnd): ans.append(field) return ans
def fetchInfo(homeUrl,type=None): if 'C:\ '[:-1] in homeUrl: f=open(homeUrl) homePage=f.read() f.close() else: homePage=getWebpage(homeUrl) homeSoup=BeautifulSoup(homePage) pageSoup=homeSoup info=[] if type=='xinmin': album_name=pageSoup.find('title').find(text=True) homeUrl=homeUrl.replace('.html','_@@@@.html') for x in range(2,100): article=pageSoup.find('div',{'class':'article_info'}) if not article: break paragraphs=article.findAll('p') if not paragraphs: break for paragraph in paragraphs: img=paragraph.find('img') if not img: name=paragraph.find(text=True) info.append((name,link)) else: link=img['src'] pageUrl=homeUrl.replace('@@@@',str(x)) page=getWebpage(pageUrl,retry_num=1) if not page: break pageSoup=BeautifulSoup(page) return (album_name,info) if type=='douban': count=homeSoup.find('span',{'class':'count'}) if count: count=count.find(text=True)[2:-2] count=int(count) else: count=N # only one page ind=len(homeSoup.findAll('h1'))-1 if ind>1: ind=1 if type=='douban' or type=='renren': album_name=homeSoup.findAll('h1')[ind].find(text=True) else: album_name=homeSoup.find('title').find(text=True) if '-' in album_name: if type=='douban' or type=='renren': album_name=album_name.split('-')[1] else: album_name=album_name.split('-')[0] album_name=album_name.split()[0] album_name=album_name.replace("*",'') if album_name=='("▔□▔)/': album_name='smile' start=0 if type=='douban': while True: photos=pageSoup.findAll('div',{'class':'photo_wrap'}) if len(photos)>N: print 'warning on photo number!' for photo in photos: aTag=photo.find('a',{'class':"photolst_photo"}) if not aTag: continue name=aTag['title'] url=photo.find('img')['src'] url=url.replace('thumb','large') info.append((name,url)) start+=N if start>count: break page=getWebpage(homeUrl+'?start='+str(start)) pageSoup=BeautifulSoup(page) photos=homeSoup.findAll('span',{'class':"img"}) if not photos: photos=homeSoup.findAll('a',{'class':"pic"}) for photo in photos: img=photo.find('img') if not img: continue if not img.has_key('alt'): continue name=img['alt'] if img.has_key('data-src'): url=img['data-src'] else: url=img['src'] url=url.replace('head','original') info.append((url,name)) return (album_name,info)
coo = 'datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299' f = open(sysPath('webpages/ids.txt')) jf = json.loads(f.read().decode('utf8', 'ignore')) f.close() createFile('infos_fb.txt', force=True) g = open('infos_fb.txt', 'a') g.write( 'Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value' + '\n') g.close() ans = [] for f in jf['data']: info = getWebpage('http://www.facebook.com/' + str(f['id']), cookies=coo, info=str(f['id'])) bI = BeautifulSoup(info) link = bI.find('link', {'rel': 'alternate'}) ''' info=getWebpage(link['href']+'/info', cookies=coo, info=str(f['id']) ) ''' ind = link['href'].rfind('/') email = link['href'][ind + 1:] ans.append((f['name'], f['id'], email + '@facebook.com')) name = f['name'] id = f['id'] email = email + '@facebook.com'
def getSoup(url,encode='utf8',coo=''): page=getWebpage(url,cookies=coo) return BeautifulSoup(page.decode(encode,'ignore'))
def grabQuestions(link=None, category_number=None, pn=0, reLoad=True): questions_to_return = [] if link == None and category_number == None: raise Exception('no input') if category_number != None: newLink = header + clean(str(category_number)) if pn != 0: newLink += '?pn=' + str(pn) if link != None and category_number != None and newLink != link: raise Exception('conflict input') if link == None: link = newLink page = getWebpage(link, reLoad=reLoad, dataDir='webpages') page = page.decode('gb2312', 'ignore') soup = BeautifulSoup(page, from_encoding="gb2312") questions = soup.findAll('tr', {'class': "qlist-tr"}) for question in questions: number = question.find('td', {'class': 'quick-num'}) number_ans = int(str(number.contents[0])) question = question.find('td', {'class': 'align-l'}) cid = question['cid'] qid = question['qid'] qdesc = question['qdesc'] qtitle = question['qtitle'] if not cid.isdigit(): print 'cid is not digit' print cid, qid, qdesc, qtitle continue else: cid = int(cid) if not qid.isdigit(): print 'qid is not digit' print cid, qid, qdesc, qtitle continue else: qid = int(qid) qdesc = clean_sentence(qdesc) qtitle = clean_sentence(qtitle) #if qid!=499587072: continue content = qtitle + '\n' + qdesc content_no_space = no_space(content) email = re.search('[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]+', content_no_space) if email != None: email = email.group(0) + 'qq.com' if email != None: email = re.search('\w+@\w+\.(com|cn)', content_no_space) if email != None: email = email.group(0) if email == None: email = '' s = u'《' + '.*' + u'》' s2 = '<<.*>>' ## s=s.encode('gb2312','ignore') ## print s title_separate = separate(qtitle) book_name = re.search(s, content_no_space) if book_name != None: book_name = book_name.group(0)[1:-1] if book_name == None: book_name = re.search(s2, content_no_space) if book_name != None: book_name = book_name.group(0)[2:-2] else: for i in xrange(len(title_separate)): x = title_separate[i] if u'求' in x: ind = x.find(u'求') if len(x) - ind >= 3: book_name = x[ind + 1:] elif i + 1 < len(title_separate): book_name = title_separate[i + 1] break if u'有' in x: ind = x.find(u'有') if len(x) - ind >= 3: book_name = x[ind + 1:] elif i + 1 < len(title_separate): book_name = title_separate[i + 1] break if 'txt' in x: ind = x.find('txt') if ind > 3: book_name = x[:ind] break if 'TXT' in x: ind = x.find('TXT') if ind > 3: book_name = x[:ind] break for x in EndSign: if book_name == None: break if x in book_name: ind = book_name.find(x) book_name = book_name[:ind] for x in BeginSign: if book_name == None: break if x in book_name: ind = book_name.find(x) book_name = book_name[ind + len(x):] if book_name != None and '@' in book_name: book_name = None if book_name == None or book_name == '': book_name = title_separate[0] #print qid,int(number_ans),'[',qdesc,',',qtitle,']',email #printlist(title_separate) #print book_name #print '-'*30 google_query = book_name if google_query in title_separate[0]: google_query = title_separate[0] questions_to_return.append((qid, int(number_ans), email, book_name, google_query, content_no_space)) return questions_to_return
''' this program needs to add wait time, may cause problem with your renren id ''' from getWebpage import getWebpage import re import json,time from sysPath import createFile coo='anonymid=h9489u7u-yp0fqs; _r01_=1; mop_uniq_ckid=10.7.18.77_1355594994_642928755; _de=3F3126DBF672F298F26CBA88523C3AB26DEBB8C2103DE356; __utma=151146938.1762808405.1361533510.1361533510.1361533510.1; __utmz=151146938.1361533510.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); l4pager=0; depovince=GW; jebecookies=abb5a061-adf7-4276-9913-0059ed1553e6|||||; p=c506abb8c6dd441921166c4464e116341; ap=269496411; t=351ac721dd34d54a08268e46db838a211; societyguester=351ac721dd34d54a08268e46db838a211; id=269496411; xnsid=cacc7bc0; XNESSESSIONID=376bb17a6b26; at=1; loginfrom=null' headpage=getWebpage(link='http://friend.renren.com/myfriendlistx.do', cookies=coo) r=re.search('var friends=(\[.*\]);',headpage) friendList=r.group(1) jf=json.loads(friendList) ids=[] for f in jf: ids.append(f['id']) createFile('infos.txt',force=True) g=open('infos.txt','a') g.write('Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'+'\n') g.close() count=0 for id in ids[:]: timeSleep=1 count+=1 #print count mainInfo=getWebpage('http://www.renren.com/'+str(id)+ '/profile?v=info_ajax&undefined', cookies=coo,
''' this program needs to add wait time, may cause problem with your renren id ''' from getWebpage import getWebpage try: from BeautifulSoup import BeautifulSoup, SoupStrainer except: from bs4 import BeautifulSoup, SoupStrainer # beta version of bs import re import json, time from sysPath import createFile import sys coo = '''anonymid=h9489u7u-yp0fqs; _r01_=1; mop_uniq_ckid=10.7.18.77_1355594994_642928755; __utma=10481322.145044192.1363634540.1363634540.1363636668.2; __utmz=10481322.1363636668.2.2.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/269496411; _de=3F3126DBF672F298F26CBA88523C3AB26DEBB8C2103DE356; depovince=GW; bt_new=12; jebecookies=63880745-b57f-4dce-b75e-7cc2218be89a|||||; p=9babffa88c9c71f7219d11a49178460d1; ap=269496411; t=fa5d5d911dc472ebde86481e5486062e1; societyguester=fa5d5d911dc472ebde86481e5486062e1; id=269496411; xnsid=6ef4dee; loginfrom=null; feedType=269496411_hot; JSESSIONID=abcMqcp8dHsTAh3nle53t; l4pager=0''' headpage = getWebpage(link='http://friend.renren.com/myfriendlistx.do', cookies=coo) r = re.search('var friends=(\[.*\]);', headpage) friendList = r.group(1) jf = json.loads(friendList) ids = [] for f in jf: ids.append(f['id']) if len(sys.argv) >= 2: start_num = int(sys.argv[1]) else: start_num = 0 timeSleep = 0.8 for id in ids[start_num:start_num + 100]: page = getWebpage('http://www.renren.com/' + str(id) + '/profile',
def fetch(head, nid): head += '-' + str(nid) headP = getWebpage(head + '.html') soup = BeautifulSoup(headP.decode('utf8', 'ignore')) n = soup.find('div', {'class': "pageNum1"}) n = n.contents[0] r = re.match(u'共' + '(\d*)' + u'页', n) n = r.group(1) try: n = int(n) except: print 'failed to find the number of page' n = 1000 ans = [] for i in range(1, n + 1): page = head + '-' + str(i) + '.html' page = getWebpage(page) soup = BeautifulSoup(page) posts = soup.findAll('li', {'class': 'at c h2'}) for post in posts: post = '\n'.join(map(clean, post.findAll(text=True))) if len(post) < 10: continue if reply(post): continue ans.append(post) g = open(str(nid) + '.txt', 'w') g.write('\n'.join(ans)) g.close() return return chapters = extractLinks(link=link, requireLinkStart=link, avoidKeys=['img', 'alt', 'src'], requireLinkEnd='.html') print chapters content = getWebpage(link) soup = BeautifulSoup(content) paras = soup.findAll('div', {'class': 'paragraph'}) intro = soup.find('div', {'class': 'bookintro'}) book_name = soup.find('div', {'id': 'book-cover'}).find('a')['title'] print 'collecting: ', book_name f = open(book_name + '.txt', 'w') f.write('intro: ') for y in intro.findAll(text=True): if y.encode('utf8', 'ignore').strip() == '': continue f.write(y.encode('utf8', 'ignore') + '\n') for x in paras: for y in x.findAll(text=True): f.write(y.encode('utf8', 'ignore') + '\n') f.close() start = int(chapters[0]['href'][len(link):-5]) end = int(chapters[-1]['href'][len(link):-5]) + 20 chapterD = {} for x in chapters: num = int(x['href'][len(link):-5]) title = x['title'] chapterD[num] = title count = 0 for i in range(start, end): chapter = link + str(i) + '.html' content = getWebpage(chapter) soup = BeautifulSoup(content) content = soup.find('div', {'id': 'zoom'}) f = open(book_name + '.txt', 'a') if i in chapterD: f.write('\n\n' + chapterD[i].encode('utf8', 'ignore') + '\n') if content == None: continue for y in content.findAll(text=True): if y.encode('utf8', 'ignore').strip() == '': continue f.write(y.encode('utf8', 'ignore') + '\n') f.close() #if count>5:break count += 1
def genTable(filename='../../testData/testingMonuments.txt',\ outfname='../../testData/testingMonumentsData_week4_all.csv', \ months=None,yearBegin=2009, yearEnd=2015,silent=True,endLine=None,\ testNow=False, country='en'): now = datetime.datetime.now() now=(int(now.year),int(now.month)) if months==None: months=[] for year in range(yearBegin,yearEnd): for month in range(1,13): if (year, month)>=now: break months.append(str(year)+'0'*(2-len(str(month)))+str(month)) months=map(str,months) filename=sysPath(filename) f=open(filename,'r') links=f.read().splitlines() f.close() #soup=BeautifulSoup(links) titleLine=['linkTitle'] for month in months: titleLine.append('Img'+month) titleLine.append('Content'+month) titleLine.append('Traffic'+month) if not os.path.exists(outfname): outf=open(outfname,'w') outf.write('\t'.join(titleLine)+'\n') start=0 outf.close() else: outf=open(outfname,'r') start=len(outf.read().splitlines()) outf.close() count=0 ## for field in soup.findAll('a')[:endLine]: for linkTitle in links: index=linkTitle.find('/wiki/') if index!=-1: linkTitle=linkTitle[index+6:] count+=1 if count<start: continue ## if not field.has_key('title'): continue ## linkTitle=field['href'][6:] ## officialTitle=field['title'] curLine=[linkTitle] for month in months: date=month+'01' revId=getRevId(linkTitle, date+'000000' , silent=silent,country=country) # 6 zeros for h,m,s if not silent: print 'revId=',revId if revId==None: curLine+=['','',''] continue link='http://'+country+'.wikipedia.org/w/index.php?oldid='+revId if testNow: print 'title=',linkTitle, 'link=',link,'month=',month if not silent: print 'prepare' page=getWebpage(link, timeSleep=0.5,silent=silent) if not silent: print 'got page' soup=BeautifulSoup(page) if not silent: print 'got soup' numImg=numImage(soup) if not silent: print 'got num' conLen=contentLen(soup) if not silent: print 'got len' traffic=str(getTraffic(linkTitle,month, silent=silent, country=country)) if not silent: print 'got history' curLine+=[numImg, conLen, traffic] curLine=map(str, curLine) outf=open(outfname,'a') outf.write('\t'.join(curLine)+'\n') outf.close()
from bs4 import BeautifulSoup,SoupStrainer # beta version of bs coo='datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299' f=open(sysPath('webpages/ids.txt')) jf=json.loads(f.read().decode('utf8','ignore')) f.close() createFile('infos_fb.txt',force=True) g=open('infos_fb.txt','a') g.write('Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'+'\n') g.close() ans=[] for f in jf['data']: info=getWebpage('http://www.facebook.com/'+str(f['id']), cookies=coo, info=str(f['id']) ) bI=BeautifulSoup(info) link=bI.find('link',{'rel':'alternate'}) ''' info=getWebpage(link['href']+'/info', cookies=coo, info=str(f['id']) ) ''' ind=link['href'].rfind('/') email=link['href'][ind+1:] ans.append((f['name'],f['id'],email+'@facebook.com')) name=f['name'] id=f['id'] email=email+'@facebook.com'
def fetch(head, nid): head += "-" + str(nid) headP = getWebpage(head + ".html") soup = BeautifulSoup(headP.decode("utf8", "ignore")) n = soup.find("div", {"class": "pageNum1"}) n = n.contents[0] r = re.match(u"共" + "(\d*)" + u"页", n) n = r.group(1) try: n = int(n) except: print "failed to find the number of page" n = 1000 ans = [] for i in range(1, n + 1): page = head + "-" + str(i) + ".html" page = getWebpage(page) soup = BeautifulSoup(page) posts = soup.findAll("li", {"class": "at c h2"}) for post in posts: post = "\n".join(map(clean, post.findAll(text=True))) if len(post) < 10: continue if reply(post): continue ans.append(post) g = open(str(nid) + ".txt", "w") g.write("\n".join(ans)) g.close() return return chapters = extractLinks(link=link, requireLinkStart=link, avoidKeys=["img", "alt", "src"], requireLinkEnd=".html") print chapters content = getWebpage(link) soup = BeautifulSoup(content) paras = soup.findAll("div", {"class": "paragraph"}) intro = soup.find("div", {"class": "bookintro"}) book_name = soup.find("div", {"id": "book-cover"}).find("a")["title"] print "collecting: ", book_name f = open(book_name + ".txt", "w") f.write("intro: ") for y in intro.findAll(text=True): if y.encode("utf8", "ignore").strip() == "": continue f.write(y.encode("utf8", "ignore") + "\n") for x in paras: for y in x.findAll(text=True): f.write(y.encode("utf8", "ignore") + "\n") f.close() start = int(chapters[0]["href"][len(link) : -5]) end = int(chapters[-1]["href"][len(link) : -5]) + 20 chapterD = {} for x in chapters: num = int(x["href"][len(link) : -5]) title = x["title"] chapterD[num] = title count = 0 for i in range(start, end): chapter = link + str(i) + ".html" content = getWebpage(chapter) soup = BeautifulSoup(content) content = soup.find("div", {"id": "zoom"}) f = open(book_name + ".txt", "a") if i in chapterD: f.write("\n\n" + chapterD[i].encode("utf8", "ignore") + "\n") if content == None: continue for y in content.findAll(text=True): if y.encode("utf8", "ignore").strip() == "": continue f.write(y.encode("utf8", "ignore") + "\n") f.close() # if count>5:break count += 1
if name=='servingsPerContainer': name='Servings Per Container' if name=='servingSize': name='Serving Size' value=detail[x] if name[-1]==':': name=name[:-1] if len(name)<3: continue new_detail[name]=value for x in detail: name=x value=detail[x] if len(name.split())>1 and name[-1] in map(str,range(10))+['%','g']: value=clean(name.split()[-1]) name=clean(' '.join(name.split()[:-1])) tmpname=None if name.endswith('Calories from Fat'): tmpname=name name='Calories from Fat' if not name in new_detail: new_detail[name]=value if tmpname!=None: tmpname=tmpname.split() if tmpname[0]=='Calories' and len(tmpname)>1: value=tmpname[1] new_detail['Calories']=value return new_detail page=getWebpage('http://www.peapod.com/itemDetailView.jhtml?productId=155508',dataDir='../../data/detail_pages') getDetails(page)
def getSoup(url, encode='utf8', coo=''): page = getWebpage(url, cookies=coo) return BeautifulSoup(page.decode(encode, 'ignore'))