Пример #1
0
def fetch(head):
    coo = 'bid="yrdz0J5ispI"; __gads=ID=720d3fea3d3fb612:T=1352676703:S=ALNI_MZ72ae6zGEgpSfYlI_B0WyhBlV-zA; ll="0"; viewed="2052978_11584608"; dbcl2="4898454:5qnPL5l4FFw"; ck="3bMe"; __utma=30149280.1032140921.1356576933.1356576933.1356614007.2; __utmc=30149280; __utmz=30149280.1356576933.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=30149280.489'
    parts = head.split('/')
    ref = '/'.join(parts[:-3])
    headP = getWebpage('http://read.douban.com/reader/',
                       cookies='hst=1; ' + coo,
                       reLoad=True,
                       referer=ref)
    return
    soup = BeautifulSoup(headP.decode('utf8', 'ignore'))
    t = soup.find('h1')
    title = t.find(text=True)
    soup = soup.find('div', {'id': "content"})
    ans = []
    for x in soup.findAll('a'):
        if not x.has_key('href'): break
        link = x['href']
        if link[:len(prehead)] != prehead: continue
        chapter = getWebpage(link)
        chapter = BeautifulSoup(chapter)
        content = chapter.find('div', {'class': "book-content"})
        content = '\n'.join(map(clean, content.findAll(text=True)))
        ans.append(content)
    g = open(title + '.txt', 'w')
    g.write('\n'.join(ans))
    g.close()
Пример #2
0
def fetchTieba(prefaceL, lz=True):  #lz, only see the top floor poster
    if lz and prefaceL[-8:] != 'see_lz=1': prefaceL += '?see_lz=1'
    prefaceP = getWebpage(prefaceL)
    prefaceS = BeautifulSoup(prefaceP.decode('gbk', 'ignore'))
    book_title = prefaceS.find('title').find(text=True)
    for link in prefaceS.findAll('a'):
        if link.find(text=True) == '尾页':
            lastL = link['href']
            ind = lastL.rfind('=')
            totalP = int(lastL[ind + 1:])
    pageS = prefaceS
    currentP = 1
    if not '?' in prefaceL:
        prefaceL += '?'
    else:
        prefaceL += '&'
    ans = []
    while True:
        posts = pageS.findAll('div', {'class': "d_post_content"})
        for post in posts:
            ans.append('\n'.join(post.findAll(text=True)))
        currentP += 1
        if currentP > totalP: break
        page = getWebpage(prefaceL + 'pn=' + str(currentP))
        pageS = BeautifulSoup(page)
    g = open(book_title + '.txt', 'w')
    g.write('\n\n'.join(ans))
    g.close()
Пример #3
0
def fetchNovel(link, rLS=None):
    if rLS==None: rLS=link
    chapters=extractLinks(link=link, requireLinkStart=rLS,avoidKeys=['img','alt','src'],\
                          requireLinkEnd='.html')
    content=getWebpage(link)
    soup=BeautifulSoup(content)
    book_name=''.join(soup.find('title').contents)
    book_name=book_name.split('_')[0]
    print 'collecting: ',book_name
    f=open(book_name+'.txt','w')
    f.close()
    count=0
    for x in chapters:
        chapter='http://data.book.163.com'+x['href']
        #print chapter,'0'
        content=getWebpage(chapter)
        soup=BeautifulSoup(content)
        content=soup.find('div',{'class':'bk-article-body','id':'bk-article-body'})
        f=open(book_name+'.txt','a')
        #print chapter,'1'
        try:
            title=''.join(x.contents).encode('GBK','ignore')
        except:
            title=''
        if title!='' and (title[-1]!=')' or title[-3:]=='(1)'):f.write('\n\n'+title+'\n')
        #print chapter,'2'
        for y in content.findAll(text=True):
            if y.encode('GBK','ignore').strip()=='': continue
            f.write(y.encode('GBK','ignore')+'\n')
        f.close()
        #if count>5:break
        count+=1
Пример #4
0
def fetchTieba(prefaceL,lz=True):#lz, only see the top floor poster
    if lz and prefaceL[-8:]!='see_lz=1': prefaceL+='?see_lz=1'    
    prefaceP=getWebpage(prefaceL)
    prefaceS=BeautifulSoup(prefaceP.decode('gbk','ignore'))
    book_title=prefaceS.find('title').find(text=True)
    for link in prefaceS.findAll('a'):
        if link.find(text=True)=='尾页':
            lastL=link['href']
            ind=lastL.rfind('=')
            totalP=int(lastL[ind+1:])
    pageS=prefaceS
    currentP=1
    if not '?' in prefaceL: 
        prefaceL+='?'
    else:
        prefaceL+='&'
    ans=[]
    while True:
        posts=pageS.findAll('div',{'class':"d_post_content"})
        for post in posts:
            ans.append('\n'.join(post.findAll(text=True)))
        currentP+=1
        if currentP>totalP: break
        page=getWebpage(prefaceL+'pn='+str(currentP))
        pageS=BeautifulSoup(page) 
    g=open(book_title+'.txt','w')
    g.write('\n\n'.join(ans))
    g.close()
Пример #5
0
def fetchNovel(link):
    content=getWebpage(link)
    soup=BeautifulSoup(content)
    list_c=soup.find('div',{'class':"book_neirong_left"})
    chapters=list_c.findAll('a')
    paras=soup.findAll('div',{'class':'paragraph'})
    intro=soup.find('div',{'class':'bookintro'})
    book_name=''.join(soup.find('title').findAll(text=True)).strip()
    print 'collecting: ',book_name
    for c in chapters:
        url=c['href'][:-4]
        url=url.split('/')
        if len(url)!=5: continue
        page_info = urllib2.build_opener()
        postData='c='+url[-1]+'&b='+url[-2]
        req = urllib2.Request('http://v.book.ifeng.com/book/remc.htm', postData)
        page=page_info.open(req)
        content=page.read()[14:-1]
        content=BeautifulSoup(content)
        f=open(book_name+'.txt','a')
        if content==None: continue
        for y in content.findAll(text=True):
            if y.encode('GBK','ignore').strip()=='': continue
            f.write(y.encode('GBK','ignore')+'\n')
        f.close()
Пример #6
0
def test():
    link = 'http://read.360buy.com/14532/'
    chapters = extractLinks(link=link,
                            requireLinkStart=link,
                            avoidKeys=['img', 'alt', 'src'],
                            requireLinkEnd='.html')
    print chapters
    content = getWebpage(link)
    soup = BeautifulSoup(content)
    paras = soup.findAll('div', {'class': 'paragraph'})
    intro = soup.find('div', {'class': 'bookintro'})
    book_name = soup.find('div', {'id': 'book-cover'}).find('a')['title']
    print 'collecting: ', book_name
    f = open(book_name + '.txt', 'w')
    f.write('intro: ')
    for y in intro.findAll(text=True):
        if y.encode('utf8', 'ignore').strip() == '': continue
        f.write(y.encode('utf8', 'ignore') + '\n')
    for x in paras:
        for y in x.findAll(text=True):
            f.write(y.encode('utf8', 'ignore') + '\n')
    f.close()
    start = int(chapters[0]['href'][len(link):-5])
    end = int(chapters[-1]['href'][len(link):-5]) + 20
    chapterD = {}
    for x in chapters:
        num = int(x['href'][len(link):-5])
        title = x['title']
        chapterD[num] = title
    count = 0
    for i in range(start, end):
        chapter = link + str(i) + '.html'
        content = getWebpage(chapter)
        soup = BeautifulSoup(content)
        content = soup.find('div', {'id': 'zoom'})
        f = open(book_name + '.txt', 'a')
        if i in chapterD:
            f.write('\n\n' + chapterD[i].encode('utf8', 'ignore') + '\n')
        if content == None: continue
        for y in content.findAll(text=True):
            if y.encode('utf8', 'ignore').strip() == '': continue
            f.write(y.encode('utf8', 'ignore') + '\n')
        f.close()
        #if count>5:break
        count += 1
Пример #7
0
def fetch(head):
    headP=getWebpage(head)
    soup=BeautifulSoup(headP.decode('gbk','ignore'))
    t=soup.find('h1')
    title=t.find(text=True)
    ans=[]
    for x in soup.findAll('td',{'class':"ccss"})[90:]:
        link=x.find('a')
        if link==None: continue
        if not link.has_key('href'): continue
        link=head+link['href']
        chapter=getWebpage(link)
        chapter=BeautifulSoup(chapter)
        content=chapter.find('div',{'id':"content"})
        content='\n'.join(map(clean,content.findAll(text=True)))
        ans.append(content)
    g=open(title+'.txt','w')
    g.write('\n'.join(ans))
    g.close()
Пример #8
0
def fetch(head):
    headP = getWebpage(head)
    soup = BeautifulSoup(headP.decode('gbk', 'ignore'))
    t = soup.find('h1')
    title = t.find(text=True)
    ans = []
    for x in soup.findAll('td', {'class': "ccss"})[90:]:
        link = x.find('a')
        if link == None: continue
        if not link.has_key('href'): continue
        link = head + link['href']
        chapter = getWebpage(link)
        chapter = BeautifulSoup(chapter)
        content = chapter.find('div', {'id': "content"})
        content = '\n'.join(map(clean, content.findAll(text=True)))
        ans.append(content)
    g = open(title + '.txt', 'w')
    g.write('\n'.join(ans))
    g.close()
def fetch(prefix, suffix='.shtml'):
    prefaceP = getWebpage(prefix + '1' + suffix)
    prefaceS = BeautifulSoup(prefaceP.decode('utf8', 'ignore'))
    book_title = prefaceS.find('title').find(text=True)
    num_page = prefaceS.find('div', {
        'class': "atl-pages"
    }).find('form')['onsubmit']
    num_page = num_page.split()[-1]
    num_page = num_page.split(',')[-1]
    num_page = num_page.split(')')[0]
    num_page = int(num_page)
    book_author = prefaceS.find('a', {'replyid': 0})['author']
    ans = []
    last_author = book_author
    for page_num in xrange(1, num_page):
        link = prefix + str(page_num) + suffix
        page = getWebpage(link)
        soup = BeautifulSoup(page.decode('utf8', 'ignore'))
        posts = soup.findAll('div', {'class': "atl-item"})
        for post in posts:
            try:
                author = post.find('div', {
                    'class': "atl-info"
                }).find('a', {'target': "_blank"})['uname']
            except:
                author = ''
            if author == last_author and author != '':
                author = ''
            else:
                last_author = author
            try:
                post = post.find('div', {'class': "bbs-content"})
            except:
                pass
            post = '\n'.join(map(clean, post.findAll(text=True)))
            if len(post) < 30: continue
            if author != '': post = u'作者:' + author + '\n' + post
            post.replace('\n\n', '\n')
            post.replace('\n\n', '\n')
            ans.append(post)
    g = open(book_title + '.txt', 'w')
    g.write('\n\n'.join(ans))
    g.close()
Пример #10
0
def test():
    link='http://read.360buy.com/14532/'
    chapters=extractLinks(link=link, requireLinkStart=link,avoidKeys=['img','alt','src'],requireLinkEnd='.html')
    print chapters
    content=getWebpage(link)
    soup=BeautifulSoup(content)
    paras=soup.findAll('div',{'class':'paragraph'})
    intro=soup.find('div',{'class':'bookintro'})
    book_name=soup.find('div',{'id':'book-cover'}).find('a')['title']
    print 'collecting: ',book_name
    f=open(book_name+'.txt','w')
    f.write('intro: ')
    for y in intro.findAll(text=True):
        if y.encode('utf8','ignore').strip()=='': continue
        f.write(y.encode('utf8','ignore')+'\n')
    for x in paras:
        for y in x.findAll(text=True):
            f.write(y.encode('utf8','ignore')+'\n')
    f.close()
    start=int(chapters[0]['href'][len(link):-5])
    end=int(chapters[-1]['href'][len(link):-5])+20
    chapterD={}
    for x in chapters:
        num=int(x['href'][len(link):-5])
        title=x['title']
        chapterD[num]=title
    count=0
    for i in range(start,end):
        chapter=link+str(i)+'.html'
        content=getWebpage(chapter)
        soup=BeautifulSoup(content)
        content=soup.find('div',{'id':'zoom'}) 
        f=open(book_name+'.txt','a')
        if i in chapterD:
            f.write('\n\n'+chapterD[i].encode('utf8','ignore')+'\n')
        if content==None: continue
        for y in content.findAll(text=True):
            if y.encode('utf8','ignore').strip()=='': continue
            f.write(y.encode('utf8','ignore')+'\n')
        f.close()
        #if count>5:break
        count+=1
Пример #11
0
def fetch(head):
    prehead = 'http://book.douban.com/reading/'
    headP = getWebpage(head)
    soup = BeautifulSoup(headP.decode('utf8', 'ignore'))
    t = soup.find('h1')
    title = t.find(text=True)
    soup = soup.find('div', {'id': "content"})
    ans = []
    for x in soup.findAll('a'):
        if not x.has_key('href'): break
        link = x['href']
        if link[:len(prehead)] != prehead: continue
        chapter = getWebpage(link)
        chapter = BeautifulSoup(chapter)
        content = chapter.find('div', {'class': "book-content"})
        content = '\n'.join(map(clean, content.findAll(text=True)))
        ans.append(content)
    g = open(title + '.txt', 'w')
    g.write('\n'.join(ans))
    g.close()
Пример #12
0
def fetch(head):
    prehead='http://book.douban.com/reading/' 
    headP=getWebpage(head)
    soup=BeautifulSoup(headP.decode('utf8','ignore')) 
    t=soup.find('h1')
    title=t.find(text=True)
    soup=soup.find('div',{'id':"content"})
    ans=[]
    for x in soup.findAll('a'):
        if not x.has_key('href'): break
        link=x['href']
        if link[:len(prehead)]!=prehead: continue
        chapter=getWebpage(link)
        chapter=BeautifulSoup(chapter)
        content=chapter.find('div',{'class':"book-content"})
        content='\n'.join(map(clean,content.findAll(text=True)))
        ans.append(content)
    g=open(title+'.txt','w')
    g.write('\n'.join(ans))
    g.close()
Пример #13
0
def fetch(head):
    headP = getWebpage(head)
    ind = head.rfind('/')
    head = head[:ind + 1]
    soup = BeautifulSoup(headP.decode('gbk', 'ignore'))
    t = soup.find('h1')
    title = t.find(text=True)
    print title
    ans = []
    soup = soup.find('div', {'class': "booklist clearfix"})
    for x in soup.findAll('a'):
        if not x.has_key('href'): continue
        link = head + x['href']
        chapter = getWebpage(link)
        chapter = BeautifulSoup(chapter)
        content = chapter.find('div', {'class': "bookcontent clearfix"})
        content = '\n'.join(map(clean, content.findAll(text=True)))
        ans.append(content)
    g = open(title + '.txt', 'w')
    g.write('\n'.join(ans))
    g.close()
Пример #14
0
def fetch(head):
    headP=getWebpage(head)
    ind=head.rfind('/')
    head=head[:ind+1]
    soup=BeautifulSoup(headP.decode('gbk','ignore'))
    t=soup.find('h1')
    title=t.find(text=True)
    print title
    ans=[]
    soup=soup.find('div',{'class':"booklist clearfix"})
    for x in soup.findAll('a'):
        if not x.has_key('href'): continue
        link=head+x['href']
        chapter=getWebpage(link)
        chapter=BeautifulSoup(chapter)
        content=chapter.find('div',{'class':"bookcontent clearfix"})
        content='\n'.join(map(clean,content.findAll(text=True)))
        ans.append(content)
    g=open(title+'.txt','w')
    g.write('\n'.join(ans))
    g.close()
Пример #15
0
def getTraffic(title, date , silent=True,country='en'):
    date=str(date)
    link='http://stats.grok.se/json/'+country+'/' + date +'/'+title
    if silent==False: print link
    page=getWebpage(link)
    jsondata= json.loads(page)
    daily_views =jsondata['daily_views']
    tot = 0
    days = 0
    for day, views in daily_views.iteritems():
        tot+=views
        days+=1
    return tot*1.0/days
Пример #16
0
def fetch(prefix,suffix='.shtml'):
    prefaceP=getWebpage(prefix+'1'+suffix)
    prefaceS=BeautifulSoup(prefaceP.decode('utf8','ignore'))
    book_title=prefaceS.find('title').find(text=True)
    num_page=prefaceS.find('div',{'class':"atl-pages"}).find('form')['onsubmit']
    num_page=num_page.split()[-1]
    num_page=num_page.split(',')[-1]
    num_page=num_page.split(')')[0]
    num_page=int(num_page)
    book_author=prefaceS.find('a',{'replyid':0})['author']
    ans=[]
    last_author=book_author
    for page_num in xrange(1,num_page):
        link=prefix+str(page_num)+suffix
        page=getWebpage(link)
        soup=BeautifulSoup(page.decode('utf8','ignore'))
        posts=soup.findAll('div',{'class':"atl-item"})
        for post in posts:
            try:
                author=post.find('div',{'class':"atl-info"}).find('a',{'target':"_blank"})['uname']
            except:
                author=''
            if author==last_author and author!='': 
                author=''
            else:
                last_author=author
            try:
                post=post.find('div',{'class':"bbs-content"})  
            except:
                pass 
            post='\n'.join(map(clean,post.findAll(text=True)))
            if len(post)<30: continue
            if author!='': post=u'作者:'+author+'\n'+post
            post.replace('\n\n','\n')
            post.replace('\n\n','\n')
            ans.append(post)
    g=open(book_title+'.txt','w')
    g.write('\n\n'.join(ans))
    g.close()
def fetch(head):
    coo='bid="yrdz0J5ispI"; __gads=ID=720d3fea3d3fb612:T=1352676703:S=ALNI_MZ72ae6zGEgpSfYlI_B0WyhBlV-zA; ll="0"; viewed="2052978_11584608"; dbcl2="4898454:5qnPL5l4FFw"; ck="3bMe"; __utma=30149280.1032140921.1356576933.1356576933.1356614007.2; __utmc=30149280; __utmz=30149280.1356576933.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=30149280.489'
    parts=head.split('/')
    ref='/'.join(parts[:-3])
    headP=getWebpage('http://read.douban.com/reader/',cookies='hst=1; '+coo, reLoad=True,referer=ref)
    return
    soup=BeautifulSoup(headP.decode('utf8','ignore')) 
    t=soup.find('h1')
    title=t.find(text=True)
    soup=soup.find('div',{'id':"content"})
    ans=[]
    for x in soup.findAll('a'):
        if not x.has_key('href'): break
        link=x['href']
        if link[:len(prehead)]!=prehead: continue
        chapter=getWebpage(link)
        chapter=BeautifulSoup(chapter)
        content=chapter.find('div',{'class':"book-content"})
        content='\n'.join(map(clean,content.findAll(text=True)))
        ans.append(content)
    g=open(title+'.txt','w')
    g.write('\n'.join(ans))
    g.close()
Пример #18
0
def fetch(head):
    headP=getWebpage(head+'firstchapter')
    soup=BeautifulSoup(headP)
    for f in soup.findAll('input'):
        if not f.has_key('value'): continue
        if len(f['value'])<100: continue
        break
    ids=re.findall("'id\d*'",f['value'])
    ids=map(toId,ids)
    
    ans=[]
    for id in ids:
        try:
            page=head+'id'+id
            page=getWebpage(page,timeSleep=10) 
            soup=BeautifulSoup(page)
            content=soup.find('div',{'class':"htmlcontent"})
            content=''.join(map(clean,content.findAll(text=True)))
            ans.append(content)
        except:
            print head+'id'+id+' failed'
    g=open('download.txt','w')
    g.write('\n'.join(ans))
    g.close()
Пример #19
0
def fetchNovel(link, rLS=None):
    if rLS == None: rLS = link
    chapters=extractLinks(link=link, requireLinkStart=rLS,avoidKeys=['img','alt','src'],\
                          requireLinkEnd='.html')
    content = getWebpage(link)
    soup = BeautifulSoup(content)
    book_name = ''.join(soup.find('title').contents)
    book_name = book_name.split('_')[0]
    print 'collecting: ', book_name
    f = open(book_name + '.txt', 'w')
    f.close()
    count = 0
    for x in chapters:
        chapter = 'http://data.book.163.com' + x['href']
        #print chapter,'0'
        content = getWebpage(chapter)
        soup = BeautifulSoup(content)
        content = soup.find('div', {
            'class': 'bk-article-body',
            'id': 'bk-article-body'
        })
        f = open(book_name + '.txt', 'a')
        #print chapter,'1'
        try:
            title = ''.join(x.contents).encode('GBK', 'ignore')
        except:
            title = ''
        if title != '' and (title[-1] != ')' or title[-3:] == '(1)'):
            f.write('\n\n' + title + '\n')
        #print chapter,'2'
        for y in content.findAll(text=True):
            if y.encode('GBK', 'ignore').strip() == '': continue
            f.write(y.encode('GBK', 'ignore') + '\n')
        f.close()
        #if count>5:break
        count += 1
Пример #20
0
def fetch(head):
    headP = getWebpage(head + 'firstchapter')
    soup = BeautifulSoup(headP)
    for f in soup.findAll('input'):
        if not f.has_key('value'): continue
        if len(f['value']) < 100: continue
        break
    ids = re.findall("'id\d*'", f['value'])
    ids = map(toId, ids)

    ans = []
    for id in ids:
        try:
            page = head + 'id' + id
            page = getWebpage(page, timeSleep=10)
            soup = BeautifulSoup(page)
            content = soup.find('div', {'class': "htmlcontent"})
            content = ''.join(map(clean, content.findAll(text=True)))
            ans.append(content)
        except:
            print head + 'id' + id + ' failed'
    g = open('download.txt', 'w')
    g.write('\n'.join(ans))
    g.close()
Пример #21
0
def fetchInfo(homeUrl, type=None):
    homeSoup = getSoup(homeUrl)
    pageSoup = homeSoup
    info = []
    count = homeSoup.find('span', {'class': 'count'})
    if count:
        count = count.find(text=True)[2:-2]
        count = int(count)
    else:
        count = N  # only one page
    ind = len(homeSoup.findAll('h1')) - 1
    if ind > 1: ind = 1
    album_name = homeSoup.findAll('h1')[ind].find(text=True)
    if '-' in album_name:
        album_name = album_name.split('-')[1]
    album_name = album_name.replace("*", '')
    album_name = album_name.replace("/", '')
    album_name = album_name.split()[0]
    start = 0
    while True:
        photos = pageSoup.findAll('div', {'class': 'photo_wrap'})
        if len(photos) > N: print 'warning on photo number!'
        for photo in photos:
            aTag = photo.find('a', {'class': "photolst_photo"})
            if not aTag: continue
            name = aTag['title']
            url = photo.find('img')['src']
            url = url.replace('thumb', 'large')
            info.append((name, url))
        start += N
        if start > count: break
        page = getWebpage(homeUrl + '?start=' + str(start))
        pageSoup = BeautifulSoup(page)
    photos = homeSoup.findAll('span', {'class': "img"})
    if not photos:
        photos = homeSoup.findAll('a', {'class': "pic"})
    for photo in photos:
        img = photo.find('img')
        if not img: continue
        if not img.has_key('alt'): continue
        name = img['alt']
        if img.has_key('data-src'):
            url = img['data-src']
        else:
            url = img['src']
        url = url.replace('head', 'original')
        info.append((url, name))
    return (album_name, info)
Пример #22
0
def forum_crawl(link,outFile):
    createFile(outFile, force=True)
    p=1
    lastpage=''
    while True:            
        page=getWebpage(link+str(p),timeSleep=0)
        if not page or page==lastpage: break
        lastpage=page
        soup=BeautifulSoup(page.decode('gb2312','ignore'))
        fields=soup.findAll('div',{'id':"content"})
        for f in fields:
            for line in f.findAll(text=True):
                if len(line.strip())>1:
                    f=open(outFile,'a')
                    f.write(line)
                    f.close() 
        p+=1
Пример #23
0
def forum_crawl(link, outFile):
    createFile(outFile, force=True)
    p = 1
    lastpage = ''
    while True:
        page = getWebpage(link + str(p), timeSleep=0)
        if not page or page == lastpage: break
        lastpage = page
        soup = BeautifulSoup(page.decode('gb2312', 'ignore'))
        fields = soup.findAll('div', {'id': "content"})
        for f in fields:
            for line in f.findAll(text=True):
                if len(line.strip()) > 1:
                    f = open(outFile, 'a')
                    f.write(line)
                    f.close()
        p += 1
Пример #24
0
def extractLinks(link='', requireLinkStart='', requiredKeys=[],\
                 specification=[], containedIn=None,numTable_level_1=None,\
                 numTable_level_2=None, avoidKeys=[],coo='',requireLinkEnd=''):
    ans=[]
    content=getWebpage(link, cookies=coo)     
    soup = BeautifulSoup(content)
    if containedIn!=None:
        tables=soup.findAll(containedIn[0],containedIn[1])
    else:
        tables=[soup]
    for table in tables[:numTable_level_1]:
        for field in table.findAll('a')[:numTable_level_2]:
            if field.has_key('href'): 
                extLink=field['href']
                satisfySpec=True
                for subfield, require in specification:
                    if not field.has_key(subfield):
                        satisfySpec=False
                        break
                    if not require in field[subfield]:
                        satisfySpec=False
                        break
                for requiredKey in requiredKeys:
                    if not field.has_key(requiredKey):
                        satisfySpec=False
                        break
                for avoidKey in avoidKeys:
                    if isinstance(avoidKey,str):
                        if field.has_key(avoidKey):
                            satisfySpec=False
                            break
                    else:
                        akey,akeyValue=avoidKey
                        if field.has_key(akey) and field[akey]==[akeyValue]:
                            satisfySpec=False
                            break
                if satisfySpec==False: continue 
                if extLink[:len(requireLinkStart)]==requireLinkStart and \
                   (requireLinkEnd=='' or extLink[-len(requireLinkEnd):]==requireLinkEnd):
                    ans.append(field)
    return ans
Пример #25
0
def extractLinks(link='', requireLinkStart='', requiredKeys=[],\
                 specification=[], containedIn=None,numTable_level_1=None,\
                 numTable_level_2=None, avoidKeys=[],coo='',requireLinkEnd=''):
    ans = []
    content = getWebpage(link, cookies=coo)
    soup = BeautifulSoup(content)
    if containedIn != None:
        tables = soup.findAll(containedIn[0], containedIn[1])
    else:
        tables = [soup]
    for table in tables[:numTable_level_1]:
        for field in table.findAll('a')[:numTable_level_2]:
            if field.has_key('href'):
                extLink = field['href']
                satisfySpec = True
                for subfield, require in specification:
                    if not field.has_key(subfield):
                        satisfySpec = False
                        break
                    if not require in field[subfield]:
                        satisfySpec = False
                        break
                for requiredKey in requiredKeys:
                    if not field.has_key(requiredKey):
                        satisfySpec = False
                        break
                for avoidKey in avoidKeys:
                    if isinstance(avoidKey, str):
                        if field.has_key(avoidKey):
                            satisfySpec = False
                            break
                    else:
                        akey, akeyValue = avoidKey
                        if field.has_key(akey) and field[akey] == [akeyValue]:
                            satisfySpec = False
                            break
                if satisfySpec == False: continue
                if extLink[:len(requireLinkStart)]==requireLinkStart and \
                   (requireLinkEnd=='' or extLink[-len(requireLinkEnd):]==requireLinkEnd):
                    ans.append(field)
    return ans
Пример #26
0
def fetchInfo(homeUrl,type=None):
    if 'C:\ '[:-1] in homeUrl:
        f=open(homeUrl)
        homePage=f.read()
        f.close()
    else:
        homePage=getWebpage(homeUrl)
    homeSoup=BeautifulSoup(homePage)
    pageSoup=homeSoup
    info=[]
    if type=='xinmin':    
        album_name=pageSoup.find('title').find(text=True)
        homeUrl=homeUrl.replace('.html','_@@@@.html')
        for x in range(2,100):
            article=pageSoup.find('div',{'class':'article_info'})
            if not article: break
            paragraphs=article.findAll('p')
            if not paragraphs: break
            for paragraph in paragraphs:
                img=paragraph.find('img')
                if not img: 
                    name=paragraph.find(text=True)
                    info.append((name,link))
                else:
                    link=img['src']
            pageUrl=homeUrl.replace('@@@@',str(x))
            page=getWebpage(pageUrl,retry_num=1)
            if not page: break
            pageSoup=BeautifulSoup(page)
        return (album_name,info)
    if type=='douban':
        count=homeSoup.find('span',{'class':'count'})
        if count: 
            count=count.find(text=True)[2:-2]
            count=int(count)
        else:
            count=N # only one page
    ind=len(homeSoup.findAll('h1'))-1
    if ind>1: ind=1
    if type=='douban' or type=='renren':
        album_name=homeSoup.findAll('h1')[ind].find(text=True)
    else:
        album_name=homeSoup.find('title').find(text=True)
    if '-' in album_name:
        if type=='douban' or type=='renren':
            album_name=album_name.split('-')[1]
        else:
            album_name=album_name.split('-')[0]
    album_name=album_name.split()[0]
    album_name=album_name.replace("*",'')
    if album_name=='("▔□▔)/': album_name='smile'
    start=0
    
    
    
    if type=='douban':
        while True:
            photos=pageSoup.findAll('div',{'class':'photo_wrap'})
            if len(photos)>N: print 'warning on photo number!'
            for photo in photos:
                aTag=photo.find('a',{'class':"photolst_photo"})
                if not aTag: continue
                name=aTag['title']
                url=photo.find('img')['src']
                url=url.replace('thumb','large')
                info.append((name,url))
            start+=N
            if start>count: break
            page=getWebpage(homeUrl+'?start='+str(start))
            pageSoup=BeautifulSoup(page)
    photos=homeSoup.findAll('span',{'class':"img"})
    if not photos: 
        photos=homeSoup.findAll('a',{'class':"pic"})
    for photo in photos:
        img=photo.find('img')
        if not img: continue
        if not img.has_key('alt'): continue
        name=img['alt']
        if img.has_key('data-src'): 
            url=img['data-src']
        else:
            url=img['src']
        url=url.replace('head','original')
        info.append((url,name))
        
    return (album_name,info)
Пример #27
0
coo = 'datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299'
f = open(sysPath('webpages/ids.txt'))
jf = json.loads(f.read().decode('utf8', 'ignore'))
f.close()

createFile('infos_fb.txt', force=True)
g = open('infos_fb.txt', 'a')
g.write(
    'Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'
    + '\n')
g.close()

ans = []
for f in jf['data']:
    info = getWebpage('http://www.facebook.com/' + str(f['id']),
                      cookies=coo,
                      info=str(f['id']))
    bI = BeautifulSoup(info)
    link = bI.find('link', {'rel': 'alternate'})
    '''
    info=getWebpage(link['href']+'/info',
                    cookies=coo,
                    info=str(f['id'])
                    )
    '''
    ind = link['href'].rfind('/')
    email = link['href'][ind + 1:]
    ans.append((f['name'], f['id'], email + '@facebook.com'))
    name = f['name']
    id = f['id']
    email = email + '@facebook.com'
Пример #28
0
def getSoup(url,encode='utf8',coo=''):
    page=getWebpage(url,cookies=coo)
    return BeautifulSoup(page.decode(encode,'ignore'))
Пример #29
0
def grabQuestions(link=None, category_number=None, pn=0, reLoad=True):
    questions_to_return = []
    if link == None and category_number == None:
        raise Exception('no input')
    if category_number != None:
        newLink = header + clean(str(category_number))
        if pn != 0: newLink += '?pn=' + str(pn)
    if link != None and category_number != None and newLink != link:
        raise Exception('conflict input')
    if link == None: link = newLink
    page = getWebpage(link, reLoad=reLoad, dataDir='webpages')
    page = page.decode('gb2312', 'ignore')
    soup = BeautifulSoup(page, from_encoding="gb2312")
    questions = soup.findAll('tr', {'class': "qlist-tr"})
    for question in questions:
        number = question.find('td', {'class': 'quick-num'})
        number_ans = int(str(number.contents[0]))
        question = question.find('td', {'class': 'align-l'})
        cid = question['cid']
        qid = question['qid']
        qdesc = question['qdesc']
        qtitle = question['qtitle']
        if not cid.isdigit():
            print 'cid is not digit'
            print cid, qid, qdesc, qtitle
            continue
        else:
            cid = int(cid)
        if not qid.isdigit():
            print 'qid is not digit'
            print cid, qid, qdesc, qtitle
            continue
        else:
            qid = int(qid)
        qdesc = clean_sentence(qdesc)
        qtitle = clean_sentence(qtitle)
        #if qid!=499587072: continue

        content = qtitle + '\n' + qdesc
        content_no_space = no_space(content)
        email = re.search('[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]+',
                          content_no_space)
        if email != None:
            email = email.group(0) + 'qq.com'
        if email != None:
            email = re.search('\w+@\w+\.(com|cn)', content_no_space)
            if email != None:
                email = email.group(0)
        if email == None:
            email = ''
        s = u'《' + '.*' + u'》'
        s2 = '<<.*>>'
        ##        s=s.encode('gb2312','ignore')
        ##        print s
        title_separate = separate(qtitle)
        book_name = re.search(s, content_no_space)
        if book_name != None:
            book_name = book_name.group(0)[1:-1]
        if book_name == None:
            book_name = re.search(s2, content_no_space)
            if book_name != None:
                book_name = book_name.group(0)[2:-2]
            else:
                for i in xrange(len(title_separate)):
                    x = title_separate[i]
                    if u'求' in x:
                        ind = x.find(u'求')
                        if len(x) - ind >= 3:
                            book_name = x[ind + 1:]
                        elif i + 1 < len(title_separate):
                            book_name = title_separate[i + 1]
                        break
                    if u'有' in x:
                        ind = x.find(u'有')
                        if len(x) - ind >= 3:
                            book_name = x[ind + 1:]
                        elif i + 1 < len(title_separate):
                            book_name = title_separate[i + 1]
                        break
                    if 'txt' in x:
                        ind = x.find('txt')
                        if ind > 3:
                            book_name = x[:ind]
                            break
                    if 'TXT' in x:
                        ind = x.find('TXT')
                        if ind > 3:
                            book_name = x[:ind]
                            break

        for x in EndSign:
            if book_name == None: break
            if x in book_name:
                ind = book_name.find(x)
                book_name = book_name[:ind]

        for x in BeginSign:
            if book_name == None: break
            if x in book_name:
                ind = book_name.find(x)
                book_name = book_name[ind + len(x):]

        if book_name != None and '@' in book_name: book_name = None
        if book_name == None or book_name == '':
            book_name = title_separate[0]

        #print qid,int(number_ans),'[',qdesc,',',qtitle,']',email
        #printlist(title_separate)
        #print book_name
        #print '-'*30
        google_query = book_name
        if google_query in title_separate[0]:
            google_query = title_separate[0]
        questions_to_return.append((qid, int(number_ans), email, book_name,
                                    google_query, content_no_space))

    return questions_to_return
Пример #30
0
'''
this program needs to add wait time, may cause problem with your renren id
'''
from getWebpage import getWebpage
import re
import json,time
from sysPath import createFile

coo='anonymid=h9489u7u-yp0fqs; _r01_=1; mop_uniq_ckid=10.7.18.77_1355594994_642928755; _de=3F3126DBF672F298F26CBA88523C3AB26DEBB8C2103DE356; __utma=151146938.1762808405.1361533510.1361533510.1361533510.1; __utmz=151146938.1361533510.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); l4pager=0; depovince=GW; jebecookies=abb5a061-adf7-4276-9913-0059ed1553e6|||||; p=c506abb8c6dd441921166c4464e116341; ap=269496411; t=351ac721dd34d54a08268e46db838a211; societyguester=351ac721dd34d54a08268e46db838a211; id=269496411; xnsid=cacc7bc0; XNESSESSIONID=376bb17a6b26; at=1; loginfrom=null'
headpage=getWebpage(link='http://friend.renren.com/myfriendlistx.do',
                    cookies=coo)
r=re.search('var friends=(\[.*\]);',headpage)
friendList=r.group(1)
jf=json.loads(friendList)
ids=[]
for f in jf:
    ids.append(f['id'])
createFile('infos.txt',force=True)
g=open('infos.txt','a')
g.write('Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'+'\n')
g.close()

count=0
for id in ids[:]:
    timeSleep=1
    count+=1
    #print count

    mainInfo=getWebpage('http://www.renren.com/'+str(id)+
                    '/profile?v=info_ajax&undefined',
                    cookies=coo,
Пример #31
0
'''
this program needs to add wait time, may cause problem with your renren id
'''
from getWebpage import getWebpage
try:
    from BeautifulSoup import BeautifulSoup, SoupStrainer
except:
    from bs4 import BeautifulSoup, SoupStrainer  # beta version of bs
import re
import json, time
from sysPath import createFile
import sys

coo = '''anonymid=h9489u7u-yp0fqs; _r01_=1; mop_uniq_ckid=10.7.18.77_1355594994_642928755; __utma=10481322.145044192.1363634540.1363634540.1363636668.2; __utmz=10481322.1363636668.2.2.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/269496411; _de=3F3126DBF672F298F26CBA88523C3AB26DEBB8C2103DE356; depovince=GW; bt_new=12; jebecookies=63880745-b57f-4dce-b75e-7cc2218be89a|||||; p=9babffa88c9c71f7219d11a49178460d1; ap=269496411; t=fa5d5d911dc472ebde86481e5486062e1; societyguester=fa5d5d911dc472ebde86481e5486062e1; id=269496411; xnsid=6ef4dee; loginfrom=null; feedType=269496411_hot; JSESSIONID=abcMqcp8dHsTAh3nle53t; l4pager=0'''
headpage = getWebpage(link='http://friend.renren.com/myfriendlistx.do',
                      cookies=coo)
r = re.search('var friends=(\[.*\]);', headpage)
friendList = r.group(1)
jf = json.loads(friendList)
ids = []
for f in jf:
    ids.append(f['id'])

if len(sys.argv) >= 2:
    start_num = int(sys.argv[1])
else:
    start_num = 0

timeSleep = 0.8
for id in ids[start_num:start_num + 100]:
    page = getWebpage('http://www.renren.com/' + str(id) + '/profile',
def fetch(head, nid):
    head += '-' + str(nid)
    headP = getWebpage(head + '.html')
    soup = BeautifulSoup(headP.decode('utf8', 'ignore'))
    n = soup.find('div', {'class': "pageNum1"})
    n = n.contents[0]
    r = re.match(u'共' + '(\d*)' + u'页', n)
    n = r.group(1)
    try:
        n = int(n)
    except:
        print 'failed to find the number of page'
        n = 1000
    ans = []
    for i in range(1, n + 1):
        page = head + '-' + str(i) + '.html'
        page = getWebpage(page)
        soup = BeautifulSoup(page)
        posts = soup.findAll('li', {'class': 'at c h2'})
        for post in posts:
            post = '\n'.join(map(clean, post.findAll(text=True)))
            if len(post) < 10: continue
            if reply(post): continue
            ans.append(post)
    g = open(str(nid) + '.txt', 'w')
    g.write('\n'.join(ans))
    g.close()

    return

    return
    chapters = extractLinks(link=link,
                            requireLinkStart=link,
                            avoidKeys=['img', 'alt', 'src'],
                            requireLinkEnd='.html')
    print chapters
    content = getWebpage(link)
    soup = BeautifulSoup(content)
    paras = soup.findAll('div', {'class': 'paragraph'})
    intro = soup.find('div', {'class': 'bookintro'})
    book_name = soup.find('div', {'id': 'book-cover'}).find('a')['title']
    print 'collecting: ', book_name
    f = open(book_name + '.txt', 'w')
    f.write('intro: ')
    for y in intro.findAll(text=True):
        if y.encode('utf8', 'ignore').strip() == '': continue
        f.write(y.encode('utf8', 'ignore') + '\n')
    for x in paras:
        for y in x.findAll(text=True):
            f.write(y.encode('utf8', 'ignore') + '\n')
    f.close()
    start = int(chapters[0]['href'][len(link):-5])
    end = int(chapters[-1]['href'][len(link):-5]) + 20
    chapterD = {}
    for x in chapters:
        num = int(x['href'][len(link):-5])
        title = x['title']
        chapterD[num] = title
    count = 0
    for i in range(start, end):
        chapter = link + str(i) + '.html'
        content = getWebpage(chapter)
        soup = BeautifulSoup(content)
        content = soup.find('div', {'id': 'zoom'})
        f = open(book_name + '.txt', 'a')
        if i in chapterD:
            f.write('\n\n' + chapterD[i].encode('utf8', 'ignore') + '\n')
        if content == None: continue
        for y in content.findAll(text=True):
            if y.encode('utf8', 'ignore').strip() == '': continue
            f.write(y.encode('utf8', 'ignore') + '\n')
        f.close()
        #if count>5:break
        count += 1
Пример #33
0
def genTable(filename='../../testData/testingMonuments.txt',\
             outfname='../../testData/testingMonumentsData_week4_all.csv', \
             months=None,yearBegin=2009, yearEnd=2015,silent=True,endLine=None,\
             testNow=False, country='en'):
    now = datetime.datetime.now()
    now=(int(now.year),int(now.month))
    if months==None:
        months=[]
        for year in range(yearBegin,yearEnd):
                for month in range(1,13):
                    if (year, month)>=now: break
                    months.append(str(year)+'0'*(2-len(str(month)))+str(month))
    months=map(str,months)
    filename=sysPath(filename)
    f=open(filename,'r')
    links=f.read().splitlines()
    f.close()    
    #soup=BeautifulSoup(links)
    titleLine=['linkTitle']
    for month in months:
        titleLine.append('Img'+month)
        titleLine.append('Content'+month)
        titleLine.append('Traffic'+month)
    if not os.path.exists(outfname):
        outf=open(outfname,'w')
        outf.write('\t'.join(titleLine)+'\n')
        start=0
        outf.close()
    else:
        outf=open(outfname,'r')
        start=len(outf.read().splitlines())
        outf.close()
    count=0
##    for field in soup.findAll('a')[:endLine]:
    for linkTitle in links:
        index=linkTitle.find('/wiki/')
        if index!=-1:
            linkTitle=linkTitle[index+6:]
        count+=1
        if count<start: continue
##        if not field.has_key('title'): continue
##        linkTitle=field['href'][6:]
##        officialTitle=field['title']
        curLine=[linkTitle]
        for month in months:
            date=month+'01'
            revId=getRevId(linkTitle, date+'000000' , silent=silent,country=country) # 6 zeros for h,m,s
            if not silent: print 'revId=',revId
            if revId==None:
                curLine+=['','','']
                continue
            link='http://'+country+'.wikipedia.org/w/index.php?oldid='+revId
            if testNow: print 'title=',linkTitle, 'link=',link,'month=',month
            if not silent: print 'prepare'
            page=getWebpage(link, timeSleep=0.5,silent=silent)
            if not silent: print 'got page'
            soup=BeautifulSoup(page)
            if not silent: print 'got soup'
            numImg=numImage(soup)            
            if not silent: print 'got num'
            conLen=contentLen(soup)
            if not silent: print 'got len'
            traffic=str(getTraffic(linkTitle,month, silent=silent, country=country))
            if not silent: print 'got history'
            curLine+=[numImg, conLen, traffic]
        curLine=map(str, curLine)
        outf=open(outfname,'a')
        outf.write('\t'.join(curLine)+'\n')
        outf.close()
Пример #34
0
    from bs4 import BeautifulSoup,SoupStrainer # beta version of bs

coo='datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299'
f=open(sysPath('webpages/ids.txt'))
jf=json.loads(f.read().decode('utf8','ignore'))
f.close()

createFile('infos_fb.txt',force=True)
g=open('infos_fb.txt','a')
g.write('Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'+'\n')
g.close()

ans=[]
for f in jf['data']:
    info=getWebpage('http://www.facebook.com/'+str(f['id']),
                    cookies=coo,
                    info=str(f['id'])
                    )
    bI=BeautifulSoup(info)
    link=bI.find('link',{'rel':'alternate'})
    '''
    info=getWebpage(link['href']+'/info',
                    cookies=coo,
                    info=str(f['id'])
                    )
    '''
    ind=link['href'].rfind('/')
    email=link['href'][ind+1:]
    ans.append((f['name'],f['id'],email+'@facebook.com'))
    name=f['name']
    id=f['id']
    email=email+'@facebook.com'
def fetch(head, nid):
    head += "-" + str(nid)
    headP = getWebpage(head + ".html")
    soup = BeautifulSoup(headP.decode("utf8", "ignore"))
    n = soup.find("div", {"class": "pageNum1"})
    n = n.contents[0]
    r = re.match(u"共" + "(\d*)" + u"页", n)
    n = r.group(1)
    try:
        n = int(n)
    except:
        print "failed to find the number of page"
        n = 1000
    ans = []
    for i in range(1, n + 1):
        page = head + "-" + str(i) + ".html"
        page = getWebpage(page)
        soup = BeautifulSoup(page)
        posts = soup.findAll("li", {"class": "at c h2"})
        for post in posts:
            post = "\n".join(map(clean, post.findAll(text=True)))
            if len(post) < 10:
                continue
            if reply(post):
                continue
            ans.append(post)
    g = open(str(nid) + ".txt", "w")
    g.write("\n".join(ans))
    g.close()

    return

    return
    chapters = extractLinks(link=link, requireLinkStart=link, avoidKeys=["img", "alt", "src"], requireLinkEnd=".html")
    print chapters
    content = getWebpage(link)
    soup = BeautifulSoup(content)
    paras = soup.findAll("div", {"class": "paragraph"})
    intro = soup.find("div", {"class": "bookintro"})
    book_name = soup.find("div", {"id": "book-cover"}).find("a")["title"]
    print "collecting: ", book_name
    f = open(book_name + ".txt", "w")
    f.write("intro: ")
    for y in intro.findAll(text=True):
        if y.encode("utf8", "ignore").strip() == "":
            continue
        f.write(y.encode("utf8", "ignore") + "\n")
    for x in paras:
        for y in x.findAll(text=True):
            f.write(y.encode("utf8", "ignore") + "\n")
    f.close()
    start = int(chapters[0]["href"][len(link) : -5])
    end = int(chapters[-1]["href"][len(link) : -5]) + 20
    chapterD = {}
    for x in chapters:
        num = int(x["href"][len(link) : -5])
        title = x["title"]
        chapterD[num] = title
    count = 0
    for i in range(start, end):
        chapter = link + str(i) + ".html"
        content = getWebpage(chapter)
        soup = BeautifulSoup(content)
        content = soup.find("div", {"id": "zoom"})
        f = open(book_name + ".txt", "a")
        if i in chapterD:
            f.write("\n\n" + chapterD[i].encode("utf8", "ignore") + "\n")
        if content == None:
            continue
        for y in content.findAll(text=True):
            if y.encode("utf8", "ignore").strip() == "":
                continue
            f.write(y.encode("utf8", "ignore") + "\n")
        f.close()
        # if count>5:break
        count += 1
Пример #36
0
        if name=='servingsPerContainer': name='Servings Per Container'
        if name=='servingSize': name='Serving Size'
        value=detail[x]
        if name[-1]==':': name=name[:-1]
        if len(name)<3: continue
        new_detail[name]=value
    for x in detail:
        name=x
        value=detail[x]
        if len(name.split())>1 and name[-1] in map(str,range(10))+['%','g']:
            value=clean(name.split()[-1])
            name=clean(' '.join(name.split()[:-1]))
            tmpname=None
            if name.endswith('Calories from Fat'):
                tmpname=name
                name='Calories from Fat'
            if not name in new_detail:
                new_detail[name]=value
            if tmpname!=None:
                tmpname=tmpname.split()
                if tmpname[0]=='Calories' and len(tmpname)>1:
                    value=tmpname[1]
                    new_detail['Calories']=value
    return new_detail


        
page=getWebpage('http://www.peapod.com/itemDetailView.jhtml?productId=155508',dataDir='../../data/detail_pages')

getDetails(page)
Пример #37
0
def getSoup(url, encode='utf8', coo=''):
    page = getWebpage(url, cookies=coo)
    return BeautifulSoup(page.decode(encode, 'ignore'))