Python getWebpage примеры, getWebpage.getWebpage Python примеры использования

Пример #1

0

Показать файл

def fetch(head):
    coo = 'bid="yrdz0J5ispI"; __gads=ID=720d3fea3d3fb612:T=1352676703:S=ALNI_MZ72ae6zGEgpSfYlI_B0WyhBlV-zA; ll="0"; viewed="2052978_11584608"; dbcl2="4898454:5qnPL5l4FFw"; ck="3bMe"; __utma=30149280.1032140921.1356576933.1356576933.1356614007.2; __utmc=30149280; __utmz=30149280.1356576933.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=30149280.489'
    parts = head.split('/')
    ref = '/'.join(parts[:-3])
    headP = getWebpage('http://read.douban.com/reader/',
                       cookies='hst=1; ' + coo,
                       reLoad=True,
                       referer=ref)
    return
    soup = BeautifulSoup(headP.decode('utf8', 'ignore'))
    t = soup.find('h1')
    title = t.find(text=True)
    soup = soup.find('div', {'id': "content"})
    ans = []
    for x in soup.findAll('a'):
        if not x.has_key('href'): break
        link = x['href']
        if link[:len(prehead)] != prehead: continue
        chapter = getWebpage(link)
        chapter = BeautifulSoup(chapter)
        content = chapter.find('div', {'class': "book-content"})
        content = '\n'.join(map(clean, content.findAll(text=True)))
        ans.append(content)
    g = open(title + '.txt', 'w')
    g.write('\n'.join(ans))
    g.close()

Пример #2

0

Показать файл

def fetchTieba(prefaceL, lz=True):  #lz, only see the top floor poster
    if lz and prefaceL[-8:] != 'see_lz=1': prefaceL += '?see_lz=1'
    prefaceP = getWebpage(prefaceL)
    prefaceS = BeautifulSoup(prefaceP.decode('gbk', 'ignore'))
    book_title = prefaceS.find('title').find(text=True)
    for link in prefaceS.findAll('a'):
        if link.find(text=True) == '尾页':
            lastL = link['href']
            ind = lastL.rfind('=')
            totalP = int(lastL[ind + 1:])
    pageS = prefaceS
    currentP = 1
    if not '?' in prefaceL:
        prefaceL += '?'
    else:
        prefaceL += '&'
    ans = []
    while True:
        posts = pageS.findAll('div', {'class': "d_post_content"})
        for post in posts:
            ans.append('\n'.join(post.findAll(text=True)))
        currentP += 1
        if currentP > totalP: break
        page = getWebpage(prefaceL + 'pn=' + str(currentP))
        pageS = BeautifulSoup(page)
    g = open(book_title + '.txt', 'w')
    g.write('\n\n'.join(ans))
    g.close()

Пример #3

0

Показать файл

Файл: fetchNovelbook.py Проект: gaoyunzhi/crawling_toolkit

def fetchNovel(link, rLS=None):
    if rLS==None: rLS=link
    chapters=extractLinks(link=link, requireLinkStart=rLS,avoidKeys=['img','alt','src'],\
                          requireLinkEnd='.html')
    content=getWebpage(link)
    soup=BeautifulSoup(content)
    book_name=''.join(soup.find('title').contents)
    book_name=book_name.split('_')[0]
    print 'collecting: ',book_name
    f=open(book_name+'.txt','w')
    f.close()
    count=0
    for x in chapters:
        chapter='http://data.book.163.com'+x['href']
        #print chapter,'0'
        content=getWebpage(chapter)
        soup=BeautifulSoup(content)
        content=soup.find('div',{'class':'bk-article-body','id':'bk-article-body'})
        f=open(book_name+'.txt','a')
        #print chapter,'1'
        try:
            title=''.join(x.contents).encode('GBK','ignore')
        except:
            title=''
        if title!='' and (title[-1]!=')' or title[-3:]=='(1)'):f.write('\n\n'+title+'\n')
        #print chapter,'2'
        for y in content.findAll(text=True):
            if y.encode('GBK','ignore').strip()=='': continue
            f.write(y.encode('GBK','ignore')+'\n')
        f.close()
        #if count>5:break
        count+=1

Пример #4

0

Показать файл

Файл: fn.py Проект: gaoyunzhi/crawling_toolkit

def fetchTieba(prefaceL,lz=True):#lz, only see the top floor poster
    if lz and prefaceL[-8:]!='see_lz=1': prefaceL+='?see_lz=1'    
    prefaceP=getWebpage(prefaceL)
    prefaceS=BeautifulSoup(prefaceP.decode('gbk','ignore'))
    book_title=prefaceS.find('title').find(text=True)
    for link in prefaceS.findAll('a'):
        if link.find(text=True)=='尾页':
            lastL=link['href']
            ind=lastL.rfind('=')
            totalP=int(lastL[ind+1:])
    pageS=prefaceS
    currentP=1
    if not '?' in prefaceL: 
        prefaceL+='?'
    else:
        prefaceL+='&'
    ans=[]
    while True:
        posts=pageS.findAll('div',{'class':"d_post_content"})
        for post in posts:
            ans.append('\n'.join(post.findAll(text=True)))
        currentP+=1
        if currentP>totalP: break
        page=getWebpage(prefaceL+'pn='+str(currentP))
        pageS=BeautifulSoup(page) 
    g=open(book_title+'.txt','w')
    g.write('\n\n'.join(ans))
    g.close()

Пример #5

0

Показать файл

def fetchNovel(link):
    content=getWebpage(link)
    soup=BeautifulSoup(content)
    list_c=soup.find('div',{'class':"book_neirong_left"})
    chapters=list_c.findAll('a')
    paras=soup.findAll('div',{'class':'paragraph'})
    intro=soup.find('div',{'class':'bookintro'})
    book_name=''.join(soup.find('title').findAll(text=True)).strip()
    print 'collecting: ',book_name
    for c in chapters:
        url=c['href'][:-4]
        url=url.split('/')
        if len(url)!=5: continue
        page_info = urllib2.build_opener()
        postData='c='+url[-1]+'&b='+url[-2]
        req = urllib2.Request('http://v.book.ifeng.com/book/remc.htm', postData)
        page=page_info.open(req)
        content=page.read()[14:-1]
        content=BeautifulSoup(content)
        f=open(book_name+'.txt','a')
        if content==None: continue
        for y in content.findAll(text=True):
            if y.encode('GBK','ignore').strip()=='': continue
            f.write(y.encode('GBK','ignore')+'\n')
        f.close()

Пример #6

0

Показать файл

Файл: extractLinks.py Проект: gaoyunzhi/crawling_toolkit

def test():
    link = 'http://read.360buy.com/14532/'
    chapters = extractLinks(link=link,
                            requireLinkStart=link,
                            avoidKeys=['img', 'alt', 'src'],
                            requireLinkEnd='.html')
    print chapters
    content = getWebpage(link)
    soup = BeautifulSoup(content)
    paras = soup.findAll('div', {'class': 'paragraph'})
    intro = soup.find('div', {'class': 'bookintro'})
    book_name = soup.find('div', {'id': 'book-cover'}).find('a')['title']
    print 'collecting: ', book_name
    f = open(book_name + '.txt', 'w')
    f.write('intro: ')
    for y in intro.findAll(text=True):
        if y.encode('utf8', 'ignore').strip() == '': continue
        f.write(y.encode('utf8', 'ignore') + '\n')
    for x in paras:
        for y in x.findAll(text=True):
            f.write(y.encode('utf8', 'ignore') + '\n')
    f.close()
    start = int(chapters[0]['href'][len(link):-5])
    end = int(chapters[-1]['href'][len(link):-5]) + 20
    chapterD = {}
    for x in chapters:
        num = int(x['href'][len(link):-5])
        title = x['title']
        chapterD[num] = title
    count = 0
    for i in range(start, end):
        chapter = link + str(i) + '.html'
        content = getWebpage(chapter)
        soup = BeautifulSoup(content)
        content = soup.find('div', {'id': 'zoom'})
        f = open(book_name + '.txt', 'a')
        if i in chapterD:
            f.write('\n\n' + chapterD[i].encode('utf8', 'ignore') + '\n')
        if content == None: continue
        for y in content.findAll(text=True):
            if y.encode('utf8', 'ignore').strip() == '': continue
            f.write(y.encode('utf8', 'ignore') + '\n')
        f.close()
        #if count>5:break
        count += 1

Пример #7

0

Показать файл

Файл: fetchNovelzy.py Проект: gaoyunzhi/crawling_toolkit

def fetch(head):
    headP=getWebpage(head)
    soup=BeautifulSoup(headP.decode('gbk','ignore'))
    t=soup.find('h1')
    title=t.find(text=True)
    ans=[]
    for x in soup.findAll('td',{'class':"ccss"})[90:]:
        link=x.find('a')
        if link==None: continue
        if not link.has_key('href'): continue
        link=head+link['href']
        chapter=getWebpage(link)
        chapter=BeautifulSoup(chapter)
        content=chapter.find('div',{'id':"content"})
        content='\n'.join(map(clean,content.findAll(text=True)))
        ans.append(content)
    g=open(title+'.txt','w')
    g.write('\n'.join(ans))
    g.close()

Пример #8

0

Показать файл

Файл: fetchNovelzy.py Проект: gaoyunzhi/crawling_toolkit

def fetch(head):
    headP = getWebpage(head)
    soup = BeautifulSoup(headP.decode('gbk', 'ignore'))
    t = soup.find('h1')
    title = t.find(text=True)
    ans = []
    for x in soup.findAll('td', {'class': "ccss"})[90:]:
        link = x.find('a')
        if link == None: continue
        if not link.has_key('href'): continue
        link = head + link['href']
        chapter = getWebpage(link)
        chapter = BeautifulSoup(chapter)
        content = chapter.find('div', {'id': "content"})
        content = '\n'.join(map(clean, content.findAll(text=True)))
        ans.append(content)
    g = open(title + '.txt', 'w')
    g.write('\n'.join(ans))
    g.close()

Пример #9

0

Показать файл

Файл: fetchNovelTianya.py Проект: gaoyunzhi/crawling_toolkit

def fetch(prefix, suffix='.shtml'):
    prefaceP = getWebpage(prefix + '1' + suffix)
    prefaceS = BeautifulSoup(prefaceP.decode('utf8', 'ignore'))
    book_title = prefaceS.find('title').find(text=True)
    num_page = prefaceS.find('div', {
        'class': "atl-pages"
    }).find('form')['onsubmit']
    num_page = num_page.split()[-1]
    num_page = num_page.split(',')[-1]
    num_page = num_page.split(')')[0]
    num_page = int(num_page)
    book_author = prefaceS.find('a', {'replyid': 0})['author']
    ans = []
    last_author = book_author
    for page_num in xrange(1, num_page):
        link = prefix + str(page_num) + suffix
        page = getWebpage(link)
        soup = BeautifulSoup(page.decode('utf8', 'ignore'))
        posts = soup.findAll('div', {'class': "atl-item"})
        for post in posts:
            try:
                author = post.find('div', {
                    'class': "atl-info"
                }).find('a', {'target': "_blank"})['uname']
            except:
                author = ''
            if author == last_author and author != '':
                author = ''
            else:
                last_author = author
            try:
                post = post.find('div', {'class': "bbs-content"})
            except:
                pass
            post = '\n'.join(map(clean, post.findAll(text=True)))
            if len(post) < 30: continue
            if author != '': post = u'作者：' + author + '\n' + post
            post.replace('\n\n', '\n')
            post.replace('\n\n', '\n')
            ans.append(post)
    g = open(book_title + '.txt', 'w')
    g.write('\n\n'.join(ans))
    g.close()

Пример #10

0

Показать файл

Файл: extractLinks.py Проект: gaoyunzhi/crawling_toolkit

def test():
    link='http://read.360buy.com/14532/'
    chapters=extractLinks(link=link, requireLinkStart=link,avoidKeys=['img','alt','src'],requireLinkEnd='.html')
    print chapters
    content=getWebpage(link)
    soup=BeautifulSoup(content)
    paras=soup.findAll('div',{'class':'paragraph'})
    intro=soup.find('div',{'class':'bookintro'})
    book_name=soup.find('div',{'id':'book-cover'}).find('a')['title']
    print 'collecting: ',book_name
    f=open(book_name+'.txt','w')
    f.write('intro: ')
    for y in intro.findAll(text=True):
        if y.encode('utf8','ignore').strip()=='': continue
        f.write(y.encode('utf8','ignore')+'\n')
    for x in paras:
        for y in x.findAll(text=True):
            f.write(y.encode('utf8','ignore')+'\n')
    f.close()
    start=int(chapters[0]['href'][len(link):-5])
    end=int(chapters[-1]['href'][len(link):-5])+20
    chapterD={}
    for x in chapters:
        num=int(x['href'][len(link):-5])
        title=x['title']
        chapterD[num]=title
    count=0
    for i in range(start,end):
        chapter=link+str(i)+'.html'
        content=getWebpage(chapter)
        soup=BeautifulSoup(content)
        content=soup.find('div',{'id':'zoom'}) 
        f=open(book_name+'.txt','a')
        if i in chapterD:
            f.write('\n\n'+chapterD[i].encode('utf8','ignore')+'\n')
        if content==None: continue
        for y in content.findAll(text=True):
            if y.encode('utf8','ignore').strip()=='': continue
            f.write(y.encode('utf8','ignore')+'\n')
        f.close()
        #if count>5:break
        count+=1

Пример #11

0

Показать файл

Файл: fetchNovelDouban.py Проект: gaoyunzhi/crawling_toolkit

def fetch(head):
    prehead = 'http://book.douban.com/reading/'
    headP = getWebpage(head)
    soup = BeautifulSoup(headP.decode('utf8', 'ignore'))
    t = soup.find('h1')
    title = t.find(text=True)
    soup = soup.find('div', {'id': "content"})
    ans = []
    for x in soup.findAll('a'):
        if not x.has_key('href'): break
        link = x['href']
        if link[:len(prehead)] != prehead: continue
        chapter = getWebpage(link)
        chapter = BeautifulSoup(chapter)
        content = chapter.find('div', {'class': "book-content"})
        content = '\n'.join(map(clean, content.findAll(text=True)))
        ans.append(content)
    g = open(title + '.txt', 'w')
    g.write('\n'.join(ans))
    g.close()

Пример #12

0

Показать файл

Файл: fetchNovelDouban.py Проект: gaoyunzhi/crawling_toolkit

def fetch(head):
    prehead='http://book.douban.com/reading/' 
    headP=getWebpage(head)
    soup=BeautifulSoup(headP.decode('utf8','ignore')) 
    t=soup.find('h1')
    title=t.find(text=True)
    soup=soup.find('div',{'id':"content"})
    ans=[]
    for x in soup.findAll('a'):
        if not x.has_key('href'): break
        link=x['href']
        if link[:len(prehead)]!=prehead: continue
        chapter=getWebpage(link)
        chapter=BeautifulSoup(chapter)
        content=chapter.find('div',{'class':"book-content"})
        content='\n'.join(map(clean,content.findAll(text=True)))
        ans.append(content)
    g=open(title+'.txt','w')
    g.write('\n'.join(ans))
    g.close()

Пример #13

0

Показать файл

def fetch(head):
    headP = getWebpage(head)
    ind = head.rfind('/')
    head = head[:ind + 1]
    soup = BeautifulSoup(headP.decode('gbk', 'ignore'))
    t = soup.find('h1')
    title = t.find(text=True)
    print title
    ans = []
    soup = soup.find('div', {'class': "booklist clearfix"})
    for x in soup.findAll('a'):
        if not x.has_key('href'): continue
        link = head + x['href']
        chapter = getWebpage(link)
        chapter = BeautifulSoup(chapter)
        content = chapter.find('div', {'class': "bookcontent clearfix"})
        content = '\n'.join(map(clean, content.findAll(text=True)))
        ans.append(content)
    g = open(title + '.txt', 'w')
    g.write('\n'.join(ans))
    g.close()

Пример #14

0

Показать файл

Файл: fetchNoveldm.py Проект: gaoyunzhi/crawling_toolkit

def fetch(head):
    headP=getWebpage(head)
    ind=head.rfind('/')
    head=head[:ind+1]
    soup=BeautifulSoup(headP.decode('gbk','ignore'))
    t=soup.find('h1')
    title=t.find(text=True)
    print title
    ans=[]
    soup=soup.find('div',{'class':"booklist clearfix"})
    for x in soup.findAll('a'):
        if not x.has_key('href'): continue
        link=head+x['href']
        chapter=getWebpage(link)
        chapter=BeautifulSoup(chapter)
        content=chapter.find('div',{'class':"bookcontent clearfix"})
        content='\n'.join(map(clean,content.findAll(text=True)))
        ans.append(content)
    g=open(title+'.txt','w')
    g.write('\n'.join(ans))
    g.close()

Пример #15

0

Показать файл

def getTraffic(title, date , silent=True,country='en'):
    date=str(date)
    link='http://stats.grok.se/json/'+country+'/' + date +'/'+title
    if silent==False: print link
    page=getWebpage(link)
    jsondata= json.loads(page)
    daily_views =jsondata['daily_views']
    tot = 0
    days = 0
    for day, views in daily_views.iteritems():
        tot+=views
        days+=1
    return tot*1.0/days

Пример #16

0

Показать файл

Файл: fetchNovelTianya.py Проект: gaoyunzhi/crawling_toolkit

def fetch(prefix,suffix='.shtml'):
    prefaceP=getWebpage(prefix+'1'+suffix)
    prefaceS=BeautifulSoup(prefaceP.decode('utf8','ignore'))
    book_title=prefaceS.find('title').find(text=True)
    num_page=prefaceS.find('div',{'class':"atl-pages"}).find('form')['onsubmit']
    num_page=num_page.split()[-1]
    num_page=num_page.split(',')[-1]
    num_page=num_page.split(')')[0]
    num_page=int(num_page)
    book_author=prefaceS.find('a',{'replyid':0})['author']
    ans=[]
    last_author=book_author
    for page_num in xrange(1,num_page):
        link=prefix+str(page_num)+suffix
        page=getWebpage(link)
        soup=BeautifulSoup(page.decode('utf8','ignore'))
        posts=soup.findAll('div',{'class':"atl-item"})
        for post in posts:
            try:
                author=post.find('div',{'class':"atl-info"}).find('a',{'target':"_blank"})['uname']
            except:
                author=''
            if author==last_author and author!='': 
                author=''
            else:
                last_author=author
            try:
                post=post.find('div',{'class':"bbs-content"})  
            except:
                pass 
            post='\n'.join(map(clean,post.findAll(text=True)))
            if len(post)<30: continue
            if author!='': post=u'作者：'+author+'\n'+post
            post.replace('\n\n','\n')
            post.replace('\n\n','\n')
            ans.append(post)
    g=open(book_title+'.txt','w')
    g.write('\n\n'.join(ans))
    g.close()

Пример #17

0

Показать файл

Файл: fetchNovelDoubanDushu.py Проект: gaoyunzhi/crawling_toolkit

def fetch(head):
    coo='bid="yrdz0J5ispI"; __gads=ID=720d3fea3d3fb612:T=1352676703:S=ALNI_MZ72ae6zGEgpSfYlI_B0WyhBlV-zA; ll="0"; viewed="2052978_11584608"; dbcl2="4898454:5qnPL5l4FFw"; ck="3bMe"; __utma=30149280.1032140921.1356576933.1356576933.1356614007.2; __utmc=30149280; __utmz=30149280.1356576933.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=30149280.489'
    parts=head.split('/')
    ref='/'.join(parts[:-3])
    headP=getWebpage('http://read.douban.com/reader/',cookies='hst=1; '+coo, reLoad=True,referer=ref)
    return
    soup=BeautifulSoup(headP.decode('utf8','ignore')) 
    t=soup.find('h1')
    title=t.find(text=True)
    soup=soup.find('div',{'id':"content"})
    ans=[]
    for x in soup.findAll('a'):
        if not x.has_key('href'): break
        link=x['href']
        if link[:len(prehead)]!=prehead: continue
        chapter=getWebpage(link)
        chapter=BeautifulSoup(chapter)
        content=chapter.find('div',{'class':"book-content"})
        content='\n'.join(map(clean,content.findAll(text=True)))
        ans.append(content)
    g=open(title+'.txt','w')
    g.write('\n'.join(ans))
    g.close()

Пример #18

0

Показать файл

Файл: fetchNovelMIT.py Проект: gaoyunzhi/crawling_toolkit

def fetch(head):
    headP=getWebpage(head+'firstchapter')
    soup=BeautifulSoup(headP)
    for f in soup.findAll('input'):
        if not f.has_key('value'): continue
        if len(f['value'])<100: continue
        break
    ids=re.findall("'id\d*'",f['value'])
    ids=map(toId,ids)
    
    ans=[]
    for id in ids:
        try:
            page=head+'id'+id
            page=getWebpage(page,timeSleep=10) 
            soup=BeautifulSoup(page)
            content=soup.find('div',{'class':"htmlcontent"})
            content=''.join(map(clean,content.findAll(text=True)))
            ans.append(content)
        except:
            print head+'id'+id+' failed'
    g=open('download.txt','w')
    g.write('\n'.join(ans))
    g.close()

Пример #19

0

Показать файл

Файл: fetchNovelbook.py Проект: gaoyunzhi/crawling_toolkit

def fetchNovel(link, rLS=None):
    if rLS == None: rLS = link
    chapters=extractLinks(link=link, requireLinkStart=rLS,avoidKeys=['img','alt','src'],\
                          requireLinkEnd='.html')
    content = getWebpage(link)
    soup = BeautifulSoup(content)
    book_name = ''.join(soup.find('title').contents)
    book_name = book_name.split('_')[0]
    print 'collecting: ', book_name
    f = open(book_name + '.txt', 'w')
    f.close()
    count = 0
    for x in chapters:
        chapter = 'http://data.book.163.com' + x['href']
        #print chapter,'0'
        content = getWebpage(chapter)
        soup = BeautifulSoup(content)
        content = soup.find('div', {
            'class': 'bk-article-body',
            'id': 'bk-article-body'
        })
        f = open(book_name + '.txt', 'a')
        #print chapter,'1'
        try:
            title = ''.join(x.contents).encode('GBK', 'ignore')
        except:
            title = ''
        if title != '' and (title[-1] != ')' or title[-3:] == '(1)'):
            f.write('\n\n' + title + '\n')
        #print chapter,'2'
        for y in content.findAll(text=True):
            if y.encode('GBK', 'ignore').strip() == '': continue
            f.write(y.encode('GBK', 'ignore') + '\n')
        f.close()
        #if count>5:break
        count += 1

Пример #20

0

Показать файл

Файл: fetchNovelMIT.py Проект: gaoyunzhi/crawling_toolkit

def fetch(head):
    headP = getWebpage(head + 'firstchapter')
    soup = BeautifulSoup(headP)
    for f in soup.findAll('input'):
        if not f.has_key('value'): continue
        if len(f['value']) < 100: continue
        break
    ids = re.findall("'id\d*'", f['value'])
    ids = map(toId, ids)

    ans = []
    for id in ids:
        try:
            page = head + 'id' + id
            page = getWebpage(page, timeSleep=10)
            soup = BeautifulSoup(page)
            content = soup.find('div', {'class': "htmlcontent"})
            content = ''.join(map(clean, content.findAll(text=True)))
            ans.append(content)
        except:
            print head + 'id' + id + ' failed'
    g = open('download.txt', 'w')
    g.write('\n'.join(ans))
    g.close()

Пример #21

0

Показать файл

Файл: douban_album.py Проект: gaoyunzhi/crawling_toolkit

def fetchInfo(homeUrl, type=None):
    homeSoup = getSoup(homeUrl)
    pageSoup = homeSoup
    info = []
    count = homeSoup.find('span', {'class': 'count'})
    if count:
        count = count.find(text=True)[2:-2]
        count = int(count)
    else:
        count = N  # only one page
    ind = len(homeSoup.findAll('h1')) - 1
    if ind > 1: ind = 1
    album_name = homeSoup.findAll('h1')[ind].find(text=True)
    if '-' in album_name:
        album_name = album_name.split('-')[1]
    album_name = album_name.replace("*", '')
    album_name = album_name.replace("/", '')
    album_name = album_name.split()[0]
    start = 0
    while True:
        photos = pageSoup.findAll('div', {'class': 'photo_wrap'})
        if len(photos) > N: print 'warning on photo number!'
        for photo in photos:
            aTag = photo.find('a', {'class': "photolst_photo"})
            if not aTag: continue
            name = aTag['title']
            url = photo.find('img')['src']
            url = url.replace('thumb', 'large')
            info.append((name, url))
        start += N
        if start > count: break
        page = getWebpage(homeUrl + '?start=' + str(start))
        pageSoup = BeautifulSoup(page)
    photos = homeSoup.findAll('span', {'class': "img"})
    if not photos:
        photos = homeSoup.findAll('a', {'class': "pic"})
    for photo in photos:
        img = photo.find('img')
        if not img: continue
        if not img.has_key('alt'): continue
        name = img['alt']
        if img.has_key('data-src'):
            url = img['data-src']
        else:
            url = img['src']
        url = url.replace('head', 'original')
        info.append((url, name))
    return (album_name, info)

Пример #22

0

Показать файл

Файл: forum_crawl.py Проект: gaoyunzhi/crawling_toolkit

def forum_crawl(link,outFile):
    createFile(outFile, force=True)
    p=1
    lastpage=''
    while True:            
        page=getWebpage(link+str(p),timeSleep=0)
        if not page or page==lastpage: break
        lastpage=page
        soup=BeautifulSoup(page.decode('gb2312','ignore'))
        fields=soup.findAll('div',{'id':"content"})
        for f in fields:
            for line in f.findAll(text=True):
                if len(line.strip())>1:
                    f=open(outFile,'a')
                    f.write(line)
                    f.close() 
        p+=1

Пример #23

0

Показать файл

Файл: forum_crawl.py Проект: gaoyunzhi/crawling_toolkit

def forum_crawl(link, outFile):
    createFile(outFile, force=True)
    p = 1
    lastpage = ''
    while True:
        page = getWebpage(link + str(p), timeSleep=0)
        if not page or page == lastpage: break
        lastpage = page
        soup = BeautifulSoup(page.decode('gb2312', 'ignore'))
        fields = soup.findAll('div', {'id': "content"})
        for f in fields:
            for line in f.findAll(text=True):
                if len(line.strip()) > 1:
                    f = open(outFile, 'a')
                    f.write(line)
                    f.close()
        p += 1

Пример #24

0

Показать файл

Файл: extractLinks.py Проект: gaoyunzhi/crawling_toolkit

def extractLinks(link='', requireLinkStart='', requiredKeys=[],\
                 specification=[], containedIn=None,numTable_level_1=None,\
                 numTable_level_2=None, avoidKeys=[],coo='',requireLinkEnd=''):
    ans=[]
    content=getWebpage(link, cookies=coo)     
    soup = BeautifulSoup(content)
    if containedIn!=None:
        tables=soup.findAll(containedIn[0],containedIn[1])
    else:
        tables=[soup]
    for table in tables[:numTable_level_1]:
        for field in table.findAll('a')[:numTable_level_2]:
            if field.has_key('href'): 
                extLink=field['href']
                satisfySpec=True
                for subfield, require in specification:
                    if not field.has_key(subfield):
                        satisfySpec=False
                        break
                    if not require in field[subfield]:
                        satisfySpec=False
                        break
                for requiredKey in requiredKeys:
                    if not field.has_key(requiredKey):
                        satisfySpec=False
                        break
                for avoidKey in avoidKeys:
                    if isinstance(avoidKey,str):
                        if field.has_key(avoidKey):
                            satisfySpec=False
                            break
                    else:
                        akey,akeyValue=avoidKey
                        if field.has_key(akey) and field[akey]==[akeyValue]:
                            satisfySpec=False
                            break
                if satisfySpec==False: continue 
                if extLink[:len(requireLinkStart)]==requireLinkStart and \
                   (requireLinkEnd=='' or extLink[-len(requireLinkEnd):]==requireLinkEnd):
                    ans.append(field)
    return ans

Пример #25

0

Показать файл

Файл: extractLinks.py Проект: gaoyunzhi/crawling_toolkit

def extractLinks(link='', requireLinkStart='', requiredKeys=[],\
                 specification=[], containedIn=None,numTable_level_1=None,\
                 numTable_level_2=None, avoidKeys=[],coo='',requireLinkEnd=''):
    ans = []
    content = getWebpage(link, cookies=coo)
    soup = BeautifulSoup(content)
    if containedIn != None:
        tables = soup.findAll(containedIn[0], containedIn[1])
    else:
        tables = [soup]
    for table in tables[:numTable_level_1]:
        for field in table.findAll('a')[:numTable_level_2]:
            if field.has_key('href'):
                extLink = field['href']
                satisfySpec = True
                for subfield, require in specification:
                    if not field.has_key(subfield):
                        satisfySpec = False
                        break
                    if not require in field[subfield]:
                        satisfySpec = False
                        break
                for requiredKey in requiredKeys:
                    if not field.has_key(requiredKey):
                        satisfySpec = False
                        break
                for avoidKey in avoidKeys:
                    if isinstance(avoidKey, str):
                        if field.has_key(avoidKey):
                            satisfySpec = False
                            break
                    else:
                        akey, akeyValue = avoidKey
                        if field.has_key(akey) and field[akey] == [akeyValue]:
                            satisfySpec = False
                            break
                if satisfySpec == False: continue
                if extLink[:len(requireLinkStart)]==requireLinkStart and \
                   (requireLinkEnd=='' or extLink[-len(requireLinkEnd):]==requireLinkEnd):
                    ans.append(field)
    return ans

Пример #26

0

Показать файл

def fetchInfo(homeUrl,type=None):
    if 'C:\ '[:-1] in homeUrl:
        f=open(homeUrl)
        homePage=f.read()
        f.close()
    else:
        homePage=getWebpage(homeUrl)
    homeSoup=BeautifulSoup(homePage)
    pageSoup=homeSoup
    info=[]
    if type=='xinmin':    
        album_name=pageSoup.find('title').find(text=True)
        homeUrl=homeUrl.replace('.html','_@@@@.html')
        for x in range(2,100):
            article=pageSoup.find('div',{'class':'article_info'})
            if not article: break
            paragraphs=article.findAll('p')
            if not paragraphs: break
            for paragraph in paragraphs:
                img=paragraph.find('img')
                if not img: 
                    name=paragraph.find(text=True)
                    info.append((name,link))
                else:
                    link=img['src']
            pageUrl=homeUrl.replace('@@@@',str(x))
            page=getWebpage(pageUrl,retry_num=1)
            if not page: break
            pageSoup=BeautifulSoup(page)
        return (album_name,info)
    if type=='douban':
        count=homeSoup.find('span',{'class':'count'})
        if count: 
            count=count.find(text=True)[2:-2]
            count=int(count)
        else:
            count=N # only one page
    ind=len(homeSoup.findAll('h1'))-1
    if ind>1: ind=1
    if type=='douban' or type=='renren':
        album_name=homeSoup.findAll('h1')[ind].find(text=True)
    else:
        album_name=homeSoup.find('title').find(text=True)
    if '-' in album_name:
        if type=='douban' or type=='renren':
            album_name=album_name.split('-')[1]
        else:
            album_name=album_name.split('-')[0]
    album_name=album_name.split()[0]
    album_name=album_name.replace("*",'')
    if album_name=='("▔□▔)/': album_name='smile'
    start=0
    
    
    
    if type=='douban':
        while True:
            photos=pageSoup.findAll('div',{'class':'photo_wrap'})
            if len(photos)>N: print 'warning on photo number!'
            for photo in photos:
                aTag=photo.find('a',{'class':"photolst_photo"})
                if not aTag: continue
                name=aTag['title']
                url=photo.find('img')['src']
                url=url.replace('thumb','large')
                info.append((name,url))
            start+=N
            if start>count: break
            page=getWebpage(homeUrl+'?start='+str(start))
            pageSoup=BeautifulSoup(page)
    photos=homeSoup.findAll('span',{'class':"img"})
    if not photos: 
        photos=homeSoup.findAll('a',{'class':"pic"})
    for photo in photos:
        img=photo.find('img')
        if not img: continue
        if not img.has_key('alt'): continue
        name=img['alt']
        if img.has_key('data-src'): 
            url=img['data-src']
        else:
            url=img['src']
        url=url.replace('head','original')
        info.append((url,name))
        
    return (album_name,info)

Пример #27

0

Показать файл

Файл: fb_friendlist.py Проект: gaoyunzhi/crawling_toolkit

coo = 'datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299'
f = open(sysPath('webpages/ids.txt'))
jf = json.loads(f.read().decode('utf8', 'ignore'))
f.close()

createFile('infos_fb.txt', force=True)
g = open('infos_fb.txt', 'a')
g.write(
    'Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'
    + '\n')
g.close()

ans = []
for f in jf['data']:
    info = getWebpage('http://www.facebook.com/' + str(f['id']),
                      cookies=coo,
                      info=str(f['id']))
    bI = BeautifulSoup(info)
    link = bI.find('link', {'rel': 'alternate'})
    '''
    info=getWebpage(link['href']+'/info',
                    cookies=coo,
                    info=str(f['id'])
                    )
    '''
    ind = link['href'].rfind('/')
    email = link['href'][ind + 1:]
    ans.append((f['name'], f['id'], email + '@facebook.com'))
    name = f['name']
    id = f['id']
    email = email + '@facebook.com'

Пример #28

0

Показать файл

Файл: getSoup.py Проект: gaoyunzhi/crawling_toolkit

def getSoup(url,encode='utf8',coo=''):
    page=getWebpage(url,cookies=coo)
    return BeautifulSoup(page.decode(encode,'ignore'))

Пример #29

0

Показать файл

def grabQuestions(link=None, category_number=None, pn=0, reLoad=True):
    questions_to_return = []
    if link == None and category_number == None:
        raise Exception('no input')
    if category_number != None:
        newLink = header + clean(str(category_number))
        if pn != 0: newLink += '?pn=' + str(pn)
    if link != None and category_number != None and newLink != link:
        raise Exception('conflict input')
    if link == None: link = newLink
    page = getWebpage(link, reLoad=reLoad, dataDir='webpages')
    page = page.decode('gb2312', 'ignore')
    soup = BeautifulSoup(page, from_encoding="gb2312")
    questions = soup.findAll('tr', {'class': "qlist-tr"})
    for question in questions:
        number = question.find('td', {'class': 'quick-num'})
        number_ans = int(str(number.contents[0]))
        question = question.find('td', {'class': 'align-l'})
        cid = question['cid']
        qid = question['qid']
        qdesc = question['qdesc']
        qtitle = question['qtitle']
        if not cid.isdigit():
            print 'cid is not digit'
            print cid, qid, qdesc, qtitle
            continue
        else:
            cid = int(cid)
        if not qid.isdigit():
            print 'qid is not digit'
            print cid, qid, qdesc, qtitle
            continue
        else:
            qid = int(qid)
        qdesc = clean_sentence(qdesc)
        qtitle = clean_sentence(qtitle)
        #if qid!=499587072: continue

        content = qtitle + '\n' + qdesc
        content_no_space = no_space(content)
        email = re.search('[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]+',
                          content_no_space)
        if email != None:
            email = email.group(0) + 'qq.com'
        if email != None:
            email = re.search('\w+@\w+\.(com|cn)', content_no_space)
            if email != None:
                email = email.group(0)
        if email == None:
            email = ''
        s = u'《' + '.*' + u'》'
        s2 = '<<.*>>'
        ##        s=s.encode('gb2312','ignore')
        ##        print s
        title_separate = separate(qtitle)
        book_name = re.search(s, content_no_space)
        if book_name != None:
            book_name = book_name.group(0)[1:-1]
        if book_name == None:
            book_name = re.search(s2, content_no_space)
            if book_name != None:
                book_name = book_name.group(0)[2:-2]
            else:
                for i in xrange(len(title_separate)):
                    x = title_separate[i]
                    if u'求' in x:
                        ind = x.find(u'求')
                        if len(x) - ind >= 3:
                            book_name = x[ind + 1:]
                        elif i + 1 < len(title_separate):
                            book_name = title_separate[i + 1]
                        break
                    if u'有' in x:
                        ind = x.find(u'有')
                        if len(x) - ind >= 3:
                            book_name = x[ind + 1:]
                        elif i + 1 < len(title_separate):
                            book_name = title_separate[i + 1]
                        break
                    if 'txt' in x:
                        ind = x.find('txt')
                        if ind > 3:
                            book_name = x[:ind]
                            break
                    if 'TXT' in x:
                        ind = x.find('TXT')
                        if ind > 3:
                            book_name = x[:ind]
                            break

        for x in EndSign:
            if book_name == None: break
            if x in book_name:
                ind = book_name.find(x)
                book_name = book_name[:ind]

        for x in BeginSign:
            if book_name == None: break
            if x in book_name:
                ind = book_name.find(x)
                book_name = book_name[ind + len(x):]

        if book_name != None and '@' in book_name: book_name = None
        if book_name == None or book_name == '':
            book_name = title_separate[0]

        #print qid,int(number_ans),'[',qdesc,',',qtitle,']',email
        #printlist(title_separate)
        #print book_name
        #print '-'*30
        google_query = book_name
        if google_query in title_separate[0]:
            google_query = title_separate[0]
        questions_to_return.append((qid, int(number_ans), email, book_name,
                                    google_query, content_no_space))

    return questions_to_return

Пример #30

0

Показать файл

'''
this program needs to add wait time, may cause problem with your renren id
'''
from getWebpage import getWebpage
import re
import json,time
from sysPath import createFile

coo='anonymid=h9489u7u-yp0fqs; _r01_=1; mop_uniq_ckid=10.7.18.77_1355594994_642928755; _de=3F3126DBF672F298F26CBA88523C3AB26DEBB8C2103DE356; __utma=151146938.1762808405.1361533510.1361533510.1361533510.1; __utmz=151146938.1361533510.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); l4pager=0; depovince=GW; jebecookies=abb5a061-adf7-4276-9913-0059ed1553e6|||||; p=c506abb8c6dd441921166c4464e116341; ap=269496411; t=351ac721dd34d54a08268e46db838a211; societyguester=351ac721dd34d54a08268e46db838a211; id=269496411; xnsid=cacc7bc0; XNESSESSIONID=376bb17a6b26; at=1; loginfrom=null'
headpage=getWebpage(link='http://friend.renren.com/myfriendlistx.do',
                    cookies=coo)
r=re.search('var friends=(\[.*\]);',headpage)
friendList=r.group(1)
jf=json.loads(friendList)
ids=[]
for f in jf:
    ids.append(f['id'])
createFile('infos.txt',force=True)
g=open('infos.txt','a')
g.write('Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'+'\n')
g.close()

count=0
for id in ids[:]:
    timeSleep=1
    count+=1
    #print count

    mainInfo=getWebpage('http://www.renren.com/'+str(id)+
                    '/profile?v=info_ajax&undefined',
                    cookies=coo,

Пример #31

0

Показать файл

'''
this program needs to add wait time, may cause problem with your renren id
'''
from getWebpage import getWebpage
try:
    from BeautifulSoup import BeautifulSoup, SoupStrainer
except:
    from bs4 import BeautifulSoup, SoupStrainer  # beta version of bs
import re
import json, time
from sysPath import createFile
import sys

coo = '''anonymid=h9489u7u-yp0fqs; _r01_=1; mop_uniq_ckid=10.7.18.77_1355594994_642928755; __utma=10481322.145044192.1363634540.1363634540.1363636668.2; __utmz=10481322.1363636668.2.2.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/269496411; _de=3F3126DBF672F298F26CBA88523C3AB26DEBB8C2103DE356; depovince=GW; bt_new=12; jebecookies=63880745-b57f-4dce-b75e-7cc2218be89a|||||; p=9babffa88c9c71f7219d11a49178460d1; ap=269496411; t=fa5d5d911dc472ebde86481e5486062e1; societyguester=fa5d5d911dc472ebde86481e5486062e1; id=269496411; xnsid=6ef4dee; loginfrom=null; feedType=269496411_hot; JSESSIONID=abcMqcp8dHsTAh3nle53t; l4pager=0'''
headpage = getWebpage(link='http://friend.renren.com/myfriendlistx.do',
                      cookies=coo)
r = re.search('var friends=(\[.*\]);', headpage)
friendList = r.group(1)
jf = json.loads(friendList)
ids = []
for f in jf:
    ids.append(f['id'])

if len(sys.argv) >= 2:
    start_num = int(sys.argv[1])
else:
    start_num = 0

timeSleep = 0.8
for id in ids[start_num:start_num + 100]:
    page = getWebpage('http://www.renren.com/' + str(id) + '/profile',

Пример #32

0

Показать файл

Файл: fetchNovelTianyayidu.py Проект: gaoyunzhi/crawling_toolkit

def fetch(head, nid):
    head += '-' + str(nid)
    headP = getWebpage(head + '.html')
    soup = BeautifulSoup(headP.decode('utf8', 'ignore'))
    n = soup.find('div', {'class': "pageNum1"})
    n = n.contents[0]
    r = re.match(u'共' + '(\d*)' + u'页', n)
    n = r.group(1)
    try:
        n = int(n)
    except:
        print 'failed to find the number of page'
        n = 1000
    ans = []
    for i in range(1, n + 1):
        page = head + '-' + str(i) + '.html'
        page = getWebpage(page)
        soup = BeautifulSoup(page)
        posts = soup.findAll('li', {'class': 'at c h2'})
        for post in posts:
            post = '\n'.join(map(clean, post.findAll(text=True)))
            if len(post) < 10: continue
            if reply(post): continue
            ans.append(post)
    g = open(str(nid) + '.txt', 'w')
    g.write('\n'.join(ans))
    g.close()

    return

    return
    chapters = extractLinks(link=link,
                            requireLinkStart=link,
                            avoidKeys=['img', 'alt', 'src'],
                            requireLinkEnd='.html')
    print chapters
    content = getWebpage(link)
    soup = BeautifulSoup(content)
    paras = soup.findAll('div', {'class': 'paragraph'})
    intro = soup.find('div', {'class': 'bookintro'})
    book_name = soup.find('div', {'id': 'book-cover'}).find('a')['title']
    print 'collecting: ', book_name
    f = open(book_name + '.txt', 'w')
    f.write('intro: ')
    for y in intro.findAll(text=True):
        if y.encode('utf8', 'ignore').strip() == '': continue
        f.write(y.encode('utf8', 'ignore') + '\n')
    for x in paras:
        for y in x.findAll(text=True):
            f.write(y.encode('utf8', 'ignore') + '\n')
    f.close()
    start = int(chapters[0]['href'][len(link):-5])
    end = int(chapters[-1]['href'][len(link):-5]) + 20
    chapterD = {}
    for x in chapters:
        num = int(x['href'][len(link):-5])
        title = x['title']
        chapterD[num] = title
    count = 0
    for i in range(start, end):
        chapter = link + str(i) + '.html'
        content = getWebpage(chapter)
        soup = BeautifulSoup(content)
        content = soup.find('div', {'id': 'zoom'})
        f = open(book_name + '.txt', 'a')
        if i in chapterD:
            f.write('\n\n' + chapterD[i].encode('utf8', 'ignore') + '\n')
        if content == None: continue
        for y in content.findAll(text=True):
            if y.encode('utf8', 'ignore').strip() == '': continue
            f.write(y.encode('utf8', 'ignore') + '\n')
        f.close()
        #if count>5:break
        count += 1

Пример #33

0

Показать файл

Файл: genTable.py Проект: gaoyunzhi/crawling_toolkit

def genTable(filename='../../testData/testingMonuments.txt',\
             outfname='../../testData/testingMonumentsData_week4_all.csv', \
             months=None,yearBegin=2009, yearEnd=2015,silent=True,endLine=None,\
             testNow=False, country='en'):
    now = datetime.datetime.now()
    now=(int(now.year),int(now.month))
    if months==None:
        months=[]
        for year in range(yearBegin,yearEnd):
                for month in range(1,13):
                    if (year, month)>=now: break
                    months.append(str(year)+'0'*(2-len(str(month)))+str(month))
    months=map(str,months)
    filename=sysPath(filename)
    f=open(filename,'r')
    links=f.read().splitlines()
    f.close()    
    #soup=BeautifulSoup(links)
    titleLine=['linkTitle']
    for month in months:
        titleLine.append('Img'+month)
        titleLine.append('Content'+month)
        titleLine.append('Traffic'+month)
    if not os.path.exists(outfname):
        outf=open(outfname,'w')
        outf.write('\t'.join(titleLine)+'\n')
        start=0
        outf.close()
    else:
        outf=open(outfname,'r')
        start=len(outf.read().splitlines())
        outf.close()
    count=0
##    for field in soup.findAll('a')[:endLine]:
    for linkTitle in links:
        index=linkTitle.find('/wiki/')
        if index!=-1:
            linkTitle=linkTitle[index+6:]
        count+=1
        if count<start: continue
##        if not field.has_key('title'): continue
##        linkTitle=field['href'][6:]
##        officialTitle=field['title']
        curLine=[linkTitle]
        for month in months:
            date=month+'01'
            revId=getRevId(linkTitle, date+'000000' , silent=silent,country=country) # 6 zeros for h,m,s
            if not silent: print 'revId=',revId
            if revId==None:
                curLine+=['','','']
                continue
            link='http://'+country+'.wikipedia.org/w/index.php?oldid='+revId
            if testNow: print 'title=',linkTitle, 'link=',link,'month=',month
            if not silent: print 'prepare'
            page=getWebpage(link, timeSleep=0.5,silent=silent)
            if not silent: print 'got page'
            soup=BeautifulSoup(page)
            if not silent: print 'got soup'
            numImg=numImage(soup)            
            if not silent: print 'got num'
            conLen=contentLen(soup)
            if not silent: print 'got len'
            traffic=str(getTraffic(linkTitle,month, silent=silent, country=country))
            if not silent: print 'got history'
            curLine+=[numImg, conLen, traffic]
        curLine=map(str, curLine)
        outf=open(outfname,'a')
        outf.write('\t'.join(curLine)+'\n')
        outf.close()

Пример #34

0

Показать файл

Файл: fb_friendlist.py Проект: gaoyunzhi/crawling_toolkit

    from bs4 import BeautifulSoup,SoupStrainer # beta version of bs

coo='datr=1HSWUNG14Cr81JphyUZWTl2i; lu=gAff9sJJ2_wuev5W3zxFsGZA; sub=128; p=49; c_user=1216615221; csm=2; fr=0regP7HiBNucJQa1n.AWVfvGNhos7mlakT0e52olU2aWo.BQlnT_.nT.AWVtovRV; s=Aa7LrP8dIAOi4SoX; xs=3%3ArXa_AglvHBTByg%3A2%3A1352037631; act=1356128659553%2F6%3A2; presence=EM356128936EuserFA21216615221A2EstateFDsb2F0Et2F_5b_5dElm2FnullEuct2F135610056B0EtrFA2loadA2EtwF1698182903EatF1356128697024G356128936322CEchFDp_5f1216615221F8CC; wd=1280x299'
f=open(sysPath('webpages/ids.txt'))
jf=json.loads(f.read().decode('utf8','ignore'))
f.close()

createFile('infos_fb.txt',force=True)
g=open('infos_fb.txt','a')
g.write('Name,Given Name,Additional Name,Family Name,Yomi Name,Given Name Yomi,Additional Name Yomi,Family Name Yomi,Name Prefix,Name Suffix,Initials,Nickname,Short Name,Maiden Name,Birthday,Gender,Location,Billing Information,Directory Server,Mileage,Occupation,Hobby,Sensitivity,Priority,Subject,Notes,Group Membership,E-mail 1 - Type,E-mail 1 - Value,E-mail 2 - Type,E-mail 2 - Value,Phone 1 - Type,Phone 1 - Value'+'\n')
g.close()

ans=[]
for f in jf['data']:
    info=getWebpage('http://www.facebook.com/'+str(f['id']),
                    cookies=coo,
                    info=str(f['id'])
                    )
    bI=BeautifulSoup(info)
    link=bI.find('link',{'rel':'alternate'})
    '''
    info=getWebpage(link['href']+'/info',
                    cookies=coo,
                    info=str(f['id'])
                    )
    '''
    ind=link['href'].rfind('/')
    email=link['href'][ind+1:]
    ans.append((f['name'],f['id'],email+'@facebook.com'))
    name=f['name']
    id=f['id']
    email=email+'@facebook.com'

Пример #35

0

Показать файл

Файл: fetchNovelTianyayidu.py Проект: gaoyunzhi/crawling_toolkit

def fetch(head, nid):
    head += "-" + str(nid)
    headP = getWebpage(head + ".html")
    soup = BeautifulSoup(headP.decode("utf8", "ignore"))
    n = soup.find("div", {"class": "pageNum1"})
    n = n.contents[0]
    r = re.match(u"共" + "(\d*)" + u"页", n)
    n = r.group(1)
    try:
        n = int(n)
    except:
        print "failed to find the number of page"
        n = 1000
    ans = []
    for i in range(1, n + 1):
        page = head + "-" + str(i) + ".html"
        page = getWebpage(page)
        soup = BeautifulSoup(page)
        posts = soup.findAll("li", {"class": "at c h2"})
        for post in posts:
            post = "\n".join(map(clean, post.findAll(text=True)))
            if len(post) < 10:
                continue
            if reply(post):
                continue
            ans.append(post)
    g = open(str(nid) + ".txt", "w")
    g.write("\n".join(ans))
    g.close()

    return

    return
    chapters = extractLinks(link=link, requireLinkStart=link, avoidKeys=["img", "alt", "src"], requireLinkEnd=".html")
    print chapters
    content = getWebpage(link)
    soup = BeautifulSoup(content)
    paras = soup.findAll("div", {"class": "paragraph"})
    intro = soup.find("div", {"class": "bookintro"})
    book_name = soup.find("div", {"id": "book-cover"}).find("a")["title"]
    print "collecting: ", book_name
    f = open(book_name + ".txt", "w")
    f.write("intro: ")
    for y in intro.findAll(text=True):
        if y.encode("utf8", "ignore").strip() == "":
            continue
        f.write(y.encode("utf8", "ignore") + "\n")
    for x in paras:
        for y in x.findAll(text=True):
            f.write(y.encode("utf8", "ignore") + "\n")
    f.close()
    start = int(chapters[0]["href"][len(link) : -5])
    end = int(chapters[-1]["href"][len(link) : -5]) + 20
    chapterD = {}
    for x in chapters:
        num = int(x["href"][len(link) : -5])
        title = x["title"]
        chapterD[num] = title
    count = 0
    for i in range(start, end):
        chapter = link + str(i) + ".html"
        content = getWebpage(chapter)
        soup = BeautifulSoup(content)
        content = soup.find("div", {"id": "zoom"})
        f = open(book_name + ".txt", "a")
        if i in chapterD:
            f.write("\n\n" + chapterD[i].encode("utf8", "ignore") + "\n")
        if content == None:
            continue
        for y in content.findAll(text=True):
            if y.encode("utf8", "ignore").strip() == "":
                continue
            f.write(y.encode("utf8", "ignore") + "\n")
        f.close()
        # if count>5:break
        count += 1

Пример #36

0

Показать файл

        if name=='servingsPerContainer': name='Servings Per Container'
        if name=='servingSize': name='Serving Size'
        value=detail[x]
        if name[-1]==':': name=name[:-1]
        if len(name)<3: continue
        new_detail[name]=value
    for x in detail:
        name=x
        value=detail[x]
        if len(name.split())>1 and name[-1] in map(str,range(10))+['%','g']:
            value=clean(name.split()[-1])
            name=clean(' '.join(name.split()[:-1]))
            tmpname=None
            if name.endswith('Calories from Fat'):
                tmpname=name
                name='Calories from Fat'
            if not name in new_detail:
                new_detail[name]=value
            if tmpname!=None:
                tmpname=tmpname.split()
                if tmpname[0]=='Calories' and len(tmpname)>1:
                    value=tmpname[1]
                    new_detail['Calories']=value
    return new_detail


        
page=getWebpage('http://www.peapod.com/itemDetailView.jhtml?productId=155508',dataDir='../../data/detail_pages')

getDetails(page)

Пример #37

0

Показать файл

Файл: getSoup.py Проект: gaoyunzhi/crawling_toolkit

def getSoup(url, encode='utf8', coo=''):
    page = getWebpage(url, cookies=coo)
    return BeautifulSoup(page.decode(encode, 'ignore'))

Python getWebpage примеры использования