示例#1
0
def book_spider(book_tag):
    global file_content
 
    url =  "http://bbs.csdn.net/topics/310046216"
    source_code = requests.get(url)
    # just get the code, no headers or anything
    plain_text = source_code.text
    # BeautifulSoup objects can be sorted through easy
    soup = BeautifulSoup(plain_text)
    '''print('\n')
    print('--' * 30)
    print('--' * 30)
    print("\t"*4+book_tag+" :")
    print('--' * 30)
    print('--' * 30)
    print('\n')'''
    title_divide = '\n' + '--' * 30 + '\n' + '--' * 30 + '\n'
    file_content += title_divide + '\t' * 4 + \
            book_tag + ':' + title_divide
    count = 1
    for book_info in soup.findAll('div', {'class': 'info'}):
        title = book_info.findAll('a', {
            'onclick': re.compile(r"\"moreurl(.+)")})[0].get('title')
 
        pub = book_info.findAll('div', {'class':'pub'})[0].string.strip()
        rating = book_info.findAll('span', {
            'class':'rating_nums'})[0].string.strip()
        people_num = book_info.findAll('span', {
            'class':'pl'})[0].string.strip()
        file_content += "*%d\t《%s》\t评分:%s%s\n\t%s\n\n" % (
                count, title, rating, people_num, pub)
        count += 1
示例#2
0
def get_single_book_data(book_url):
    source_code = requests.get(book_url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)
#    for rating in soup.findAll('strong', {'class':'ll rating_num '}):
#        print("评分:" + rating.string.strip())
    for rating in soup.findAll('p', {'class':'rating_self clearfix'}):
        print rating.strong.string
    '''for book_info in soup.findAll('div', {'id':'info'}):
示例#3
0
def fetch_tv_info(username, password):
    match = re.search(pattern, get_home_page())
    challenge = ""
    if match is not None:
        # get the challenge
        challenge = match.group(2)
        print "challenge string: " + challenge
        # now login
        response = login(username, password, challenge)
        content = response.read()

        # open the TV page
        opener = urllib2.build_opener()
        response = execute_opener(opener, "http://iptv.bg/watch")
        content = response.read()
        #print content

        soup = BeautifulSoup(content, fromEncoding='utf-8')
        tvTags = soup.findAll(name='li', attrs={'class': 'listmode_tv'})

        tv_info = []

        for tag in tvTags:
            name = tag.find(name='div', attrs={'class': 'tv_info'}).find(name='b').getText()
            logo = tag.find(name='img').get('src', default='')
            url = tag.findAll(
                name='div',
                attrs={'class': 'noprint player_soft'})[-1].find(name='a').get('href', default='')
            thumbnail = tag.findAll(
                name='div',
                attrs={'class': 'noprint player_soft'})[-1].find(name='a').get('href', default='')
            info_tag = tag.find(name='div', attrs={'class': 'tv_info'})
            info = ''
            thumbnail = ''
            if info_tag is not None:
                thumbnail = info_tag.find(name='img').get('src', default='')
                detail_tag = info_tag.find(name='em').find(name='abbr')
                if detail_tag is not None:
                    info = detail_tag.get('title', default='Unknown')

            tv_info += [{'name': name, 'logo': logo, 'path': url, 'thumbnail': thumbnail, 'info': info}]

        return tv_info
示例#4
0
    def geturl(self, webpage, key=None):
        #key = None ##############################test
        global dlLinksNext

        try:
            webpage = unicode(webpage, 'gbk').encode('utf-8')
            soup = BeautifulSoup(webpage)
            tagA = soup.findAll('a')

            for link in tagA:
                if not key:
                    dlLinksNext.put(link.get('href'))
                elif key in str(link):
                    dlLinksNext.put(link.get('href'))

        except (UnicodeDecodeError):
            #error = '132 have code'
            error = 'UnicodeDecodeError'
            self.loger.logInfo(error)
        except (UnicodeEncodeError):
            #error = '135 had code'
            error = 'UnicodeDecodeError'
            self.loger.logInfo(error)
示例#5
0
import requests
from bs3 import BeautifulSoup

url = "https://www.yelp.com/sf"

yelp_r = requests.get(url)

print(yelp_r.status_code)  #should be 200

yelp_soup = BeautifulSoup(yelp_r.text, 'html.parser')

print(yelp_soup.prettify())

print(yelp_soup.findAll('a'))

for link in yelp_soup.findAll('a'):
    print(link)