Exemplos de getSoup em Python, exemplos de getSoup.getSoup em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: fetchNoveldoubannote.py Projeto: gaoyunzhi/crawling_toolkit

def fetch(head):
    soup=getSoup(head)
    t=soup.find('h1')
    title=t.find(text=True)
    print title
    ans=[]
    for i in xrange(0,10000,10):
        link=head+'?start='+str(i)
        soup=getSoup(link)
        notes=[]
        for x in soup.findAll('span',{'class':'rec'}):
            if not x.has_key('id') : continue
            if  x['id'][:5]!='Note-': continue
            notes.append(x['id'][5:])
        if notes==[]: break
        for note in notes:
            soup=getSoup('http://www.douban.com/note/'+note)
            note_title=soup.find('title').find(text=True)
            article=soup.find('div',{'class':"note", 'id':"link-report"})
            content=note_title+'\n'.join(map(clean,article.findAll(text=True)))+'\n\n'
            ans.append(content)
 
    g=open(title+'.txt','w')
    g.write('\n'.join(ans))
    g.close()

Exemplo n.º 2

0

Exibir arquivo

def runoob():
    url = 'http://www.runoob.com/python3/python3-tutorial.html'
    soup = getSoup.getSoup(url)
    urls = soup.select('div#leftcolumn a')
    for url in urls:
        url = url.get('href')
        url = 'http://www.runoob.com' + url
        print(url)
        soup = getSoup.getSoup(url)
        articleBody = soup.select('.article-intro')
        for articleBody in articleBody:
            articleBody = articleBody.get_text()
            saveDoc.saveDocs('runoob菜鸟教程.doc', articleBody)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: suyan.py Projeto: gaoyunzhi/crawling_toolkit

def addDom(link, loc, year):
    soup = getSoup(link)
    for x in soup.findAll('pre'):
        time = None
        time1 = None
        time2 = None
        x = str(x).split('\n')
        for y in x:
            y = y.strip()
            if date(y):
                time = y[1:-1]
            elif data(y):
                t1, t2, p1, p2 = data(y)
                ans.append(
                    [t1, p1 - p2, year, time, t2, loc, 'home', 'domestic'])
                ans.append(
                    [t2, p2 - p1, year, time, t1, loc, 'away', 'domestic'])
            if date2(y):
                time1, time2 = date2(y)
            elif data2(y):
                t1, t2, p1, p2, p3, p4 = data2(y)
                ans.append(
                    [t1, p1 - p2, year, time1, t2, loc, 'home', 'domestic'])
                ans.append(
                    [t2, p2 - p1, year, time1, t1, loc, 'away', 'domestic'])
                ans.append(
                    [t1, p3 - p4, year, time2, t2, loc, 'home', 'domestic'])
                ans.append(
                    [t2, p4 - p3, year, time2, t1, loc, 'away', 'domestic'])

Exemplo n.º 4

0

Exibir arquivo

Arquivo: suyan.py Projeto: gaoyunzhi/crawling_toolkit

def addInt(link, year):
    soup = getSoup(link)
    for x in soup.findAll('pre'):
        time1 = None
        time2 = None
        x = str(x).split('\n')
        for y in x:
            y = y.strip()
            if date2(y):
                time1, time2 = date2(y)
            elif data3(y):
                t1, t2, p1, p2, p3, p4 = data3(y)
                ans.append([
                    t1, p1 - p2, year, time1, t2, 'international', 'home',
                    'international'
                ])
                ans.append([
                    t2, p2 - p1, year, time1, t1, 'international', 'away',
                    'international'
                ])
                ans.append([
                    t1, p3 - p4, year, time2, t2, 'international', 'home',
                    'international'
                ])
                ans.append([
                    t2, p4 - p3, year, time2, t1, 'international', 'away',
                    'international'
                ])

Exemplo n.º 5

0

Exibir arquivo

Arquivo: fetchNovelqq.py Projeto: gaoyunzhi/crawling_toolkit

def get_info(url):
    index_page=getSoup(url,encode='gbk')
    book_title=index_page.find('title').find(text=True)[:-10]
    yield book_title
    for ch in index_page.findAll('a'):
        if not ch.has_key('href') or not ch['href'].startswith(Ch_Start): continue 
        ch_url=ch['href'][20:-2]
        ch_title=title_clean(ch.find(text=True))
        yield ch_url,ch_title

Exemplo n.º 6

0

Exibir arquivo

def makeExcel(sites):

    # Workbook() takes one, non-optional, argument
    # which is the filename that we want to create.
    workbook = xlsxwriter.Workbook('ShopifyContacts.xlsx')

    # The workbook object is then used to add new
    # worksheet via the add_worksheet() method.
    worksheet = workbook.add_worksheet()

    row = 1
    worksheet.write('A' + str(row), 'Website')
    worksheet.write('B' + str(row), 'Email')
    worksheet.write('C' + str(row), 'Instagram')
    worksheet.write('D' + str(row), 'Facebook')
    worksheet.write('E' + str(row), 'Twitter')
    worksheet.write('F' + str(row), 'Youtube')
    worksheet.write('G' + str(row), 'Pinterest')

    i = 0
    while (len(sites) > i):
        row += 1
        worksheet.write('A' + str(row), sites[i])
        data = getSoup(sites[i])
        #data = removeDups(data)  #get rid of duplicate entries
        y = re.findall(r'[\w\.-]+@[\w\.-]+', data)  #Get all emails
        if y:
            worksheet.write('B' + str(row), y[0])
            print('---Email---')
            print(y[0])
        else:
            worksheet.write('B' + str(row), 'N/A')

        x = re.findall(r'(https?://[^\s]+)', data)  #Get all Urls
        x = removeDups(x)
        searchString(x, 'instagram.com', row, 'C', worksheet)
        searchString(x, 'facebook.com', row, 'D', worksheet)
        searchString(x, 'twitter.com', row, 'E', worksheet)
        searchString(x, 'youtube.com', row, 'F', worksheet)
        searchString(x, 'pinterest.com', row, 'G', worksheet)
        #if re.findall('instagram', x):
        # print('---Instagram---')
        #  print(x)
        #        y = re.findall('instagram', data)
        #        if y:
        #            worksheet.write('C' + str(row), y[0])
        #            print('---Instagram---')
        #            print(y[0])
        #        else:
        #            worksheet.write('C' + str(row), 'N/A')

        i += 1

    print('making excel file')

    workbook.close()

Exemplo n.º 7

0

Exibir arquivo

def get_info(url):
    index_page = getSoup(url, encode='gbk')
    book_title = index_page.find('title').find(text=True)[:-10]
    yield book_title
    for ch in index_page.findAll('a'):
        if not ch.has_key('href') or not ch['href'].startswith(Ch_Start):
            continue
        ch_url = ch['href'][20:-2]
        ch_title = title_clean(ch.find(text=True))
        yield ch_url, ch_title

Exemplo n.º 8

0

Exibir arquivo

def getplaylistids(url):
    url = url
    soup = getSoup.getSoup(url)
    # print(soup)
    playlistids = []
    playlists = soup.select('a')
    print(playlists)
    for playlist in playlists:
        playlist = playlist.get('href')
        playlistids.append(playlist)
    return playlistids

Exemplo n.º 9

0

Exibir arquivo

Arquivo: doulist.py Projeto: gaoyunzhi/crawling_toolkit

def fetchNote((filename,info)):
    ans=[]
    for url, name in info:
        soup=getSoup(url)
        note=soup.find('div',{'class':'note-content'})
        if not note:
            note=soup.find('div',{'class':'note','id':"link-report"})
        ans.append(soupToTxt(note,title=name))
    f=open(filename+'.txt','w')
    f.write(('\n\n\n\n'+'-'*30+'\n\n').join(ans))
    f.close()

Exemplo n.º 10

0

Exibir arquivo

Arquivo: doulist.py Projeto: gaoyunzhi/crawling_toolkit

def fetchNote((filename, info)):
    ans = []
    for url, name in info:
        soup = getSoup(url)
        note = soup.find('div', {'class': 'note-content'})
        if not note:
            note = soup.find('div', {'class': 'note', 'id': "link-report"})
        ans.append(soupToTxt(note, title=name))
    f = open(filename + '.txt', 'w')
    f.write(('\n\n\n\n' + '-' * 30 + '\n\n').join(ans))
    f.close()

Exemplo n.º 11

0

Exibir arquivo

Arquivo: doulist.py Projeto: gaoyunzhi/crawling_toolkit

def fetchInfo(url):
    filename=''
    info=[]
    for n in xrange(N):        
        soup=getSoup(url+'?start='+str(M*n))
        if not filename: filename=soup.find('title').find(text=True)
        titles=soup.findAll('div',{'class':"title"})
        if not titles: break # no more
        for title in titles:
            link=title.find('a')
            info.append((link['href'],link.find(text=True).strip()))
    return (filename,info)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: doulist.py Projeto: gaoyunzhi/crawling_toolkit

def fetchInfo(url):
    filename = ''
    info = []
    for n in xrange(N):
        soup = getSoup(url + '?start=' + str(M * n))
        if not filename: filename = soup.find('title').find(text=True)
        titles = soup.findAll('div', {'class': "title"})
        if not titles: break  # no more
        for title in titles:
            link = title.find('a')
            info.append((link['href'], link.find(text=True).strip()))
    return (filename, info)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: photo_webpage.py Projeto: gaoyunzhi/crawling_toolkit

def fetchInfo(url):
    soup = getSoup(url)
    title = soup.find('title').find(text=True)
    info = []
    for img_field in soup.findAll('img'):
        if img_field.has_key('src') and img_field.has_key('alt'):
            name = img_field['alt']
            if len(name) > 13 or len(name) < 8: continue
            name = img_field.findParent('p').findPreviousSibling('p')
            name = ''.join(name.findAll(text=True))
            info.append((name, img_field['src']))
    return (title, info)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: photo_webpage.py Projeto: gaoyunzhi/crawling_toolkit

def fetchInfo(url):
    soup=getSoup(url)
    title=soup.find('title').find(text=True) 
    info=[]
    for img_field in soup.findAll('img'):
        if img_field.has_key('src') and img_field.has_key('alt'):
            name=img_field['alt']
            if len(name)>13 or len(name)<8: continue
            name=img_field.findParent('p').findPreviousSibling('p')
            name=''.join(name.findAll(text=True))
            info.append((name,img_field['src']))
    return (title, info)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: fetchNovelqq.py Projeto: gaoyunzhi/crawling_toolkit

def fetch_book(info):
    book_title=info.next()
    old_title=''
    content=[]
    for url,ch_title in info:
        if ch_title==old_title:
            ch_title=''
        else:
            old_title=ch_title
        ch=getSoup(url,encode='gbk').find('div',{'id':"content"})
        content.append(soupToTxt(ch,title=ch_title))
    f=open(book_title+'.txt','w')
    f.write(''.join(content))
    f.close()

Exemplo n.º 16

0

Exibir arquivo

Arquivo: frist.py Projeto: Qiang-He/test

def getMp3Info(albumid):
    url = 'http://www.kugou.com/yy/album/single/' + str(albumid) + '.html'
    soup = getSoup.getSoup(url)
    hashs = soup.select('.songList a')
    loadMp3Hash = []
    for hashss in hashs:
        hash = hashss.get('data')
        # 通过spilt('|')分割字符串,获取hash
        mp3Hash = hash.split('|')[0]
        # print(hash.split('|')[0])
        # hash = hash.spilt('|')
        loadMp3Hash.append(mp3Hash)
        # print(mp3Hash)
    return loadMp3Hash

Exemplo n.º 17

0

Exibir arquivo

def fetch_book(info):
    book_title = info.next()
    old_title = ''
    content = []
    for url, ch_title in info:
        if ch_title == old_title:
            ch_title = ''
        else:
            old_title = ch_title
        ch = getSoup(url, encode='gbk').find('div', {'id': "content"})
        content.append(soupToTxt(ch, title=ch_title))
    f = open(book_title + '.txt', 'w')
    f.write(''.join(content))
    f.close()

Exemplo n.º 18

0

Exibir arquivo

Arquivo: douban_album.py Projeto: gaoyunzhi/crawling_toolkit

def fetchInfo(homeUrl, type=None):
    homeSoup = getSoup(homeUrl)
    pageSoup = homeSoup
    info = []
    count = homeSoup.find('span', {'class': 'count'})
    if count:
        count = count.find(text=True)[2:-2]
        count = int(count)
    else:
        count = N  # only one page
    ind = len(homeSoup.findAll('h1')) - 1
    if ind > 1: ind = 1
    album_name = homeSoup.findAll('h1')[ind].find(text=True)
    if '-' in album_name:
        album_name = album_name.split('-')[1]
    album_name = album_name.replace("*", '')
    album_name = album_name.replace("/", '')
    album_name = album_name.split()[0]
    start = 0
    while True:
        photos = pageSoup.findAll('div', {'class': 'photo_wrap'})
        if len(photos) > N: print 'warning on photo number!'
        for photo in photos:
            aTag = photo.find('a', {'class': "photolst_photo"})
            if not aTag: continue
            name = aTag['title']
            url = photo.find('img')['src']
            url = url.replace('thumb', 'large')
            info.append((name, url))
        start += N
        if start > count: break
        page = getWebpage(homeUrl + '?start=' + str(start))
        pageSoup = BeautifulSoup(page)
    photos = homeSoup.findAll('span', {'class': "img"})
    if not photos:
        photos = homeSoup.findAll('a', {'class': "pic"})
    for photo in photos:
        img = photo.find('img')
        if not img: continue
        if not img.has_key('alt'): continue
        name = img['alt']
        if img.has_key('data-src'):
            url = img['data-src']
        else:
            url = img['src']
        url = url.replace('head', 'original')
        info.append((url, name))
    return (album_name, info)

Exemplo n.º 19

0

Exibir arquivo

def fetchInfo(url):
    soup = getSoup(url, coo=coo)
    title = soup.find('title').find(text=True).split()[-1]
    info = []
    for img_field in soup.findAll('a', {'class': "pic"}):
        img = img_field.find('img')
        if not img: continue
        if not img.has_key('alt'): continue
        name = img['alt']
        if img.has_key('data-src'):
            url = img['data-src']
        else:
            url = img['src']
        url = url.replace('head', 'original')
        info.append((name, url))
    return (title, info)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: suyan.py Projeto: gaoyunzhi/crawling_toolkit

def addInt(link,year):
    soup=getSoup(link) 
    for x in soup.findAll('pre'):
        time1=None
        time2=None
        x=str(x).split('\n')
        for y in x:
            y=y.strip()              
            if date2(y):
                time1,time2=date2(y)
            elif data3(y):
                t1,t2,p1,p2,p3,p4=data3(y)
                ans.append([t1,p1-p2,year,time1,t2,'international','home','international'])
                ans.append([t2,p2-p1,year,time1,t1,'international','away','international'])
                ans.append([t1,p3-p4,year,time2,t2,'international','home','international'])
                ans.append([t2,p4-p3,year,time2,t1,'international','away','international'])

Exemplo n.º 21

0

Exibir arquivo

Arquivo: main.py Projeto: gaoyunzhi/huangji

def checkMessage():
    global last_check_time
    update=False
    page=getSoup(board_url,coo=coo)
    for message in page.findAll('li',{'class':"mbtrdot comment-item"}):
        pname_url=message.find('a')['href']
        pname=message.find('a').find(text=True)
        ptext=message.findAll(text=True)[1][1:].strip()
        ptime=message.find('span',{'class','pl'}).find(text=True)
        t=parser.parse(ptime) 
        t=(t-datetime.datetime(1970,1,1)).total_seconds()
        if t<last_check_time or (('by' in ptext) and (u'小黄鸡' in ptext)): return
        last_check_time=t        
        reply(pname,pname_url,ptext)
        update=True
    return update

Exemplo n.º 22

0

Exibir arquivo

Arquivo: photo_renren.py Projeto: gaoyunzhi/crawling_toolkit

def fetchInfo(url):
    soup=getSoup(url,coo=coo)
    title=soup.find('title').find(text=True).split()[-1] 
    info=[]
    for img_field in soup.findAll('a',{'class':"pic"}):
        img=img_field.find('img')
        if not img: continue
        if not img.has_key('alt'): continue
        name=img['alt']
        if img.has_key('data-src'): 
            url=img['data-src']
        else:
            url=img['src']
        url=url.replace('head','original')
        info.append((name,url))
    return (title, info)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: JDBookRank.py Projeto: Testworm/tmall_crawl

def getBook(n):
    bookstore = []
    for i in range(1, 6):
        url = 'http://book.jd.com/booktop/0-0-0.html?category=3287-0-0-0-10003-' + str(
            i) + '#comfort'
        print(url)
        soup = getSoup.getSoup(url)
        books = soup.select('a.p-name')
        # print(books)
        for book in books:
            title = book.get("title")
            # title = book.get_text
            # print(title)
            bookstore.append(title.strip())
    # print(bookstore)
    return bookstore

Exemplo n.º 24

0

Exibir arquivo

Arquivo: main.py Projeto: gaoyunzhi/huangji

def checkContact():
    page=getSoup(contact_url,coo=coo)
    contacts=page.findAll('li',{'class':'clearfix'})
    update=False
    for contact in contacts:
        name=contact.find('a')['href']
        if name=='http://www.douban.com/people/39500150/': break
        if contact.find('span',{'class':"user-cs"}): continue
        page_info = urllib2.build_opener()
        page_info.addheaders.append(('Cookie', coo))
        postData='ck='+ck+'&people='+contact['id'][1:]
        postData=postData.encode('utf8')
        req = urllib2.Request(addcontact_url, postData)
        page_info.open(req)
        postData='ck='+ck+'&tag=195082&people='+contact['id'][1:]
        postData=postData.encode('utf8')
        req = urllib2.Request(addtotag_url, postData)
        page_info.open(req)
        update=True
    return update

Exemplo n.º 25

0

Exibir arquivo

Arquivo: suyan.py Projeto: gaoyunzhi/crawling_toolkit

def addDom(link,loc,year):
    soup=getSoup(link)    
    for x in soup.findAll('pre'):
        time=None
        time1=None
        time2=None
        x=str(x).split('\n')
        for y in x:
            y=y.strip()
            if date(y):
                time=y[1:-1]
            elif data(y):
                t1,t2,p1,p2=data(y)
                ans.append([t1,p1-p2,year,time,t2,loc,'home','domestic'])
                ans.append([t2,p2-p1,year,time,t1,loc,'away','domestic'])                
            if date2(y):
                time1,time2=date2(y)
            elif data2(y):
                t1,t2,p1,p2,p3,p4=data2(y)
                ans.append([t1,p1-p2,year,time1,t2,loc,'home','domestic'])
                ans.append([t2,p2-p1,year,time1,t1,loc,'away','domestic'])
                ans.append([t1,p3-p4,year,time2,t2,loc,'home','domestic'])
                ans.append([t2,p4-p3,year,time2,t1,loc,'away','domestic'])

Exemplo n.º 26

0

Exibir arquivo

Arquivo: suyan.py Projeto: gaoyunzhi/crawling_toolkit

                    [t2, p2 - p1, year, time, t1, loc, 'away', 'domestic'])
            if date2(y):
                time1, time2 = date2(y)
            elif data2(y):
                t1, t2, p1, p2, p3, p4 = data2(y)
                ans.append(
                    [t1, p1 - p2, year, time1, t2, loc, 'home', 'domestic'])
                ans.append(
                    [t2, p2 - p1, year, time1, t1, loc, 'away', 'domestic'])
                ans.append(
                    [t1, p3 - p4, year, time2, t2, loc, 'home', 'domestic'])
                ans.append(
                    [t2, p4 - p3, year, time2, t1, loc, 'away', 'domestic'])


domPage = getSoup('http://www.rsssf.com/resultsp00.html')
for loc in ['England', 'Italy', 'Spain']:
    for x in domPage.findAll('a'):
        text = x.find(text=True)
        text = text.split()
        if len(text) != 2: continue
        if text[0] != loc: continue
        link = 'http://www.rsssf.com/' + x['href']
        addDom(link, loc, text[1])


def addInt(link, year):
    soup = getSoup(link)
    for x in soup.findAll('pre'):
        time1 = None
        time2 = None

Exemplo n.º 27

0

Exibir arquivo

Arquivo: brazilCup.py Projeto: Testworm/worldCup

# db连接
connectDB = connect_dataBase.ConnectDatabase()
get_conf = connectDB.get_conf('databases_conf.json')
conn, cur = connectDB.connect_db(get_conf["brazilCup"]["host"],
                                 get_conf["brazilCup"]["user"],
                                 get_conf["brazilCup"]["password"],
                                 get_conf["brazilCup"]["database"],
                                 get_conf["brazilCup"]["port"])

# url = 'http://worldcup.2014.163.com/playerrank/total/attPenGoal/' # 球员总数据
# url = 'http://worldcup.2014.163.com/playerrank/avg/attPenGoal/'  # 球员场均数据
# url = 'http://worldcup.2014.163.com/teamrank/total/goals/'  # 国家队场总数据
url = 'http://worldcup.2014.163.com/teamrank/avg/goals/'  # 国家队场均数据

soup = getSoup.getSoup(url)
trs = soup.select('tbody tr')
# print(tds)
length = len(trs)
# print(length)
players = []
for tr in trs:
    # print(row)
    player = []
    # print(len(tr))
    for td in tr:
        # 数据格式化, formatSQL
        tds = '\'' + str(td.string.strip()) + '\''
        # print(tds)
        # player.append(str(td.string.strip()))
        # if '' in player:

Exemplo n.º 28

0

Exibir arquivo

import getSoup
import saveDoc
# times = time.strftime('%m%d')
# def getArticle():
# urls = open('urls.json', 'r', encoding='utf-8')
# print(urls)
with open('urls.json', 'r', encoding='utf-8') as f:
    urls = json.load(f)
    # print(urls['sites'])
    urls = urls['sites']
    # print(urls)
    ii = 1
for url in urls:
    url = url['url'].strip()
    # print(url)
    soup = getSoup.getSoup(url)
    # file = time.strftime('%m%d')+str(ii)+'.doc'
    # print(file)
    if url == 'http://www.cnblogs.com/':
        article = soup.select('#editor_pick_lnk')
        # for article in article:
        #     articleUrl = article.get('href')
        #     print(articleUrl)
        # # articleReponse = requests.get(articleUrl)
        # # articleReponse.raise_for_status()
        # articleSoup = getSoup.getSoup(articleUrl)
        # articleTitle = articleSoup.select('#cb_post_title_url')
        # for articleTitle in articleTitle:
        #     articleTitle = articleTitle.get_text()
        #     print(articleTitle)
        #     file = articleTitle + '.doc'

Exemplo n.º 29

0

Exibir arquivo

Arquivo: suyan.py Projeto: gaoyunzhi/crawling_toolkit

            elif data(y):
                t1,t2,p1,p2=data(y)
                ans.append([t1,p1-p2,year,time,t2,loc,'home','domestic'])
                ans.append([t2,p2-p1,year,time,t1,loc,'away','domestic'])                
            if date2(y):
                time1,time2=date2(y)
            elif data2(y):
                t1,t2,p1,p2,p3,p4=data2(y)
                ans.append([t1,p1-p2,year,time1,t2,loc,'home','domestic'])
                ans.append([t2,p2-p1,year,time1,t1,loc,'away','domestic'])
                ans.append([t1,p3-p4,year,time2,t2,loc,'home','domestic'])
                ans.append([t2,p4-p3,year,time2,t1,loc,'away','domestic'])
                


domPage=getSoup('http://www.rsssf.com/resultsp00.html')
for loc in ['England','Italy','Spain']:
    for x in domPage.findAll('a'):
        text=x.find(text=True)
        text=text.split()
        if len(text)!=2: continue
        if text[0]!=loc: continue
        link='http://www.rsssf.com/'+x['href']
        addDom(link,loc,text[1])
        
      
def addInt(link,year):
    soup=getSoup(link) 
    for x in soup.findAll('pre'):
        time1=None
        time2=None