Python extractLinks примеры использования

Язык программирования: Python

Пространство имен/Пакет: extractLinks

Метод/Функция: extractLinks

Примеров на hotexamples.com: 4

Python extractLinks - 4 примера найдено. Это лучшие примеры Python кода для extractLinks.extractLinks, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: fetchNovelbook.py Проект: gaoyunzhi/crawling_toolkit

def fetchNovel(link, rLS=None):
    if rLS==None: rLS=link
    chapters=extractLinks(link=link, requireLinkStart=rLS,avoidKeys=['img','alt','src'],\
                          requireLinkEnd='.html')
    content=getWebpage(link)
    soup=BeautifulSoup(content)
    book_name=''.join(soup.find('title').contents)
    book_name=book_name.split('_')[0]
    print 'collecting: ',book_name
    f=open(book_name+'.txt','w')
    f.close()
    count=0
    for x in chapters:
        chapter='http://data.book.163.com'+x['href']
        #print chapter,'0'
        content=getWebpage(chapter)
        soup=BeautifulSoup(content)
        content=soup.find('div',{'class':'bk-article-body','id':'bk-article-body'})
        f=open(book_name+'.txt','a')
        #print chapter,'1'
        try:
            title=''.join(x.contents).encode('GBK','ignore')
        except:
            title=''
        if title!='' and (title[-1]!=')' or title[-3:]=='(1)'):f.write('\n\n'+title+'\n')
        #print chapter,'2'
        for y in content.findAll(text=True):
            if y.encode('GBK','ignore').strip()=='': continue
            f.write(y.encode('GBK','ignore')+'\n')
        f.close()
        #if count>5:break
        count+=1

Пример #2

Показать файл

def fetchNovel(link):
    chapters = extractLinks(link=link,
                            requireLinkStart=link,
                            avoidKeys=['img', 'alt', 'src'],
                            requireLinkEnd='.html')
    content = getWebpage(link)
    soup = BeautifulSoup(content)
    paras = soup.findAll('div', {'class': 'paragraph'})
    intro = soup.find('div', {'class': 'bookintro'})
    book_name = soup.find('div', {'id': 'book-cover'}).find('a')['title']
    print 'collecting: ', book_name
    f = open(book_name + '.txt', 'w')
    f.write('intro: ')
    for y in intro.findAll(text=True):
        if y.encode('GBK', 'ignore').strip() == '': continue
        f.write(y.encode('GBK', 'ignore') + '\n')
    for x in paras:
        for y in x.findAll(text=True):
            f.write(y.encode('GBK', 'ignore') + '\n')
    f.close()
    start = int(chapters[0]['href'][len(link):-5])
    end = int(chapters[-1]['href'][len(link):-5]) + 20
    chapterD = {}
    for x in chapters:
        num = int(x['href'][len(link):-5])
        title = x['title']
        chapterD[num] = title
    count = 0
    for i in range(start, end):
        chapter = link + str(i) + '.html'
        content = getWebpage(chapter)
        soup = BeautifulSoup(content)
        content = soup.find('div', {'id': 'zoom'})
        f = open(book_name + '.txt', 'a')
        if i in chapterD:
            f.write('\n\n' + chapterD[i].encode('GBK', 'ignore') + '\n')
        if content == None: continue
        for y in content.findAll(text=True):
            if y.encode('GBK', 'ignore').strip() == '': continue
            f.write(y.encode('GBK', 'ignore') + '\n')
        f.close()
        #if count>5:break
        count += 1

Пример #3

Показать файл

Файл: fetchNovelsina.py Проект: gaoyunzhi/crawling_toolkit

def fetchNovel(link):
    chapters=extractLinks(link=link, requireLinkStart=link,avoidKeys=['img','alt','src'],requireLinkEnd='.html')
    content=getWebpage(link)
    soup=BeautifulSoup(content)
    paras=soup.findAll('div',{'class':'paragraph'})
    intro=soup.find('div',{'class':'bookintro'})
    book_name=soup.find('div',{'id':'book-cover'}).find('a')['title']
    print 'collecting: ',book_name
    f=open(book_name+'.txt','w')
    f.write('intro: ')
    for y in intro.findAll(text=True):
        if y.encode('GBK','ignore').strip()=='': continue
        f.write(y.encode('GBK','ignore')+'\n')
    for x in paras:
        for y in x.findAll(text=True):
            f.write(y.encode('GBK','ignore')+'\n')
    f.close()
    start=int(chapters[0]['href'][len(link):-5])
    end=int(chapters[-1]['href'][len(link):-5])+20
    chapterD={}
    for x in chapters:
        num=int(x['href'][len(link):-5])
        title=x['title']
        chapterD[num]=title
    count=0
    for i in range(start,end):
        chapter=link+str(i)+'.html'
        content=getWebpage(chapter)
        soup=BeautifulSoup(content)
        content=soup.find('div',{'id':'zoom'}) 
        f=open(book_name+'.txt','a')
        if i in chapterD:
            f.write('\n\n'+chapterD[i].encode('GBK','ignore')+'\n')
        if content==None: continue
        for y in content.findAll(text=True):
            if y.encode('GBK','ignore').strip()=='': continue
            f.write(y.encode('GBK','ignore')+'\n')
        f.close()
        #if count>5:break
        count+=1

Пример #4

Показать файл

Файл: fetchNovelbook.py Проект: gaoyunzhi/crawling_toolkit

def fetchNovel(link, rLS=None):
    if rLS == None: rLS = link
    chapters=extractLinks(link=link, requireLinkStart=rLS,avoidKeys=['img','alt','src'],\
                          requireLinkEnd='.html')
    content = getWebpage(link)
    soup = BeautifulSoup(content)
    book_name = ''.join(soup.find('title').contents)
    book_name = book_name.split('_')[0]
    print 'collecting: ', book_name
    f = open(book_name + '.txt', 'w')
    f.close()
    count = 0
    for x in chapters:
        chapter = 'http://data.book.163.com' + x['href']
        #print chapter,'0'
        content = getWebpage(chapter)
        soup = BeautifulSoup(content)
        content = soup.find('div', {
            'class': 'bk-article-body',
            'id': 'bk-article-body'
        })
        f = open(book_name + '.txt', 'a')
        #print chapter,'1'
        try:
            title = ''.join(x.contents).encode('GBK', 'ignore')
        except:
            title = ''
        if title != '' and (title[-1] != ')' or title[-3:] == '(1)'):
            f.write('\n\n' + title + '\n')
        #print chapter,'2'
        for y in content.findAll(text=True):
            if y.encode('GBK', 'ignore').strip() == '': continue
            f.write(y.encode('GBK', 'ignore') + '\n')
        f.close()
        #if count>5:break
        count += 1