Пример #1
0
def process_folder(filename):
    print >> sys.stderr, 'Processing %s' % filename
    with open(filename, 'r') as f:
        soup = BeautifulSoup(f.read())

    list_name = soup.title(text=True)[0]
    task_elems = soup.find_all("li", class_="task")
    print >> sys.stderr, 'Found list {0} with {1} tasks'.format(list_name, len(task_elems))

    tasks = []
    for el in task_elems:
        task = {
            'title': list(el.find('a', class_='body').children)[0].strip(),
            'completed': 'completed' in el['class'],
            'created-by': detail(el, 'Created by'),
            'assigned-to': detail(el, 'Assigned to'),
            'created-on': detail(el, 'Created on', dateparser.parse),
            'completed-on': detail(el, 'Completed on', dateparser.parse, True),
            'followers': detail(el, 'Followers'),
            'activities': []
        }
        tasks.append(task)
        #completed_at = ac.find(class_='completed-at'.text.strip())
        #if completed_at:
        #    task['completed_at'] = dateparser.parse(completed_at)

        for activity_el in el.findAll('li', class_='activity'):
            task['activities'].append({
                'summary': activity_el.find(class_='summary').text.strip(),
                'detail': text(activity_el.find(class_='activity-detail')),
                'date': activity_el.find(class_='date').text.strip()
            })

    return (list_name, tasks)
Пример #2
0
def s_loader(url, check):

    # suspended(url)

    start_time = time.time()
    urldata = urllib2.urlopen(url, timeout=25).read()
    load_time = time.time() - start_time

    #print "url: "+ url +  " load %f " %(load_time)

    if check == 1 and load_time > 5:
        time.sleep(5)
        #print url + " step 1 > 4:  load_time :  %f" %  (load_time)
        load_time2 = s_loader(url, '0')
    else:
        load_time2 = load_time

    if check == 1 and load_time2 > 5:
        time.sleep(5)
        #print url + " step 2 > 4:  load_time2 : %f" % (load_time2)
        load_time3 = s_loader(url, '0')
    else:
        load_time3 = load_time2

    soup = BeautifulSoup(urldata, "html5lib")
    # print soup
    #print soup.find_all('img')
    #print soup.title.string
    t = soup.title.string
    broi = str(len(soup.find_all('img')))
    if soup.find_all(
            'img'
    ) == "[]" and broi == 0 or soup.title.string == "Account Suspended":
        #     if soup.title() != "503 Service Temporarily Unavailable":
        print soup.title.string
        #            suspended(urldata)
        load_time = 999.9
        return load_time
    else:
        return ((load_time + load_time2 + load_time3) / 3)
        print soup.title()
        print "OK it's Working"
Пример #3
0
def process_one_list_file(filename):
    stderr(u'Processing %s', filename)
    with open(filename, 'r') as f:
        soup = BeautifulSoup(f.read())

    list_name = soup.title(text=True)[0]
    task_elems = soup.find_all("li", class_="task")
    stderr(u'Found list {0} with {1} tasks'.format(list_name, len(task_elems)))

    tasks = []
    for el in task_elems:
        task = {
            'title': list(el.find('a', class_='body').children)[0].strip(),
            'completed': 'completed' in el.find('a', class_='body')['class'],
            'assigned-to': detail(el, 'Assigned to'),
            # 'created-on': detail(el, 'Created on', dateparser.parse),
            # 'completed-on': detail(el, 'Completed on', dateparser.parse, True),
            'subscribers': detail(el, 'Subscribers'),


            'activities': [],
            #'due-on': None,
            #'subtasks': []
        }
        tasks.append(task)

        if el.find('span', class_="due-on"):
            task['due-on'] = dateparser.parse(el.find('span', class_="due-on").text.strip(u'— '))
        #
        for subtask in el.findAll('li', class_='subtask'):
            task.setdefault('subtasks', []).append(subtask.text.strip())

        for activity_el in el.findAll('li', class_='activity'):
            activity = {
                'summary': activity_el.find(class_='summary').text.strip(),
                'date': dateparser.parse(activity_el.find(class_='date').text.strip())
            }

            detail_el = activity_el.find(class_='activity-detail')
            if detail_el:
                detail_html = "".join([str(x) for x in detail_el.contents])
                detail_html = detail_html.decode('utf-8')
                activity['detail'] = detail_html
                activity['detail_plain'] = html2text.html2text(detail_html)

            if detail_el and 'comment' in detail_el['class']:
                activity['is_comment'] = True
            task['activities'].append(activity)

        task['created-at'] = \
            activity_log(task['activities'], 'created this task')

    return (list_name, tasks)
Пример #4
0
def s_loader( url, check ):
        
       # suspended(url)
        
        start_time = time.time()
        urldata = urllib2.urlopen(url, timeout = 25 ).read()
        load_time = time.time() - start_time

	#print "url: "+ url +  " load %f " %(load_time)
	        
	if check == 1 and load_time > 5:
		time.sleep(5)
		#print url + " step 1 > 4:  load_time :  %f" %  (load_time)
		load_time2 = s_loader(url, '0')	
	else :
		load_time2 = load_time

        if check == 1 and load_time2 > 5:
                time.sleep(5)
		#print url + " step 2 > 4:  load_time2 : %f" % (load_time2)
                load_time3 = s_loader(url, '0')
	else :
		load_time3 = load_time2

	soup = BeautifulSoup(urldata, "html5lib")
       # print soup
        #print soup.find_all('img')
        #print soup.title.string
        t = soup.title.string
        broi = str(len(soup.find_all('img')))
        if soup.find_all('img') == "[]" and broi==0 or soup.title.string == "Account Suspended":
        #     if soup.title() != "503 Service Temporarily Unavailable":
                print soup.title.string
    #            suspended(urldata) 
		load_time = 999.9
		return load_time
	else:
		return  ( ( load_time + load_time2 + load_time3 ) / 3 )
                print soup.title()
                print "OK it's Working"               
Пример #5
0
    def find(self) -> str:
        bs = BeautifulSoup(urlopen(self.url).read(), 'lxml')
        full_content = bs.title(string=True)[0]
        spl: List[str] = full_content.split(' ')

        rank: List[str] = []

        for x in spl:
            if x in self.ranks.keys():
                print(self.ranks.get(x))
                rank.append(str(self.ranks.get(x)))
            if x.isnumeric():
                rank.append(x)
                break
        return ' '.join(rank)
Пример #6
0
def process_folder(filename):
    print >> sys.stderr, 'Processing %s' % filename
    with open(filename, 'r') as f:
        soup = BeautifulSoup(f.read())

    list_name = soup.title(text=True)[0]
    task_elems = soup.find_all("li", class_="task")
    print >> sys.stderr, 'Found list {0} with {1} tasks'.format(
        list_name, len(task_elems))

    tasks = []
    for el in task_elems:
        task = {
            'title': list(el.find('a', class_='body').children)[0].strip(),
            'completed': 'completed' in el['class'],
            'created-by': detail(el, 'Created by'),
            'assigned-to': detail(el, 'Assigned to'),
            'created-on': detail(el, 'Created on', dateparser.parse),
            'completed-on': detail(el, 'Completed on', dateparser.parse, True),
            'followers': detail(el, 'Followers'),
            'activities': []
        }
        tasks.append(task)
        #completed_at = ac.find(class_='completed-at'.text.strip())
        #if completed_at:
        #    task['completed_at'] = dateparser.parse(completed_at)

        for activity_el in el.findAll('li', class_='activity'):
            task['activities'].append({
                'summary':
                activity_el.find(class_='summary').text.strip(),
                'detail':
                text(activity_el.find(class_='activity-detail')),
                'date':
                activity_el.find(class_='date').text.strip()
            })

    return (list_name, tasks)
Пример #7
0
"""
import requests
from bs4 import BeautifulSoup

#attempting to extract the HTML using the request module
import urllib.request, re
response = urllib.request.urlopen('https://danpeluso.wordpress.com/')
html = response.read()
#trying again with high school website for number of times town is said
# google says this is 15

response2 = urllib.request.urlopen('https://www.yourhtmlsource.com/')
html2 = response2.read()
#code from beutiful soup
soup = BeautifulSoup(html, 'html.parser')
print(soup.title())
for link in soup.find_all('a'):
    print(link.get('href'))
#print(soup.get_text())

soup2 = BeautifulSoup(html2, 'html.parser')
numsims = soup2.get_text().lower().count('html')


#function for finding number of instances of one word (why not lol)
# string -> number
def dans(txt):
    for i in range(len(txt)):
        if (txt[i] == ('P' or 'p')) and (txt[i + 1] == ('r')) and (txt[i + 2]
                                                                   == 'o'):
            return (1 + dans(txt[i + 3:]))
Пример #8
0
# basicaly content is a property of requests -->response object would be used to access certain features such as content, headers, etc.
#print("\n\n\n\n\n\n\n\n\n###############################################################################################################################\n\n\n\n\n\n\n\n\n\n")
#It can parse almost all the elements of an HTML doc,
# breaking it down into different tags and pieces which can be filtered out for various use cases.

# Parse the HTML

soup = BeautifulSoup(r.content, "html5lib")
#Beautiful Soup is a Python package for parsing HTML(hypertext markup language) and XML(extensible markup language) documents
# (including having malformed markup,
# i.e. non-closed tags, so named after tag soup). It creates a parse tree for parsed pages that can be used
# to extract data from HTML, which is useful for web scraping.
print(soup.prettify())
#Prettify() function in BeautifulSoup will enable us to view how the tags are nested in the document.
#print("\n\n\n\n\n\n\n\n\n###############################################################################################################################\n\n\n\n\n\n\n\n\n\n")
title = soup.title()
# commonly use type of objects in
# print(type(title))# 1 = tag
# print(type(title.string))# 2 = NavigableString
# print(type(soup))# 3 = BeautifulSoup
# 4 = Comment
# markup = "<p><!----this is a comment ----></p>"
# soup2 = BeautifulSoup(markup)
# print(soup2.p.string)

#print(title)
# get all the paragraph from website
#paras = soup.find_all('p')
#print(paras)
# get all the anchor tags from website
#anchor = soup.find_all('a')
Пример #9
0
    def execute_crawler(self, keywords, url):
        # ToDo: 기자가 많이 언급될 경우 다른 기자이름에 다른 이메일이 매칭 될 수 있음. 수정 필요
        self.log.debug("search keywords - {0}".format(keywords))
        self.log.debug("target url - {0}".format(url))

        for idx, (code, keyword, business_code, business) in tqdm(enumerate(keywords), total=len(keywords)):
            page_num = 1
            search_url = url.format(keyword)
            self.log.debug("search URL - {0}".format(search_url))

            self.driver.get(search_url)
            tmp_df = pd.DataFrame(
                columns=['title', 'link', 'press', 'date', 'reporter', 'email', 'article', 'search_keyword', 'company',
                         'company_code', 'business_code', 'business']
            )

            while page_num <= 10:
                html = self.driver.page_source
                soup = BeautifulSoup(html, 'html.parser')
                for urls in soup.select("a.info"):
                    if urls["href"].startswith("https://news.naver.com"):
                        self.log.debug("Get URL - {0}".format(urls['href']))
                        self.driver.get(urls["href"])
                        news_html = self.driver.page_source
                        news_html_soup = BeautifulSoup(news_html, 'html.parser')

                        tmp_title = news_html_soup.title(string=True)
                        tmp_date = news_html_soup.select('.t11')
                        tmp_article = news_html_soup.select('#articleBodyContents')
                        tmp_press = news_html_soup.select('#footer address')

                        title = tmp_title[0].replace(" : 네이버 뉴스", "")
                        if len(tmp_date) == 0:
                            tmp_date = news_html_soup.select('.article_info')[0].find('em')
                            p_date = tmp_date.get_text().split(" ")[0]
                        else:
                            p_date = tmp_date[0].get_text().split(" ")[0]
                        p_date = datetime.datetime.strptime(p_date, "%Y.%m.%d.")
                        if len(tmp_article) == 0:
                            tmp_article = news_html_soup.select('#articeBody')
                        article = tmp_article[0].get_text().replace('\n', "").replace('\t', "")
                        if not tmp_press[0].a:
                            tmp_press = news_html_soup.select(".article_footer")
                            press = tmp_press[0].a.get_text().replace("\n", "").replace("\t", "").split(" ")[0]
                        else:
                            press = tmp_press[0].a.get_text()

                        email = ""
                        publisher = ""
                        publisher_match = re.search(self.pattern_publisher, article)
                        email_match = re.search(self.pattern_email, article)
                        if publisher_match and email_match:
                            tmp_publisher = publisher_match.group()
                            tmp_publisher = tmp_publisher.strip().split(" ")

                            if len(tmp_publisher) == 1:
                                publisher = tmp_publisher[0].replace("기자", "")
                            elif len(tmp_publisher) == 2:
                                publisher = tmp_publisher[0]
                            elif len(tmp_publisher) == 3:
                                publisher = tmp_publisher[1]

                            tmp_email = email_match.group()
                            email = tmp_email.strip()

                        tokens = self.mecab.pos(str(article))
                        nouns_tokens = [word for word, tag in tokens if tag == 'NNG' or tag == 'NNP']
                        tokens_str = ' '.join(nouns_tokens)

                        tmp_df = tmp_df.append({
                            'title': title, 'link': urls["href"], 'press': press,
                            'date': p_date, 'reporter': publisher, 'email': email,
                            'article': article, 'search_keyword': keyword,
                            'company': keyword, 'company_code': code,
                            'business_code': business_code, 'business': business,
                            'tokens_split': nouns_tokens, 'tokens': tokens_str
                        }, ignore_index=True)
                        self.log.debug("title-{0}, date-{1}, press-{2}, company-{3}".format(title, p_date,
                                                                                            press, keyword))
                        time.sleep(random.randrange(3, 10))
                        self.driver.back()

                page_num += 1
                if len(self.driver.find_elements_by_class_name('next')) > 0:
                    element = self.driver.find_element_by_class_name("next")
                    element.click()
                else:
                    break

            self.log.debug("Save News data cnt - {0}".format(len(tmp_df)))
            if len(tmp_df) > 0:
                self.data_handler.save_db(tmp_df)
        self.driver.quit()
Пример #10
0
...
'''
import bs4
from bs4 import BeautifulSoup
import re

soup = BeautifulSoup(html_doc, 'lxml')

# 调用tag的find_all方法时,BeautifulSoup会检索当前tag的
# 所有子孙节点,如果只想搜索tag的直接子节点,可以使用
# 参数recursive = false

print(soup.html.find_all("title"))
print(soup.html.find_all("title", recursive=False))

# 只有find_all() 和 find()支持recursive参数

# 像调用find_all()一样调用tag
# find_all()的简写方法
print(soup.find_all("a") == soup("a"))
print(soup.title.find_all(string=True) == soup.title(string=True))

# 使用find_all方法并设置limit=1参数不如直接调用find()方法
print(soup.find_all("title", limit=1))
print(soup.find('title'))

print("\n\n\n", soup.head.title)
# 原理
# soup.find("head").find("title")
print(soup.title)
print(soup.head.title == soup.title)
Пример #11
0
# recursive
soup.html.find_all("title")
# [<title>The Dormouse's story</title>]

soup.html.find_all("title", recursive=False)
# []

# Beautiful Soup offers a lot of tree-searching methods (covered below), and they mostly take the same arguments as find_all(): name, attrs, text, limit, and the keyword arguments. But the recursive argument is different: find_all() and find() are the only methods that support it. Passing recursive=False into a method like find_parents() wouldnt be very useful.

# Calling a tag is like calling find_all()
soup.find_all("a")
soup("a")

soup.title.find_all(text=True)
soup.title(text=True)


# These two lines of code are nearly equivalent:
soup.find_all("title", limit=1)
# [<title>The Dormouse's story</title>]

soup.find("title")
# <title>The Dormouse's story</title>

soup.head.title
# <title>The Dormouse's story</title>

soup.find("head").find("title")
# <title>The Dormouse's story</title>
Пример #12
0
print soup.find_all('a', limit=2)  # 返回最先搜索到的两个a标签

# recursive参数
# 调用tag的 find_all() 方法时,Beautiful Soup会检索当前tag的所有子孙节点,如果只想搜索tag的直接子节点,可以使用参数 recursive=False .
print soup.find_all('title')  # [<title>The Dormouse's story</title>]
print soup.find_all('title', recursive=False)  #[]

# 像调用 find_all() 一样调用tag
# find_all() 几乎是Beautiful Soup中最常用的搜索方法,所以我们定义了它的简写方法.
# BeautifulSoup 对象和 tag 对象可以被当作一个方法来使用,这个方法的执行结果与调用这个对象的 find_all() 方法相同
# 下面两行代码是等价的:
print soup.find_all('title')
print soup('title')
# 下面两行代码是等价的:
print soup.title.find_all(text=True)
print soup.title(text=True)

# find()
# find( name , attrs , recursive , text , **kwargs )
# find_all() 方法将返回文档中符合条件的所有tag,尽管有时候我们只想得到一个结果.比如文档中只有一个<body>标签,
# 那么使用 find_all() 方法来查找<body>标签就不太合适, 使用 find_all 方法并设置 limit=1 参数不如直接使用 find() 方法.
# 下面两行代码是等价的:
print soup.find_all('title', limit=1)
print soup.find('title')
# 唯一的区别是 find_all() 方法的返回结果是值包含一个元素的列表,而 find() 方法直接返回结果.
# find_all() 方法没有找到目标是返回空列表, find() 方法找不到目标时,返回 None .
print soup.find_all('tite', limit=1)  #[]
print soup.find('tite')  # None
# soup.head.title 是 tag的名字 方法的简写.这个简写的原理就是多次调用当前tag的 find() 方法:
print soup.head.title
print soup.find('head').find('title')
request = Request(url)
response = urlopen(request)
html = response.read()  #To read the response and store in a file
response.close()  #To close the link

#Shortcut for above steps is to use the requests package
import requests
url = "https://www.wikipedia.org"
r = requests.get(url)  #To activate the request
text = r.text  #To convert html file into text file

from bs4 import BeautifulSoup
import requests
url = "https://www.wikipedia.org"
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc)
print(soup.prettify())  #Prints well indendeted HTML files
print(soup.title())  #Prints the title of the HTML file
print(soup.get_text())  #Extracts the text from the HTML file
for link in soup.find_all('a'):
    print(link.get('href'))  #To print all the hyperlinks in the html file
with open("a_movie.json") as json_file:
    json_data = json.load(json_file)  #imports as a list

r = requests.get(url)
json_data = r.json()
for key, value in json_data.items():
    print(key + ' ', value)
#An API is a bunch of codes that allows two software programs to interact with each other
Пример #14
0
#通过string参数可以搜索文档中的字符串类容
print(soup.find_all(string='Elsie'))
print(soup.find_all(string=['Elsie', 'Lacie']))
soup.find_all(string=re.compile("Dormouse"))

print("-----------------limit参数-----------------")
#find_all() 方法返回全部的搜索结构,如果文档树很大那么搜索会很慢.如果我们不需要全部结果,可以使用 limit 参数限制返回结果的数量.效果与SQL中的limit关键字类似,当搜索到的结果数量达到 limit 的限制时,就停止搜索返回结果.
soup.find_all('a', limit=2)

print('___________________像调用find_all()一样调用tag_______________________')
soup.find_all('a')
#等价于
soup("a")

soup.title.find_all(string=True)
soup.title(string=True)

print("______________CSS选择器________________")
print(soup.select('title'))
print(soup.select("p:nth-of-type(3)"))
#通过tag标签逐层查找
soup.select("body a")

#找到某个tag标签下的直接子标签
soup.select('head > title')
soup.select("p > #link1")

#找到兄弟节点标签
soup.select("#link1 ~ .sister")

#通过CSS的类名查找
Пример #15
0
##############################################
############# BeautifulSoup
import requests
from bs4 import BeautifulSoup
url = 'https://www.python.org/~guido/'
# package the request, send the request and catch the response
r = requests.get(url)
# extrack the response as html
html_doc = r.text
# create a BeautifulSoup object from the html
soup = BeautifulSoup(html_doc)

pretty_soup = soup.prettify()  # prettify the BeautifulSoup object
print(pretty_soup)

guido_title = soup.title()  # get the title of the page
print(guido_title)
print(soup.title)
print(soup.get_text)  # get the text of the page

a_tags = soup.find_all('a')  # find all hyperlinks (a tags)
for link in a_tags:
    print(link.get('href'))

############# Scrapy
from scrapy import Selector
import requests
url
html = requests.get(url).content
sel = Selector(text=html)
#Note: x=0 & y=5 is [0,1,2,3,4]; next numbers should be x=5 & y=10 to check [5,6,7,8,9].
#Note: content[y] does not work if value for len(content) is inputed; last value is len(content)-1.
#Beautiful Soup instruction: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
data = {}
count = 0
exceptions = {}
for i in range(x, y):
    try:
        body = []
        bodystring = ''
        aa = a + content[i]
        data[str(i) + '-(0)Reference'] = aa
        html = requests.get(aa)
        soup = BeautifulSoup(html.text, 'html.parser')
        #print soup.title
        soup.title = soup.title.encode('utf-8')
        soup.title = soup.title[7:-8]
        data[str(i) + '-(1)Title'] = soup.title  #Title
        for author in soup.find('footer'):  #Author
            author = author.find('a')
            if author == -1:
                pass
            else:
                #print author
                author1 = author.encode('utf-8')
                author2 = author1[:-4]
                author3 = author2.find('>')
                author4 = author2[author3 + 1:]
                data[str(i) + '-(2)Author'] = author4
        for date in soup.find('time'):  #DateTime
            #print date
Пример #17
0
        try:
            days_left = (difference / subs_per_second) / 86400
            if days_left > 0:
                days_left_array.append(days_left)
        except:
            days_left = 999999999999
        avg_value = 0
        for value in days_left_array:
            avg_value += value
        try:
            avg_value = avg_value / (len(days_left_array))
        except:
            avg_value = 999999999999

        print("Subs Per Second:", subs_per_second)
        print(first_channel_name.title(), "Subs:", first_channel_subs)
        print(second_channel_name.title(), "Subs:", second_channel_subs)
        print(first_channel_name.title(), "Gains:", first_channel_gains)
        print(second_channel_name.title(), "Gains:", second_channel_gains)
        print("Difference:", difference)
        print("Change:", change)
        print("Time Since Last Update (seconds):", round(time_delta, 2))
        print("Days Left:", days_left)
        print("Days Left (Avg):", avg_value)
        print("-----------------------------")
        first = False
        time1 = time.time()
        #time.sleep(1)
    except ValueError:
        if flag == False:  #only alert you the first time
            print(
Пример #18
0
def handle_text(filename, img_keyword, sound_keyword, video_keyword):
    """
    :param paras:file name of a charpter, such like 'Charpter1.txt', without directory path.
    :result: a html file
    """
    # open file and read paragraphs
    with open(os.path.join('./text/', filename), 'r+') as f:
        paras = [p.strip() for p in f.readlines() if len(p) > 4]
    # read html template
    with open(r'base.txt', 'r+') as f:
        template_text = f.read()
        temp = BeautifulSoup(template_text, "lxml")

    # replace cover img
    # cover = temp.find('img', {'id': 'cover'})
    # cover['src'] = './pics/cover.jpg'

    # handle title
    title = temp.find('h3')
    title.string = paras[0]
    temp.title = paras[0]

    # handle paras
    text_box = temp.find('div', {'id': 'text'})
    js_box = temp.find('script', {'id': 'main'})
    count = [0,0]
    img_pat = re.compile(r'\((\W+?)\)\['+img_keyword+r'(\S+?)\]')
    sound_pat = re.compile(r'\((\W+?)\)\['+sound_keyword+r'(\S+?)\]')
    video_pat = re.compile(r'\((\W+?)\)\['+video_keyword+r'(\S+?)\]')
    for i in range(1, len(paras)):
        new_p = temp.new_tag('p')
        new_br = temp.new_tag('br')
        # handle img in text
        if img_pat.findall(paras[i]):
            imgs = img_pat.findall(paras[i])# a list of tuple(text, img_id)
            for img in imgs:
                img_result = insert_img(img[1], temp, count)
                new_img_div, count = img_result[0], img_result[1]
                text_box.append(new_img_div)
            new_p.string = re.sub(img_pat, r'\1', paras[i])# delete () and []
            # text_box.append(new_p)
            # text_box.append(new_br)
        if sound_pat.findall(paras[i]):
            sounds = sound_pat.findall(paras[i])
            new_p.string = re.sub(sound_pat, r'\1', paras[i])
            for sound in sounds:
                new_play_logo = insert_sound(sound[0], sound[1], paras[i], temp)
                new_p.append(new_play_logo)
            # text_box.append(new_p)
            # text_box.append(new_br)
        if video_pat.findall(paras[i]):
            videos = video_pat.findall(paras[i])
            for video in videos:
                new_video_link = temp.new_string("<a target='_blank' href='"+insert_video(video[1], paras[i], temp) + ".html'>"+video[0]+"</a>")
                new_p.string = re.sub(video_pat, new_video_link, new_p.string)
                new_p = BeautifulSoup(html_parser.unescape(str(new_p)), 'lxml')
        if not (img_pat.findall(paras[i]) or sound_pat.findall(paras[i]) or video_pat.findall(paras[i])):
            new_p.string = paras[i]
        text_box.append(new_p)
        text_box.append(new_br)

    with open('audio.txt', 'r+') as f:
        text = f.read()
        audio_tag = BeautifulSoup(text, 'lxml').div
        text_box.append(audio_tag)

    # add js about sound to html script
    # with open('static/js/audio.js', 'r+') as f:
    #     audio_js = f.read()
    #     js_box.append(audio_js)     

    with open(filename[:-4] + '.html', 'w+') as f:
        f.write(temp.prettify("utf-8"))
        print '==========finish ' + filename + '==========' 
Пример #19
0

# beautiful soup testing

def has30classes(classes):
    return classes != None and len(classes) == 30

for elem in soup.find_all(class_=has30classes):
    print(elem.get_text)

# an element that's id only contains letters between a to r
print(soup.find(id=re.compile("^[a-r]*$")))

# () and find_all methods
print(soup("a") == soup.find_all("a"))
print(soup.title(string=True))

# find parent, find sibling
print(soup.find_parent("div"))
print(soup.title.find_next_sibling("link"))
print(soup.a.find_parent("div"))

# previous sibling, next sibling
print(soup.title.find_previous_sibling())
print(soup.title.find_next_sibling())

# find next, find previous
print()
print(soup.title.find_previous())
print(soup.title.find_next())
    driver.get(search_url)

    if company in except_company:
        continue

    while page_num <= 50:
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        for urls in soup.select("._sp_each_url"):
            if urls["href"].startswith("https://news.naver.com"):
                driver.get(urls["href"])
                news_html = driver.page_source
                news_html_soup = BeautifulSoup(news_html, 'html.parser')

                tmp_title = news_html_soup.title(string=True)
                tmp_date = news_html_soup.select('.t11')
                tmp_article = news_html_soup.select('#articleBodyContents')
                tmp_press = news_html_soup.select('#footer address')

                title = tmp_title[0].replace(" : 네이버 뉴스", "")
                if len(tmp_date) == 0:
                    tmp_date = news_html_soup.select('.article_info')[0].find('em')
                    p_date = tmp_date.get_text().split(" ")[0]
                else:
                    p_date = tmp_date[0].get_text().split(" ")[0]
                p_date = datetime.datetime.strptime(p_date, "%Y.%m.%d.")
                if len(tmp_article) == 0:
                    tmp_article = news_html_soup.select('#articeBody')
                article = tmp_article[0].get_text().replace('\n', "").replace('\t', "")
                if not tmp_press[0].a:
Пример #21
0
from bs4 import BeautifulSoup
import requests
import pandas as pd

#specify URl
url = requests.get(
    'https://en.wikipedia.org/wiki/List_of_Asian_countries_by_area').text

# Parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(url, "lxml")

# To look at the HTML underlying to the web
#print(soup.prettify())

# to get the title of the page
soup.title()

# use the 'find_all' function to bring back all instances of the 'table'
# tag in the HTML and store in 'all_tables' variable

all_tables = soup.find_all("table")
all_tables

# use the 'find_all' function to bring back all instances of the 'table'
# tag in the HTML and store in 'all_tables' variable

my_table = soup.find('table', {'class': 'wikitable sortable'})
my_table

links = my_table.find_all('a')
links
Пример #22
0
def scrape_urls(urls):
    stop_words_ru = get_stop_words('russian')
    stop_words_en = get_stop_words('english')
    stop_words = stop_words_en + stop_words_ru

    stemmer_ru = SnowballStemmer('russian')
    stemmer_en = SnowballStemmer('english')

    df = pd.DataFrame(urls)
    contents = []

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'
    }

    for row in df.itertuples():
        if not re.compile("^https?://").match(row[1]):
            print('URL must begin with http:// or https://')
            continue

        try:
            response = requests.get(row[1], headers=headers)
        except Exception as e:
            print(row[1], e)

        if response.status_code == 200:
            try:
                soup = BeautifulSoup(response.content, 'lxml')
                [s.decompose() for s in soup('noscript')]
                [s.decompose() for s in soup('script')]
                [s.decompose() for s in soup('style')]
                title = soup.title(text=True)[0]
                body = soup.body(text=True)
                html_body = ' '.join(body)
            except Exception as e:
                print(row[1], e)
                continue

            try:
                tokenized_title = word_tokenize(title)
                result_title = [
                    stemmer_ru.stem(stemmer_en.stem(i))
                    for i in tokenized_title if i.lower() not in stop_words
                    and i.isalpha() and len(i) > 3
                ]

                text_from_html = html_body.replace('\n', ' ')
                tokenized_html = word_tokenize(text_from_html)
                result_words = [
                    stemmer_ru.stem(stemmer_en.stem(i)) for i in tokenized_html
                    if i.lower() not in stop_words and i.isalpha()
                    and len(i) > 3
                ]

                title = ' '.join(result_title).lower()
                content = ' '.join(result_words).lower()

                if len(row) == 3:
                    contents.append(
                        [row[1], ' '.join([title, content]), row[2]])
                else:
                    contents.append(' '.join([title, content]))
            except Exception as e:
                print(row[1], e)
        else:
            print(f'Unable to reach {row[1]}, {response.status_code}')

    return contents