Python BeautifulSoup примеры, bs4.BeautifulSoup Python примеры использования

Пример #1

1

Показать файл

Файл: getEcommence_dangdang.py Проект: Neilfu/NLP

def getCategoryUrl(site="",url=""):
    catDb = openTable(tableName=global_setting['catTable'])
    r = session.get(url)
    if not r.text:
        return False

    soup = BeautifulSoup(r.text)
    for level1 in soup.select('.classify_books'):
        curLevel1 = level1.select('.classify_title')[0].text
        curLevel1 = re.sub('\s', '', curLevel1)
        for level2 in level1.select('.classify_kind'):
            curLevel2 = level2.select('.classify_kind_name')[0].text
            curLevel2 = re.sub('\s', '', curLevel2)
            for level3 in level2.select('ul li a'):
                #curLevel3 = re.sub('\s', '', level3.text)
                curLevel3 =  level3.text.strip()
                curlUrl = level3['href']
                retFind = re.findall(r'\/cp(.*)\.html',curlUrl)
                if retFind:
                    curCatID = retFind[0]
                    catType = 'book'
                else:
                    retFind = re.findall(r'\/cid(.*)\.html',curlUrl)
                    if retFind:
                        curCatID = retFind[0]
                        catType = 'nonbook'
                if retFind:
                    if catDb.find({'catId':curCatID}).count() >0:
                        logger.debug('catetogy %s exists,skip\n'%(curCatID))
                    else:
                        catDb.insert({'catId':curCatID,'level1':curLevel1, 'level2':curLevel2, 'level3':curLevel3, 'catUrl':curlUrl,'catType':catType, 'site':site})
    return True

Пример #2

1

Показать файл

Файл: mx_grinder.py Проект: aesthese/mx_grinder

def show_options(id):
    r = requests.get("https://interaktiv.mx.dk/toolbox/" + votetype + "/get/" + id)
    soup2 = BeautifulSoup(r.text, "lxml")

    clear_console()
    print_logo()
    print "(Interaktiv version. Kør scriptet med -h eller --help for flere indstillinger.)"
    print

    vote_text = soup2.find("div", attrs={"id": "vote_text"}).text
    print vote_text
    print

    if votetype == "advancedvotes":
            for option in soup2.find_all("div", attrs={"class": "vote_button"}):

                number = option.get("data-vote")
                text = option.text

                print "(%s) %s" % (number, text)
            print

    else:

            for option in soup2.find_all("div", attrs={"class": "vote_button"}):
                if option.get("id") == "vote_yes":
                    number = "1"

                else:
                    number = "0"

                text = option.text
                print "(%s) %s" % (number, text)
            print

Пример #3

1

Показать файл

Файл: html.py Проект: allieus/askbot3

def replace_links_with_text(html):
    """any absolute links will be replaced with the
    url in plain text, same with any img tags
    """
    soup = BeautifulSoup(html, 'html5lib')
    abs_url_re = r'^http(s)?://'

    images = soup.find_all('img')
    for image in images:
        url = image.get('src', '')
        text = image.get('alt', '')
        if url == '' or re.match(abs_url_re, url):
            image.replaceWith(format_url_replacement(url, text))

    links = soup.find_all('a')
    for link in links:
        url = link.get('href', '')
        text = ''.join(link.text) or ''

        if text == '':  # this is due to an issue with url inlining in comments
            link.replaceWith('')
        elif url == '' or re.match(abs_url_re, url):
            link.replaceWith(format_url_replacement(url, text))

    return force_text(soup.find('body').renderContents(), 'utf-8')

Пример #4

0

Показать файл

Файл: bleach.py Проект: CCharlieLi/funny_tool

    def get_Comics(self, name, comic_url):
        if not self.mkdir(name):
            again = ''
            while (1):
                again = str(input('Directory ' + name + ' already exists, do you wanna to download again? (Y/N)'))
                if again == 'Y' or again == 'N':
                    break
            if again == 'N':
                print('Folder \'BLEACH/' + name + '\' already exists!')
                return
            else:
                shutil.rmtree(self.path)
                self.mkdir(name)

        # Parse html
        page_url = self.prefix + comic_url
        data = urllib.request.urlopen(page_url).read().decode('utf-8', 'ignore')
        data.encode('utf-8')
        soup = BeautifulSoup(data, 'lxml')
        lists = soup.findAll('img', {'class': 'BDE_Image'})

        print('Downloading: ' + name)
        # Define progress bar's length
        progress_bar = tqdm(unit='Pic', total=len(lists))
        count = 0

        for each in lists:
            pic_url = each['src']
            filename = '%03d.txt' % count  + '.' + pic_url.split('.')[-1]
            urllib.request.urlretrieve(pic_url, filename = self.path + '/' + filename)
            progress_bar.update(1)
            count = count + 1

        # Close bar
        progress_bar.close()

Пример #5

0

Показать файл

Файл: imdb.py Проект: PlainStupid/PlainActors

def getMoviesActors(movieList):
    """

    :param A list containing formatted movie list
    :return: A list containing ID of the movie and all actors in that movie including actors ID
    """
    actorsInMovies = {}

    for x in movieList:
        req = urllib.request.Request(BASE_URL+movieList[x]["Url"]+"/fullcredits")
        #print(req.full_url)
        # Header is necessary to get the right movie titles, as in the english title
        req.add_header('Accept-Language', 'en-US,en')
        # Send the request and get response
        response = urllib.request.urlopen(req)

        bsoup = BeautifulSoup(response)

        findCastList = bsoup.find("table", {"class": "cast_list"})

        findAllActors = findCastList.findAll("td", itemprop="actor")

        actors = {}
        for d in findAllActors:
            actorName = d.find("span", itemprop="name")
            actorNumber = d.find("a", href=re.compile("\/name\/nm"))
            actorID = re.match("(?:\/name\/nm)(?P<userid>\d+)", actorNumber["href"]).group("userid")
            actors[actorID] = actorName.contents[0]

        actorsInMovies[movieList[x]["ID"]] = actors

    return actorsInMovies

Пример #6

0

Показать файл

Файл: lct_get_items_v0.6.py Проект: GitSven/Crawls

def parse(html):
    '''
    页面分析，如果接收到的内容是ERROR_NUM，则说明超时了，则无需在分析；
    如果正常，则分别匹配出商品的id，name，price，stat，并写到以日期命名的文件中
    '''
    if not html:
        logger.info('======pass parse=====')
        return {}

    items = {}
#    print isinstance(html, str)
    parse_page = BeautifulSoup(html)
    goods = parse_page.find_all('div', class_='goods-content')

    for good in goods:

        good_id = good['nctype_goods'][1:]#在开始有一个空格

        good_name = good.select('div[class="goods-name"]')[0].a.text.replace(',', '_')

        good_price = good.select('em[class="sale-price"]')[0].text
        if re.findall(u'\u4e07', good_price):#处理‘1.3万’这种价格
            good_price = str(float(good_price[:-1])*10000)
        else:#去掉价格里的人民币符号
            good_price = good_price[1:]

        good_stat = good.select('a[class="status"]')[0].text

        items[good_id] = good_name + ',' + good_price + ',' + good_stat

    return items

Пример #7

0

Показать файл

Файл: html.py Проект: allieus/askbot3

def get_visible_text(html):
    """returns visible text from html
    http://stackoverflow.com/a/19760007/110274
    """
    soup = BeautifulSoup(html, 'html5lib')
    [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
    return soup.get_text()

Пример #8

0

Показать файл

Файл: linear_regression.py Проект: jeenalee/sklearn_pmml

def from_pmml(self, pmml):
    """Returns a model with the intercept and coefficients represented in PMML file."""

    model = self()
    
    # Reads the input PMML file with BeautifulSoup.
    with open(pmml, "r") as f:
        lm_soup = BeautifulSoup(f, "xml")

    if not lm_soup.RegressionTable:
        raise ValueError("RegressionTable not found in the input PMML file.")

    else:
    ##### DO I WANT TO PULL THIS OUT AS ITS OWN FUNCTION? #####
        # Pulls out intercept from the PMML file and assigns it to the
        # model. If the intercept does not exist, assign it to zero.
        intercept = 0
        if "intercept" in lm_soup.RegressionTable.attrs:
            intercept = lm_soup.RegressionTable['intercept']
        model.intercept_ = float(intercept)

        # Pulls out coefficients from the PMML file, and assigns them
        # to the model.
        if not lm_soup.find_all('NumericPredictor'):
            raise ValueError("NumericPredictor not found in the input PMML file.")
        else:
            coefs = []
            numeric_predictors = lm_soup.find_all('NumericPredictor')
            for i in numeric_predictors:
                i_coef = float(i['coefficient'])
                coefs.append(i_coef)
            model.coef_ = numpy.array(coefs)
            
    return model

Пример #9

0

Показать файл

Файл: html.py Проект: allieus/askbot3

def get_text_from_html(html_text):
    """Returns the content part from an HTML document
    retains links and references to images and line breaks.
    """
    soup = BeautifulSoup(html_text, 'html5lib')

    # replace <a> links with plain text
    links = soup.find_all('a')
    for link in links:
        url = link.get('href', '')
        text = ''.join(link.text) or ''
        link.replaceWith(format_url_replacement(url, text))

    # replace <img> tags with plain text
    images = soup.find_all('img')
    for image in images:
        url = image.get('src', '')
        text = image.get('alt', '')
        image.replaceWith(format_url_replacement(url, text))

    # extract and join phrases
    body_element = soup.find('body')
    filter_func = lambda s: bool(s.strip())
    phrases = map(
        lambda s: s.strip(),
        filter(filter_func, body_element.get_text().split('\n'))
    )
    return '\n\n'.join(phrases)

Пример #10

0

Показать файл

Файл: html.py Проект: allieus/askbot3

def moderate_tags(html):
    """replaces instances of <a> and <img>
    with "item in moderation" alerts
    """
    from askbot.conf import settings
    soup = BeautifulSoup(html, 'html5lib')
    replaced = False
    if settings.MODERATE_LINKS:
        links = soup.find_all('a')
        if links:
            template = get_template('widgets/moderated_link.jinja')
            aviso = BeautifulSoup(template.render(), 'html5lib').find('body')
            map(lambda v: v.replaceWith(aviso), links)
            replaced = True

    if settings.MODERATE_IMAGES:
        images = soup.find_all('img')
        if images:
            template = get_template('widgets/moderated_link.jinja')
            aviso = BeautifulSoup(template.render(), 'html5lib').find('body')
            map(lambda v: v.replaceWith(aviso), images)
            replaced = True

    if replaced:
        return force_text(soup.find('body').renderContents(), 'utf-8')

    return html

Пример #11

0

Показать файл

Файл: amalgamate_xml.py Проект: smurp/huviz

def prettify(which, id):
    prefix = which[0]
    bs = BeautifulSoup(open(os.path.join(root,which, i+"-" + prefix + ".xml")), 'xml')
    sgm = i + "-" + prefix + ".sgm"
    out = bs.prettify(encoding='utf-8')
    [first, rest] = out.split("\n",1)
    return rest.replace(sgm, i) # the ID in the files look like "atwoma-b.sgm" rather than "atwoma"

Пример #12

0

Показать файл

Файл: func.py Проект: zhuhk/test-toys

def extract_images(base,html):
  images = []
  soup = BeautifulSoup(html)
  for img in soup.find_all("img"):
    if img.has_attr("src"):
      images.append(urljoin(base,img["src"]))
  return images

Пример #13

0

Показать файл

Файл: symbols.py Проект: hedgefair/finsymbols

def get_sp500_symbols():
    page_html = wiki_html('List_of_S%26P_500_companies', 'SP500.html')
    wiki_soup = BeautifulSoup(page_html, "html.parser")
    symbol_table = wiki_soup.find(attrs={'class': 'wikitable sortable'})

    symbol_data_list = list()

    for symbol in symbol_table.find_all("tr"):
        symbol_data_content = dict()
        symbol_raw_data = symbol.find_all("td")
        td_count = 0
        for symbol_data in symbol_raw_data:
            if(td_count == 0):
                symbol_data_content[
                    'symbol'] = symbol_data.text
            elif(td_count == 1):
                symbol_data_content[
                    'company'] = symbol_data.text
            elif(td_count == 3):
                symbol_data_content[
                    'sector'] = symbol_data.text
            elif(td_count == 4):
                symbol_data_content[
                    'industry'] = symbol_data.text
            elif(td_count == 5):
                symbol_data_content[
                    'headquarters'] = symbol_data.text

            td_count += 1

        symbol_data_list.append(symbol_data_content)

    return symbol_data_list[1::]

Пример #14

0

Показать файл

Файл: test_admin.py Проект: orcunacan/pybossa

    def test_23_admin_add_category(self):
        """Test ADMIN add category works"""
        self.create()
        category = {'name': 'cat', 'short_name': 'cat',
                    'description': 'description'}
        # Anonymous user
        url = '/admin/categories'
        res = self.app.post(url, data=category, follow_redirects=True)
        dom = BeautifulSoup(res.data)
        err_msg = "Anonymous users should be redirected to sign in"
        assert dom.find(id='signin') is not None, err_msg

        # Authenticated user but not admin
        self.signin(email=self.email_addr2, password=self.password)
        res = self.app.post(url, data=category, follow_redirects=True)
        err_msg = "Non-Admin users should get 403"
        assert res.status_code == 403, err_msg
        self.signout()

        # Admin
        self.signin(email=self.root_addr, password=self.root_password)
        res = self.app.post(url, data=category, follow_redirects=True)
        err_msg = "Category should be added"
        assert "Category added" in res.data, err_msg
        assert category['name'] in res.data, err_msg

        category = {'name': 'cat', 'short_name': 'cat',
                    'description': 'description'}

        self.signin(email=self.root_addr, password=self.root_password)
        res = self.app.post(url, data=category, follow_redirects=True)
        err_msg = "Category form validation should work"
        assert "Please correct the errors" in res.data, err_msg

Пример #15

0

Показать файл

Файл: HexunWeiboCrawler.py Проект: flykeysky/AllWeiboCrawler

 def getWeibos(self, keyword,  page=1, count=None):
     url = 'http://t.hexun.com/k/topic.html?type=1&value=%s&pg=%d' % (json.dumps(keyword).replace('\\', '%').replace('"', ''), page)
     result = WeiboCrawler.request(self, url, self.headers)
     if 'result' in result and result['result']:
         infos = result['info'].decode('gb2312')
         soup = BeautifulSoup(infos)
         total_soup = soup.select('.headerR1')[0]
         total_num = total_soup.get_text().split('共')[-1].split('条')[0].strip()
         return_val = {'total_count': int(total_num), 'msgs':[]}
         allmsgs = []
         msgs_soup = soup.select('.nr_con')
         for msg_soup in msgs_soup:
             avatar =  'http://t.hexun.com%s' % msg_soup.select('.nr_conLa > a')[0].get('href')
             nickandtext = msg_soup.select('.nr_shuo')[0].get_text().split('：')
             nickname = nickandtext[0]
             text = nickandtext[1]
             ts = msg_soup.select('.nr_tan > h3 > a')[0].get_text()
             allmsgs.append({
                 'avatar': avatar,
                 'nickname': nickname,
                 'text': text,
                 'datetime': ts,
                 })
         return_val['msgs'] = allmsgs
         return return_val

Пример #16

0

Показать файл

Файл: crawler.py Проект: singhalvibhor05/Simple-crawler

	def __call__(self, url, count_of_crawler):
		"""
		Function which fetch the content from the given URL and collect all the
		URL in the content and pass the first url of the page to fetch the
		content.
		"""
		try:
			page = urllib2.urlopen(url)
			soup = BeautifulSoup(page.read())	

			links_on_page = map(lambda anchor: anchor.get('href'), 
						soup.find_all('a'))

			cleaned_url = map(lambda link: link if urlparse(link).scheme 
	 				and urlparse(url).netloc else (urlparse(url)
					.scheme+"://"+urlparse(url).netloc+link if 
					link[0] == "/" else url+link), links_on_page)
			visited_url.append(url)
			total_collected_url.append(cleaned_url)
			next_url_to_visit = [next_url for next_url in cleaned_url\
				 if not next_url in visited_url and not "#" in next_url][0]
		
			if count_of_crawler and next_url_to_visit:	
				count_of_crawler = crawler(next_url_to_visit, 
								count_of_crawler-1)
	
		except:
			print "It seems there is some issue in URL "+url
	
		return count_of_crawler

Пример #17

0

Показать файл

Файл: import_charlotte_health.py Проект: tbackes/yelp-health

def get_page_info(id_no, s=None):
    '''
    Extract restaurant information from Charlotte's health inspection website

    INPUT:  id_no = int, id # for ESTABLISHMENT
            s = request.Session(), [OPTIONAL] 
    OUTPUT: out = dict, establishment-level information
    '''
    if s is None:
        s = requests.Session()
    link = 'https://public.cdpehs.com/NCENVPBL/INSPECTION/ShowESTABLISHMENTPage.aspx'
    payload = {'ESTABLISHMENT':id_no, 'esttst_cty':60}
    z = s.get(link, params=payload)
    soup = BeautifulSoup(z.content, from_encoding='UTF-8')
    
    t = soup.findAll('table')[0]
    
    insp_info = np.array([y.text for y in t.findAll('td', attrs={'class':'ttc'})]).reshape(-1,4)
    
    if insp_info.shape[0] < 1:
        return None
    
    r = t.findAll('td', attrs={'class':'dfv'})
    rest_info = [x.text for x in r]
    
    return {'name'       :rest_info[0],
            'address'    :rest_info[2],
            'city'       :rest_info[8],
            'state'      :rest_info[9],
            'zip'        :rest_info[10],
            'type'       :rest_info[16],
            'county'     :rest_info[19],
            'inspections':insp_info}

Пример #18

0

Показать файл

Файл: scraper.py Проект: ramonsaraiva/huelol

def scrap_items():
	for itemlist in ITEMLIST:
		soup = BS(urllib2.urlopen(''.join([LOLWIKI, itemlist])).read())
		item_table = soup.find('table', class_='stdt sortable')

		for tr in item_table.find_all('tr'):
			tds = tr.find_all('td')
			if len(tds) < 1:
				continue
			if tr.find('p') == None:
				continue

			item_name = tr.find('p').text.strip()
			item_url = tr.find('img')['src']

			if item_url.split(':')[0] == 'data':
				item_url = tr.find('img')['data-src']

			if not HOOKED:
				continue

			#store item in database
			d_item = Item()
			d_item.name = item_name

			t_img = NamedTemporaryFile(delete=True)
			t_img.write(urllib2.urlopen(item_url).read())
			t_img.flush()
			t_img.name = '.'.join([item_name, 'jpg'])

			d_item.picture = File(t_img)
			d_item.save()

Пример #19

0

Показать файл

Файл: network.py Проект: DimosGu/nba

def get_games(date, output_file=None):

    # games_url = base + '/scoreboard/' + format_date(date) + '/games.json'
    games_url = si_base + 'schedule'
    #print format_date(date)

    result = requests.get(games_url, params={'date': format_date(date)})

    #print games_url + format_date(date)

    soup = BeautifulSoup(result.text)

    #date_string = date.strftime('%B %d,%Y')

    games = soup.find_all('tr', 'component-scoreboard-list final')

    game_ids = []

    for game in games:
        game_date_elem = game.find('div', 'game-anchor')
        game_date_text = game_date_elem['id']
        game_date = date_parser.parse(game_date_text).date()
        if game_date == date:
            game_id = int(game['data-id'])
            game_ids.append(game_id)

    if output_file is not None:
        of = open(output_file, 'w')
        of.write(json.dumps({'game_date': format_date(date), 'game_ids': game_ids}))
        of.close()

    return game_ids

Пример #20

0

Показать файл

Файл: hugi-is-spider.py Проект: gogn-in/hugi-is

 def parse(self, response):
     logger.info("Parsing {}".format(response.url))
     soup = BeautifulSoup(response.body, "html.parser")
     trs = soup.find_all("tr", "item")
     if trs:
         for tr in trs:
             link = tr.find("a")
             article_url = DETAIL_URL.format(link["href"])
             r = scrapy.Request(article_url,
                                      callback=self.parse_article)
             yield r
     # next urls
     try:
         next_url = soup.find(class_="next").a
         cat_url = response.url
         u = urlparse(cat_url)
         query = None
         # Strip the query part
         u = u._replace(query=query)
         follow_url = urlunparse(u) + next_url["href"]
         r = scrapy.Request(follow_url, callback=self.parse)
         yield r
     except AttributeError:
         logger.info("Done with".format(response.url))
         pass

Пример #21

0

Показать файл

Файл: NBA_GET_TEAMS.py Проект: Peilin-D/NBA_Stats

def Get_All_Teams():
    data_path = '../data/'
    # get the teams
    url = 'http://espn.go.com/nba/teams'
    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html, 'lxml')
    # print (soup.prettify())
    tables = soup.find_all('ul', class_ = 'medium-logos')

    tables[0].find_all('li')[0].h5.a

    name_pref_Tuples = []
    city_name_Dict = {}
    for table in tables:
        lis = table.find_all('li')
        for li in lis:
            info = li.h5.a
            team_url = info['href']
            team_name = info.text
            pref = team_url.split('/')[-2]
            city_name = ' '.join(info.text.split()[:-1])
            if team_name == 'Portland Trail Blazers':
                city_name = 'Portland'
            city_name_Dict[city_name] = team_name
            name_pref_Tuples.append((team_name, pref))

    print 'output two files: city_name.pickle and name_pref.pickle'
    print 'city_name.pickle is a dict with (city, team_name) pairs'
    print 'name_pref.pickle is a list of (team_name, team_name_prefix) tuples'
    pk.dump(city_name_Dict, open(data_path + 'city_name.pickle', 'wb'))
    pk.dump(name_pref_Tuples, open(data_path + 'name_pref.pickle', 'wb'))

Пример #22

0

Показать файл

Файл: CA_NB.py Проект: alfurey/electricitymap

def _get_new_brunswick_flows(requests_obj):
    """
    Gets current electricity flows in and out of New Brunswick.

    There is no reported data timestamp in the page. The page returns
    current time and says "Times at which values are sampled may vary by
    as much as 5 minutes."
    """

    url = 'https://tso.nbpower.com/Public/en/SystemInformation_realtime.asp'
    response = requests_obj.get(url)

    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find('table', attrs={'bordercolor': '#191970'})

    rows = table.find_all('tr')

    headers = rows[1].find_all('td')
    values = rows[2].find_all('td')

    flows = {headers[i].text.strip(): float(row.text.strip())
             for i, row in enumerate(values)}

    return flows

Пример #23

0

Показать файл

Файл: core.py Проект: AlexaVillaume/astroquery

 def _login(self, username=None, store_password=False):
     if username is None:
         if self.USERNAME == "":
             raise LoginError("If you do not pass a username to login(), you should configure a default one!")
         else:
             username = self.USERNAME
     # Get password from keyring or prompt
     password_from_keyring = keyring.get_password("astroquery:www.eso.org", username)
     if password_from_keyring is None:
         if system_tools.in_ipynb():
             log.warn("You may be using an ipython notebook:"
                      " the password form will appear in your terminal.")
         password = getpass.getpass("{0}, enter your ESO password:\n".format(username))
     else:
         password = password_from_keyring
     # Authenticate
     log.info("Authenticating {0} on www.eso.org...".format(username))
     # Do not cache pieces of the login process
     login_response = self._request("GET", "https://www.eso.org/sso/login", cache=False)
     login_result_response = self._activate_form(login_response,
                                                 form_index=-1,
                                                 inputs={'username': username,
                                                         'password': password})
     root = BeautifulSoup(login_result_response.content, 'html5lib')
     authenticated = not root.select('.error')
     if authenticated:
         log.info("Authentication successful!")
     else:
         log.exception("Authentication failed!")
     # When authenticated, save password in keyring if needed
     if authenticated and password_from_keyring is None and store_password:
         keyring.set_password("astroquery:www.eso.org", username, password)
     return authenticated

Пример #24

0

Показать файл

Файл: scrape.py Проект: shoekla/scholarC

def crawlLinkScoial(url):
	try:
		pages=[]
		arr=[]
		source_code=requests.get(url)
		plain_text=source_code.text
		soup=BeautifulSoup(plain_text)
		for link in soup.findAll('a'):

			href=link.get('href')
			href_test=str(href)
			#if href_test[0]!='/' and href_test[0]!='j' and href_test!='none' and href_test[0]!='#':
			if is_in_arr(pages,str(href))==False:
				if "facebook" in href_test or "twitter" in href_test or "google" in href_test:

					lin=getGoodLink(url)
					pages.append(lin+str(href))
		newArr=deleteDuplicates(pages)
		for page in newArr:
			socialFile.write(page)
			socialFile.write("\n")
		allFile.write("Social-Media-Links: \n")
		for page in newArr:
			allFile.write(page)
			allFile.write("\n")



	except:
		print "Error at: "+str(url)

Пример #25

0

Показать файл

Файл: scrape.py Проект: shoekla/scholarC

def crawlSearch(url,pages):
	try:
		arr=[]
		source_code=requests.get(url)
		plain_text=source_code.text
		soup=BeautifulSoup(plain_text)
		for link in soup.findAll('a'):

			href=link.get('href')
			href_test=str(href)
			#if href_test[0]!='/' and href_test[0]!='j' and href_test!='none' and href_test[0]!='#':
			if is_in_arr(pages,str(href))==False:
				if "microsoft" not in href_test and "facebook" not in href_test and "twitter" not in href_test and "google" not in href_test:
					if href_test.startswith("http"):
						if "bing" not in href_test:
							if "scholarships.com" not in href_test:
								pages.append(href)
								print str(href)
							else:
								if countS<2:
									crawl(href,pages)
									print "Crawling "+str(href)
									countS=countS+1
								else:
									print "Skiping "+str(href)
					else:
						pass


	except:
		print "Error at: "+str(url)

Пример #26

0

Показать файл

Файл: markdown_safe.py Проект: ianmackinnon/mango

def convert_links(text, quote="\""):
    soup = BeautifulSoup(text, "html.parser")
    for t in soup.findAll(text=True):
        if has_link_parent(t):
            continue
        split = re.split(r"(?:(https?://)|(www\.))([\S]+\.[^\s<>\"\']+)", t)
        if len(split) == 1:
            continue
        r = ""
        n = 0
        split = [s or "" for s in split]
        while split:
            if n % 2 == 0:
                r += split[0]
                split.pop(0)
            else:
                r += "<a href=%shttp://%s%s%s>%s%s%s</a>" % (
                    quote, split[1], split[2], quote,
                    split[0], split[1], split[2]
                    )
                split.pop(0)
                split.pop(0)
                split.pop(0)
            n += 1

        t.replaceWith(BeautifulSoup(r, "html.parser"))
    return str(soup)

Пример #27

0

Показать файл

Файл: webinfo.py Проект: marbrb/Scripting-with-python

    def reverseIP(self):
        #acomodar la url como la necesitamos (www.url.com)
        if self.url.startswith("http://"):
            url = self.url.replace("http://","") #remplazar por vacio :v
        else:
            url = self.url

        #se envia por post ya que la pagina usa un formulario para pedir la url a escanear
        #data son los datos POST que es la url
        #remoteHost es como se envía el parametro (la url que se especifica en connection)
        data = {"remoteHost" : url}
        connection = requests.post(
            #parametros necesarios para la conexion
            url="http://www.ipfingerprints.com/scripts/getReverseIP.php", data=data
        )

        #connection.text es el html que retorna la conexion
        #BeautifulSoup lo parsea menos horrible
        #html.parser para salida mas limpia
        beautifulOut = BeautifulSoup(connection.text, "html.parser")

        #aqui guardaremos todos los links que encontremos en la etiqueta
        response = list()

        #find_all busca todas las equitas y 'a' es el parametro para filtrar solo ese tipo de etiqueta
        for link in beautifulOut.find_all("a"):
            #href es el nombre del dominio (que es lo unico que nos interesa de toda la etiqueta)
            currentLink = link.get("href")
            response.append(currentLink[11:-2])

        return response

Пример #28

0

Показать файл

Файл: cleanpages.py Проект: connerm/ReadableSiteArchiver

def htmlfile(url):
  r = urllib2.urlopen(url)
  soup = BeautifulSoup(r)
  
  html = []
  #html- title, css (body width 960px)
  html.append('<html><head><title>'+soup.title.string+'</title><link rel="stylesheet" type="text/css" href="page.css"></head><body>')
  
  #parses for content only in article div - depends on site oblicously
  content =  soup.find('div', {'class': 'layout-block-a'})
  
  #gets hhtml paragraphs and h1 headings - should be alterd for websites style
  for text in content.find_all(['p', 'h1']):
    if text.name == 'p':
      html.append(str(text).decode("ascii", "ignore"))
    else:
      html.append(str(text).decode("ascii", "ignore"))
    
  html.append('</body></html>')
    
  # creates html files here
  out = open(soup.title.string+'.html', 'a')
  for line in html:
    out.write(line)
  out.close(
  
if __name__ == '__main__':
  main()

Пример #29

0

Показать файл

Файл: google.py Проект: StefanKjartansson/anton

def parse_data(data):
    page = BeautifulSoup(data)

    results = page.find("div", id="res")
    if results is None:
        raise NoResultsException

    calc = results.find("img", src="/images/icons/onebox/calculator-40.gif")
    if calc is not None:
        calc = results.find("h2", {"class": "r"})
        if calc is not None:
            superscripts = calc.find_all("sup")
            if superscripts is not None and len(superscripts):
                for x in superscripts:
                    x.contents[0].replaceWith("^" + x.contents[0])
            return [dict(type="string", string=util.strip_html(calc).decode("utf-8"))]

    nresults = results.find_all("li", {"class": "g"})
    if len(nresults) == 0:
        raise NoResultsException

    processed_results = []
    for x in nresults:
        a_tag = x.find("a")
        if a_tag is not None:
            processed_results.append(
                dict(type="result", href=urlparse.parse_qs(urlparse.urlparse(a_tag["href"]).query)["q"][0],
                     text=util.strip_html(a_tag).decode("utf-8")))

    return processed_results

Пример #30

0

Показать файл

Файл: website.py Проект: fucusy/fresher

    def insert_push(self):
        uw = user_website.UserWebsite()
        userids = uw.get_user_ids_by_website_id(self.website_id)
        for id in userids:
            p = push.Push()
            p.website_id = self.website_id
            p.user_id = id
            p.title = "has new notice"

            soup_diff = BeautifulSoup(self.get_different())

            new_link_list =  soup_diff.find_all('a')

            new_link_count = len(new_link_list)

            if new_link_count == 1:
                content = "one notice is published:\n"
            else:
                content = str(new_link_count) + " notices are published:\n"

            content += self.get_different()


            p.content = content
            p.content = p.content.replace('"',"'")


            p.date = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            p.website_id = self.website_id
            p.content_url = ""
            p.insert()

Пример #31

0

Показать файл

Файл: Gossiping.py Проект: AdlerHu/AIBigData_PyETL

    'Chrome/80.0.3987.132 Safari/537.36'
}
url = 'https://www.ptt.cc/bbs/Gossiping/index.html'
url_head = 'https://www.ptt.cc'

# 建立另一個資料夾
path = './pttGossiping/'
if not os.path.exists(path):
    os.mkdir(path)

ss = requests.session()
ss.cookies['over18'] = '1'

for i in range(0, 3):
    res = ss.get(url, headers=headers)
    soup = BeautifulSoup(res.text, 'html.parser')

    last_page_url = url_head + soup.select('a.btn.wide')[1]['href']
    title = soup.select('div.title a')

    for t in title:
        article_title = t.text
        article_url = url_head + t['href']

        # 依序進入各個文章
        article_res = ss.get(article_url, headers=headers)
        soup = BeautifulSoup(article_res.text, 'html.parser')

        try:
            # 標題、作者、日期的資訊
            result = soup.select('span.article-meta-value')

Пример #32

0

Показать файл

Файл: webscraper.py Проект: alex-love/flashcards

from bs4 import BeautifulSoup
import requests

page_link = 'https://www.vocabulary.com/lists/274832'

page_responce = requests.get(page_link, timeout=5)

page_content = BeautifulSoup(page_responce.content, "html.parser")
wordContent = []

for i in range(0, 499):
    wordEntry = page_content.find_all("a", class_="word")[i].text
    wordDef = page_content.find_all("div", class_="definition")[i].text
    wordContent.append((wordEntry, wordDef))

f = open("words.txt", "w+")
for i in range(len(wordContent)):
    f.write("{ ")
    f.write("{},{}\n".format(wordContent[i][0], wordContent[i][1]))
    #f.write( " %s", " %s", ) % (wordContent[i][0], wordContent[i][1])

print("should have worked")
path = os.path.abspath(f)
directory = os.path.dirname(path)
print("path: {path}, directory: {directory}")
f.close()

Пример #33

0

Показать файл

def create_soup(url):
    res = requests.get(url)
    res.raise_for_status()
    soup = BeautifulSoup(res.text, "lxml")
    return soup

Пример #34

0

Показать файл

Файл: .en_4sem.py Проект: Abhas-Bhatnagar/result_analysis

#5
from bs4 import BeautifulSoup
import os
import glob
import sys
from xlrd import open_workbook
from xlwt import Workbook
import xlsxwriter

workbook = xlsxwriter.Workbook('EN4th.xlsx')  #NAME OF GENERATED FILE
worksheet = workbook.add_worksheet()

row = 1
for filename in glob.glob('*.html'):
    soup = BeautifulSoup(open(filename), 'html.parser')
    n = 0
    c = 0
    for b in soup.table():
        if (str(b.get('id')) != "None"):
            n = n + 1
            x = str(b.get('id'))
    for b in soup.table():
        if (str(b.get('id')) != "None"):
            c = c + 1
            if (c == n - 1):
                x = str(b.get('id'))
                id_selector = x[3:5]
                print(id_selector)

    rollnumber = str(soup.find(id='lblRollNo').text)
    name = str(soup.find(id='lblFullName').text)

Пример #35

0

Показать файл

def get_book(url):
    html = request_dangdang(url)
    soup = BeautifulSoup(html, 'lxml')
    parse_html(soup)

Пример #36

0

Показать файл

Файл: scrapper.py Проект: acuriel/graphscrapper

 def get_base_html(self):
     resp = self._get_url(self.base_url)
     return BeautifulSoup(resp.text, 'html.parser')

Пример #37

0

Показать файл

Файл: inputScript.py Проект: SalmanMapkar/Phishing-Detection-using-Machine-Learning

def main(url):
    with open('files/URL.txt', 'r') as file:
        soup_string = file.read()

    soup = BeautifulSoup(soup_string, 'html.parser')

    status = []

    hostname = url
    h = [(x.start(0), x.end(0)) for x in re.finditer('https://|http://|www.|https://www.|http://www.', hostname)]
    z = int(len(h))
    if z != 0:
        y = h[0][1]
        hostname = hostname[y:]
        h = [(x.start(0), x.end(0)) for x in re.finditer('/', hostname)]
        z = int(len(h))
        if z != 0:
            hostname = hostname[:h[0][0]]


    status.append(having_ip_address(url))
    status.append(url_length(url))
    status.append(shortening_service(url))
    status.append(having_at_symbol(url))
    status.append(double_slash_redirecting(url))
    status.append(prefix_suffix(hostname))
    status.append(having_sub_domain(url))
    status.append(SSLfinal_State(url))

    dns = 1
    try:
        domain = whois.query(hostname)
    except:
        dns = -1

    if dns == -1:
        status.append(-1)
    else:
        status.append(domain_registration_length(domain))


    status.append(favicon(url, soup, hostname))
    status.append(https_token(url))
    status.append(request_url(url, soup, hostname))
    status.append(url_of_anchor(url, soup, hostname))
    status.append(links_in_tags(url, soup, hostname))
    status.append(sfh(url, soup, hostname))
    status.append(submitting_to_email(soup))

    if dns == -1:
        status.append(-1)
    else:
        status.append(abnormal_url(domain, url))

    status.append(i_frame(soup))

    if dns == -1:
        status.append(-1)
    else:
        status.append(age_of_domain(domain))

    status.append(dns)
    status.append(web_traffic(soup))
    status.append(page_rank(url))
    status.append(google_index(url))
    status.append(links_pointing_to_page(url))
    status.append(statistical_report(url, hostname))
    """
    print('\n1. Having IP address\n2. URL Length\n3. URL Shortening service\n4. Having @ symbol\n'
          '5. Having double slash\n6. Having dash symbol(Prefix Suffix)\n7. Having multiple subdomains\n'
          '8. SSL Final State\n8. Domain Registration Length\n9. Favicon\n10. HTTP or HTTPS token in domain name\n'
          '11. Request URL\n12. URL of Anchor\n13. Links in tags\n14. SFH\n15. Submitting to email\n16. Abnormal URL\n'
          '17. IFrame\n18. Age of Domain\n19. DNS Record\n20. Web Traffic\n21. Google Index\n22. Statistical Reports\n')
    """
    print(status)
    return status

Пример #38

0

Показать файл

def main():

    parser = ArgumentParser()
    parser.add_argument("--folder", type=str, dest="folder")
    args = parser.parse_args()
    hotelname = args.folder

    print hotelname + " review start"
    print
    open_file = "data/file_hotel_review-" + hotelname + "-" + city + "-" + state + ".html"
    print open_file
    input_file = open(open_file, 'r')

    for line in input_file:
        #    print line
        url = line
        #        time.sleep(1)
        review_html = urllib2.urlopen(url)
        review_bsObj = BeautifulSoup(review_html.read())
        #print url
        #        print

        for link in review_bsObj.findAll("span", {"class": "ratingDate"}):
            if 'content' in link.attrs:
                review_date = link.attrs['content']
                #                print review_date, type(review_date)

                Date = datetime.datetime.strptime(review_date, "%Y-%m-%d")
                #                print Date, type(Date)

                index_time = datetime.datetime(2006, 1, 1)
                if Date > index_time:

                    hotel_id = re.findall(r"-d([0-9]*)", url)
                    #                    print hotel_id[0], type(hotel_id[0])

                    r_id = re.findall(r"-r([0-9]*)", url)
                    #                    print r_id[0]

                    review_id = "review_" + r_id[0]
                    #                    print review_id

                    Crating = 0
                    for link in review_bsObj.findAll("div", {"id": review_id},
                                                     {"class": "v"}):
                        #            for link in review_bsObj.findAll("div", {"class":"rating-list"}):
                        #            print link
                        for link1 in link.findAll(
                                "li", {"class": "recommend-answer"}):
                            #                 print link1
                            #                print link1.text , type(link1.text)
                            l0 = str(link1)
                            #                print l0 , type(l0)
                            if "Cleanliness" in l0:
                                #                    print "l0 is "+ l0

                                hotel_id = re.findall(r"-d([0-9]*)", url)

                                index_s = "<img alt=\""
                                r0 = re.findall(r"<img alt=\"[0-9*]", l0)
                                #                                print r0[0][10:]
                                Crating = int(r0[0][10:])

                        for link in review_bsObj.findAll(
                                "p",
                            {"id": review_id},
                        ):
                            #        for link in review_bsObj.findAll("div",{"class":"col2of2"},):
                            #            print link
                            r0 = str(link)
                            #            print r0 , type(r0)
                            r1 = re.compile(r'<.*?>')
                            review = r1.sub('', r0)
                            #            r1=r0.replace("^<[A-za-z0-9*\"\=\\]$>","")
                            #                            print review

                            hotel_review_rate = []
                            hotel_review_rate = [
                                hotel_id[0], hotelname, Date, review_id,
                                r_id[0], review, Crating
                            ]
                            hotel_review_rate_list.append(hotel_review_rate)


#    print hotel_review_rate_list

    heading = [
        'ID', 'HotelName', 'ReviewDate', 'ReviewId', 'ReviewId_number',
        'Review', 'Cleanliness'
    ]
    print " hotel_review_rate_list saving in csv file .. "
    outfile = "data/hotel_review_rate_list_" + hotel_id[
        0] + "_" + hotelname + ".csv"
    with open(outfile, 'wb') as f:
        writer = csv.writer(f, delimiter=';')
        writer.writerow(heading)
        for row in hotel_review_rate_list:
            writer.writerow(row)
    f.close()

Пример #39

0

Показать файл

            reviews = pd.DataFrame(results)
            reviews.to_excel('reviewscount-es-error.xlsx', index=False)
            sys.exit()
        else:
            pass

        if etreeee1.xpath("*//form[@action='/errors/validateCaptcha']"):
            print('需要验证码')
            reviews = pd.DataFrame(results)
            reviews.to_excel('reviewscount-es-error.xlsx', index=False)
            sys.exit()
        else:
            print('爬取成功')

        html1 = r1.content
        amazonreviews = BeautifulSoup(html1, 'lxml')
        fmt_vrp_reviews = amazonreviews.find_all('div', attrs={'class': 'a-section a-spacing-medium'})
        for c1 in fmt_vrp_reviews:
            try:
                contents1 = c1.span.string
                fmt_vrp_review = contents1.split(' ', 4)[3]

            except:
                print('error')
                continue

        '''
        time.sleep(1.0)
        a2 = (url1 + asin + url3)
        r2 = requests.get(url =a2, headers = headers)
        etreeee2 = fromstring(r1.text)

Пример #40

0

Показать файл

from bs4 import BeautifulSoup
import requests
import html5lib

#india-times
r = requests.get("https://timesofindia.indiatimes.com/briefs")
rc = r.content
soup = BeautifulSoup(rc , "html5lib")
soup = soup.find_all('h2')

#hindu-times
r2 = requests.get("https://www.hindustantimes.com/india-news/")
rc2 = r2.content
soup2 = BeautifulSoup(rc2 , "html5lib")
soup2 = soup2.find_all("div" , {"class":"headingfour"})

Пример #41

0

Показать файл

 def convert(self):
     result = self.create_metadata()
     root = BeautifulSoup(self.content, 'html.parser')
     result += self.handle_element(root)
     return result

Пример #42

0

Показать файл

Файл: scrape_mars.py Проект: mtagruda/web-scraping-challenge

def scrape_info():
    browser = init_browser()

    # Visit https://mars.nasa.gov/news/
    url1 = 'https://mars.nasa.gov/news/'
    browser.visit(url1)

    time.sleep(3)

    # Scrape page into Soup
    html = browser.html
    soup = BeautifulSoup(html, "html.parser")

    news_titles = soup.find('div', class_="content_title")
    news_title = news_titles.text
    print(news_title)

    time.sleep(3)


    news_ps = soup.find('div', class_="article_teaser_body")
    news_p = news_ps.text
    print(news_p)

#Find the src for the featured image
    url2 = 'http://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser.visit(url2)

    time.sleep(2)

    html2 = browser.html
    soup = BeautifulSoup(html2, 'html.parser')

    img = soup.find_all('a', class_="button fancybox")

    for a in img:
        print(a["data-fancybox-href"])
    
    url9 = "http://www.jpl.nasa.gov/"
    featured_image_url = url9 + a["data-fancybox-href"]

    url3 = 'https://twitter.com/marswxreport?lang=en'
    browser.visit(url3)

    time.sleep(3)

    soup = BeautifulSoup(browser.html, 'html.parser')

    mars_weather = soup.find(class_='tweet-text').text

    url4 = 'https://space-facts.com/mars/'
    browser.visit(url4)

    time.sleep(10)

    html4 = browser.html
    soup = BeautifulSoup(html4, 'html.parser')

    marsfacts = soup.find_all('table', class_="tablepress tablepress-id-p-mars")
    marsfacts

    url5 = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
    browser.visit(url5)

    time.sleep(5)

    html5 = browser.html
    soup = BeautifulSoup(html5, 'html.parser')

    hemis_search = soup.find_all('a', class_="itemLink product-item")
    url10 = "https://astrogeology.usgs.gov"
    img_url =  []

    for a in hemis_search:
        print(a['href'])
        img_url.append(a['href'])

    url11 = url10 + img_url[0]
    url12 = url10 + img_url[2]
    url13 = url10 + img_url[4]
    url14 = url10 + img_url[6]

    browser.visit(url11)
    html11 = browser.html

    time.sleep(5)
    soup = BeautifulSoup(html11, 'html.parser')
    hemis_search2 = soup.find_all('img', class_="wide-image")
    for a in hemis_search2:
        print(a['src'])
    url15 = url10 + (a['src'])
    print(url15)

    browser.visit(url12)
    html12 = browser.html
    time.sleep(5)
    soup = BeautifulSoup(html12, 'html.parser')
    hemis_search3 = soup.find_all('img', class_="wide-image")
    for a in hemis_search3:
        print(a['src'])
    url16 = url10 + (a['src'])
    print(url16)

    browser.visit(url13)
    html13 = browser.html
    time.sleep(5)
    soup = BeautifulSoup(html13, 'html.parser')
    hemis_search4 = soup.find_all('img', class_="wide-image")
    for a in hemis_search4:
        print(a['src'])
    url17 = url10 + (a['src'])
    print(url17)

    browser.visit(url14)
    html14 = browser.html
    time.sleep(5)
    soup = BeautifulSoup(html14, 'html.parser')
    hemis_search4 = soup.find_all('img', class_="wide-image")
    for a in hemis_search4:
        print(a['src'])
    url18 = url10 + (a['src'])
    print(url18)


    hemisphere_image_url = [
    {"title": "Cerberus Hemisphere", "img_url": url15}, 
    {"title": "Schiaparelli Hemisphere", "img_url": url16},
    {"title": "Syrtis Major Hemisphere", "img_url": url17},
    {"title": "Valles Marineris Hemisphere", "img_url": url18}
    ]

    # Store data in a dictionary
    mars_data = {
        "news_title": news_title,
        "news_p": news_p,
        "featured_image_url": featured_image_url,
        "mars_weather": mars_weather,
        "url15": url15,
        "url16": url16,
        "url17": url17,
        "url18": url18       
    }

    # Close the browser after scraping
    browser.quit()

    # Return results
    return mars_data

Пример #43

0

Показать файл

#-----------Web Scraping Program------------#
import requests
from bs4 import BeautifulSoup
import csv

response = requests.get("https://www.rithmschool.com/blog")
soup = BeautifulSoup(response.text, "html.parser")
articles = soup.find_all("article")

with open("blog_data.csv", "w") as csv_file:
	csv_writer = csv.writer(csv_file)
	csv_writer.writerow(["title","link","date"])

	for article in articles:
		a_tag = article.find("a")
		url = a_tag["href"]
		title = a_tag.get_text()
		datetime = article.find("time")["datetime"]
		csv_writer.writerow([title,url,datetime])

Пример #44

0

Показать файл

import urllib.request
import time
import pyautogui

if __name__ == "__main__":
    loopIdx = 0
    loopLimit = 12000

    while True:
        # uptempo = "https://smartstore.naver.com/neulhaerangmask/products/4632987981?site_preference=device&NaPm="
        uptempo = 'https://smartstore.naver.com/hana-water/products/4832110630?NaPm='
        req = urllib.request.Request(uptempo)
        res = urllib.request.urlopen(req)
        data = res.read()

        soup = BeautifulSoup(data.decode("utf-8"), 'html.parser')
        ready = False

        for span in soup.find_all("span"):
            if span.get('class') == None:
                continue

            if span.get('class')[0] == 'cart':
                for s in span:
                    if s.get('class')[0] == 'mask2':
                        continue
                    elif s.get('class')[0] == '_stopDefault':
                        ready = False
                    else:
                        ready = True
                        break

Пример #45

0

Показать файл

Файл: navercrawler.py Проект: aimhigh53/ReplyAnalysis

def loadPage():
    html = chrome.page_source
    soup = BeautifulSoup(html, 'lxml')

Пример #46

0

Показать файл

Файл: prettifyhtmllinux.py Проект: gamingbeast4183/alltech2.0

import os
os.system("sudo pip install --upgrade beautifulsoup4")
from bs4 import BeautifulSoup as BS
html = input("HTML file? \t")
f = open(html,"r+")
soup = BS(f.read(), "html.parser")
f.seek(0,0)
f.write(soup.prettify())
f.close()

Пример #47

0

Показать файл

Файл: views.py Проект: ArefatHyeredin/c0derunR

def home(request):
    """
    Parameters:
    request[HttpRequest]
    --------------------------------------------
    Returns:
    render(request, 'init.html') [HttpResponse] => init.html is returned as HttpResponse
    Logic:
    it takes the inputted code from frontend request and sends it to hackerearth API
    if the code doesn't compile, then it finds the necessary keyword from error messages
    and searches for it on google with regex matching and suggests debug links
    """


    if request.method == 'POST':
        # POST goes here . is_ajax is must to capture ajax requests.
        if request.is_ajax():
            lang = request.POST.get('lang')
            source = request.POST.get('source')
            inputl = request.POST.get('input')
            data = {"lang": lang, "source": source, "input": inputl}

            data = {
                'client_secret': CLIENT_SECRET,
                'async': 0,
                'source': source,
                'lang': lang,
                'input': inputl,
                'time_limit': 5,
                'memory_limit': 262144,
            }

            # Post data to HackerEarth API
            s = requests.Session()
            s.mount("http://", requests.adapters.HTTPAdapter(max_retries=5))
            s.mount("https://", requests.adapters.HTTPAdapter(max_retries=5))
            r = s.post(RUN_URL, data=data)

            key_words = []
            compile_status = r.json()['compile_status'].strip()
            current_json = r.json()
            if compile_status != 'OK':
                rk = Rake()
                rk.extract_keywords_from_text(compile_status)
                for keyword in rk.get_ranked_phrases():
                    if 'hackerearth' in keyword:
                        continue
                    key_words.append(keyword)

                # filter extra information
                if len(key_words) >= 3:
                    key_words = key_words[-2:]
                key_words = list(reversed(key_words))
                key_words.append(compile_status)

                links = []
                desc = []
                import re
                for word in key_words:
                    page = s.get("https://www.google.co.in/search?q=" + word)
                    soup = BeautifulSoup(page.content, 'lxml')
                    for link in soup.find_all("a", href=re.compile("(?<=/url\?q=)(htt.*://.*)")):
                        debug_url = link["href"].replace("/url?q=", "").split('&')[0]
                        if 'webcache.googleusercontent.com' in debug_url:
                            continue
                        links.append(debug_url)
                        desc.append(link.text + ":" + get_domain(debug_url))

                current_json['debug_urls'] = links[:10]
                current_json['descriptions'] = desc[:10]
            return JsonResponse(current_json, safe=False)

    # A normal get request goes here
    return render(request, 'init.html')

Пример #48

0

Показать файл

def fillUnivList(ulist, html):
    soup = BeautifulSoup(html, "html.parser")
    for tr in soup.find('tbody').children:
        if isinstance(tr, bs4.element.Tag):
            tds = tr('td')
            ulist.append([tds[0].string, tds[1].string, tds[3].string])

Пример #49

0

Показать файл

Файл: task9.py Проект: Bijusrt/IMBD-Scrapper

from bs4 import BeautifulSoup
import requests, pprint, random, time, string
url = requests.get(
    "https://www.imdb.com/india/top-rated-indian-movies/?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=8a7876cd-2844-4017-846a-2c0876945b7b&pf_rd_r=C6ZKX5N78115F6BM14Y3&pf_rd_s=right-5&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_india_tr_rhs_1"
)
soup = BeautifulSoup(url.text, 'lxml')
table = soup.find('tbody', class_='lister-list')
body = table.find_all('tr')
random_var = random.randint(1, 5)
time.sleep(random_var)
_list = []
for i in body:
    _dict = {}
    data = i.find('td', class_="titleColumn")
    no = ''
    for j in data.text:
        no += j
        if j == '.':
            break
    _dict['No'] = no.strip()
    _dict['Movie'] = data.find('a').text
    _dict['Year'] = int(data.find('span').text.strip('(').strip(')'))
    _dict['Rating'] = i.find('strong').text
    _dict['Link'] = "https://www.imdb.com" + i.find('a')['href']
    _list.append(_dict)
pprint.pprint(_list)


def scrapped_movie(mov_link):
    new_url = requests.get(mov_link).text
    soup = BeautifulSoup(new_url, 'lxml')

Пример #50

0

Показать файл

Файл: navercrawler.py Проект: aimhigh53/ReplyAnalysis

    raise Exception()

# 크롬 드라이버 인스턴스 생성
chrome = generate_chrome(
    driver_path=driver_path,
    headless=False,
    download_path=DOWNLOAD_DIR)


# 페이지 요청
url = 'https://news.naver.com/main/ranking/popularDay.nhn'
chrome.get(url)
time.sleep(3)

html = chrome.page_source
soup = BeautifulSoup(html, 'lxml')
collecttime = str(datetime.utcnow().replace(microsecond=0) + timedelta(hours=9))[:16]

es=Elasticsearch()

def loadPage():
    html = chrome.page_source
    soup = BeautifulSoup(html, 'lxml')

def backToMainPage():
    chrome.get(url)


def getTopFive(i):
    loadPage()
    elements=soup.select('#wrap > table > tbody > tr > td.content > div > div:nth-child('+str(i)+') > ol> li')

Пример #51

0

Показать файл

Файл: itunes.py Проект: queensland1990/HuyenNguyen-Fundamental-C4E17

from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'https://www.apple.com/itunes/charts/songs'
conn = urlopen(url)
raw_data = conn.read()
text = raw_data.decode('utf8')
soup = BeautifulSoup(text, "html.parser")
ul = soup.find('section', 'section chart-grid')
li_list = ul.find_all("li")
item_list = []
for li in li_list:
    a = li.h3.a
    b = li.h4.a
    song_name = a.string
    artist = b.string
    item = {"Song_names": song_name, "Artist": artist}
    item_list.append(item)

import pyexcel

pyexcel.save_as(records=item_list, dest_file_name="itunes100.xlsx")

from youtube_dl import YoutubeDL
for song in item_list:
    options = {
        'default_search': 'ytsearch',
        'max_dowloads': 10,
        'format': 'bestaudio/audio'
    }
    dl = YoutubeDL(options)

Пример #52

0

Показать файл

Файл: movie-crawlingmdb.py Проект: koty08/20190615python

import urllib
import pymysql
import db
from bs4 import BeautifulSoup


params = urllib.parse.urlencode({'page' :1})
url='https://movie.naver.com/movie/point/af/list.nhn?&%s' % params
print(url)

response = urllib.request.urlopen(url)
navigator = BeautifulSoup(response, 'html.parser')
table = navigator.find('table', class_ = 'list_netizen')
print(table)

list_records=[]
for i,r in enumerate(table.find_all('tr')):
    for j,c in enumerate(r.find_all('td')):
        if j==0:
            record=int(c.text.strip())
        elif j==2:
            record1=int(c.text.strip())
        elif j==3:
            record2= str(c.find('a', class_ = 'movie').text.strip())
            record3= str(c.text).split('\n')[2]
        elif j==4:
            record4 = str(c.find('a', class_ = 'author').text.strip())
            record5= str(c.text).split('****')[1]
    try:
        record_t=tuple([record,record1,record2,record3,record4,record5])
        list_records.append(record_t)

Пример #53

0

Показать файл

from requests import get
from bs4 import BeautifulSoup


url = "https://helion.pl/search?qa=&serwisyall=&szukaj=python&wprzyg=&wsprzed=&wyczerp="

response = get(url)

html_soup = BeautifulSoup(response.text, 'html.parser')

books = html_soup.find_all('div', class_="book-info")

for b in books:
    print(b.a.text)

Пример #54

0

Показать файл

Файл: __main__.py Проект: book000/gotoeat_map

def main():
    merchantFilePath = os.path.dirname(
        os.path.abspath(__file__)) + "/merchants.json"

    if os.path.exists(merchantFilePath):
        json_open = open(merchantFilePath, "r", encoding="utf8")
        merchants = json.load(json_open)
    else:
        merchants = {"data": [], "names": []}
    findMerchants = []

    page = 0
    while True:
        page += 1
        print("----- Page {page} -----".format(page=page))
        html = requests.get(
            "https://www.gotoeat-tochigi.jp/merchant/index.php?word=&sort=2&page={page}"
            .format(page=page))
        html.encoding = html.apparent_encoding
        soup = BeautifulSoup(html.content, "html.parser")
        lists = soup.find("ul", {
            "class": "serch_result"
        }).findChildren("li", recursive=False)
        if (len(lists) == 0):
            break
        for merchant in lists:
            merchant_name = merchant.find("p", {"class": "name"}).text
            merchant_type = merchant.find("p", {
                "class": "name"
            }).find("span").text
            merchant_name = re.sub(
                r"{merchant_type}$".format(merchant_type=merchant_type), "",
                merchant_name)
            _merchant_address = merchant.find("div", {
                "class": "add"
            }).findAll("p")[0].text
            merchant_postal_code = re.sub(r"所在地〒([0-9\-]+) (.+)", r"\1",
                                          _merchant_address)
            merchant_address = re.sub(r"所在地〒([0-9\-]+) (.+)", r"\2",
                                      _merchant_address)
            if len(merchant.find("div", {"class": "add"}).findAll("p")) >= 2:
                merchant_tel = merchant.find("div", {
                    "class": "add"
                }).findAll("p")[1].text
                merchant_tel = re.sub(r"TEL(.+)", r"\1", merchant_tel)

            print(merchant_name + " - " + merchant_address)
            findMerchants.append(merchant_name)

            if merchant_name in merchants["names"]:
                continue

            lat, lng = getLatLng(merchant_address)
            print(str(lat) + " " + str(lng))

            merchants["data"].append({
                "name": merchant_name,
                "type": merchant_type,
                "address": merchant_address,
                "postal_code": merchant_postal_code,
                "tel": merchant_tel,
                "lat": lat,
                "lng": lng
            })
            merchants["names"].append(merchant_name)

            with open(merchantFilePath, mode="w", encoding="utf8") as f:
                f.write(json.dumps(merchants, indent=4, ensure_ascii=False))
        if (soup.find("li", {"class": "next"}) == None):
            break
        else:
            time.sleep(1)

    merchants = checkRemovedMerchant(merchants, findMerchants)

    with open(merchantFilePath, mode="w", encoding="utf8") as f:
        f.write(json.dumps(merchants, indent=4, ensure_ascii=False))

Пример #55

0

Показать файл

Файл: crawl_allsides_for_article_urls.py Проект: afcarl/DebugPoly

def get_urls_for_date(date):
    response = requests.get(
        'http://www.allsides.com/?date_filter[value][date]=' + date)
    soup = BeautifulSoup(response.content, "html.parser")
    return [el.a.get('href') for el in soup.find_all('div', "news-title")]

Пример #56

0

Показать файл

Файл: BS4Demo.py Проект: CodeKul/Python-Oct-2018-Weekend12pm-2pm

username = "******"
url = "http://www.twitter.com/" + username

response = None

try:
    response = requests.get(url)
except Exception as e:
    print(repr(e))
    sys.exit(1)

if response.status_code != 200:
    print("Non success status code returned "+str(response.status_code))
    sys.exit(1)

soup = BeautifulSoup(response.text, 'html.parser')

if soup.find("div", {"class": "errorpage-topbar"}):
    print("\n\n Error: Invalid username.")
    sys.exit(1)

tweets = soup.find_all("p", {"class": "TweetTextSize TweetTextSize--normal js-tweet-text tweet-text"})

tweetList = []
for tweet in tweets:
    text = tweet.text.encode('utf-8')
    tweetList.append(text)

for tweetText in tweetList:
    print(tweetText)

Пример #57

0

Показать файл

Файл: comp_item.py Проект: k156/hello

    <tr>
        <th>전화번호</th>
        <td>02-2345-2323</td>
        <td>033-223-2323</td>
        <td>051-121-1212</td>
    </tr>
    <tr>
        <th>대표메일</th>
        <td>[email protected]</td>
        <td>[email protected]</td>
        <td>[email protected]</td>
    </tr>
</table>
'''

soup = BeautifulSoup(html, 'html.parser')

companies = {}
data = {}
for i, tr in enumerate(soup.select('tr')):
    if i == 0:
        for j, th in enumerate(tr.select('th')):
            if j == 0: continue
            companies[th.text] = j - 1

    else:
        item_name = tr.select_one('th').text
        lst = []
        print(item_name)
        for td in tr.select('td'):
            lst.append(td.text)

Пример #58

0

Показать файл

import csv
import os 
import requests
from bs4 import BeautifulSoup
url='https://karki23.github.io/Weather-Data/assignment.html'
page=requests.get(url)
src=BeautifulSoup(page.content, "html.parser")
all_cities=src.find_all('a')
os.mkdir("dataset")
for i in all_cities:
    s=i.get('href')[0:len(i)-5:]
    url1='https://karki23.github.io/Weather-Data/'+i.get('href')
    page1=requests.get(url1)                                                       
    src1=BeautifulSoup(page1.content, "html.parser")
    rows=src1.find_all('tr')
    rows.pop(0) 
    file_name="dataset\\"+s+"csv"
    f=open(file_name, "w", newline="")
    headings=src1.find_all('th')
    headings_new=[i.text for i in headings]
    writer=csv.writer(f)
    writer.writerow(headings_new)
    for i in rows:    
        columns=i.find_all('td')
        column_new=[j.text for j in columns]
        writer.writerow(column_new)
    f.close()

Пример #59

0

Показать файл

urls = ['https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=0',
        'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=10',
        'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=20',
        'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=30',
        'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=40',
        'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=50',
        'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=60',
        'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=70',
        'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=80',
        'https://scholar.google.com/citations?view_op=search_authors&hl=en&mauthors=label:machine_learning&after_author=0LcBAH5D_v8J&astart=90',
        ]
for index, url in enumerate(urls):
    print(url)
    content = urlopen(url).read()
    soup = BeautifulSoup(content, "lxml")

    for author in soup.find_all('h3'):
        AUTHOR_LIST.append(author.a.contents[0])
    for email in soup.find_all('div', class_='gsc_oai_eml'):
        if len(email.contents) == 1:
            EMAIL_LIST.append(re.search("Verified email at (.*)",str(email.contents[0])).group(1))
        else:
            EMAIL_LIST.append("N/A")
    for topic in soup.find_all('div', class_='gsc_oai'):
        temp = []
        for i in topic.find_all('a', class_='gsc_oai_one_int'):
            temp += i.contents
        TOPIC_LIST.append(temp)

workbook = xlsxwriter.Workbook('google_scholar_label:ai.xlsx')

Пример #60

0

Показать файл

Файл: se_epub_build.py Проект: nebulon42/tools

def build(self, run_epubcheck: bool, build_kobo: bool, build_kindle: bool, output_directory: Path, proof: bool, build_covers: bool) -> None:
	"""
	Entry point for `se build`
	"""

	# Check for some required tools
	if build_kindle:
		which_ebook_convert = shutil.which("ebook-convert")
		if which_ebook_convert:
			ebook_convert_path = Path(which_ebook_convert)
		else:
			# Look for default Mac calibre app path if none found in path
			ebook_convert_path = Path("/Applications/calibre.app/Contents/MacOS/ebook-convert")
			if not ebook_convert_path.exists():
				raise se.MissingDependencyException("Couldn’t locate [bash]ebook-convert[/]. Is [bash]calibre[/] installed?")

	if run_epubcheck:
		if not shutil.which("java"):
			raise se.MissingDependencyException("Couldn’t locate [bash]java[/]. Is it installed?")

	# Check the output directory and create it if it doesn't exist
	try:
		output_directory = output_directory.resolve()
		output_directory.mkdir(parents=True, exist_ok=True)
	except Exception:
		raise se.FileExistsException(f"Couldn’t create output directory: [path][link=file://{output_directory}]{output_directory}[/][/].")

	# All clear to start building!
	metadata_xml = self.metadata_xml

	with tempfile.TemporaryDirectory() as temp_directory:
		work_directory = Path(temp_directory)
		work_epub_root_directory = work_directory / "src"

		copy_tree(self.path, str(work_directory))
		try:
			shutil.rmtree(work_directory / ".git")
		except Exception:
			pass

		# By convention the ASIN is set to the SHA-1 sum of the book's identifying URL
		try:
			identifier = self.metadata_dom.xpath("//dc:identifier")[0].inner_xml().replace("url:", "")
			asin = sha1(identifier.encode("utf-8")).hexdigest()
		except:
			raise se.InvalidSeEbookException(f"Missing [xml]<dc:identifier>[/] element in [path][link=file://{self.metadata_file_path}]{self.metadata_file_path}[/][/].")

		if not self.metadata_dom.xpath("//dc:title"):
			raise se.InvalidSeEbookException(f"Missing [xml]<dc:title>[/] element in [path][link=file://{self.metadata_file_path}]{self.metadata_file_path}[/][/].")

		output_filename = identifier.replace("https://standardebooks.org/ebooks/", "").replace("/", "_")
		url_author = ""
		for author in self.metadata_dom.xpath("//dc:creator"):
			url_author = url_author + se.formatting.make_url_safe(author.inner_xml()) + "_"

		url_author = url_author.rstrip("_")

		epub_output_filename = f"{output_filename}{'.proof' if proof else ''}.epub"
		epub3_output_filename = f"{output_filename}{'.proof' if proof else ''}.epub3"
		kobo_output_filename = f"{output_filename}{'.proof' if proof else ''}.kepub.epub"
		kindle_output_filename = f"{output_filename}{'.proof' if proof else ''}.azw3"

		# Clean up old output files if any
		se.quiet_remove(output_directory / f"thumbnail_{asin}_EBOK_portrait.jpg")
		se.quiet_remove(output_directory / "cover.jpg")
		se.quiet_remove(output_directory / "cover-thumbnail.jpg")
		se.quiet_remove(output_directory / epub_output_filename)
		se.quiet_remove(output_directory / epub3_output_filename)
		se.quiet_remove(output_directory / kobo_output_filename)
		se.quiet_remove(output_directory / kindle_output_filename)

		# Are we including proofreading CSS?
		if proof:
			with open(work_epub_root_directory / "epub" / "css" / "local.css", "a", encoding="utf-8") as local_css_file:
				with importlib_resources.open_text("se.data.templates", "proofreading.css", encoding="utf-8") as proofreading_css_file:
					local_css_file.write(proofreading_css_file.read())

		# Update the release date in the metadata and colophon
		if self.last_commit:
			last_updated_iso = regex.sub(r"\.[0-9]+$", "", self.last_commit.timestamp.isoformat()) + "Z"
			last_updated_iso = regex.sub(r"\+.+?Z$", "Z", last_updated_iso)
			# In the line below, we can't use %l (unpadded 12 hour clock hour) because it isn't portable to Windows.
			# Instead we use %I (padded 12 hour clock hour) and then do a string replace to remove leading zeros.
			last_updated_friendly = f"{self.last_commit.timestamp:%B %e, %Y, %I:%M <abbr class=\"time eoc\">%p</abbr>}".replace(" 0", " ")
			last_updated_friendly = regex.sub(r"\s+", " ", last_updated_friendly).replace("AM", "a.m.").replace("PM", "p.m.").replace(" <abbr", " <abbr")

			# Set modified date in content.opf
			self.metadata_xml = regex.sub(r"<meta property=\"dcterms:modified\">[^<]+?</meta>", f"<meta property=\"dcterms:modified\">{last_updated_iso}</meta>", self.metadata_xml)

			with open(work_epub_root_directory / "epub" / "content.opf", "w", encoding="utf-8") as file:
				file.seek(0)
				file.write(self.metadata_xml)
				file.truncate()

			# Update the colophon with release info
			with open(work_epub_root_directory / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file:
				xhtml = file.read()

				xhtml = xhtml.replace("<p>The first edition of this ebook was released on<br/>", f"<p>This edition was released on<br/>\n\t\t\t<b>{last_updated_friendly}</b><br/>\n\t\t\tand is based on<br/>\n\t\t\t<b>revision {self.last_commit.short_sha}</b>.<br/>\n\t\t\tThe first edition of this ebook was released on<br/>")

				file.seek(0)
				file.write(xhtml)
				file.truncate()

		# Output the pure epub3 file
		se.epub.write_epub(work_epub_root_directory, output_directory / epub3_output_filename)

		# Now add epub2 compatibility.

		# Include compatibility CSS
		with open(work_epub_root_directory / "epub" / "css" / "core.css", "a", encoding="utf-8") as core_css_file:
			with importlib_resources.open_text("se.data.templates", "compatibility.css", encoding="utf-8") as compatibility_css_file:
				core_css_file.write(compatibility_css_file.read())

		# Simplify CSS and tags
		total_css = ""

		# Simplify the CSS first.  Later we'll update the document to match our simplified selectors.
		# While we're doing this, we store the original css into a single variable so we can extract the original selectors later.
		for root, _, filenames in os.walk(work_epub_root_directory):
			for filename_string in fnmatch.filter(filenames, "*.css"):
				filename = Path(root) / filename_string
				with open(filename, "r+", encoding="utf-8") as file:
					css = file.read()

					# Before we do anything, we process a special case in core.css
					if filename.name == "core.css":
						css = regex.sub(r"abbr{.+?}", "", css, flags=regex.DOTALL)

					total_css = total_css + css + "\n"
					file.seek(0)
					file.write(se.formatting.simplify_css(css))
					file.truncate()

		# Now get a list of original selectors
		# Remove @supports(){}
		total_css = regex.sub(r"@supports.+?{(.+?)}\s*}", "\\1}", total_css, flags=regex.DOTALL)

		# Remove CSS rules
		total_css = regex.sub(r"{[^}]+}", "", total_css)

		# Remove trailing commas
		total_css = regex.sub(r",", "", total_css)

		# Remove comments
		total_css = regex.sub(r"/\*.+?\*/", "", total_css, flags=regex.DOTALL)

		# Remove @ defines
		total_css = regex.sub(r"^@.+", "", total_css, flags=regex.MULTILINE)

		# Construct a dictionary of the original selectors
		selectors = {line for line in total_css.splitlines() if line != ""}

		# Get a list of .xhtml files to simplify
		for root, _, filenames in os.walk(work_epub_root_directory):
			for filename_string in fnmatch.filter(filenames, "*.xhtml"):
				filename = (Path(root) / filename_string).resolve()

				# Don't mess with the ToC, since if we have ol/li > first-child selectors we could screw it up
				if filename.name == "toc.xhtml":
					continue

				with open(filename, "r+", encoding="utf-8") as file:
					# We have to remove the default namespace declaration from our document, otherwise
					# xpath won't find anything at all.  See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python
					xhtml = file.read().replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")
					processed_xhtml = xhtml
					try:
						tree = etree.fromstring(str.encode(xhtml))
					except Exception as ex:
						raise se.InvalidXhtmlException(f"Error parsing XHTML file: [path][link=file://{filename}]{filename}[/][/]. Exception: {ex}")

					# Now iterate over each CSS selector and see if it's used in any of the files we found
					for selector in selectors:
						try:
							# Add classes to elements that match any of our selectors to simplify. For example, if we select :first-child, add a "first-child" class to all elements that match that.
							for selector_to_simplify in se.SELECTORS_TO_SIMPLIFY:
								while selector_to_simplify in selector:
									# Potentially the pseudoclass we’ll simplify isn’t at the end of the selector,
									# so we need to temporarily remove the trailing part to target the right elements.
									split_selector = regex.split(fr"({selector_to_simplify}(\(.*?\))?)", selector, 1)
									target_element_selector = ''.join(split_selector[0:2])

									replacement_class = split_selector[1].replace(":", "").replace("(", "-").replace("n-", "n-minus-").replace("n+", "n-plus-").replace(")", "")
									selector = selector.replace(split_selector[1], "." + replacement_class, 1)
									sel = se.easy_xml.css_selector(target_element_selector)
									for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES):
										current_class = element.get("class")
										if current_class is not None and replacement_class not in current_class:
											current_class = current_class + " " + replacement_class
										else:
											current_class = replacement_class

										element.set("class", current_class)

						except lxml.cssselect.ExpressionError:
							# This gets thrown if we use pseudo-elements, which lxml doesn't support
							pass
						except lxml.cssselect.SelectorSyntaxError as ex:
							raise se.InvalidCssException(f"Couldn’t parse CSS in or near this line: [css]{selector}[/]. Exception: {ex}")

						# We've already replaced attribute/namespace selectors with classes in the CSS, now add those classes to the matching elements
						if "[epub|type" in selector:
							for namespace_selector in regex.findall(r"\[epub\|type\~\=\"[^\"]*?\"\]", selector):
								sel = se.easy_xml.css_selector(namespace_selector)

								for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES):
									new_class = regex.sub(r"^\.", "", se.formatting.namespace_to_class(namespace_selector))
									current_class = element.get("class", "")

									if new_class not in current_class:
										current_class = f"{current_class} {new_class}".strip()
										element.set("class", current_class)

					processed_xhtml = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + etree.tostring(tree, encoding=str, pretty_print=True)

					# We do this round in a second pass because if we modify the tree like this, it screws up how lxml does processing later.
					# If it's all done in one pass, we wind up in a race condition where some elements are fixed and some not
					tree = etree.fromstring(str.encode(processed_xhtml))

					for selector in selectors:
						try:
							sel = se.easy_xml.css_selector(selector)
						except lxml.cssselect.ExpressionError:
							# This gets thrown if we use pseudo-elements, which lxml doesn't support
							continue
						except lxml.cssselect.SelectorSyntaxError as ex:
							raise se.InvalidCssException(f"Couldn’t parse CSS in or near this line: [css]{selector}[/]. Exception: {ex}")

						# Convert <abbr> to <span>
						if "abbr" in selector:
							for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES):
								# Why would you want the tail to output by default?!?
								raw_string = etree.tostring(element, encoding=str, with_tail=False)

								# lxml--crap as usual--includes a bunch of namespace information in every element we print.
								# Remove it here.
								raw_string = raw_string.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")
								raw_string = raw_string.replace(" xmlns:epub=\"http://www.idpf.org/2007/ops\"", "")
								raw_string = raw_string.replace(" xmlns:m=\"http://www.w3.org/1998/Math/MathML\"", "")

								# Now lxml doesn't let us modify the tree, so we just do a straight up regex replace to turn this into a span
								processed_string = raw_string.replace("<abbr", "<span")
								processed_string = processed_string.replace("</abbr", "</span")

								# Now we have a nice, fixed string.  But, since lxml can't replace elements, we write it ourselves.
								processed_xhtml = processed_xhtml.replace(raw_string, processed_string)

								tree = etree.fromstring(str.encode(processed_xhtml))

					# Now we just remove all stray abbr tags that were not styled by CSS
					processed_xhtml = regex.sub(r"</?abbr[^>]*?>", "", processed_xhtml)

					# Remove datetime="" attribute in <time> tags, which is not always understood by epubcheck
					processed_xhtml = regex.sub(r" datetime=\"[^\"]+?\"", "", processed_xhtml)

					tree = etree.fromstring(str.encode(processed_xhtml))

					if processed_xhtml != xhtml:
						file.seek(0)
						file.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + etree.tostring(tree, encoding=str, pretty_print=True).replace("<html", "<html xmlns=\"http://www.w3.org/1999/xhtml\""))
						file.truncate()

		# Done simplifying CSS and tags!

		# Extract cover and cover thumbnail
		cover_svg_file = work_epub_root_directory / "epub" / "images" / "cover.svg"
		if not os.path.isfile(cover_svg_file):
			raise se.MissingDependencyException("Cover image is missing. Did you run [bash]se build-images[/]?")

		svg2png(url=str(cover_svg_file), write_to=str(work_directory / "cover.png"))
		cover = Image.open(work_directory / "cover.png")
		cover = cover.convert("RGB") # Remove alpha channel from PNG if necessary
		cover.save(work_epub_root_directory / "epub" / "images" / "cover.jpg")
		(work_directory / "cover.png").unlink()

		if build_covers:
			shutil.copy2(work_epub_root_directory / "epub" / "images" / "cover.jpg", output_directory / "cover.jpg")
			shutil.copy2(cover_svg_file, output_directory / "cover-thumbnail.svg")
			# Path arguments must be cast to string
			svg2png(url=str(output_directory / "cover-thumbnail.svg"), write_to=str(work_directory / "cover-thumbnail.png"))
			cover = Image.open(work_directory / "cover-thumbnail.png")
			cover = cover.resize((COVER_THUMBNAIL_WIDTH, COVER_THUMBNAIL_HEIGHT))
			cover = cover.convert("RGB") # Remove alpha channel from PNG if necessary
			cover.save(output_directory / "cover-thumbnail.jpg")
			(work_directory / "cover-thumbnail.png").unlink()
			(output_directory / "cover-thumbnail.svg").unlink()

		cover_svg_file.unlink()

		# Massage image references in content.opf
		metadata_xml = metadata_xml.replace("cover.svg", "cover.jpg")
		metadata_xml = metadata_xml.replace(".svg", ".png")
		metadata_xml = metadata_xml.replace("id=\"cover.jpg\" media-type=\"image/svg+xml\"", "id=\"cover.jpg\" media-type=\"image/jpeg\"")
		metadata_xml = metadata_xml.replace("image/svg+xml", "image/png")
		metadata_xml = regex.sub(r" properties=\"([^\"]*?)svg([^\"]*?)\"", r''' properties="\1\2"''', metadata_xml) # We may also have the `mathml` property
		metadata_xml = regex.sub(r" properties=\"([^\s]*?)\s\"", r''' properties="\1"''', metadata_xml) # Clean up trailing white space in property attributes introduced by the above line
		metadata_xml = regex.sub(r" properties=\"\s*\"", "", metadata_xml) # Remove any now-empty property attributes

		# Add an element noting the version of the se tools that built this ebook
		metadata_xml = regex.sub(r"<dc:publisher", f"<meta property=\"se:built-with\">{se.VERSION}</meta>\n\t\t<dc:publisher", metadata_xml)

		# Google Play Books chokes on https XML namespace identifiers (as of at least 2017-07)
		metadata_xml = metadata_xml.replace("https://standardebooks.org/vocab/1.0", "http://standardebooks.org/vocab/1.0")

		# Output the modified content.opf so that we can build the kobo book before making more epub2 compatibility hacks
		with open(work_epub_root_directory / "epub" / "content.opf", "w", encoding="utf-8") as file:
			file.write(metadata_xml)
			file.truncate()

		# Recurse over xhtml files to make some compatibility replacements
		for root, _, filenames in os.walk(work_epub_root_directory):
			for filename_string in filenames:
				filename = Path(root) / filename_string

				if filename.suffix == ".svg":
					# For night mode compatibility, give the titlepage a 1px white stroke attribute
					if filename.name in("titlepage.svg", "logo.svg"):
						with open(filename, "r+", encoding="utf-8") as file:
							svg = file.read()
							paths = svg

							# What we're doing here is faking the `stroke-align: outside` property, which is an unsupported draft spec right now.
							# We do this by duplicating all the SVG paths, and giving the duplicates a 2px stroke.  The originals are directly on top,
							# so the 2px stroke becomes a 1px stroke that's *outside* of the path instead of being *centered* on the path border.
							# This looks much nicer, but we also have to increase the image size by 2px in both directions, and re-center the whole thing.

							if filename.name == "titlepage.svg":
								stroke_width = SVG_TITLEPAGE_OUTER_STROKE_WIDTH
							else:
								stroke_width = SVG_OUTER_STROKE_WIDTH

							# First, strip out non-path, non-group elements
							paths = regex.sub(r"<\?xml[^<]+?\?>", "", paths)
							paths = regex.sub(r"</?svg[^<]*?>", "", paths)
							paths = regex.sub(r"<title>[^<]+?</title>", "", paths)
							paths = regex.sub(r"<desc>[^<]+?</desc>", "", paths)

							# `paths` is now our "duplicate".  Add a 2px stroke.
							paths = paths.replace("<path", f"<path style=\"stroke: #ffffff; stroke-width: {stroke_width}px;\"")

							# Inject the duplicate under the old SVG paths.  We do this by only replacing the first regex match for <g> or <path>
							svg = regex.sub(r"(<g|<path)", f"{paths}\\1", svg, 1)

							# If this SVG specifies height/width, then increase height and width by 2 pixels and translate everything by 1px
							try:
								height = int(regex.search(r"<svg[^>]+?height=\"([0-9]+)\"", svg).group(1)) + stroke_width
								svg = regex.sub(r"<svg([^<]*?)height=\"[0-9]+\"", f"<svg\\1height=\"{height}\"", svg)

								width = int(regex.search(r"<svg[^>]+?width=\"([0-9]+)\"", svg).group(1)) + stroke_width
								svg = regex.sub(r"<svg([^<]*?)width=\"[0-9]+\"", f"<svg\\1width=\"{width}\"", svg)

								# Add a grouping element to translate everything over 1px
								svg = regex.sub(r"(<g|<path)", "<g transform=\"translate({amount}, {amount})\">\n\\1".format(amount=(stroke_width / 2)), svg, 1)
								svg = svg.replace("</svg>", "</g>\n</svg>")
							except AttributeError:
								# Thrown when the regex doesn't match (i.e. SVG doesn't specify height/width)
								pass

							file.seek(0)
							file.write(svg)
							file.truncate()

					# Convert SVGs to PNGs at 2x resolution
					# Path arguments must be cast to string
					svg2png(url=str(filename), write_to=str(filename.parent / (str(filename.stem) + ".png")), scale=2)
					(filename).unlink()

				if filename.suffix == ".xhtml":
					with open(filename, "r+", encoding="utf-8") as file:
						xhtml = file.read()
						processed_xhtml = xhtml

						# Check if there's any MathML to convert.
						# We expect MathML to be the "content" type (versus the "presentational" type).
						# We use an XSL transform to convert from "content" to "presentational" MathML.
						# If we start with presentational, then nothing will be changed.
						# Kobo supports presentational MathML. After we build kobo, we convert the presentational MathML to PNG for the rest of the builds.
						mathml_transform = None
						for line in regex.findall(r"<(?:m:)?math[^>]*?>(.+?)</(?:m:)?math>", processed_xhtml, flags=regex.DOTALL):
							mathml_content_tree = se.easy_xml.EasyXhtmlTree("<?xml version=\"1.0\" encoding=\"utf-8\"?><math xmlns=\"http://www.w3.org/1998/Math/MathML\">{}</math>".format(regex.sub(r"<(/?)m:", "<\\1", line)))

							# Initialize the transform object, if we haven't yet
							if not mathml_transform:
								with importlib_resources.path("se.data", "mathmlcontent2presentation.xsl") as mathml_xsl_filename:
									mathml_transform = etree.XSLT(etree.parse(str(mathml_xsl_filename)))

							# Transform the mathml and get a string representation
							# XSLT comes from https://github.com/fred-wang/webextension-content-mathml-polyfill
							mathml_presentation_tree = mathml_transform(mathml_content_tree.etree)
							mathml_presentation_xhtml = etree.tostring(mathml_presentation_tree, encoding="unicode", pretty_print=True, with_tail=False).strip()

							# Plop our string back in to the XHTML we're processing
							processed_xhtml = regex.sub(r"<(?:m:)?math[^>]*?>\{}\</(?:m:)?math>".format(regex.escape(line)), mathml_presentation_xhtml, processed_xhtml, flags=regex.MULTILINE)

						if filename.name == "endnotes.xhtml":
							# iOS renders the left-arrow-hook character as an emoji; this fixes it and forces it to render as text.
							# See https://github.com/standardebooks/tools/issues/73
							# See http://mts.io/2015/04/21/unicode-symbol-render-text-emoji/
							processed_xhtml = processed_xhtml.replace("\u21a9", "\u21a9\ufe0e")

						# Since we added an outlining stroke to the titlepage/publisher logo images, we
						# want to remove the se:color-depth.black-on-transparent semantic
						if filename.name in ("colophon.xhtml", "imprint.xhtml", "titlepage.xhtml"):
							processed_xhtml = regex.sub(r"\s*se:color-depth\.black-on-transparent\s*", "", processed_xhtml)

						# Add ARIA roles, which are just mostly duplicate attributes to epub:type
						for role in ARIA_ROLES:
							processed_xhtml = regex.sub(fr"(epub:type=\"[^\"]*?{role}[^\"]*?\")", f"\\1 role=\"doc-{role}\"", processed_xhtml)

						# Some ARIA roles can't apply to some elements.
						# For example, epilogue can't apply to <article>
						processed_xhtml = regex.sub(r"<article ([^>]*?)role=\"doc-epilogue\"", "<article \\1", processed_xhtml)

						if filename.name == "toc.xhtml":
							landmarks_xhtml = regex.findall(r"<nav epub:type=\"landmarks\">.*?</nav>", processed_xhtml, flags=regex.DOTALL)
							landmarks_xhtml = regex.sub(r" role=\"doc-.*?\"", "", landmarks_xhtml[0])
							processed_xhtml = regex.sub(r"<nav epub:type=\"landmarks\">.*?</nav>", landmarks_xhtml, processed_xhtml, flags=regex.DOTALL)

						# But, remove ARIA roles we added to h# tags, because tyically those roles are for sectioning content.
						# For example, we might have an h2 that is both a title and dedication. But ARIA can't handle it being a dedication.
						# See The Man Who Was Thursday by G K Chesterton
						processed_xhtml = regex.sub(r"(<h[1-6] [^>]*) role=\".*?\">", "\\1>", processed_xhtml)

						# Google Play Books chokes on https XML namespace identifiers (as of at least 2017-07)
						processed_xhtml = processed_xhtml.replace("https://standardebooks.org/vocab/1.0", "http://standardebooks.org/vocab/1.0")

						# We converted svgs to pngs, so replace references
						processed_xhtml = processed_xhtml.replace("cover.svg", "cover.jpg")
						processed_xhtml = processed_xhtml.replace(".svg", ".png")

						# To get popup footnotes in iBooks, we have to change epub:endnote to epub:footnote.
						# Remember to get our custom style selectors too.
						processed_xhtml = regex.sub(r"epub:type=\"([^\"]*?)endnote([^\"]*?)\"", "epub:type=\"\\1footnote\\2\"", processed_xhtml)
						processed_xhtml = regex.sub(r"class=\"([^\"]*?)epub-type-endnote([^\"]*?)\"", "class=\"\\1epub-type-footnote\\2\"", processed_xhtml)

						# Include extra lang tag for accessibility compatibility.
						processed_xhtml = regex.sub(r"xml:lang\=\"([^\"]+?)\"", "lang=\"\\1\" xml:lang=\"\\1\"", processed_xhtml)

						# Typography: replace double and triple em dash characters with extra em dashes.
						processed_xhtml = processed_xhtml.replace("⸺", f"—{se.WORD_JOINER}—")
						processed_xhtml = processed_xhtml.replace("⸻", f"—{se.WORD_JOINER}—{se.WORD_JOINER}—")

						# Typography: replace some other less common characters.
						processed_xhtml = processed_xhtml.replace("⅒", "1/10")
						processed_xhtml = processed_xhtml.replace("℅", "c/o")
						processed_xhtml = processed_xhtml.replace("✗", "×")
						processed_xhtml = processed_xhtml.replace(" ", f"{se.NO_BREAK_SPACE}{se.NO_BREAK_SPACE}") # em-space to two nbsps

						# Many e-readers don't support the word joiner character (U+2060).
						# They DO, however, support the now-deprecated zero-width non-breaking space (U+FEFF)
						# For epubs, do this replacement.  Kindle now seems to handle everything fortunately.
						processed_xhtml = processed_xhtml.replace(se.WORD_JOINER, se.ZERO_WIDTH_SPACE)

						# Some minor code style cleanup
						processed_xhtml = processed_xhtml.replace(" >", ">")
						processed_xhtml = regex.sub(r"""\s*epub:type=""\s*""", "", processed_xhtml)

						if processed_xhtml != xhtml:
							file.seek(0)
							file.write(processed_xhtml)
							file.truncate()

				if filename.suffix == ".css":
					with open(filename, "r+", encoding="utf-8") as file:
						css = file.read()
						processed_css = css

						# To get popup footnotes in iBooks, we have to change epub:endnote to epub:footnote.
						# Remember to get our custom style selectors too.
						processed_css = processed_css.replace("endnote", "footnote")

						# page-break-* is deprecated in favor of break-*. Add page-break-* aliases for compatibility in older ereaders.
						processed_css = regex.sub(r"(\s+)break-(.+?:\s.+?;)", "\\1break-\\2\t\\1page-break-\\2", processed_css)

						# `page-break-*: page;` should be come `page-break-*: always;`
						processed_css = regex.sub(r"(\s+)page-break-(before|after):\s+page;", "\\1page-break-\\2: always;", processed_css)

						if processed_css != css:
							file.seek(0)
							file.write(processed_css)
							file.truncate()

		if build_kobo:
			with tempfile.TemporaryDirectory() as temp_directory:
				kobo_work_directory = Path(temp_directory)
				copy_tree(str(work_epub_root_directory), str(kobo_work_directory))

				for root, _, filenames in os.walk(kobo_work_directory):
					# Add a note to content.opf indicating this is a transform build
					for filename_string in fnmatch.filter(filenames, "content.opf"):
						with open(Path(root) / filename_string, "r+", encoding="utf-8") as file:
							xhtml = file.read()

							xhtml = regex.sub(r"<dc:publisher", "<meta property=\"se:transform\">kobo</meta>\n\t\t<dc:publisher", xhtml)

							file.seek(0)
							file.write(xhtml)
							file.truncate()

					# Kobo .kepub files need each clause wrapped in a special <span> tag to enable highlighting.
					# Do this here. Hopefully Kobo will get their act together soon and drop this requirement.
					for filename_string in fnmatch.filter(filenames, "*.xhtml"):
						kobo.paragraph_counter = 1
						kobo.segment_counter = 1

						filename = (Path(root) / filename_string).resolve()

						# Don't add spans to the ToC
						if filename.name == "toc.xhtml":
							continue

						with open(filename, "r+", encoding="utf-8") as file:
							xhtml = file.read()

							# Note: Kobo supports CSS hyphenation, but it can be improved with soft hyphens.
							# However we can't insert them, because soft hyphens break the dictionary search when
							# a word is highlighted.

							# Kobos don't have fonts that support the ↩ character in endnotes, so replace it with ←
							if filename.name == "endnotes.xhtml":
								# Note that we replaced ↩ with \u21a9\ufe0e in an earlier iOS compatibility fix
								xhtml = regex.sub(r"epub:type=\"backlink\">\u21a9\ufe0e</a>", "epub:type=\"backlink\">←</a>", xhtml)

							# We have to remove the default namespace declaration from our document, otherwise
							# xpath won't find anything at all.  See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python
							try:
								tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")))
							except Exception as ex:
								raise se.InvalidXhtmlException(f"Error parsing XHTML file: [path][link=file://{filename}]{filename}[/][/]. Exception: {ex}")

							kobo.add_kobo_spans_to_node(tree.xpath("./body", namespaces=se.XHTML_NAMESPACES)[0])

							xhtml = etree.tostring(tree, encoding="unicode", pretty_print=True, with_tail=False)
							xhtml = regex.sub(r"<html:span", "<span", xhtml)
							xhtml = regex.sub(r"html:span>", "span>", xhtml)
							xhtml = regex.sub(r"<span xmlns:html=\"http://www.w3.org/1999/xhtml\"", "<span", xhtml)
							xhtml = regex.sub(r"<html", "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\"", xhtml)

							file.seek(0)
							file.write(xhtml)
							file.truncate()

				# All done, clean the output
				# Note that we don't clean .xhtml files, because the way kobo spans are added means that it will screw up spaces inbetween endnotes.
				for filepath in se.get_target_filenames([kobo_work_directory], (".svg", ".opf", ".ncx")):
					se.formatting.format_xml_file(filepath)

				se.epub.write_epub(kobo_work_directory, output_directory / kobo_output_filename)

		# Now work on more epub2 compatibility

		# Recurse over css files to make some compatibility replacements.
		for root, _, filenames in os.walk(work_epub_root_directory):
			for filename_string in filenames:
				filename = Path(root) / filename_string

				if filename.suffix == ".css":
					with open(filename, "r+", encoding="utf-8") as file:
						css = file.read()
						processed_css = css

						processed_css = regex.sub(r"(page\-break\-(before|after|inside)\s*:\s*(.+))", "\\1\n\t-webkit-column-break-\\2: \\3 /* For Readium */", processed_css)
						processed_css = regex.sub(r"^\s*hyphens\s*:\s*(.+)", "\thyphens: \\1\n\tadobe-hyphenate: \\1\n\t-webkit-hyphens: \\1\n\t-epub-hyphens: \\1\n\t-moz-hyphens: \\1", processed_css, flags=regex.MULTILINE)
						processed_css = regex.sub(r"^\s*hyphens\s*:\s*none;", "\thyphens: none;\n\tadobe-text-layout: optimizeSpeed; /* For Nook */", processed_css, flags=regex.MULTILINE)

						if processed_css != css:
							file.seek(0)
							file.write(processed_css)
							file.truncate()

		# Sort out MathML compatibility
		has_mathml = "mathml" in metadata_xml
		if has_mathml:
			# We import this late because we don't want to load selenium if we're not going to use it!
			from se import browser # pylint: disable=import-outside-toplevel

			# We wrap this whole thing in a try block, because we need to call
			# driver.quit() if execution is interrupted (like by ctrl + c, or by an unhandled exception). If we don't call driver.quit(),
			# Firefox will stay around as a zombie process even if the Python script is dead.
			try:
				driver = browser.initialize_selenium_firefox_webdriver()

				mathml_count = 1
				for root, _, filenames in os.walk(work_epub_root_directory):
					for filename_string in filenames:
						filename = Path(root) / filename_string
						if filename.suffix == ".xhtml":
							with open(filename, "r+", encoding="utf-8") as file:
								xhtml = file.read()
								processed_xhtml = xhtml
								replaced_mathml: List[str] = []

								# Check if there's MathML we want to convert
								# We take a naive approach and use some regexes to try to simplify simple MathML expressions.
								# For each MathML expression, if our round of regexes finishes and there is still MathML in the processed result, we abandon the attempt and render to PNG using Firefox.
								for line in regex.findall(r"<(?:m:)?math[^>]*?>(?:.+?)</(?:m:)?math>", processed_xhtml, flags=regex.DOTALL):
									if line not in replaced_mathml:
										replaced_mathml.append(line) # Store converted lines to save time in case we have multiple instances of the same MathML
										mathml_tree = se.easy_xml.EasyXhtmlTree("<?xml version=\"1.0\" encoding=\"utf-8\"?>{}".format(regex.sub(r"<(/?)m:", "<\\1", line)))
										processed_line = line

										# If the mfenced element has more than one child, they are separated by commas when rendered.
										# This is too complex for our naive regexes to work around. So, if there is an mfenced element with more than one child, abandon the attempt.
										if not mathml_tree.css_select("mfenced > * + *"):
											processed_line = regex.sub(r"</?(?:m:)?math[^>]*?>", "", processed_line)
											processed_line = regex.sub(r"<!--.+?-->", "", processed_line)
											processed_line = regex.sub(r"<(?:m:)?mfenced/>", "()", processed_line)
											processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi)>(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "<i>\\4</i><\\2><i>\\6</i></\\2>", processed_line)
											processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi)>(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "<i>\\4</i><\\2>\\6</\\2>", processed_line)
											processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mn)>(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "\\4<\\2>\\6</\\2>", processed_line)
											processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mn)>(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "\\4<\\2><i>\\6</i></\\2>", processed_line)
											processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi) mathvariant=\"normal\">(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "\\4<\\2><i>\\6</i></\\2>", processed_line)
											processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi) mathvariant=\"normal\">(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "\\4<\\2>\\6</\\2>", processed_line)
											processed_line = regex.sub(fr"<(?:m:)?mo>{se.FUNCTION_APPLICATION}</(?:m:)?mo>", "", processed_line, flags=regex.IGNORECASE) # The ignore case flag is required to match here with the special FUNCTION_APPLICATION character, it's unclear why
											processed_line = regex.sub(r"<(?:m:)?mfenced><((?:m:)(?:mo|mi|mn|mrow))>(.+?)</\1></(?:m:)?mfenced>", "(<\\1>\\2</\\1>)", processed_line)
											processed_line = regex.sub(r"<(?:m:)?mrow>([^>].+?)</(?:m:)?mrow>", "\\1", processed_line)
											processed_line = regex.sub(r"<(?:m:)?mi>([^<]+?)</(?:m:)?mi>", "<i>\\1</i>", processed_line)
											processed_line = regex.sub(r"<(?:m:)?mi mathvariant=\"normal\">([^<]+?)</(?:m:)?mi>", "\\1", processed_line)
											processed_line = regex.sub(r"<(?:m:)?mo>([+\-−=×])</(?:m:)?mo>", " \\1 ", processed_line)
											processed_line = regex.sub(r"<((?:m:)?m[no])>(.+?)</\1>", "\\2", processed_line)
											processed_line = regex.sub(r"</?(?:m:)?mrow>", "", processed_line)
											processed_line = processed_line.strip()
											processed_line = regex.sub(r"</i><i>", "", processed_line, flags=regex.DOTALL)

										# Did we succeed? Is there any more MathML in our string?
										if regex.findall("</?(?:m:)?m", processed_line):
											# Failure! Abandon all hope, and use Firefox to convert the MathML to PNG.
											se.images.render_mathml_to_png(driver, regex.sub(r"<(/?)m:", "<\\1", line), work_epub_root_directory / "epub" / "images" / f"mathml-{mathml_count}.png", work_epub_root_directory / "epub" / "images" / f"mathml-{mathml_count}-2x.png")

											processed_xhtml = processed_xhtml.replace(line, f"<img class=\"mathml epub-type-se-image-color-depth-black-on-transparent\" epub:type=\"se:image.color-depth.black-on-transparent\" src=\"../images/mathml-{mathml_count}.png\" srcset=\"../images/mathml-{mathml_count}-2x.png 2x, ../images/mathml-{mathml_count}.png 1x\" />")
											mathml_count = mathml_count + 1
										else:
											# Success! Replace the MathML with our new string.
											processed_xhtml = processed_xhtml.replace(line, processed_line)

								if processed_xhtml != xhtml:
									file.seek(0)
									file.write(processed_xhtml)
									file.truncate()
			except KeyboardInterrupt as ex:
				# Bubble the exception up, but proceed to `finally` so we quit the driver
				raise ex
			finally:
				try:
					driver.quit()
				except Exception:
					# We might get here if we ctrl + c before selenium has finished initializing the driver
					pass

		# Include epub2 cover metadata
		cover_id = self.metadata_dom.xpath("//item[@properties=\"cover-image\"]/@id")[0].replace(".svg", ".jpg")
		metadata_xml = regex.sub(r"(<metadata[^>]+?>)", f"\\1\n\t\t<meta content=\"{cover_id}\" name=\"cover\" />", metadata_xml)

		# Add metadata to content.opf indicating this file is a Standard Ebooks compatibility build
		metadata_xml = metadata_xml.replace("<dc:publisher", "<meta property=\"se:transform\">compatibility</meta>\n\t\t<dc:publisher")

		# Add any new MathML images we generated to the manifest
		if has_mathml:
			for root, _, filenames in os.walk(work_epub_root_directory / "epub" / "images"):
				filenames = natsorted(filenames)
				filenames.reverse()
				for filename_string in filenames:
					filename = Path(root) / filename_string
					if filename.name.startswith("mathml-"):
						metadata_xml = metadata_xml.replace("<manifest>", f"<manifest><item href=\"images/{filename.name}\" id=\"{filename.name}\" media-type=\"image/png\"/>")

			metadata_xml = regex.sub(r"properties=\"([^\"]*?)mathml([^\"]*?)\"", "properties=\"\\1\\2\"", metadata_xml)

		metadata_xml = regex.sub(r"properties=\"\s*\"", "", metadata_xml)

		# Generate our NCX file for epub2 compatibility.
		# First find the ToC file.
		toc_filename = self.metadata_dom.xpath("//item[@properties=\"nav\"]/@href")[0]
		metadata_xml = metadata_xml.replace("<spine>", "<spine toc=\"ncx\">")
		metadata_xml = metadata_xml.replace("<manifest>", "<manifest><item href=\"toc.ncx\" id=\"ncx\" media-type=\"application/x-dtbncx+xml\" />")

		# Now use an XSLT transform to generate the NCX
		with importlib_resources.path("se.data", "navdoc2ncx.xsl") as navdoc2ncx_xsl_filename:
			toc_tree = se.epub.convert_toc_to_ncx(work_epub_root_directory, toc_filename, navdoc2ncx_xsl_filename)

		# Convert the <nav> landmarks element to the <guide> element in content.opf
		guide_xhtml = "<guide>"
		for element in toc_tree.xpath("//nav[@epub:type=\"landmarks\"]/ol/li/a"):
			element_xhtml = element.tostring()
			element_xhtml = regex.sub(r"epub:type=\"([^\"]*)(\s*frontmatter\s*|\s*backmatter\s*)([^\"]*)\"", "type=\"\\1\\3\"", element_xhtml)
			element_xhtml = regex.sub(r"epub:type=\"[^\"]*(acknowledgements|bibliography|colophon|copyright-page|cover|dedication|epigraph|foreword|glossary|index|loi|lot|notes|preface|bodymatter|titlepage|toc)[^\"]*\"", "type=\"\\1\"", element_xhtml)
			element_xhtml = element_xhtml.replace("type=\"copyright-page", "type=\"copyright page")

			# We add the 'text' attribute to the titlepage to tell the reader to start there
			element_xhtml = element_xhtml.replace("type=\"titlepage", "type=\"title-page text")

			element_xhtml = regex.sub(r"type=\"\s*\"", "", element_xhtml)
			element_xhtml = element_xhtml.replace("<a", "<reference")
			element_xhtml = regex.sub(r">(.+)</a>", " title=\"\\1\" />", element_xhtml)

			# Replace instances of the `role` attribute since it's illegal in content.opf
			element_xhtml = regex.sub(r" role=\".*?\"", "", element_xhtml)

			guide_xhtml = guide_xhtml + element_xhtml

		guide_xhtml = guide_xhtml + "</guide>"

		metadata_xml = metadata_xml.replace("</package>", "") + guide_xhtml + "</package>"

		# Guide is done, now write content.opf and clean it.
		# Output the modified content.opf before making more epub2 compatibility hacks.
		with open(work_epub_root_directory / "epub" / "content.opf", "w", encoding="utf-8") as file:
			file.write(metadata_xml)
			file.truncate()

		# All done, clean the output
		for filepath in se.get_target_filenames([work_epub_root_directory], (".xhtml", ".svg", ".opf", ".ncx")):
			se.formatting.format_xml_file(filepath)

		# Write the compatible epub
		se.epub.write_epub(work_epub_root_directory, output_directory / epub_output_filename)

		if run_epubcheck:
			# Path arguments must be cast to string for Windows compatibility.
			with importlib_resources.path("se.data.epubcheck", "epubcheck.jar") as jar_path:
				try:
					epubcheck_result = subprocess.run(["java", "-jar", str(jar_path), "--quiet", str(output_directory / epub_output_filename)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False)
					epubcheck_result.check_returncode()
				except subprocess.CalledProcessError:
					output = epubcheck_result.stdout.decode().strip()
					# Get the epubcheck version to print to the console
					version_output = subprocess.run(["java", "-jar", str(jar_path), "--version"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False).stdout.decode().strip()
					version = regex.search(r"[0-9]+\.([0-9]+\.?)*", version_output, flags=regex.MULTILINE).group(0)

					# The last two lines from epubcheck output are not necessary. Remove them here.
					# Remove them as lines instead of as a matching regex to work with localized output strings.
					split_output = output.split("\n")
					output = "\n".join(split_output[:-2])

					# Try to linkify files in output if we can find them
					try:
						output = regex.sub(r"(ERROR\(.+?\): )(.+?)(\([0-9]+,[0-9]+\))", lambda match: match.group(1) + "[path][link=file://" + str(self.path / "src" / regex.sub(fr"^\..+?\.epub{os.sep}", "", match.group(2))) + "]" + match.group(2) + "[/][/]" + match.group(3), output)
					except:
						# If something goes wrong, just pass through the usual output
						pass

					raise se.BuildFailedException(f"[bash]epubcheck[/] v{version} failed with:\n{output}")

		if build_kindle:
			# There's a bug in Calibre <= 3.48.0 where authors who have more than one MARC relator role
			# display as "unknown author" in the Kindle interface.
			# See: https://bugs.launchpad.net/calibre/+bug/1844578
			# Until the bug is fixed, we simply remove any other MARC relator on the dc:creator element.
			# Once the bug is fixed, we can remove this block.
			with open(work_epub_root_directory / "epub" / "content.opf", "r+", encoding="utf-8") as file:
				xhtml = file.read()

				processed_xhtml = xhtml

				for match in regex.findall(r"<meta property=\"role\" refines=\"#author\" scheme=\"marc:relators\">.*?</meta>", xhtml):
					if ">aut<" not in match:
						processed_xhtml = processed_xhtml.replace(match, "")

				if processed_xhtml != xhtml:
					file.seek(0)
					file.write(processed_xhtml)
					file.truncate()

			# Kindle doesn't go more than 2 levels deep for ToC, so flatten it here.
			with open(work_epub_root_directory / "epub" / toc_filename, "r+", encoding="utf-8") as file:
				xhtml = file.read()

				soup = BeautifulSoup(xhtml, "lxml")

				for match in soup.select("ol > li > ol > li > ol"):
					match.parent.insert_after(match)
					match.unwrap()

				file.seek(0)
				file.write(str(soup))
				file.truncate()

			# Rebuild the NCX
			with importlib_resources.path("se.data", "navdoc2ncx.xsl") as navdoc2ncx_xsl_filename:
				toc_tree = se.epub.convert_toc_to_ncx(work_epub_root_directory, toc_filename, navdoc2ncx_xsl_filename)

			# Clean just the ToC and NCX
			for filepath in [work_epub_root_directory / "epub" / "toc.ncx", work_epub_root_directory / "epub" / toc_filename]:
				se.formatting.format_xml_file(filepath)

			# Convert endnotes to Kindle popup compatible notes
			if (work_epub_root_directory / "epub/text/endnotes.xhtml").is_file():
				with open(work_epub_root_directory / "epub/text/endnotes.xhtml", "r+", encoding="utf-8") as file:
					xhtml = file.read()

					# We have to remove the default namespace declaration from our document, otherwise
					# xpath won't find anything at all.  See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python
					try:
						tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "")))
					except Exception as ex:
						raise se.InvalidXhtmlException(f"Error parsing XHTML [path][link=file://{(work_epub_root_directory / 'epub/text/endnotes.xhtml').resolve()}]endnotes.xhtml[/][/]. Exception: {ex}")

					notes = tree.xpath("//li[@epub:type=\"endnote\" or @epub:type=\"footnote\"]", namespaces=se.XHTML_NAMESPACES)

					processed_endnotes = ""

					for note in notes:
						note_id = note.get("id")
						note_number = note_id.replace("note-", "")

						# First, fixup the reference link for this endnote
						try:
							ref_link = etree.tostring(note.xpath("p[last()]/a[last()]")[0], encoding="unicode", pretty_print=True, with_tail=False).replace(" xmlns:epub=\"http://www.idpf.org/2007/ops\"", "").strip()
						except Exception:
							raise se.InvalidXhtmlException(f"Can’t find ref link for [url]#{note_id}[/].")

						new_ref_link = regex.sub(r">.*?</a>", ">" + note_number + "</a>.", ref_link)

						# Now remove the wrapping li node from the note
						note_text = regex.sub(r"^<li[^>]*?>(.*)</li>$", r"\1", etree.tostring(note, encoding="unicode", pretty_print=True, with_tail=False), flags=regex.IGNORECASE | regex.DOTALL)

						# Insert our new ref link
						result = regex.subn(r"^\s*<p([^>]*?)>", "<p\\1 id=\"" + note_id + "\">" + new_ref_link + " ", note_text)

						# Sometimes there is no leading <p> tag (for example, if the endnote starts with a blockquote
						# If that's the case, just insert one in front.
						note_text = result[0]
						if result[1] == 0:
							note_text = "<p id=\"" + note_id + "\">" + new_ref_link + "</p>" + note_text

						# Now remove the old ref_link
						note_text = note_text.replace(ref_link, "")

						# Trim trailing spaces left over after removing the ref link
						note_text = regex.sub(r"\s+</p>", "</p>", note_text).strip()

						# Sometimes ref links are in their own p tag--remove that too
						note_text = regex.sub(r"<p>\s*</p>", "", note_text)

						processed_endnotes += note_text + "\n"

					# All done with endnotes, so drop them back in
					xhtml = regex.sub(r"<ol>.*</ol>", processed_endnotes, xhtml, flags=regex.IGNORECASE | regex.DOTALL)

					file.seek(0)
					file.write(xhtml)
					file.truncate()

				# While Kindle now supports soft hyphens, popup endnotes break words but don't insert the hyphen characters.  So for now, remove soft hyphens from the endnotes file.
				with open(work_epub_root_directory / "epub" / "text" / "endnotes.xhtml", "r+", encoding="utf-8") as file:
					xhtml = file.read()
					processed_xhtml = xhtml

					processed_xhtml = processed_xhtml.replace(se.SHY_HYPHEN, "")

					if processed_xhtml != xhtml:
						file.seek(0)
						file.write(processed_xhtml)
						file.truncate()

			# Do some compatibility replacements
			for root, _, filenames in os.walk(work_epub_root_directory):
				for filename_string in filenames:
					filename = Path(root) / filename_string
					if filename.suffix == ".xhtml":
						with open(filename, "r+", encoding="utf-8") as file:
							xhtml = file.read()
							processed_xhtml = xhtml

							# Kindle doesn't recognize most zero-width spaces or word joiners, so just remove them.
							# It does recognize the word joiner character, but only in the old mobi7 format.  The new format renders them as spaces.
							processed_xhtml = processed_xhtml.replace(se.ZERO_WIDTH_SPACE, "")

							# Remove the epub:type attribute, as Calibre turns it into just "type"
							processed_xhtml = regex.sub(r"epub:type=\"[^\"]*?\"", "", processed_xhtml)

							if processed_xhtml != xhtml:
								file.seek(0)
								file.write(processed_xhtml)
								file.truncate()

			# Include compatibility CSS
			with open(work_epub_root_directory / "epub" / "css" / "core.css", "a", encoding="utf-8") as core_css_file:
				with importlib_resources.open_text("se.data.templates", "kindle.css", encoding="utf-8") as compatibility_css_file:
					core_css_file.write(compatibility_css_file.read())

			# Add soft hyphens
			for filepath in se.get_target_filenames([work_epub_root_directory], (".xhtml",)):
				se.typography.hyphenate_file(filepath, None, True)

			# Build an epub file we can send to Calibre
			se.epub.write_epub(work_epub_root_directory, work_directory / epub_output_filename)

			# Generate the Kindle file
			# We place it in the work directory because later we have to update the asin, and the mobi.update_asin() function will write to the final output directory
			cover_path = work_epub_root_directory / "epub" / self.metadata_dom.xpath("//item[@properties=\"cover-image\"]/@href")[0].replace(".svg", ".jpg")

			# Path arguments must be cast to string for Windows compatibility.
			return_code = subprocess.run([str(ebook_convert_path), str(work_directory / epub_output_filename), str(work_directory / kindle_output_filename), "--pretty-print", "--no-inline-toc", "--max-toc-links=0", "--prefer-metadata-cover", f"--cover={cover_path}"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False).returncode

			if return_code:
				raise se.InvalidSeEbookException("[bash]ebook-convert[/] failed.")

			# Success, extract the Kindle cover thumbnail

			# Update the ASIN in the generated file
			mobi.update_asin(asin, work_directory / kindle_output_filename, output_directory / kindle_output_filename)

			# Extract the thumbnail
			kindle_cover_thumbnail = Image.open(work_epub_root_directory / "epub" / "images" / "cover.jpg")
			kindle_cover_thumbnail = kindle_cover_thumbnail.convert("RGB") # Remove alpha channel from PNG if necessary
			kindle_cover_thumbnail = kindle_cover_thumbnail.resize((432, 648))
			kindle_cover_thumbnail.save(output_directory / f"thumbnail_{asin}_EBOK_portrait.jpg")

Python BeautifulSoup примеры использования