def test_invalid_type_input_select(self): """Test bad input into the select API.""" flags = sv.DEBUG with self.assertRaises(TypeError): sv.select('div', "not a tag", flags=flags)
def test_scope_is_select_target(self): """Test that scope is the element which scope is called on.""" for parser in ('html.parser', 'lxml', 'html5lib', 'xml'): soup = self.soup(self.MARKUP, parser) el = soup.html # Scope here means the current element under select ids = [] for el in sv.select(':scope div', el, flags=sv.DEBUG): ids.append(el.attrs['id']) self.assertEqual(sorted(ids), sorted(['div'])) el = soup.body ids = [] for el in sv.select(':scope div', el, flags=sv.DEBUG): ids.append(el.attrs['id']) self.assertEqual(sorted(ids), sorted(['div'])) # `div` is the current element under select, and it has no `div` elements. el = soup.div ids = [] for el in sv.select(':scope div', el, flags=sv.DEBUG): ids.append(el.attrs['id']) self.assertEqual(sorted(ids), sorted([])) # `div` does have an element with the class `.wordshere` ids = [] for el in sv.select(':scope .wordshere', el, flags=sv.DEBUG): ids.append(el.attrs['id']) self.assertEqual(sorted(ids), sorted(['pre']))
def test_select_order(self): """Test select order.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ # ID `5` should come before ID `some-id` soup = self.soup(markup, 'html5lib') span = sv.select('span[id]', soup)[0] ids = [] for el in sv.select('span[id]:not(#some-id)', span.parent): ids.append(el.attrs['id']) self.assertEqual(sorted(['5']), sorted(ids))
def test_scope_is_select_target(self): """Test that scope is the element which scope is called on.""" for parser in util.available_parsers( 'html.parser', 'lxml', 'html5lib', 'xml'): soup = self.soup(self.MARKUP, parser) el = soup.html # Scope here means the current element under select ids = [] for el in sv.select(':scope div', el, flags=sv.DEBUG): ids.append(el.attrs['id']) self.assertEqual(sorted(ids), sorted(['div'])) el = soup.body ids = [] for el in sv.select(':scope div', el, flags=sv.DEBUG): ids.append(el.attrs['id']) self.assertEqual(sorted(ids), sorted(['div'])) # `div` is the current element under select, and it has no `div` elements. el = soup.div ids = [] for el in sv.select(':scope div', el, flags=sv.DEBUG): ids.append(el.attrs['id']) self.assertEqual(sorted(ids), sorted([])) # `div` does have an element with the class `.wordshere` ids = [] for el in sv.select(':scope .wordshere', el, flags=sv.DEBUG): ids.append(el.attrs['id']) self.assertEqual(sorted(ids), sorted(['pre']))
def test_select(self): """Test select.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = bs4.BeautifulSoup(markup, 'html5lib') ids = [] for el in sv.select('span[id]', soup): ids.append(el.attrs['id']) self.assertEqual(sorted(['5', 'some-id']), sorted(ids)) ids = [] for el in sv.select('span[id]', soup, limit=1): ids.append(el.attrs['id']) self.assertEqual(sorted(['5']), sorted(ids)) self.assertEqual( sv.select('span[id]', soup, limit=1)[0].attrs['id'], sv.select_one('span[id]', soup).attrs['id']) self.assertEqual(None, sv.select_one('h1', soup)) ids = [] for el in sv.iselect('span[id]', soup): ids.append(el.attrs['id']) self.assertEqual(sorted(['5', 'some-id']), sorted(ids)) span = sv.select('span[id]', soup)[0] ids = [] for el in sv.select('span[id]:not(#some-id)', span.parent): ids.append(el.attrs['id']) self.assertEqual(sorted(['5']), sorted(ids))
def test_invalid_type_input(self): """Test bad input into the API.""" with self.assertRaises(TypeError): sv.match('div', "not a tag") with self.assertRaises(TypeError): sv.select('div', "not a tag") with self.assertRaises(TypeError): sv.filter('div', "not a tag") with self.assertRaises(TypeError): sv.comments('div', "not a tag")
def test_leading_combinator_quirks(self): """Test scope with quirks.""" markup = """ <html id="root"> <head> </head> <body> <div id="div"> <p id="0" class="somewordshere">Some text <span id="1"> in a paragraph</span>.</p> <a id="2" href="http://google.com">Link</a> <span id="3" class="herewords">Direct child</span> <pre id="pre" class="wordshere"> <span id="4">Child 1</span> <span id="5">Child 2</span> <span id="6">Child 3</span> </pre> </div> </body> </html> """ soup = self.soup(markup, 'html5lib') el = soup.div ids = [] for el in sv.select('> span, > #pre', el, flags=sv.DEBUG | sv._QUIRKS): ids.append(el.attrs['id']) self.assertEqual(sorted(ids), sorted(['3', 'pre']))
def scrape(stock_to_pull): base_url = "https://finance.yahoo.com/quote/" + stock_to_pull + "/options?p=" + stock_to_pull + "&straddle=false" res = requests.get(base_url) soup = BeautifulSoup(res.text, features="lxml") call_table = soup.find("table", {"class": "calls"}) call_date = dt.strptime( call_table.parent.parent.previous_sibling.select('span')[4].text, '%B %d, %Y').date() calls = sv.select('tr:has(> td)', call_table) calls_df = pd.DataFrame() for call in calls: call_det = {} call_det["stock_name"] = stock_to_pull call_det["expiry_date"] = call_date call_det["strike_price"] = call.find("td", {"class": "data-col2"}).text call_det["last_price"] = call.find("td", {"class": "data-col3"}).text call_det["bid_price"] = call.find("td", {"class": "data-col4"}).text call_det["ask_price"] = call.find("td", {"class": "data-col5"}).text call_det["chng_price"] = call.find("td", {"class": "data-col6"}).text call_det["perc_chng"] = call.find("td", {"class": "data-col7"}).text call_det["volume"] = call.find("td", {"class": "data-col8"}).text call_det["opn_int"] = call.find("td", {"class": "data-col9"}).text call_det["imp_vol"] = call.find("td", {"class": "data-col10"}).text if ("in-the-money" in call["class"]): call_det["itm"] = "Y" else: call_det["itm"] = "N" call_det["curr_time"] = eastern.localize( dt.now()).strftime('%Y-%m-%d %H:%M:%S') calls_df = calls_df.append(call_det, ignore_index=True) return calls_df
def test_select_one(self): """Test select one.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = self.soup(markup, 'html5lib') self.assertEqual( sv.select('span[id]', soup, limit=1)[0].attrs['id'], sv.select_one('span[id]', soup).attrs['id'] )
def assert_selector(self, markup, selectors, expected_ids, namespaces={}, flags=0): """Assert selector.""" mode = flags & 0x0F if mode == HTML: bs_mode = 'lxml' elif mode in (HTML5, 0): bs_mode = 'html5lib' elif mode in (XHTML, XML): bs_mode = 'xml' soup = bs4.BeautifulSoup(textwrap.dedent(markup.replace('\r\n', '\n')), bs_mode) ids = [] for el in sv.select(selectors, soup, namespaces=namespaces, flags=sv.DEBUG): print('TAG: ', el.name) ids.append(el.attrs['id']) self.assertEqual(sorted(ids), sorted(expected_ids))
def test_select_limit(self): """Test select limit.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = self.soup(markup, 'html5lib') ids = [] for el in sv.select('span[id]', soup, limit=1): ids.append(el.attrs['id']) self.assertEqual(sorted(['5']), sorted(ids))
def StartApp(): my_app = QApplication(sys.argv) database = Database.GameDatabase() network_html = Network.request_webpage( "https://pcgamingwiki.com/wiki/Life_Is_Strange") main_widget = GUI.QtWindow(database) p = Path("F:\\", "Spel", "SteamLibrary", "steamapps", "common") database.addLibraryPath( "Steam", "Steam", Path("F", "Spel", "SteamLibrary", "steamapps", "common")) p = Path("C:\\Users\\fough\\Documents\\my games\\Life Is Strange\\Saves") # database.addLibraryPath("UserProfile", "\%USERPROFILE\%", p) main_widget.show() soup = BeautifulSoup(network_html, features='html.parser') # Alternate parser for HTML # soup = BeautifulSoup(mystr, features='html5lib') # selector: Used to select tables containing data on the webpage selector = 'h3 ~ div.container-pcgwikitable table#table-gamedata.pcgwikitable.template-infotable' table_html = soupsieve.select(selector, soup) if isinstance(table_html, collections.Mapping) or isinstance( table_html, list): for t in table_html: d = Network.getDictFromHTMLTable(t) for doc in d: database.db.insert(doc) sys.exit(my_app.exec_())
def test_select_order(self): """Test select order.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = self.soup(markup, 'html.parser') ids = [] for el in sv.select('[id]', soup.body): ids.append(el.attrs['id']) self.assertEqual(['1', '2', '3', '4', '5', 'some-id', '6'], ids)
def test_scope_cannot_select_target(self): """Test that scope, the element which scope is called on, cannot be selected.""" for parser in ('html.parser', 'lxml', 'html5lib', 'xml'): soup = self.soup(self.MARKUP, parser) el = soup.html # Scope is the element we are applying the select to, and that element is never returned self.assertTrue(len(sv.select(':scope', el, flags=sv.DEBUG)) == 0)
def get_articles(soup): slc1 = 'article:has( p)' a1 = soup.select_one(slc1) slc2 = 'div:has(> [itemprop="articleBody"]:has( p, div, a))' a2 = soup.select_one(slc2) slc3 = '''div:is([class*="article"], [class*="head"], [class^="article__header"]), div[class^="article_block"]:has(div[class^="article__text"])''' a3 = soup.select(slc3) if a1: return sv.select(slc1, soup) elif a2: s1 = sv.select(f'{slc2} [itemprop="headline"], [itemprop="articleBody"] :has(>p)', soup) return s1 elif a3: return sv.select(f'''div:is([class*="article"], [class*="head"], [class^="article__header"])''', soup) else: return sv.select(f'''#div_postbody *, :has(#div_postbody) > :is({headers}), [itemprop="headline"], .content *''', soup)
def test_scope_cannot_select_target(self): """Test that scope, the element which scope is called on, cannot be selected.""" for parser in util.available_parsers( 'html.parser', 'lxml', 'html5lib', 'xml'): soup = self.soup(self.MARKUP, parser) el = soup.html # Scope is the element we are applying the select to, and that element is never returned self.assertTrue(len(sv.select(':scope', el, flags=sv.DEBUG)) == 0)
def test_iframe(self): """ Test that we only count `iframe` as root since the scoped element is the root. Not all the parsers treat `iframe` content the same. `html5lib` for instance will escape the content in the `iframe`, so we are just going to test the builtin Python parser. """ soup = self.soup(self.MARKUP_IFRAME, 'html.parser') ids = [] for el in sv.select(':root div', soup.iframe.html): ids.append(el['id']) self.assertEqual(sorted(ids), sorted(['div2'])) ids = [] for el in sv.select(':root > body > div', soup.iframe.html): ids.append(el['id']) self.assertEqual(sorted(ids), sorted(['div2']))
def scrape(city): city_url_id = re.sub(r'[\s,]+', "-", city, 0) city_url = "https://www.tide-forecast.com/locations/" + city_url_id + "/tides/latest" r = requests.get(url=city_url) html = r.text soup = BeautifulSoup(html, 'html.parser') tr = sv.select(".tide-table > tr", soup) curr_date = "" timeofday = "" for el in tr: dateInst = sv.select_one(".date", el) if dateInst != None: curr_date = dateInst.text.strip() tide_time = "" tide_time_inst = sv.select_one(".time", el) if tide_time_inst != None: tide_time = tide_time_inst.text.strip() timezone = "" timezone_inst = sv.select_one(".time-zone", el) if tide_time_inst != None: timezone = timezone_inst.text.strip() level = "" level_inst = sv.select_one(".level", el) if level_inst != None: level = level_inst.text.strip() tide_phase = "" tide_phase_inst = sv.select_one(".tide:last-child", el) if tide_phase_inst != None: tide_phase = tide_phase_inst.text.strip() else: timeofday_inst = sv.select_one("td:last-child", el) if timeofday_inst != None: timeofday_val = timeofday_inst.text.strip() if timeofday_val == "Sunrise": timeofday = timeofday_val elif timeofday_val == "Sunset": timeofday = timeofday_val else: timeofday = "" if tide_phase == "Low Tide" and (timeofday == "Sunrise" or timeofday == "Sunset"): print('{0} {1} {2} {3} {4}'.format(city, curr_date, tide_time, timezone, level))
def test_parent_nth_of_type_preconditions(self): """Test `nth` type preconditions.""" els = sv.select('div > h1', self.soup) # check that there is a unique selection self.assertEqual(len(els), 1) self.assertEqual(els[0].string, 'An H1') # Show that the `h1`'s parent `div#inner` is the first child of type `div` of the grandparent `div#main`. # so that the selector `div:nth-of-type(1) > h1` should also give `h1`. h1 = els[0] div_inner = h1.parent div_main = div_inner.parent div_main_children = [child for child in div_main.children] self.assertEqual(div_main_children[0], '\n') self.assertEqual(div_main_children[1], div_inner)
def main(): stock_to_pull = "RDFN" base_url = "https://seekingalpha.com/symbol/" + stock_to_pull res = requests.get(base_url) soup = BeautifulSoup(res.text, features="lxml") article_list = soup.find("ul", {"id": "symbol-page-latest"}) articles = sv.select('li', article_list) news_df = pd.DataFrame() for article in articles: news_det = {} news_det["headline"] = article.find("div", { "class": "symbol_article" }).text spans = article.find("div", { "class": "date_on_by" }).findChildren("span", {"class": ""}) for span in spans: if (span.text != "SA News"): news_det["release_date"] = span.text news_det["curr_time"] = eastern.localize( dt.now()).strftime('%Y-%m-%d %H:%M:%S') print(news_det) print('\n\n')
def test_match(self): """Test matching.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = self.soup(markup, 'html5lib') nodes = sv.select('span[id]', soup) self.assertTrue(sv.match('span#\\35', nodes[0])) self.assertFalse(sv.match('span#\\35', nodes[1]))
artist_id = sys.argv[2] search_term = sys.argv[3] else: artist = 'Spice-Girls' artist_id = 199833 search_term = 'good sheep' print(str(artist_id) + ' : ' + artist) print('Search for : ' + search_term) url = 'https://www.lyrics.com/artist.php?name=' + artist + '&aid=' + str( artist_id) + '&o=1' page_source = requests.get(url).text beautiful_soap_content = BeautifulSoup(page_source, "lxml") for song in sv.select('tr', sv.select_one('tbody', beautiful_soap_content)): song_element = sv.select_one('a', song) print('\n\nSong Title : ' + song_element.text) song_url = 'https://www.lyrics.com' + song_element.get('href') print('Song URL : ' + song_url + '\n') song_page_source = requests.get(song_url).text song_page_content = BeautifulSoup(song_page_source, "lxml") # print('Song Lyrics') song_lyrics = sv.select_one('pre', song_page_content).text print(song_lyrics) # if search_term in song_lyrics : # break if re.search(search_term, song_lyrics, re.IGNORECASE): print(search_term + ' Found On ' + song_element.text)
def googleTranslate(self): # def googleTranslate(self, translateString): # 구글번역 # driver = self.driver # wait = self.wait # # driver.get('https://translate.google.com/#auto|ko|{}'.format(translateString)) # driver.find_element_by_css_selector('textarea#source.orig.tlid-source-text-input.goog-textarea').send_keys(translateString) # wait.until(expected.visibility_of_element_located((By.CSS_SELECTOR, 'span.tlid-translation.translation span'))) # html = driver.page_source # driver.find_element_by_css_selector('textarea#source.orig.tlid-source-text-input.goog-textarea').clear() # # driver.implicitly_wait(5) with open('./string.html', 'r', encoding='utf-8') as f: html = f.read() select1 = str( sv.select_one('div.homepage-content-wrap', BeautifulSoup(html, 'html.parser'))) # 번역창 분리 with open('./test.html', 'w', encoding='utf-8') as f: f.write(select1) # data1 resultString1 = '' select4 = sv.select_one('span.tlid-translation.translation', BeautifulSoup(select1, 'html.parser')) resultString1 = str(select4).replace('<br/>', '<span>\n</span>') resultString1 = BeautifulSoup(resultString1, 'html.parser').text print(resultString1) # data2 resultString2 = '' if ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style="display: none;"' in select1): pass elif ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style=""' in select1): select2 = str( sv.select_one('table.gt-baf-table', BeautifulSoup(select1, 'html.parser'))) select3 = sv.select( 'span.gt-cd-pos, span.gt-baf-cell.gt-baf-word-clickable', BeautifulSoup(select2, 'html.parser')) for data in select3: strData = str(data) if ('gt-baf-cell' in strData): resultString2 += "{}, ".format(data.text) elif ('gt-cd-pos' in strData): resultString2 = resultString2.rstrip(", ") resultString2 += "\n* {} *\n: ".format(data.text) resultString2 = resultString2.lstrip("\n") # resultString2 = resultString2.rstrip(", ") print(resultString2) # # print(html) # select11 = sv.select_one('span.tlid-translation.translation', BeautifulSoup(select1, 'html.parser')) # tmp1 = str(select11).replace('<br/>', '<span>\n</span>') # tmp1 = BeautifulSoup(tmp1, 'html.parser').text # print(tmp1) # resultString2 = '' # if('<div class="gt-lc gt-lc-mobile" style="display: none;">' in select1): # pass # elif('<div class="gt-lc gt-lc-mobile" style="">' in select1): # select2 = str(sv.select_one('table.gt-baf-table', BeautifulSoup(select1, 'html.parser'))) # select3 = sv.select('span.gt-cd-pos, span.gt-baf-cell.gt-baf-word-clickable', BeautifulSoup(select2, 'html.parser')) # for data in select3: # strData = str(data) # if('gt-baf-cell' in strData): # resultString2 += "{}, ".format(data.text) # elif('gt-cd-pos' in strData): # resultString2 = resultString2.rstrip(", ") # resultString2 += "\n* {} *\n: ".format(data.text) # resultString2 = resultString2.lstrip("\n") # resultString2 = resultString2.rstrip(", ") # print(resultString2) # return {"data1": resultString1, "data2": resultString2} return
def test_parent_nth_of_type(self): """Test parent of `nth` of type.""" els = sv.select('div:nth-of-type(1) > h1', self.soup) self.assertEqual(len(els), 1) self.assertEqual(els[0].string, 'An H1')
from selenium import webdriver from selenium.webdriver.chrome.webdriver import WebDriver from typing import List, Any, Union import soupsieve as sv chromeWebDriver: WebDriver = webdriver.Chrome("C:/Programs/chromedriver.exe") chromeWebDriver.get("https://soundcloud.com/search/sounds?q=old%20party") pageSource: Union[Union[int, List[Union[int, str]]], Any] = chromeWebDriver.page_source beautifulSoapContent: BeautifulSoup = BeautifulSoup(pageSource, "lxml") # soundBodies: List[Any] = sv.select('div:is(.sound__body)', beautifulSoapContent) soundBody: Tag for soundBody in sv.select('div.sound__body', beautifulSoapContent): # print ("sound__body Element : " + str(soundBody) + "\n") # coverArtElement = sv.select_one('a.sound__coverArt',soundBody) # print ("sound__coverArt Element : " + str(coverArtElement)) # print ("Track Page : https://soundcloud.com" + coverArtElement.get('href')) # # TODO : Wait to load Cover Arts # # coverArtUrl = get_backgroundImage_url( # # get_first_span_element_by_custom_attribute(coverArtElement, 'aria-role', 'img')) # type: str # # print "Cover Art Url : " + coverArtUrl contentElement = sv.select_one('div.sound__content', soundBody) # print ("sound__content Element : " + str(contentElement))
def scrape_transfers2(page): page = BeautifulSoup(open(page, "r"), "html.parser") date = page.select("div.box h2")[0].text.replace("Transfer on ", "") parsed_date = parser.parse(date) timestamp = parsed_date.timestamp() if parsed_date.month >= 7: season = "{start}/{end}".format(start = parsed_date.year, end = parsed_date.year+1) else: season = "{start}/{end}".format(start = parsed_date.year-1, end = parsed_date.year) for row in page.select("div#yw1 table.items tbody > tr"): columns = sv.select (":scope > td", row) if len(columns) == 7: fee_element = columns[6].select("a")[0] if fee_element.text == "Free Transfer": break player_image_element = columns[0].select("img")[0] player_element = columns[0].select("a.spielprofil_tooltip")[0] player_age = columns[1].text player_position = columns[0].select("table tr td")[-1].text from_club_elements = columns[3].select("td.hauptlink a.vereinprofil_tooltip") from_club_href = from_club_elements[0]["href"] if len(from_club_elements) > 0 else "" from_club_text = from_club_elements[0].text if len(from_club_elements) > 0 else "" from_club_image = columns[3].select("td:first-child img")[0]["src"] from_country_elements = columns[3].select("table tr td")[-1] from_club_country_image = from_country_elements.select("img") from_club_country = from_club_country_image[0]["title"] if len(from_club_country_image) > 0 else "" from_club_league_elements = from_country_elements.select("a") from_club_league = from_club_league_elements[0]["title"] if len(from_club_league_elements) > 0 else "" from_club_league_href = from_club_league_elements[0]["href"] if len(from_club_league_elements) > 0 else "" to_club_elements = columns[4].select("td.hauptlink a.vereinprofil_tooltip") to_club_href = to_club_elements[0]["href"] if len(to_club_elements) > 0 else "" to_club_text = to_club_elements[0].text if len(to_club_elements) > 0 else "" to_club_image = columns[4].select("td:first-child img")[0]["src"] to_country_elements = columns[4].select("table tr td")[-1] to_club_country_image = to_country_elements.select("img") to_club_country = to_club_country_image[0]["title"] if len(to_club_country_image) > 0 else "" to_club_league_elements = to_country_elements.select("a") to_club_league = to_club_league_elements[0]["title"] if len(to_club_league_elements) > 0 else "" to_club_league_href = to_club_league_elements[0]["href"] if len(to_club_league_elements) > 0 else "" nationality = columns[2].select("img")[0]["title"] yield {"season": season, "player": {"href": player_element["href"], "name": player_element.text, "position": player_position, "age": player_age, "image": player_image_element["src"], "nationality": nationality}, "from": {"href": from_club_href, "name": from_club_text, "country": from_club_country, "countryImage": from_club_country_image[0]["src"] if len(from_club_country_image) > 0 else "", "league": from_club_league, "leagueHref": from_club_league_href, "image": from_club_image}, "to": {"href": to_club_href, "name": to_club_text, "country": to_club_country, "countryImage": to_club_country_image[0]["src"] if len(to_club_country_image) > 0 else "", "league": to_club_league, "leagueHref": to_club_league_href, "image": to_club_image}, "transfer": {"href": fee_element["href"], "value": fee_element.text, "timestamp": int(timestamp)} }
def interpret_scraper(scraper, element, root=None, context=None, path=[], scope=None): if scope is None: scope = EvaluationScope() # Is this a tail call of item? if isinstance(scraper, str): if scraper in EXTRACTOR_NAMES: return extract(element, scraper) return element.get(scraper) sel = get_sel(scraper) iterator = get_iterator(scraper) # First we need to solve local selection if sel is not None: element = soupsieve.select_one(sel, element) elif 'sel_eval' in scraper: evaluated_sel = eval_expression( scraper['sel_eval'], element=element, elements=[], context=context, root=root, path=path + ['sel_eval'], expect=(Tag, str), allow_none=True, scope=scope ) if isinstance(evaluated_sel, str): element = soupsieve.select_one(evaluated_sel, element) else: element = evaluated_sel if element is None: return None # Then we need to solve iterator single_value = True if iterator is not None: single_value = False elements = soupsieve.select(iterator, element) elif 'iterator_eval' in scraper: single_value = False evaluated_elements = eval_expression( scraper['iterator_eval'], element=element, elements=[], context=context, root=root, path=path + ['iterator_eval'], check=is_valid_iterator_eval_output, scope=scope ) if isinstance(evaluated_elements, str): elements = soupsieve.select(evaluated_elements, element) else: elements = evaluated_elements else: elements = [element] # Handling local context if 'set_context' in scraper: local_context = {} for k, field_scraper in scraper['set_context'].items(): local_context[k] = interpret_scraper( field_scraper, element, root=root, context=context, path=path + ['set_context', k], scope=scope ) context = merge_contexts(context, local_context) # Actual iteration acc = None if single_value else [] already_seen = set() if 'uniq' in scraper and not single_value else None for element in elements: value = None # Do we have fields? if 'fields' in scraper: value = {} for k, field_scraper in scraper['fields'].items(): value[k] = interpret_scraper( field_scraper, element, root=root, context=context, path=path + ['fields', k], scope=scope ) # Do we have a scalar? elif 'item' in scraper: # Default value is text value = interpret_scraper( scraper['item'], element, root=root, context=context, path=path + ['item'], scope=scope ) else: if 'attr' in scraper: value = element.get(scraper['attr']) elif 'extract' in scraper: value = extract(element, scraper['extract']) elif 'get_context' in scraper: value = nested_get(scraper['get_context'], context) elif 'default' not in scraper: # Default value is text value = extract(element, 'text') # Eval? if 'eval' in scraper: value = eval_expression( scraper['eval'], element=element, elements=elements, value=value, context=context, root=root, path=path + ['eval'], expect=DATA_TYPES, allow_none=True, scope=scope ) # Default value after all? if 'default' in scraper and value is None: value = scraper['default'] if single_value: acc = value else: # Filtering? if 'filter_eval' in scraper: passed_filter = eval_expression( scraper['filter_eval'], element=element, elements=elements, value=value, context=context, root=root, path=path + ['filter_eval'], expect=bool, allow_none=True, scope=scope ) if not passed_filter: continue if 'filter' in scraper: filtering_clause = scraper['filter'] if filtering_clause is True and not value: continue if isinstance(filtering_clause, str) and not nested_get(filtering_clause, value): continue if 'uniq' in scraper: uniq_clause = scraper['uniq'] k = value if uniq_clause is True and value in already_seen: continue if isinstance(uniq_clause, str): k = nested_get(uniq_clause, value) if k in already_seen: continue already_seen.add(k) acc.append(value) return acc
def googleTranslate(self, translateString): # 구글번역 driver = self.driver wait = self.wait driver.find_element_by_css_selector( 'textarea#source.orig.tlid-source-text-input.goog-textarea').clear( ) wait.until( expected.invisibility_of_element_located( (By.CSS_SELECTOR, 'span.tlid-translation.translation'))) # driver.get('https://translate.google.com/#auto|ko|{}'.format(translateString)) driver.find_element_by_css_selector( 'textarea#source.orig.tlid-source-text-input.goog-textarea' ).send_keys(translateString) try: wait.until( expected.visibility_of_element_located( (By.CSS_SELECTOR, 'span.tlid-translation.translation'))) except: html = driver.page_source driver.find_element_by_css_selector( 'textarea#source.orig.tlid-source-text-input.goog-textarea' ).clear() if (sv.select_one('div.result-error', BeautifulSoup(html, 'html.parser'))): return { "data1": sv.select_one('span.tlid-result-error', BeautifulSoup(html, 'html.parser')).text } html = driver.page_source driver.find_element_by_css_selector( 'textarea#source.orig.tlid-source-text-input.goog-textarea').clear( ) # driver.implicitly_wait(5) select1 = str( sv.select_one('div.homepage-content-wrap', BeautifulSoup(html, 'html.parser'))) # 번역창 분리 # data1 resultString1 = '' select4 = sv.select_one('span.tlid-translation.translation', BeautifulSoup(select1, 'html.parser')) resultString1 = str(select4).replace('<br/>', '<span>\n</span>') resultString1 = BeautifulSoup(resultString1, 'html.parser').text # print(resultString1) # data2 resultString2 = '' if ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style="display: none;"' in select1): pass elif ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style=""' in select1): select2 = str( sv.select_one('table.gt-baf-table', BeautifulSoup(select1, 'html.parser'))) select3 = sv.select( 'span.gt-cd-pos, span.gt-baf-cell.gt-baf-word-clickable', BeautifulSoup(select2, 'html.parser')) for data in select3: strData = str(data) if ('gt-baf-cell' in strData): resultString2 += "{}, ".format(data.text) elif ('gt-cd-pos' in strData): resultString2 = resultString2.rstrip(", ") resultString2 += "\n* {} *\n: ".format(data.text) resultString2 = resultString2.lstrip("\n") resultString2 = resultString2.rstrip(", ") # print(resultString2) return {"data1": resultString1, "data2": resultString2}