Пример #1
0
    def test_invalid_type_input_select(self):
        """Test bad input into the select API."""

        flags = sv.DEBUG

        with self.assertRaises(TypeError):
            sv.select('div', "not a tag", flags=flags)
Пример #2
0
    def test_scope_is_select_target(self):
        """Test that scope is the element which scope is called on."""

        for parser in ('html.parser', 'lxml', 'html5lib', 'xml'):
            soup = self.soup(self.MARKUP, parser)
            el = soup.html

            # Scope here means the current element under select
            ids = []
            for el in sv.select(':scope div', el, flags=sv.DEBUG):
                ids.append(el.attrs['id'])
            self.assertEqual(sorted(ids), sorted(['div']))

            el = soup.body
            ids = []
            for el in sv.select(':scope div', el, flags=sv.DEBUG):
                ids.append(el.attrs['id'])
            self.assertEqual(sorted(ids), sorted(['div']))

            # `div` is the current element under select, and it has no `div` elements.
            el = soup.div
            ids = []
            for el in sv.select(':scope div', el, flags=sv.DEBUG):
                ids.append(el.attrs['id'])
            self.assertEqual(sorted(ids), sorted([]))

            # `div` does have an element with the class `.wordshere`
            ids = []
            for el in sv.select(':scope .wordshere', el, flags=sv.DEBUG):
                ids.append(el.attrs['id'])
            self.assertEqual(sorted(ids), sorted(['pre']))
Пример #3
0
    def test_select_order(self):
        """Test select order."""

        markup = """
        <!-- before header -->
        <html>
        <head>
        </head>
        <body>
        <!-- comment -->
        <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p>
        <pre id="4"></pre>
        <p><span id="5" class="some-class"></span><span id="some-id"></span></p>
        <pre id="6" class='ignore'>
            <!-- don't ignore -->
        </pre>
        </body>
        </html>
        """

        # ID `5` should come before ID `some-id`
        soup = self.soup(markup, 'html5lib')
        span = sv.select('span[id]', soup)[0]
        ids = []
        for el in sv.select('span[id]:not(#some-id)', span.parent):
            ids.append(el.attrs['id'])

        self.assertEqual(sorted(['5']), sorted(ids))
    def test_scope_is_select_target(self):
        """Test that scope is the element which scope is called on."""

        for parser in util.available_parsers(
                'html.parser', 'lxml', 'html5lib', 'xml'):
            soup = self.soup(self.MARKUP, parser)
            el = soup.html

            # Scope here means the current element under select
            ids = []
            for el in sv.select(':scope div', el, flags=sv.DEBUG):
                ids.append(el.attrs['id'])
            self.assertEqual(sorted(ids), sorted(['div']))

            el = soup.body
            ids = []
            for el in sv.select(':scope div', el, flags=sv.DEBUG):
                ids.append(el.attrs['id'])
            self.assertEqual(sorted(ids), sorted(['div']))

            # `div` is the current element under select, and it has no `div` elements.
            el = soup.div
            ids = []
            for el in sv.select(':scope div', el, flags=sv.DEBUG):
                ids.append(el.attrs['id'])
            self.assertEqual(sorted(ids), sorted([]))

            # `div` does have an element with the class `.wordshere`
            ids = []
            for el in sv.select(':scope .wordshere', el, flags=sv.DEBUG):
                ids.append(el.attrs['id'])
            self.assertEqual(sorted(ids), sorted(['pre']))
Пример #5
0
    def test_select(self):
        """Test select."""

        markup = """
        <!-- before header -->
        <html>
        <head>
        </head>
        <body>
        <!-- comment -->
        <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p>
        <pre id="4"></pre>
        <p><span id="5" class="some-class"></span><span id="some-id"></span></p>
        <pre id="6" class='ignore'>
            <!-- don't ignore -->
        </pre>
        </body>
        </html>
        """

        soup = bs4.BeautifulSoup(markup, 'html5lib')
        ids = []
        for el in sv.select('span[id]', soup):
            ids.append(el.attrs['id'])

        self.assertEqual(sorted(['5', 'some-id']), sorted(ids))

        ids = []
        for el in sv.select('span[id]', soup, limit=1):
            ids.append(el.attrs['id'])

        self.assertEqual(sorted(['5']), sorted(ids))

        self.assertEqual(
            sv.select('span[id]', soup, limit=1)[0].attrs['id'],
            sv.select_one('span[id]', soup).attrs['id'])

        self.assertEqual(None, sv.select_one('h1', soup))

        ids = []
        for el in sv.iselect('span[id]', soup):
            ids.append(el.attrs['id'])

        self.assertEqual(sorted(['5', 'some-id']), sorted(ids))

        span = sv.select('span[id]', soup)[0]
        ids = []
        for el in sv.select('span[id]:not(#some-id)', span.parent):
            ids.append(el.attrs['id'])

        self.assertEqual(sorted(['5']), sorted(ids))
Пример #6
0
    def test_invalid_type_input(self):
        """Test bad input into the API."""

        with self.assertRaises(TypeError):
            sv.match('div', "not a tag")

        with self.assertRaises(TypeError):
            sv.select('div', "not a tag")

        with self.assertRaises(TypeError):
            sv.filter('div', "not a tag")

        with self.assertRaises(TypeError):
            sv.comments('div', "not a tag")
Пример #7
0
    def test_leading_combinator_quirks(self):
        """Test scope with quirks."""

        markup = """
        <html id="root">
        <head>
        </head>
        <body>
        <div id="div">
        <p id="0" class="somewordshere">Some text <span id="1"> in a paragraph</span>.</p>
        <a id="2" href="http://google.com">Link</a>
        <span id="3" class="herewords">Direct child</span>
        <pre id="pre" class="wordshere">
        <span id="4">Child 1</span>
        <span id="5">Child 2</span>
        <span id="6">Child 3</span>
        </pre>
        </div>
        </body>
        </html>
        """

        soup = self.soup(markup, 'html5lib')
        el = soup.div
        ids = []
        for el in sv.select('> span, > #pre', el, flags=sv.DEBUG | sv._QUIRKS):
            ids.append(el.attrs['id'])
        self.assertEqual(sorted(ids), sorted(['3', 'pre']))
    def test_leading_combinator_quirks(self):
        """Test scope with quirks."""

        markup = """
        <html id="root">
        <head>
        </head>
        <body>
        <div id="div">
        <p id="0" class="somewordshere">Some text <span id="1"> in a paragraph</span>.</p>
        <a id="2" href="http://google.com">Link</a>
        <span id="3" class="herewords">Direct child</span>
        <pre id="pre" class="wordshere">
        <span id="4">Child 1</span>
        <span id="5">Child 2</span>
        <span id="6">Child 3</span>
        </pre>
        </div>
        </body>
        </html>
        """

        soup = self.soup(markup, 'html5lib')
        el = soup.div
        ids = []
        for el in sv.select('> span, > #pre', el, flags=sv.DEBUG | sv._QUIRKS):
            ids.append(el.attrs['id'])
        self.assertEqual(sorted(ids), sorted(['3', 'pre']))
Пример #9
0
def scrape(stock_to_pull):
    base_url = "https://finance.yahoo.com/quote/" + stock_to_pull + "/options?p=" + stock_to_pull + "&straddle=false"
    res = requests.get(base_url)
    soup = BeautifulSoup(res.text, features="lxml")

    call_table = soup.find("table", {"class": "calls"})
    call_date = dt.strptime(
        call_table.parent.parent.previous_sibling.select('span')[4].text,
        '%B %d, %Y').date()
    calls = sv.select('tr:has(> td)', call_table)
    calls_df = pd.DataFrame()
    for call in calls:
        call_det = {}
        call_det["stock_name"] = stock_to_pull
        call_det["expiry_date"] = call_date
        call_det["strike_price"] = call.find("td", {"class": "data-col2"}).text
        call_det["last_price"] = call.find("td", {"class": "data-col3"}).text
        call_det["bid_price"] = call.find("td", {"class": "data-col4"}).text
        call_det["ask_price"] = call.find("td", {"class": "data-col5"}).text
        call_det["chng_price"] = call.find("td", {"class": "data-col6"}).text
        call_det["perc_chng"] = call.find("td", {"class": "data-col7"}).text
        call_det["volume"] = call.find("td", {"class": "data-col8"}).text
        call_det["opn_int"] = call.find("td", {"class": "data-col9"}).text
        call_det["imp_vol"] = call.find("td", {"class": "data-col10"}).text
        if ("in-the-money" in call["class"]):
            call_det["itm"] = "Y"
        else:
            call_det["itm"] = "N"
        call_det["curr_time"] = eastern.localize(
            dt.now()).strftime('%Y-%m-%d %H:%M:%S')
        calls_df = calls_df.append(call_det, ignore_index=True)

    return calls_df
Пример #10
0
    def test_select_one(self):
        """Test select one."""

        markup = """
        <!-- before header -->
        <html>
        <head>
        </head>
        <body>
        <!-- comment -->
        <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p>
        <pre id="4"></pre>
        <p><span id="5" class="some-class"></span><span id="some-id"></span></p>
        <pre id="6" class='ignore'>
            <!-- don't ignore -->
        </pre>
        </body>
        </html>
        """

        soup = self.soup(markup, 'html5lib')
        self.assertEqual(
            sv.select('span[id]', soup, limit=1)[0].attrs['id'],
            sv.select_one('span[id]', soup).attrs['id']
        )
Пример #11
0
    def assert_selector(self,
                        markup,
                        selectors,
                        expected_ids,
                        namespaces={},
                        flags=0):
        """Assert selector."""

        mode = flags & 0x0F
        if mode == HTML:
            bs_mode = 'lxml'
        elif mode in (HTML5, 0):
            bs_mode = 'html5lib'
        elif mode in (XHTML, XML):
            bs_mode = 'xml'
        soup = bs4.BeautifulSoup(textwrap.dedent(markup.replace('\r\n', '\n')),
                                 bs_mode)

        ids = []
        for el in sv.select(selectors,
                            soup,
                            namespaces=namespaces,
                            flags=sv.DEBUG):
            print('TAG: ', el.name)
            ids.append(el.attrs['id'])
        self.assertEqual(sorted(ids), sorted(expected_ids))
Пример #12
0
    def test_select_limit(self):
        """Test select limit."""

        markup = """
        <!-- before header -->
        <html>
        <head>
        </head>
        <body>
        <!-- comment -->
        <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p>
        <pre id="4"></pre>
        <p><span id="5" class="some-class"></span><span id="some-id"></span></p>
        <pre id="6" class='ignore'>
            <!-- don't ignore -->
        </pre>
        </body>
        </html>
        """

        soup = self.soup(markup, 'html5lib')

        ids = []
        for el in sv.select('span[id]', soup, limit=1):
            ids.append(el.attrs['id'])

        self.assertEqual(sorted(['5']), sorted(ids))
Пример #13
0
def StartApp():
    my_app = QApplication(sys.argv)
    database = Database.GameDatabase()
    network_html = Network.request_webpage(
        "https://pcgamingwiki.com/wiki/Life_Is_Strange")
    main_widget = GUI.QtWindow(database)

    p = Path("F:\\", "Spel", "SteamLibrary", "steamapps", "common")
    database.addLibraryPath(
        "Steam", "Steam",
        Path("F", "Spel", "SteamLibrary", "steamapps", "common"))
    p = Path("C:\\Users\\fough\\Documents\\my games\\Life Is Strange\\Saves")
    # database.addLibraryPath("UserProfile", "\%USERPROFILE\%", p)

    main_widget.show()

    soup = BeautifulSoup(network_html, features='html.parser')

    # Alternate parser for HTML
    # soup = BeautifulSoup(mystr, features='html5lib')

    # selector: Used to select tables containing data on the webpage
    selector = 'h3 ~ div.container-pcgwikitable table#table-gamedata.pcgwikitable.template-infotable'
    table_html = soupsieve.select(selector, soup)
    if isinstance(table_html, collections.Mapping) or isinstance(
            table_html, list):
        for t in table_html:
            d = Network.getDictFromHTMLTable(t)
            for doc in d:
                database.db.insert(doc)

    sys.exit(my_app.exec_())
Пример #14
0
    def test_select_order(self):
        """Test select order."""

        markup = """
        <!-- before header -->
        <html>
        <head>
        </head>
        <body>
        <!-- comment -->
        <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p>
        <pre id="4"></pre>
        <p><span id="5" class="some-class"></span><span id="some-id"></span></p>
        <pre id="6" class='ignore'>
            <!-- don't ignore -->
        </pre>
        </body>
        </html>
        """

        soup = self.soup(markup, 'html.parser')
        ids = []
        for el in sv.select('[id]', soup.body):
            ids.append(el.attrs['id'])

        self.assertEqual(['1', '2', '3', '4', '5', 'some-id', '6'], ids)
Пример #15
0
    def test_scope_cannot_select_target(self):
        """Test that scope, the element which scope is called on, cannot be selected."""

        for parser in ('html.parser', 'lxml', 'html5lib', 'xml'):
            soup = self.soup(self.MARKUP, parser)
            el = soup.html

            # Scope is the element we are applying the select to, and that element is never returned
            self.assertTrue(len(sv.select(':scope', el, flags=sv.DEBUG)) == 0)
Пример #16
0
def get_articles(soup):
    slc1 = 'article:has( p)'
    a1 = soup.select_one(slc1)
    slc2 = 'div:has(> [itemprop="articleBody"]:has( p, div, a))'
    a2 = soup.select_one(slc2)
    slc3 = '''div:is([class*="article"], [class*="head"], [class^="article__header"]),
        div[class^="article_block"]:has(div[class^="article__text"])'''
    a3 = soup.select(slc3)
    if a1:
        return sv.select(slc1, soup)
    elif a2:
        s1 = sv.select(f'{slc2} [itemprop="headline"], [itemprop="articleBody"] :has(>p)', soup)
        return s1
    elif a3:
        return sv.select(f'''div:is([class*="article"], 
            [class*="head"], [class^="article__header"])''', soup)
    else:
        return sv.select(f'''#div_postbody *, :has(#div_postbody) > :is({headers}), 
            [itemprop="headline"], .content *''', soup)
    def test_scope_cannot_select_target(self):
        """Test that scope, the element which scope is called on, cannot be selected."""

        for parser in util.available_parsers(
                'html.parser', 'lxml', 'html5lib', 'xml'):
            soup = self.soup(self.MARKUP, parser)
            el = soup.html

            # Scope is the element we are applying the select to, and that element is never returned
            self.assertTrue(len(sv.select(':scope', el, flags=sv.DEBUG)) == 0)
Пример #18
0
    def test_iframe(self):
        """
        Test that we only count `iframe` as root since the scoped element is the root.

        Not all the parsers treat `iframe` content the same. `html5lib` for instance
        will escape the content in the `iframe`, so we are just going to test the builtin
        Python parser.
        """

        soup = self.soup(self.MARKUP_IFRAME, 'html.parser')

        ids = []
        for el in sv.select(':root div', soup.iframe.html):
            ids.append(el['id'])
        self.assertEqual(sorted(ids), sorted(['div2']))

        ids = []
        for el in sv.select(':root > body > div', soup.iframe.html):
            ids.append(el['id'])
        self.assertEqual(sorted(ids), sorted(['div2']))
Пример #19
0
def scrape(city):
    city_url_id = re.sub(r'[\s,]+', "-", city, 0)
    city_url = "https://www.tide-forecast.com/locations/" + city_url_id + "/tides/latest"
    r = requests.get(url=city_url)
    html = r.text
    soup = BeautifulSoup(html, 'html.parser')
    tr = sv.select(".tide-table > tr", soup)

    curr_date = ""
    timeofday = ""

    for el in tr:
        dateInst = sv.select_one(".date", el)
        if dateInst != None:
            curr_date = dateInst.text.strip()

        tide_time = ""
        tide_time_inst = sv.select_one(".time", el)
        if tide_time_inst != None:
            tide_time = tide_time_inst.text.strip()

        timezone = ""
        timezone_inst = sv.select_one(".time-zone", el)
        if tide_time_inst != None:
            timezone = timezone_inst.text.strip()

        level = ""
        level_inst = sv.select_one(".level", el)
        if level_inst != None:
            level = level_inst.text.strip()

        tide_phase = ""
        tide_phase_inst = sv.select_one(".tide:last-child", el)
        if tide_phase_inst != None:
            tide_phase = tide_phase_inst.text.strip()
        else:
            timeofday_inst = sv.select_one("td:last-child", el)
            if timeofday_inst != None:
                timeofday_val = timeofday_inst.text.strip()
                if timeofday_val == "Sunrise":
                    timeofday = timeofday_val
                elif timeofday_val == "Sunset":
                    timeofday = timeofday_val
                else:
                    timeofday = ""

        if tide_phase == "Low Tide" and (timeofday == "Sunrise"
                                         or timeofday == "Sunset"):
            print('{0} {1} {2} {3} {4}'.format(city, curr_date, tide_time,
                                               timezone, level))
Пример #20
0
    def test_parent_nth_of_type_preconditions(self):
        """Test `nth` type preconditions."""

        els = sv.select('div > h1', self.soup)
        # check that there is a unique selection
        self.assertEqual(len(els), 1)
        self.assertEqual(els[0].string, 'An H1')

        # Show that the `h1`'s parent `div#inner` is the first child of type `div` of the grandparent `div#main`.
        # so that the selector `div:nth-of-type(1) > h1` should also give `h1`.
        h1 = els[0]
        div_inner = h1.parent
        div_main = div_inner.parent
        div_main_children = [child for child in div_main.children]
        self.assertEqual(div_main_children[0], '\n')
        self.assertEqual(div_main_children[1], div_inner)
Пример #21
0
def main():
    stock_to_pull = "RDFN"
    base_url = "https://seekingalpha.com/symbol/" + stock_to_pull
    res = requests.get(base_url)
    soup = BeautifulSoup(res.text, features="lxml")

    article_list = soup.find("ul", {"id": "symbol-page-latest"})
    articles = sv.select('li', article_list)
    news_df = pd.DataFrame()
    for article in articles:
        news_det = {}
        news_det["headline"] = article.find("div", {
            "class": "symbol_article"
        }).text
        spans = article.find("div", {
            "class": "date_on_by"
        }).findChildren("span", {"class": ""})
        for span in spans:
            if (span.text != "SA News"):
                news_det["release_date"] = span.text
        news_det["curr_time"] = eastern.localize(
            dt.now()).strftime('%Y-%m-%d %H:%M:%S')
        print(news_det)
        print('\n\n')
Пример #22
0
    def test_match(self):
        """Test matching."""

        markup = """
        <!-- before header -->
        <html>
        <head>
        </head>
        <body>
        <!-- comment -->
        <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p>
        <pre id="4"></pre>
        <p><span id="5" class="some-class"></span><span id="some-id"></span></p>
        <pre id="6" class='ignore'>
            <!-- don't ignore -->
        </pre>
        </body>
        </html>
        """

        soup = self.soup(markup, 'html5lib')
        nodes = sv.select('span[id]', soup)
        self.assertTrue(sv.match('span#\\35', nodes[0]))
        self.assertFalse(sv.match('span#\\35', nodes[1]))
Пример #23
0
    artist_id = sys.argv[2]
    search_term = sys.argv[3]
else:
    artist = 'Spice-Girls'
    artist_id = 199833
    search_term = 'good sheep'

print(str(artist_id) + ' : ' + artist)
print('Search for : ' + search_term)

url = 'https://www.lyrics.com/artist.php?name=' + artist + '&aid=' + str(
    artist_id) + '&o=1'
page_source = requests.get(url).text
beautiful_soap_content = BeautifulSoup(page_source, "lxml")

for song in sv.select('tr', sv.select_one('tbody', beautiful_soap_content)):
    song_element = sv.select_one('a', song)
    print('\n\nSong Title : ' + song_element.text)
    song_url = 'https://www.lyrics.com' + song_element.get('href')
    print('Song URL : ' + song_url + '\n')
    song_page_source = requests.get(song_url).text
    song_page_content = BeautifulSoup(song_page_source, "lxml")
    # print('Song Lyrics')
    song_lyrics = sv.select_one('pre', song_page_content).text
    print(song_lyrics)

    # if search_term in song_lyrics :
    #     break

    if re.search(search_term, song_lyrics, re.IGNORECASE):
        print(search_term + ' Found On ' + song_element.text)
Пример #24
0
    def googleTranslate(self):
        # def googleTranslate(self, translateString): # 구글번역
        # 	driver = self.driver
        # 	wait = self.wait
        # 	# driver.get('https://translate.google.com/#auto|ko|{}'.format(translateString))
        # 	driver.find_element_by_css_selector('textarea#source.orig.tlid-source-text-input.goog-textarea').send_keys(translateString)
        # 	wait.until(expected.visibility_of_element_located((By.CSS_SELECTOR, 'span.tlid-translation.translation span')))
        # 	html = driver.page_source
        # 	driver.find_element_by_css_selector('textarea#source.orig.tlid-source-text-input.goog-textarea').clear()
        # 	# driver.implicitly_wait(5)

        with open('./string.html', 'r', encoding='utf-8') as f:
            html = f.read()

        select1 = str(
            sv.select_one('div.homepage-content-wrap',
                          BeautifulSoup(html, 'html.parser')))  # 번역창 분리

        with open('./test.html', 'w', encoding='utf-8') as f:
            f.write(select1)

        # data1
        resultString1 = ''
        select4 = sv.select_one('span.tlid-translation.translation',
                                BeautifulSoup(select1, 'html.parser'))
        resultString1 = str(select4).replace('<br/>', '<span>\n</span>')
        resultString1 = BeautifulSoup(resultString1, 'html.parser').text
        print(resultString1)

        # data2
        resultString2 = ''
        if ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style="display: none;"'
                in select1):
            pass
        elif ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style=""' in select1):
            select2 = str(
                sv.select_one('table.gt-baf-table',
                              BeautifulSoup(select1, 'html.parser')))
            select3 = sv.select(
                'span.gt-cd-pos, span.gt-baf-cell.gt-baf-word-clickable',
                BeautifulSoup(select2, 'html.parser'))
            for data in select3:
                strData = str(data)
                if ('gt-baf-cell' in strData):
                    resultString2 += "{}, ".format(data.text)
                elif ('gt-cd-pos' in strData):
                    resultString2 = resultString2.rstrip(", ")
                    resultString2 += "\n* {} *\n: ".format(data.text)
            resultString2 = resultString2.lstrip("\n")
            # resultString2 = resultString2.rstrip(", ")
        print(resultString2)

        # # print(html)
        # select11 = sv.select_one('span.tlid-translation.translation', BeautifulSoup(select1, 'html.parser'))
        # tmp1 = str(select11).replace('<br/>', '<span>\n</span>')
        # tmp1 = BeautifulSoup(tmp1, 'html.parser').text
        # print(tmp1)

        # resultString2 = ''
        # if('<div class="gt-lc gt-lc-mobile" style="display: none;">' in select1):
        # 	pass
        # elif('<div class="gt-lc gt-lc-mobile" style="">' in select1):
        # 	select2 = str(sv.select_one('table.gt-baf-table', BeautifulSoup(select1, 'html.parser')))
        # 	select3 = sv.select('span.gt-cd-pos, span.gt-baf-cell.gt-baf-word-clickable', BeautifulSoup(select2, 'html.parser'))
        # 	for data in select3:
        # 		strData = str(data)
        # 		if('gt-baf-cell' in strData):
        # 			resultString2 += "{}, ".format(data.text)
        # 		elif('gt-cd-pos' in strData):
        # 			resultString2 = resultString2.rstrip(", ")
        # 			resultString2 += "\n* {} *\n: ".format(data.text)
        # 	resultString2 = resultString2.lstrip("\n")
        # 	resultString2 = resultString2.rstrip(", ")
        # print(resultString2)

        # return {"data1": resultString1, "data2": resultString2}
        return
Пример #25
0
    def test_parent_nth_of_type(self):
        """Test parent of `nth` of type."""

        els = sv.select('div:nth-of-type(1) > h1', self.soup)
        self.assertEqual(len(els), 1)
        self.assertEqual(els[0].string, 'An H1')
Пример #26
0
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver
from typing import List, Any, Union
import soupsieve as sv

chromeWebDriver: WebDriver = webdriver.Chrome("C:/Programs/chromedriver.exe")

chromeWebDriver.get("https://soundcloud.com/search/sounds?q=old%20party")
pageSource: Union[Union[int, List[Union[int, str]]],
                  Any] = chromeWebDriver.page_source
beautifulSoapContent: BeautifulSoup = BeautifulSoup(pageSource, "lxml")

# soundBodies: List[Any] = sv.select('div:is(.sound__body)', beautifulSoapContent)

soundBody: Tag
for soundBody in sv.select('div.sound__body', beautifulSoapContent):

    # print ("sound__body Element : " + str(soundBody) + "\n")

    # coverArtElement = sv.select_one('a.sound__coverArt',soundBody)
    # print ("sound__coverArt Element : " + str(coverArtElement))

    # print ("Track Page : https://soundcloud.com" + coverArtElement.get('href'))

    # # TODO : Wait to load Cover Arts
    # # coverArtUrl = get_backgroundImage_url(
    # #     get_first_span_element_by_custom_attribute(coverArtElement, 'aria-role', 'img'))  # type: str
    # # print "Cover Art Url : " + coverArtUrl

    contentElement = sv.select_one('div.sound__content', soundBody)
    # print ("sound__content Element : " + str(contentElement))
Пример #27
0
def scrape_transfers2(page):
    page = BeautifulSoup(open(page, "r"), "html.parser")

    date = page.select("div.box h2")[0].text.replace("Transfer on ", "")
    parsed_date = parser.parse(date)
    timestamp = parsed_date.timestamp()

    if parsed_date.month >= 7:
        season = "{start}/{end}".format(start = parsed_date.year, end = parsed_date.year+1)
    else:
        season = "{start}/{end}".format(start = parsed_date.year-1, end = parsed_date.year)

    for row in page.select("div#yw1 table.items tbody > tr"):
        columns = sv.select (":scope > td", row)

        if len(columns) == 7:
            fee_element = columns[6].select("a")[0]

            if fee_element.text == "Free Transfer":
                break

            player_image_element = columns[0].select("img")[0]
            player_element = columns[0].select("a.spielprofil_tooltip")[0]
            player_age = columns[1].text
            player_position = columns[0].select("table tr td")[-1].text

            from_club_elements = columns[3].select("td.hauptlink a.vereinprofil_tooltip")
            from_club_href = from_club_elements[0]["href"] if len(from_club_elements) > 0 else ""
            from_club_text = from_club_elements[0].text if len(from_club_elements) > 0 else ""
            from_club_image = columns[3].select("td:first-child img")[0]["src"]

            from_country_elements = columns[3].select("table tr td")[-1]
            from_club_country_image = from_country_elements.select("img")
            from_club_country = from_club_country_image[0]["title"] if len(from_club_country_image) > 0 else ""

            from_club_league_elements = from_country_elements.select("a")
            from_club_league = from_club_league_elements[0]["title"] if len(from_club_league_elements) > 0 else ""
            from_club_league_href = from_club_league_elements[0]["href"] if len(from_club_league_elements) > 0 else ""

            to_club_elements = columns[4].select("td.hauptlink a.vereinprofil_tooltip")
            to_club_href = to_club_elements[0]["href"] if len(to_club_elements) > 0 else ""
            to_club_text = to_club_elements[0].text if len(to_club_elements) > 0 else ""
            to_club_image = columns[4].select("td:first-child img")[0]["src"]

            to_country_elements = columns[4].select("table tr td")[-1]
            to_club_country_image = to_country_elements.select("img")
            to_club_country = to_club_country_image[0]["title"] if len(to_club_country_image) > 0 else ""

            to_club_league_elements = to_country_elements.select("a")
            to_club_league = to_club_league_elements[0]["title"] if len(to_club_league_elements) > 0 else ""
            to_club_league_href = to_club_league_elements[0]["href"] if len(to_club_league_elements) > 0 else ""

            nationality = columns[2].select("img")[0]["title"]

            yield {"season": season,
                   "player": {"href": player_element["href"],
                              "name": player_element.text,
                              "position": player_position,
                              "age": player_age,
                              "image": player_image_element["src"],
                              "nationality": nationality},
                   "from": {"href": from_club_href,
                            "name": from_club_text,
                            "country": from_club_country,
                            "countryImage": from_club_country_image[0]["src"] if len(from_club_country_image) > 0 else "",
                            "league": from_club_league,
                            "leagueHref": from_club_league_href,
                            "image": from_club_image},
                   "to": {"href": to_club_href,
                          "name": to_club_text,
                          "country": to_club_country,
                          "countryImage": to_club_country_image[0]["src"] if len(to_club_country_image) > 0 else "",
                          "league": to_club_league,
                          "leagueHref": to_club_league_href,
                          "image": to_club_image},
                   "transfer": {"href": fee_element["href"],
                                "value": fee_element.text,
                                "timestamp": int(timestamp)}
                   }
Пример #28
0
def interpret_scraper(scraper, element, root=None, context=None, path=[], scope=None):
    if scope is None:
        scope = EvaluationScope()

    # Is this a tail call of item?
    if isinstance(scraper, str):
        if scraper in EXTRACTOR_NAMES:
            return extract(element, scraper)

        return element.get(scraper)

    sel = get_sel(scraper)
    iterator = get_iterator(scraper)

    # First we need to solve local selection
    if sel is not None:
        element = soupsieve.select_one(sel, element)
    elif 'sel_eval' in scraper:

        evaluated_sel = eval_expression(
            scraper['sel_eval'],
            element=element,
            elements=[],
            context=context,
            root=root,
            path=path + ['sel_eval'],
            expect=(Tag, str),
            allow_none=True,
            scope=scope
        )

        if isinstance(evaluated_sel, str):
            element = soupsieve.select_one(evaluated_sel, element)
        else:
            element = evaluated_sel

    if element is None:
        return None

    # Then we need to solve iterator
    single_value = True

    if iterator is not None:
        single_value = False
        elements = soupsieve.select(iterator, element)
    elif 'iterator_eval' in scraper:
        single_value = False
        evaluated_elements = eval_expression(
            scraper['iterator_eval'],
            element=element,
            elements=[],
            context=context,
            root=root,
            path=path + ['iterator_eval'],
            check=is_valid_iterator_eval_output,
            scope=scope
        )

        if isinstance(evaluated_elements, str):
            elements = soupsieve.select(evaluated_elements, element)
        else:
            elements = evaluated_elements
    else:
        elements = [element]

    # Handling local context
    if 'set_context' in scraper:
        local_context = {}

        for k, field_scraper in scraper['set_context'].items():
            local_context[k] = interpret_scraper(
                field_scraper,
                element,
                root=root,
                context=context,
                path=path + ['set_context', k],
                scope=scope
            )

        context = merge_contexts(context, local_context)

    # Actual iteration
    acc = None if single_value else []

    already_seen = set() if 'uniq' in scraper and not single_value else None

    for element in elements:
        value = None

        # Do we have fields?
        if 'fields' in scraper:
            value = {}

            for k, field_scraper in scraper['fields'].items():
                value[k] = interpret_scraper(
                    field_scraper,
                    element,
                    root=root,
                    context=context,
                    path=path + ['fields', k],
                    scope=scope
                )

        # Do we have a scalar?
        elif 'item' in scraper:

            # Default value is text
            value = interpret_scraper(
                scraper['item'],
                element,
                root=root,
                context=context,
                path=path + ['item'],
                scope=scope
            )

        else:

            if 'attr' in scraper:
                value = element.get(scraper['attr'])
            elif 'extract' in scraper:
                value = extract(element, scraper['extract'])
            elif 'get_context' in scraper:
                value = nested_get(scraper['get_context'], context)
            elif 'default' not in scraper:

                # Default value is text
                value = extract(element, 'text')

            # Eval?
            if 'eval' in scraper:
                value = eval_expression(
                    scraper['eval'],
                    element=element,
                    elements=elements,
                    value=value,
                    context=context,
                    root=root,
                    path=path + ['eval'],
                    expect=DATA_TYPES,
                    allow_none=True,
                    scope=scope
                )

        # Default value after all?
        if 'default' in scraper and value is None:
            value = scraper['default']

        if single_value:
            acc = value
        else:

            # Filtering?
            if 'filter_eval' in scraper:
                passed_filter = eval_expression(
                    scraper['filter_eval'],
                    element=element,
                    elements=elements,
                    value=value,
                    context=context,
                    root=root,
                    path=path + ['filter_eval'],
                    expect=bool,
                    allow_none=True,
                    scope=scope
                )

                if not passed_filter:
                    continue

            if 'filter' in scraper:
                filtering_clause = scraper['filter']

                if filtering_clause is True and not value:
                    continue

                if isinstance(filtering_clause, str) and not nested_get(filtering_clause, value):
                    continue

            if 'uniq' in scraper:
                uniq_clause = scraper['uniq']
                k = value

                if uniq_clause is True and value in already_seen:
                    continue

                if isinstance(uniq_clause, str):
                    k = nested_get(uniq_clause, value)

                    if k in already_seen:
                        continue

                already_seen.add(k)

            acc.append(value)

    return acc
Пример #29
0
    def googleTranslate(self, translateString):  # 구글번역
        driver = self.driver
        wait = self.wait
        driver.find_element_by_css_selector(
            'textarea#source.orig.tlid-source-text-input.goog-textarea').clear(
            )
        wait.until(
            expected.invisibility_of_element_located(
                (By.CSS_SELECTOR, 'span.tlid-translation.translation')))
        # driver.get('https://translate.google.com/#auto|ko|{}'.format(translateString))
        driver.find_element_by_css_selector(
            'textarea#source.orig.tlid-source-text-input.goog-textarea'
        ).send_keys(translateString)
        try:
            wait.until(
                expected.visibility_of_element_located(
                    (By.CSS_SELECTOR, 'span.tlid-translation.translation')))
        except:
            html = driver.page_source
            driver.find_element_by_css_selector(
                'textarea#source.orig.tlid-source-text-input.goog-textarea'
            ).clear()
            if (sv.select_one('div.result-error',
                              BeautifulSoup(html, 'html.parser'))):
                return {
                    "data1":
                    sv.select_one('span.tlid-result-error',
                                  BeautifulSoup(html, 'html.parser')).text
                }
        html = driver.page_source
        driver.find_element_by_css_selector(
            'textarea#source.orig.tlid-source-text-input.goog-textarea').clear(
            )
        # driver.implicitly_wait(5)

        select1 = str(
            sv.select_one('div.homepage-content-wrap',
                          BeautifulSoup(html, 'html.parser')))  # 번역창 분리

        # data1
        resultString1 = ''
        select4 = sv.select_one('span.tlid-translation.translation',
                                BeautifulSoup(select1, 'html.parser'))
        resultString1 = str(select4).replace('<br/>', '<span>\n</span>')
        resultString1 = BeautifulSoup(resultString1, 'html.parser').text
        # print(resultString1)

        # data2
        resultString2 = ''
        if ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style="display: none;"'
                in select1):
            pass
        elif ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style=""' in select1):
            select2 = str(
                sv.select_one('table.gt-baf-table',
                              BeautifulSoup(select1, 'html.parser')))
            select3 = sv.select(
                'span.gt-cd-pos, span.gt-baf-cell.gt-baf-word-clickable',
                BeautifulSoup(select2, 'html.parser'))
            for data in select3:
                strData = str(data)
                if ('gt-baf-cell' in strData):
                    resultString2 += "{}, ".format(data.text)
                elif ('gt-cd-pos' in strData):
                    resultString2 = resultString2.rstrip(", ")
                    resultString2 += "\n* {} *\n: ".format(data.text)
            resultString2 = resultString2.lstrip("\n")
            resultString2 = resultString2.rstrip(", ")
        # print(resultString2)

        return {"data1": resultString1, "data2": resultString2}