Exemplo n.º 1
0
    def test_select(self):
        """Test select."""

        markup = """
        <!-- before header -->
        <html>
        <head>
        </head>
        <body>
        <!-- comment -->
        <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p>
        <pre id="4"></pre>
        <p><span id="5" class="some-class"></span><span id="some-id"></span></p>
        <pre id="6" class='ignore'>
            <!-- don't ignore -->
        </pre>
        </body>
        </html>
        """

        soup = bs4.BeautifulSoup(markup, 'html5lib')
        ids = []
        for el in sv.select('span[id]', soup):
            ids.append(el.attrs['id'])

        self.assertEqual(sorted(['5', 'some-id']), sorted(ids))

        ids = []
        for el in sv.select('span[id]', soup, limit=1):
            ids.append(el.attrs['id'])

        self.assertEqual(sorted(['5']), sorted(ids))

        self.assertEqual(
            sv.select('span[id]', soup, limit=1)[0].attrs['id'],
            sv.select_one('span[id]', soup).attrs['id'])

        self.assertEqual(None, sv.select_one('h1', soup))

        ids = []
        for el in sv.iselect('span[id]', soup):
            ids.append(el.attrs['id'])

        self.assertEqual(sorted(['5', 'some-id']), sorted(ids))

        span = sv.select('span[id]', soup)[0]
        ids = []
        for el in sv.select('span[id]:not(#some-id)', span.parent):
            ids.append(el.attrs['id'])

        self.assertEqual(sorted(['5']), sorted(ids))
Exemplo n.º 2
0
    def test_select_one(self):
        """Test select one."""

        markup = """
        <!-- before header -->
        <html>
        <head>
        </head>
        <body>
        <!-- comment -->
        <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p>
        <pre id="4"></pre>
        <p><span id="5" class="some-class"></span><span id="some-id"></span></p>
        <pre id="6" class='ignore'>
            <!-- don't ignore -->
        </pre>
        </body>
        </html>
        """

        soup = self.soup(markup, 'html5lib')
        self.assertEqual(
            sv.select('span[id]', soup, limit=1)[0].attrs['id'],
            sv.select_one('span[id]', soup).attrs['id']
        )
Exemplo n.º 3
0
    def test_closest(self):
        """Test closest."""

        markup = """
        <article id="article">
          <div id="div-01">Here is div-01
            <div id="div-02">Here is div-02
              <div id="div-04">Here is div-04</div>
              <div id="div-03">Here is div-03</div>
            </div>
            <div id="div-05">Here is div-05</div>
          </div>
        </article>
        """

        soup = bs4.BeautifulSoup(markup, 'html5lib')
        el = sv.select_one('#div-03', soup)

        self.assertTrue(sv.closest('#div-02', el).attrs['id'] == 'div-02')
        self.assertTrue(sv.closest('div div', el).attrs['id'] == 'div-03')
        self.assertTrue(
            sv.closest('article > div', el).attrs['id'] == 'div-01')
        self.assertTrue(sv.closest(':not(div)', el).attrs['id'] == 'article')
        self.assertTrue(sv.closest('div #div-05', el) is None)
        self.assertTrue(sv.closest('a', el) is None)
Exemplo n.º 4
0
    def test_closest_match_self(self):
        """Test closest match self."""

        markup = """
        <article id="article">
          <div id="div-01">Here is div-01
            <div id="div-02">Here is div-02
              <div id="div-04">Here is div-04</div>
              <div id="div-03">Here is div-03</div>
            </div>
            <div id="div-05">Here is div-05</div>
          </div>
        </article>
        """

        soup = self.soup(markup, 'html5lib')
        el = sv.select_one('#div-03', soup)
        self.assertTrue(sv.closest('div div', el).attrs['id'] == 'div-03')
Exemplo n.º 5
0
    def test_closest_must_be_parent(self):
        """Test that closest only matches parents or self."""

        markup = """
        <article id="article">
          <div id="div-01">Here is div-01
            <div id="div-02">Here is div-02
              <div id="div-04">Here is div-04</div>
              <div id="div-03">Here is div-03</div>
            </div>
            <div id="div-05">Here is div-05</div>
          </div>
        </article>
        """

        soup = self.soup(markup, 'html5lib')
        el = sv.select_one('#div-03', soup)
        self.assertTrue(sv.closest('div #div-05', el) is None)
        self.assertTrue(sv.closest('a', el) is None)
Exemplo n.º 6
0
    def test_closest_match_complex_parent(self):
        """Test closest match complex parent."""

        markup = """
        <article id="article">
          <div id="div-01">Here is div-01
            <div id="div-02">Here is div-02
              <div id="div-04">Here is div-04</div>
              <div id="div-03">Here is div-03</div>
            </div>
            <div id="div-05">Here is div-05</div>
          </div>
        </article>
        """

        soup = self.soup(markup, 'html5lib')
        el = sv.select_one('#div-03', soup)
        self.assertTrue(sv.closest('article > div', el).attrs['id'] == 'div-01')
        self.assertTrue(sv.closest(':not(div)', el).attrs['id'] == 'article')
Exemplo n.º 7
0
def scrape(city):
    city_url_id = re.sub(r'[\s,]+', "-", city, 0)
    city_url = "https://www.tide-forecast.com/locations/" + city_url_id + "/tides/latest"
    r = requests.get(url=city_url)
    html = r.text
    soup = BeautifulSoup(html, 'html.parser')
    tr = sv.select(".tide-table > tr", soup)

    curr_date = ""
    timeofday = ""

    for el in tr:
        dateInst = sv.select_one(".date", el)
        if dateInst != None:
            curr_date = dateInst.text.strip()

        tide_time = ""
        tide_time_inst = sv.select_one(".time", el)
        if tide_time_inst != None:
            tide_time = tide_time_inst.text.strip()

        timezone = ""
        timezone_inst = sv.select_one(".time-zone", el)
        if tide_time_inst != None:
            timezone = timezone_inst.text.strip()

        level = ""
        level_inst = sv.select_one(".level", el)
        if level_inst != None:
            level = level_inst.text.strip()

        tide_phase = ""
        tide_phase_inst = sv.select_one(".tide:last-child", el)
        if tide_phase_inst != None:
            tide_phase = tide_phase_inst.text.strip()
        else:
            timeofday_inst = sv.select_one("td:last-child", el)
            if timeofday_inst != None:
                timeofday_val = timeofday_inst.text.strip()
                if timeofday_val == "Sunrise":
                    timeofday = timeofday_val
                elif timeofday_val == "Sunset":
                    timeofday = timeofday_val
                else:
                    timeofday = ""

        if tide_phase == "Low Tide" and (timeofday == "Sunrise"
                                         or timeofday == "Sunset"):
            print('{0} {1} {2} {3} {4}'.format(city, curr_date, tide_time,
                                               timezone, level))
Exemplo n.º 8
0
    def test_select_one_none(self):
        """Test select one returns none for no match."""

        markup = """
        <!-- before header -->
        <html>
        <head>
        </head>
        <body>
        <!-- comment -->
        <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p>
        <pre id="4"></pre>
        <p><span id="5" class="some-class"></span><span id="some-id"></span></p>
        <pre id="6" class='ignore'>
            <!-- don't ignore -->
        </pre>
        </body>
        </html>
        """

        soup = self.soup(markup, 'html5lib')
        self.assertEqual(None, sv.select_one('h1', soup))
Exemplo n.º 9
0
soundBody: Tag
for soundBody in sv.select('div.sound__body', beautifulSoapContent):

    # print ("sound__body Element : " + str(soundBody) + "\n")

    # coverArtElement = sv.select_one('a.sound__coverArt',soundBody)
    # print ("sound__coverArt Element : " + str(coverArtElement))

    # print ("Track Page : https://soundcloud.com" + coverArtElement.get('href'))

    # # TODO : Wait to load Cover Arts
    # # coverArtUrl = get_backgroundImage_url(
    # #     get_first_span_element_by_custom_attribute(coverArtElement, 'aria-role', 'img'))  # type: str
    # # print "Cover Art Url : " + coverArtUrl

    contentElement = sv.select_one('div.sound__content', soundBody)
    # print ("sound__content Element : " + str(contentElement))

    trackElement = sv.select_one('a.soundTitle__title.sc-link-dark',
                                 contentElement)
    trackTitle = sv.select_one('span', trackElement).text
    print("Track Title : " + trackTitle)
    print("Track Page : https://soundcloud.com" + trackElement.get('href'))
    print("Track Station Page : https://soundcloud.com/stations/track" +
          trackElement.get('href'))

    artistElement = sv.select_one('a.soundTitle__username.sc-link-light',
                                  contentElement)
    print("Artist : " +
          sv.select_one('span.soundTitle__usernameText', artistElement).text)
    print("Artist Page : https://soundcloud.com" + artistElement.get('href'))
Exemplo n.º 10
0
def scrape_posts(html):
    soup = BeautifulSoup(html, 'lxml')

    next_link = soup.select_one('a[href*="?bacr="], a[href*="&bacr="]')

    if next_link is not None:
        next_link = resolve_relative_url(next_link.get('href'))

    posts = []
    post_elements = soup.select('#m_group_stories_container > div > [data-ft]')

    for el in post_elements:
        full_story_link = soupsieve.select_one('a:-soup-contains("Full Story")', el)

        if not full_story_link:
            continue

        post_url = cleanup_post_link(full_story_link.get('href'))

        user_label, user = extract_user_information_from_link(el.select_one('h3 a'))

        formatted_date = el.select_one('abbr').get_text().strip()
        parsed_date = parse_date(formatted_date)

        reactions_item = el.select_one('[id^="like_"]')
        reactions = '0'

        if reactions_item:
            reactions_text = reactions_item.get_text()

            if reactions_text.count('·') > 1:
                reactions = reactions_text.split('·', 1)[0].strip()

        comments_item = soupsieve.select_one('a:-soup-contains(" Comment")', el)
        comments = '0'

        if comments_item:
            comments = comments_item.get_text().split('Comment', 1)[0].strip()

        text_root = el.select_one('[data-ft=\'{"tn":"*s"}\']')
        additional_html_roots = []

        img_root = el.select_one('[data-ft=\'{"tn":"H"}\']')

        if img_root:
            additional_html_roots.append(img_root)

        all_text_elements = text_root.find_all('div', recursive=False)

        text_elements = []
        translated_text_elements = []
        translation_link = None

        for text_el in all_text_elements:
            translation_link = text_el.select_one('a[href^="/basic/translation_preferences/"]')
            if translation_link is None:
                text_elements.append(text_el)
            else:
                translation_link.extract()
                translated_text_elements.append(text_el)

        html_elements = text_elements + additional_html_roots

        comment_text = get_display_text(text_elements)
        comment_html = ''.join(str(el) for el in html_elements)

        translated_comment_text = get_display_text(translated_text_elements)
        translated_comment_html = ''.join(str(el) for el in translated_text_elements)
        translated_from = translation_link.get_text().rsplit('from ', 1)[-1].strip() if translation_link else None

        post = FacebookPost(
            url=post_url,
            user_id=getattr(user, 'id', ''),
            user_handle=getattr(user, 'handle', ''),
            user_url=getattr(user, 'url', ''),
            user_label=user_label,
            text=comment_text,
            html=comment_html,
            translated_text=translated_comment_text,
            translated_html=translated_comment_html,
            translated_from=translated_from,
            formatted_date=formatted_date,
            date=parsed_date,
            reactions=reactions,
            comments=comments
        )

        posts.append(post)

    return next_link, posts
Exemplo n.º 11
0
def interpret_scraper(scraper, element, root=None, context=None, path=[], scope=None):
    if scope is None:
        scope = EvaluationScope()

    # Is this a tail call of item?
    if isinstance(scraper, str):
        if scraper in EXTRACTOR_NAMES:
            return extract(element, scraper)

        return element.get(scraper)

    sel = get_sel(scraper)
    iterator = get_iterator(scraper)

    # First we need to solve local selection
    if sel is not None:
        element = soupsieve.select_one(sel, element)
    elif 'sel_eval' in scraper:

        evaluated_sel = eval_expression(
            scraper['sel_eval'],
            element=element,
            elements=[],
            context=context,
            root=root,
            path=path + ['sel_eval'],
            expect=(Tag, str),
            allow_none=True,
            scope=scope
        )

        if isinstance(evaluated_sel, str):
            element = soupsieve.select_one(evaluated_sel, element)
        else:
            element = evaluated_sel

    if element is None:
        return None

    # Then we need to solve iterator
    single_value = True

    if iterator is not None:
        single_value = False
        elements = soupsieve.select(iterator, element)
    elif 'iterator_eval' in scraper:
        single_value = False
        evaluated_elements = eval_expression(
            scraper['iterator_eval'],
            element=element,
            elements=[],
            context=context,
            root=root,
            path=path + ['iterator_eval'],
            check=is_valid_iterator_eval_output,
            scope=scope
        )

        if isinstance(evaluated_elements, str):
            elements = soupsieve.select(evaluated_elements, element)
        else:
            elements = evaluated_elements
    else:
        elements = [element]

    # Handling local context
    if 'set_context' in scraper:
        local_context = {}

        for k, field_scraper in scraper['set_context'].items():
            local_context[k] = interpret_scraper(
                field_scraper,
                element,
                root=root,
                context=context,
                path=path + ['set_context', k],
                scope=scope
            )

        context = merge_contexts(context, local_context)

    # Actual iteration
    acc = None if single_value else []

    already_seen = set() if 'uniq' in scraper and not single_value else None

    for element in elements:
        value = None

        # Do we have fields?
        if 'fields' in scraper:
            value = {}

            for k, field_scraper in scraper['fields'].items():
                value[k] = interpret_scraper(
                    field_scraper,
                    element,
                    root=root,
                    context=context,
                    path=path + ['fields', k],
                    scope=scope
                )

        # Do we have a scalar?
        elif 'item' in scraper:

            # Default value is text
            value = interpret_scraper(
                scraper['item'],
                element,
                root=root,
                context=context,
                path=path + ['item'],
                scope=scope
            )

        else:

            if 'attr' in scraper:
                value = element.get(scraper['attr'])
            elif 'extract' in scraper:
                value = extract(element, scraper['extract'])
            elif 'get_context' in scraper:
                value = nested_get(scraper['get_context'], context)
            elif 'default' not in scraper:

                # Default value is text
                value = extract(element, 'text')

            # Eval?
            if 'eval' in scraper:
                value = eval_expression(
                    scraper['eval'],
                    element=element,
                    elements=elements,
                    value=value,
                    context=context,
                    root=root,
                    path=path + ['eval'],
                    expect=DATA_TYPES,
                    allow_none=True,
                    scope=scope
                )

        # Default value after all?
        if 'default' in scraper and value is None:
            value = scraper['default']

        if single_value:
            acc = value
        else:

            # Filtering?
            if 'filter_eval' in scraper:
                passed_filter = eval_expression(
                    scraper['filter_eval'],
                    element=element,
                    elements=elements,
                    value=value,
                    context=context,
                    root=root,
                    path=path + ['filter_eval'],
                    expect=bool,
                    allow_none=True,
                    scope=scope
                )

                if not passed_filter:
                    continue

            if 'filter' in scraper:
                filtering_clause = scraper['filter']

                if filtering_clause is True and not value:
                    continue

                if isinstance(filtering_clause, str) and not nested_get(filtering_clause, value):
                    continue

            if 'uniq' in scraper:
                uniq_clause = scraper['uniq']
                k = value

                if uniq_clause is True and value in already_seen:
                    continue

                if isinstance(uniq_clause, str):
                    k = nested_get(uniq_clause, value)

                    if k in already_seen:
                        continue

                already_seen.add(k)

            acc.append(value)

    return acc
Exemplo n.º 12
0
    def googleTranslate(self, translateString):  # 구글번역
        driver = self.driver
        wait = self.wait
        driver.find_element_by_css_selector(
            'textarea#source.orig.tlid-source-text-input.goog-textarea').clear(
            )
        wait.until(
            expected.invisibility_of_element_located(
                (By.CSS_SELECTOR, 'span.tlid-translation.translation')))
        # driver.get('https://translate.google.com/#auto|ko|{}'.format(translateString))
        driver.find_element_by_css_selector(
            'textarea#source.orig.tlid-source-text-input.goog-textarea'
        ).send_keys(translateString)
        try:
            wait.until(
                expected.visibility_of_element_located(
                    (By.CSS_SELECTOR, 'span.tlid-translation.translation')))
        except:
            html = driver.page_source
            driver.find_element_by_css_selector(
                'textarea#source.orig.tlid-source-text-input.goog-textarea'
            ).clear()
            if (sv.select_one('div.result-error',
                              BeautifulSoup(html, 'html.parser'))):
                return {
                    "data1":
                    sv.select_one('span.tlid-result-error',
                                  BeautifulSoup(html, 'html.parser')).text
                }
        html = driver.page_source
        driver.find_element_by_css_selector(
            'textarea#source.orig.tlid-source-text-input.goog-textarea').clear(
            )
        # driver.implicitly_wait(5)

        select1 = str(
            sv.select_one('div.homepage-content-wrap',
                          BeautifulSoup(html, 'html.parser')))  # 번역창 분리

        # data1
        resultString1 = ''
        select4 = sv.select_one('span.tlid-translation.translation',
                                BeautifulSoup(select1, 'html.parser'))
        resultString1 = str(select4).replace('<br/>', '<span>\n</span>')
        resultString1 = BeautifulSoup(resultString1, 'html.parser').text
        # print(resultString1)

        # data2
        resultString2 = ''
        if ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style="display: none;"'
                in select1):
            pass
        elif ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style=""' in select1):
            select2 = str(
                sv.select_one('table.gt-baf-table',
                              BeautifulSoup(select1, 'html.parser')))
            select3 = sv.select(
                'span.gt-cd-pos, span.gt-baf-cell.gt-baf-word-clickable',
                BeautifulSoup(select2, 'html.parser'))
            for data in select3:
                strData = str(data)
                if ('gt-baf-cell' in strData):
                    resultString2 += "{}, ".format(data.text)
                elif ('gt-cd-pos' in strData):
                    resultString2 = resultString2.rstrip(", ")
                    resultString2 += "\n* {} *\n: ".format(data.text)
            resultString2 = resultString2.lstrip("\n")
            resultString2 = resultString2.rstrip(", ")
        # print(resultString2)

        return {"data1": resultString1, "data2": resultString2}
Exemplo n.º 13
0
    def googleTranslate(self):
        # def googleTranslate(self, translateString): # 구글번역
        # 	driver = self.driver
        # 	wait = self.wait
        # 	# driver.get('https://translate.google.com/#auto|ko|{}'.format(translateString))
        # 	driver.find_element_by_css_selector('textarea#source.orig.tlid-source-text-input.goog-textarea').send_keys(translateString)
        # 	wait.until(expected.visibility_of_element_located((By.CSS_SELECTOR, 'span.tlid-translation.translation span')))
        # 	html = driver.page_source
        # 	driver.find_element_by_css_selector('textarea#source.orig.tlid-source-text-input.goog-textarea').clear()
        # 	# driver.implicitly_wait(5)

        with open('./string.html', 'r', encoding='utf-8') as f:
            html = f.read()

        select1 = str(
            sv.select_one('div.homepage-content-wrap',
                          BeautifulSoup(html, 'html.parser')))  # 번역창 분리

        with open('./test.html', 'w', encoding='utf-8') as f:
            f.write(select1)

        # data1
        resultString1 = ''
        select4 = sv.select_one('span.tlid-translation.translation',
                                BeautifulSoup(select1, 'html.parser'))
        resultString1 = str(select4).replace('<br/>', '<span>\n</span>')
        resultString1 = BeautifulSoup(resultString1, 'html.parser').text
        print(resultString1)

        # data2
        resultString2 = ''
        if ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style="display: none;"'
                in select1):
            pass
        elif ('<div class="gt-cd gt-cd-mbd gt-cd-baf" style=""' in select1):
            select2 = str(
                sv.select_one('table.gt-baf-table',
                              BeautifulSoup(select1, 'html.parser')))
            select3 = sv.select(
                'span.gt-cd-pos, span.gt-baf-cell.gt-baf-word-clickable',
                BeautifulSoup(select2, 'html.parser'))
            for data in select3:
                strData = str(data)
                if ('gt-baf-cell' in strData):
                    resultString2 += "{}, ".format(data.text)
                elif ('gt-cd-pos' in strData):
                    resultString2 = resultString2.rstrip(", ")
                    resultString2 += "\n* {} *\n: ".format(data.text)
            resultString2 = resultString2.lstrip("\n")
            # resultString2 = resultString2.rstrip(", ")
        print(resultString2)

        # # print(html)
        # select11 = sv.select_one('span.tlid-translation.translation', BeautifulSoup(select1, 'html.parser'))
        # tmp1 = str(select11).replace('<br/>', '<span>\n</span>')
        # tmp1 = BeautifulSoup(tmp1, 'html.parser').text
        # print(tmp1)

        # resultString2 = ''
        # if('<div class="gt-lc gt-lc-mobile" style="display: none;">' in select1):
        # 	pass
        # elif('<div class="gt-lc gt-lc-mobile" style="">' in select1):
        # 	select2 = str(sv.select_one('table.gt-baf-table', BeautifulSoup(select1, 'html.parser')))
        # 	select3 = sv.select('span.gt-cd-pos, span.gt-baf-cell.gt-baf-word-clickable', BeautifulSoup(select2, 'html.parser'))
        # 	for data in select3:
        # 		strData = str(data)
        # 		if('gt-baf-cell' in strData):
        # 			resultString2 += "{}, ".format(data.text)
        # 		elif('gt-cd-pos' in strData):
        # 			resultString2 = resultString2.rstrip(", ")
        # 			resultString2 += "\n* {} *\n: ".format(data.text)
        # 	resultString2 = resultString2.lstrip("\n")
        # 	resultString2 = resultString2.rstrip(", ")
        # print(resultString2)

        # return {"data1": resultString1, "data2": resultString2}
        return
Exemplo n.º 14
0
    artist_id = sys.argv[2]
    search_term = sys.argv[3]
else:
    artist = 'Spice-Girls'
    artist_id = 199833
    search_term = 'good sheep'

print(str(artist_id) + ' : ' + artist)
print('Search for : ' + search_term)

url = 'https://www.lyrics.com/artist.php?name=' + artist + '&aid=' + str(
    artist_id) + '&o=1'
page_source = requests.get(url).text
beautiful_soap_content = BeautifulSoup(page_source, "lxml")

for song in sv.select('tr', sv.select_one('tbody', beautiful_soap_content)):
    song_element = sv.select_one('a', song)
    print('\n\nSong Title : ' + song_element.text)
    song_url = 'https://www.lyrics.com' + song_element.get('href')
    print('Song URL : ' + song_url + '\n')
    song_page_source = requests.get(song_url).text
    song_page_content = BeautifulSoup(song_page_source, "lxml")
    # print('Song Lyrics')
    song_lyrics = sv.select_one('pre', song_page_content).text
    print(song_lyrics)

    # if search_term in song_lyrics :
    #     break

    if re.search(search_term, song_lyrics, re.IGNORECASE):
        print(search_term + ' Found On ' + song_element.text)