Exemplo n.º 1
0
 def create_instance(index: int, block: ResultSet):
     try:
         title = block.find('div', {'class': 'BNeawe vvjwJb AP7Wnd'}).text
         link = unquote(re.search(r'((https|http):\/\/[^&]*)', block.find('a').attrs.get('href'))[0])
         return GoogleResult(
             index,
             title,
             link
         )
     except:
         pass
Exemplo n.º 2
0
def scrape_tag_contents(tags, html):
    tag_list = copy.copy(tags)
    if isinstance(html, Tag):
        soup = html
    else:
        soup = BeautifulSoup(html, "lxml")
    results = []
    content_tag, content_attr = tag_list.pop()
    if not len(tag_list):
        return list(soup.findAll(name=content_tag, attrs=content_attr))
    first_tag, first_attr = tag_list.pop(0)
    element_list = soup.findAll(name=first_tag, attrs=first_attr)

    for tag, attr in tag_list:
        temp = ResultSet([], ())
        for element in element_list:
            if isinstance(attr, dict):
                temp += element.findAll(name=tag, attrs=attr)
            elif isinstance(attr, unicode) or isinstance(attr, str):
                if element.has_attr(attr):
                    temp.append(element[attr])

        element_list = temp

    for element in element_list:
        if content_tag == "regex":
            pattern = content_attr
            text = element
            if not isinstance(text, str):
                text = element.text
            if text:
                match = re.findall(pattern, text)
                if match:
                    results.append(match[0])
        elif content_attr is None or content_attr == "":
            if content_tag is None or content_tag == "":
                text = element
            else:
                text = element.find(content_tag)
            if text:
                results.append(text.text)
        elif content_tag is None or content_tag == "":
            if element.has_attr(content_attr):
                results.append(element[content_attr])
        else:
            info_container = element.findAll(name=content_tag)
            for container in info_container:
                if isinstance(content_attr, dict):
                    results.append(container)
                elif info_container.has_attr(content_attr):
                    results.append(container[content_attr])
    return results
Exemplo n.º 3
0
    def parse(self):
        """Method to parse results.

        Returns:
          (list): list of Merchant instances.

        """
        parse = dateparser.parse
        results = ResultSet([])
        url_parts = urlparse(self.link)
        url_splitted_path = list(filter(None, url_parts.path.split('/')))
        stop_ends = ['complaints', 'customer-reviews', 'print']

        if url_splitted_path[len(url_splitted_path)-1] in stop_ends:
            url_splitted_path.pop()

        url_path = '/'.join(url_splitted_path + ['print'])

        url = urlunparse((url_parts.scheme, url_parts.netloc, url_path,
                          None, None, None))

        try:
            html_doc = request.urlopen(url)
        except urllib_error.HTTPError:
            logger.error('BBB parser failed with {0}'.format(self.link))
            raise

        soup = BeautifulSoup(html_doc, 'html.parser')

        try:
            results += soup.find(
                'table', 'cmpldetail'
            ).find_all(
                'tr', re.compile('odd|even')
            )
        except AttributeError:
            pass

        try:
            results += soup.find(
                'div', 'customer-complaint-summary'
            ).find_all(
                'tr', re.compile('odd|even')
            )
        except AttributeError:
            pass

        for result in results:
            try:
                date_ = parse(result.find('td', 'date').text)
                text = result.find('p').text or ''
                text = re.sub(r'^Complaint:|Complaint', '', text).strip()

                self.results.append(
                    self.create_mention(text=text, date=date_)
                )
            except AttributeError:
                continue
        
        return self.results
Exemplo n.º 4
0
    def alternateRowRemover(rows: ResultSet) -> tuple:

        counter = 0
        multiple = 0

        while counter < len(rows):
            try:
                if "%" in rows[counter].find_all("td")[1].string:
                    rows.remove(rows[counter])
                    counter = 0
                    multiple += 1
                    continue

            except IndexError:
                pass

            counter += 1

        return rows, multiple
Exemplo n.º 5
0
def parse_row(r: ResultSet) -> Tuple[str, List[str]]:
    """Break up a villager's table row into the individual parts

    Arguments:
        r {ResultSet} -- The row to be parsed

    Returns:
        Tuple[str, List[str]] -- A villager's name and their attributes
    """
    bad_vals = re.compile(r'\W')
    cols = [re.sub(bad_vals, '', val.text) for val in r.find_all('td')]
    return (cols[0], cols[2:])
Exemplo n.º 6
0
    def links(self, character_page):

        try:
            page = urllib2.urlopen(character_page)
        except:
            character_list = ResultSet([])
            return character_list

        soup = BeautifulSoup(page, 'html.parser')
        character_list = soup.findAll(
            'a', attrs={'class': 'category-page__member-link'})

        nextpage = soup.find('a',
                             attrs={'class': 'category-page__pagination-next'})

        if nextpage is None:
            self.nextPage = None
        else:
            self.nextPage = nextpage.get('href')

        return character_list
Exemplo n.º 7
0
def helper_remove_tags(element: ResultSet,
                       tags_to_drop: Optional[List[TagInfo]] = None,
                       debug=False):
    for tag_to_drop in tags_to_drop:
        if debug:
            print(f"Drop tag '{tag_to_drop}'")
        for s in element.select(tag_to_drop[0]):
            decomposed = False
            if len(tag_to_drop[1]) > 0:
                for attribute in tag_to_drop[1]:
                    if not s.has_attr(attribute[0]):
                        decomposed = True
                        s.decompose()
                    elif len(attribute[1]) == 0 or s[
                            attribute[0]] not in attribute[1]:
                        decomposed = True
                        s.decompose()
            else:
                decomposed = True
                s.decompose()
            if debug and decomposed:
                print(f"Drop element '{s}' in element '{element}'")
            if not decomposed and len(s.select(tag_to_drop[0])) != 0:
                helper_remove_tags(s, tags_to_drop)
Exemplo n.º 8
0
 def get_chapter_instance_from_li(
         li: ResultSet) -> Optional[WebToonChapter]:
     url = li.find('a')['href']
     episode_pretty_name = li.find('img')['alt'].strip()
     return WebToonChapter.from_url(url, episode_pretty_name)
Exemplo n.º 9
0
 def _get_href(self, tag: ResultSet) -> str:
     return tag.get("href")
Exemplo n.º 10
0
def helper_rename_tags(element: ResultSet,
                       tags_to_rename: Optional[List[TagRenameInfo]] = None,
                       debug=False):
    for tag_to_rename in tags_to_rename:
        for tag in element.find_all(tag_to_rename[0]):
            tag.name = tag_to_rename[1]