Python get_soup примеры, utility.get_soup Python примеры использования

Пример #1

0

Показать файл

def get_anime_episodes(url):
    '''
    Gets anime episodes information from mal anime url.

    Parameters:
        url [string]: mal anime episode url (https://myanimelist.net/anime/<mal_anime_id>/episode)

    Returns
        eps [list]: contains each episode as its own individual dict object
    '''
    soup = utility.get_soup(url)

    eps = []
    ep_rows = soup.find_all("tr", class_="episode-list-data")
    for row in ep_rows:
        ep_num = row.find("td", class_="episode-number").text
        titles = row.find("td", class_="episode-title")
        title_link = titles.find("a")
        ep_url = title_link["href"]
        eng_title = title_link.text.strip()
        jap_title = titles.find("span").text.strip()
        aired = row.find("td", class_="episode-aired").text.strip()

        eps.append({
            "ep_num": int(ep_num),
            "eng_title": eng_title,
            "jap_title": jap_title,
            "aired": aired,
            "url": ep_url
        })
    return eps

Пример #2

0

Показать файл

def get_anime_characters(url):
    '''
    Gets anime characters' information from mal anime url.

    Parameters:
        url (string): mal anime url (https://myanimelist.net/anime/<mal anime id>/characters)

    Returns
        characters (dict): mal anime characters' information
    '''

    soup = utility.get_soup(url)
    # print(soup.prettify())
    characters = []
    h2_headers = soup.find_all('h2')
    # print(h2_headers)
    for h in h2_headers:
        h = utility.remove_children(h)
        h2_text = h.text.strip()

        if h2_text == 'Characters & Voice Actors' or h2_text == 'Characters &amp; Voice Actors':
            current_tag = h.parent.nextSibling

            while current_tag is not None and current_tag.name == "table":
                cells = current_tag.find_all("td")

                character = cells[1]
                character_url = character.find("a")["href"]
                character_name = character.find("a").text.strip()
                character_type = character.find("div").find(
                    "small").text.strip()
                # print("{} [{}]".format(character_name, character_type))

                for cell in cells[3:]:
                    if cell["valign"] == "top":
                        seiyuu_name = cell.find("a").text.strip()
                        seiyuu_url = cell.find("a")["href"]
                        seiyuu_lang = cell.find("small")
                        if not seiyuu_lang:
                            continue
                        seiyuu_lang = seiyuu_lang.contents[0]
                        # print("\t{}".format(cell))
                        # print("\t{} [{}]".format(seiyuu_name, seiyuu_lang))
                        # print("\t{}".format(seiyuu_url))
                        characters.append({
                            "character": character_name,
                            "url": character_url,
                            "type": character_type,
                            "va": seiyuu_name,
                            "va_lang": seiyuu_lang,
                            "va_url": seiyuu_url
                        })
                current_tag = current_tag.nextSibling
    return characters

Пример #3

0

Показать файл

def get_character_info(url, full=False):
    '''
    Gets character information from mal character url.

    Parameters:
        url [string]: mal anime url (https://myanimelist.net/anime/...)
        full [bool] [defualt=False]: indicate whether to get additional information (episodes, mal statistics)

    Returns
        info [dict]: mal anime information
    '''
    # TODO change full option to dict option, as add on (so check if is boolean or dict)
    info = {}
    soup = utility.get_soup(url)
    content = soup.find('div', {'id': 'content'})
    
    ## META INFO -START
    mal_id = url[url.find('/character/')+1:].split('/')[1]
    info['mal_id'] = mal_id

    url_tag = soup.find('meta', property="og:url")
    if url_tag:
        info['url'] = url_tag['content']
    else:
        info['url'] = url
    ### META INFO -END

    ### NAME -START
    eng_name_tag = content.find('h2', {'class':'normal_header'})    # 1st h2 header in content div
    jap_name_tag = eng_name_tag.find('small')
    if jap_name_tag:
        info['jap_name'] = re.sub(r'[\(\)]', '', jap_name_tag.text)
    if eng_name_tag:
        utility.remove_children(eng_name_tag)
        info['eng_name'] = eng_name_tag.text
    ### NAME -END

    ### MEMBER FAVES -START
    search = re.search(r'Member Favorites: ([0-9]*\,*[0-9]*)\n', content.text)
    if search:
        member_faves = int(re.sub(r'\D', '', search.group(0)))
        info['member_faves'] = member_faves
    ### MEMBER FAVES -END

    ### POST-PROCESSING -START
    info['retrieved_on'] = timestamp
    return info

Пример #4

0

Показать файл

def get_anime_staff(url):
    '''
    Gets anime staff information from mal anime url.

    Parameters:
        url (string): mal anime url (https://myanimelist.net/anime/<mal anime id>/characters)

    Returns
        staff (dict): mal anime staff information
    '''

    soup = utility.get_soup(url)
    # print(soup.prettify())
    staff = []
    h2_headers = soup.find_all('h2')
    # print(h2_headers)
    for h in h2_headers:
        h = utility.remove_children(h)
        h2_text = h.text.strip()

        if h2_text == 'Staff':
            current_tag = h.parent.nextSibling
            # cells = current_tag.find_all("td")
            # print(cells[1].find("small").text.strip())

            while current_tag is not None and current_tag.name == "table":
                cells = current_tag.find_all("td")
                staff_person = cells[1]
                staff_name = staff_person.find('a').text.strip()
                staff_url = staff_person.find('a')['href']
                # print("{} ({})".format(staff_name, staff_url))
                role_str = staff_person.find("small").text.strip()
                if len(role_str) > 0:
                    roles = [r.strip() for r in role_str.split(",")]
                else:
                    roles = []
                # print("\t{}".format(roles))
                staff.append({
                    "staff": staff_name,
                    "staffUrl": staff_url,
                    "roles": roles
                })
                current_tag = current_tag.nextSibling
    return staff

Пример #5

0

Показать файл

def get_mal_stats(url):
    '''
    Gets anime mal stats information from mal anime url.

    Parameters:
        url (string): mal anime stats url (https://myanimelist.net/anime/<anime_id>/stats)

    Returns
        stats (dict): mal stats
    '''

    soup = utility.get_soup(url)
    stats = {}
    divs = soup.find_all("div", class_="spaceit_pad")
    for d in divs:
        span = d.find('span')
        if span and span.text.strip() in [
                "Watching:", "Completed:", "On-Hold:", "Dropped:",
                "Plan to Watch:", "Total:"
        ]:
            stat_label = span.text.replace(":", "").strip().lower()
            span.decompose()
            stat_value = int(d.text.replace(",", ""))
            # print("{} : {}".format(stat_label, stat_value))
            stats[stat_label] = stat_value

    score_table = soup.find("table", class_="score-stats")
    scores = {}
    if score_table:
        for row in score_table.find_all("tr"):
            score_label = int(row.find("td", class_="score-label").text)
            votes = row.find("small").text
            votes = int(re.sub(r'\D', '', votes))  # sub non-digits with ""
            # print("score = {}, votes = {}".format(score_label, votes))
            scores[score_label] = votes
        stats["score_votes"] = scores
    else:
        stats["score_votes"] = None

    return stats

Пример #6

0

Показать файл

def get_character_info(url):
    '''
    get character info (from anime mal url)
    '''
    # TODO: get picture
    info = {}
    soup = utility.get_soup(url)

    ### METADATA
    # url
    mal_url = soup.find("meta", property="og:url")['content']
    info['url'] = mal_url
    # mal character id
    mal_id = url[url.find("/character/") + 1:].split("/")[1]
    info["mal_id"] = mal_id
    # get character nickname(s)
    name_h1 = soup.find('h1', class_="title-name")
    if name_h1:
        name_h1 = name_h1.find('strong').text
        if name_h1:
            nicknames = re.findall(r'"([^"]*)"', name_h1)
        else:
            nicknames = []
    else:
        nicknames = []
    info['nicknames'] = nicknames

    ### INFO PANEL (left-side of page)
    info_panel = soup.find(id='content')
    info_panel_text = info_panel.text.lower()
    # get member faves
    partial = info_panel_text[info_panel_text.find('member favorites'):]
    member_faves = partial.split('\n')[0]
    member_faves = member_faves.split(':')[1]
    member_faves = int(re.sub(r'\D', '',
                              member_faves))  # replace non-digits with ''
    info['member_faves'] = member_faves

    ### MAIN CONTENTS
    # get name header (h2)
    name_h2 = soup.find('h2', class_='normal_header')
    jap_name = name_h2.find('small').text
    jap_name = re.sub('[()]', '', jap_name)  # replace () with ''
    name_h2 = utility.remove_children(name_h2)
    eng_name = name_h2.text.strip()
    info['eng_name'] = eng_name
    info['jap_name'] = jap_name

    # here comes the tricky part: height, birthday, weight
    first_line_only = name_h2.nextSibling  # oh here comes danger; wth is this design
    body_text = first_line_only.nextSibling
    # print(body_text)
    main_text = body_text.contents
    main_text.insert(0, first_line_only)
    # only keep elements that are NavigableString, not just return/new lines and has ":"
    main_text = [
        e for e in main_text
        if type(e) is NavigableString and len(e.strip()) > 0 and ":" in e
    ]
    for e in main_text:
        parts = e.split(':')
        info_label = parts[0].strip().lower()
        info_detail = parts[1].strip().lower()
        # birthdate
        if 'birth' in info_label: info['birthdate'] = info_detail
        if 'height' in info_label: info['height'] = info_detail

    return info

Пример #7

0

Показать файл

def get_anime_info(url, full=False):
    '''
    Gets anime information from mal anime url.

    Parameters:
        url [string]: mal anime url (https://myanimelist.net/anime/...)
        full [bool] [defualt=False]: indicate whether to get additional information (episodes, mal statistics)

    Returns
        info [dict]: mal anime information
    '''
    # TODO change full option to dict option, as add on (so check if is boolean or dict)

    soup = utility.get_soup(url)
    info = {}

    ### BASIC ANIME INFO
    ## META INFO -START
    mal_id = url[url.find('/anime/') + 1:].split('/')[1]
    info['mal_id'] = mal_id

    title = soup.find('meta', property='og:title')
    title['url'] = title['content'].strip()

    url_tag = soup.find('meta', property="og:url")
    if url_tag:
        info['url'] = url_tag['content']
    else:
        info['url'] = url

    synopsis_tag = soup.find('meta', property="og:description")
    if synopsis_tag:
        synopsis = synopsis_tag['content']
        # removes default last line
        synopsis = synopsis.replace("[Written by MAL Rewrite]", "")
        info['synopsis'] = synopsis.strip()
    else:
        info['synopsis'] = ""
    ### META INFO -END

    ### WEBPAGE INFO -START
    # CONSIDER skipping iteration as needed - test timing
    # dark_text spans are what comes before the value (e.g. Score: xxx <-- score text in dark_text span)
    dark_text_tags = soup.find_all("span", {"class": "dark_text"})
    for tag in dark_text_tags:
        # SECTIONS: (alternative titles) english, synonyms, japanese
        #           (information) type, episodes
        section_name = tag.text.lower()
        if section_name[-1] == ":":
            section_name = section_name[:-1]
        # print("section name : {}".format(section_name))

        span_parent = tag.parent
        # print("\t {}".format(span_parent))
        values = []
        spans = span_parent.find_all('span')
        links = span_parent.find_all('a')
        if spans:
            # print("span found: {}".format(spans))
            for span in spans[1:]:
                values.append(span.text.strip())
                # print("\tspan: {}".format(span.text.strip()))
        if links:
            for link in links:
                values.append(link.text.strip())
                # print("\tlink: {}".format(link.text.strip()))

        if len(values) < 1:
            span_parent = utility.remove_children(span_parent)
            values.append(span_parent.text.strip())

        values = list(set(values))
        if len(values) < 2:
            values = values[0]
            if section_name == 'synonyms':
                values = values.split(',')
            if section_name in [
                    "episodes", "popularity", "members", "favorites"
            ]:
                values = re.sub(r'[#,]', '', values)
                try:
                    values = int(values)
                except ValueError:
                    if values.lower() == "unknown":
                        values = None

        if section_name == "score":
            score = {}
            for v in values:
                try:
                    # WARNING: might cause issues if less than 10 people scored the anime
                    if float(v) <= 10:
                        score['score'] = float(v)
                    else:
                        score['scored_by'] = int(v)
                except ValueError:
                    continue
            values = score

        elif section_name == "ranked":
            top_anime_rank = tag.nextSibling
            if top_anime_rank.strip().lower() == "n/a":
                values = None
            else:
                values = top_anime_rank
                # change rank string to int (e.g. #7 to just 7)
                values = int(re.sub(r'\D', '', values))

        info[section_name] = values
    ### WEBPAGE INFO -END

    ### RELATED ANIME -START
    related_anime_table = soup.find('table',
                                    {'class': 'anime_detail_related_anime'})
    if related_anime_table:
        related_anime_info = []
        related_anime_rows = related_anime_table.find_all('tr')
        for row in related_anime_rows:
            cells = row.find_all('td')
            # regex matches not word and not white space (for stuff like "alternate setting")
            related_how = re.sub(r'[^\w\s]', '', cells[0].text).lower()
            related_link = '{}{}'.format('https://myanimelist.net',
                                         cells[1].find('a')['href'])
            # TODO add mal link type (e.g. manga, animes)
            related_anime_info.append({
                'related_type': related_how,
                'link': related_link,
                'title': cells[1].text
            })
        info['related'] = related_anime_info
    else:
        info['related'] = None
    ### RELATED ANIME -END

    if full:
        # get anime episodes
        try:
            eps = get_anime_episodes("{}/episode".format(info['url']))
            info["episode_info"] = eps
        except Bs4Error:
            info["episode_info"] = []

        # get anime stats
        stats = get_mal_stats("{}/stats".format(info['url']))
        info["stats"] = stats

        # get anime characters
        characters = get_anime_characters("{}/characters".format(info['url']))
        info["characters"] = characters

        # get anime staff
        staff = get_anime_staff("{}/characters".format(info['url']))
        info["staff"] = staff

    ### POST-PROC BEFORE RETURN - START
    # if no english title in left side bar, use meta tag
    if not 'english' in info:
        info['english'] = soup.find('meta', property="og:title")['content']

    # trim leading and trailing white spaces from synonyms
    if not 'synonyms' in info:
        info['synonyms'] = []
    elif len(info['synonyms']) > 0:
        info['synonyms'] = [s.strip() for s in info['synonyms']]
    ### POST-PROC BEFORE RETURN - END

    # add timestamp
    info['retrieved_on'] = timestamp

    for k, v in info.items():
        # if string is of values in array, replace with None OR
        # if list is empty, replace with None OR
        # if list is empty, replace with None
        if (type(v) is str and v.strip().lower() in [
                'unknown', 'n/a', 'none', 'add some', 'na'
        ]) or (type(v) is list and len(v) < 1) or (type(v) is dict and not v):
            # print('replace {}: {}, {}'.format(type(v),k,v))
            info[k] = None

    return info

Python get_soup примеры использования