示例#1
0
def wrap_picture(
    soup: bs4.BeautifulSoup,
    img_tag: bs4.element.Tag,
    image: Image,
    domain: Url,
) -> None:

    picture_tag = soup.new_tag('picture')
    source_tag = soup.new_tag('source')
    picture_tag['class'] = img_tag.get('class') or ""
    img_tag['class'] = []
    img_tag.wrap(picture_tag)
    img_tag.insert_before(source_tag)

    source_tag.attrs.update({  # type: ignore[attr-defined]
        'type':
        'image/webp',
        'srcset':
        write_srcset(
            domain,
            image.path.with_suffix('.webp'),
            image.thumbnail_widths,
        ),
        'sizes':
        img_tag.get('sizes', '')  # type: ignore[dict-item]
    })
    def _parse_one_person(self, person: bs4.element.Tag, _stage: str,
                          i: int) -> Dict:
        _trophy, _, _level, _name = [
            i.text.strip()
            for i in person.find_all("div",
                                     attrs={"style": self._style_font_xs})
        ]
        res = {
            "trophy": int(_trophy),
            "level": int(_level),
            "name": _name,
            "hero": self.hero_map[person.find("img").get("src")],
            "playerId": person.get("href").split("/")[-1],
            "isTeammate": False,
        }

        if _stage == "Duo Showdown":
            res["group"] = i // 2
            res["is_mvp"] = np.nan
        elif _stage == "Showdown":
            res["group"] = i
            res["is_mvp"] = np.nan
        else:
            res["group"] = np.nan
            res["is_mvp"] = person.find("img",
                                        attrs={"src":
                                               self._img_mvp}) is not None
        return res
示例#3
0
 def link_to_folder(link: bs4.element.Tag) -> str:
     raw_url: str = link.get("href", default="")
     url: ParseResult = urlparse(raw_url)
     if url.scheme or url.netloc:
         return ""
     url_path: str = posixpath.normpath(url.path)
     if "/" in url_path or url_path == "." or url_path == "..":
         return ""
     return url_path
示例#4
0
        def stylesheet_filter_func(tag: bs4.element.Tag) -> bool:
            """
            Filter function for stylesheet tags only
            """

            if tag.has_attr("rel"):
                is_css = "stylesheet" in tag.get("rel")
                enabled = not tag.has_attr("disabled")
                if is_css and enabled:
                    return True
            return False
示例#5
0
def get_seclist_for_el(el: bs4.element.Tag, ref_map: Dict, default_seclist: List) -> List[Tuple]:
    """
    Build sec_list for tag
    :param el:
    :param ref_map:
    :param default_seclist:
    :return:
    """
    if type(el) == NavigableString:
        return default_seclist
    sec_id = el.get('s2orc_id', None)
    if sec_id:
        return build_section_list(sec_id, ref_map)
    else:
        return default_seclist
def extract_info_from_post(post: bs4.element.Tag) -> dict:
    """
    Extract attributes of interest from tags within a single post

    :type post: bs4.element.Tag
    :param post: a HTML Tag representing a single post

    :rtype: dict
    :return: dict containing the data extracted
    """
    result_title_tag = post.find('a', class_='result-title hdrlnk')
    href = result_title_tag.get('href')
    data_id = result_title_tag.get('data-id')
    time_tag = post.find('time', class_='result-date')
    posted_at = time_tag.get('datetime')
    repost_of = post.get('data-repost-of')

    return {
        'href': href,
        'data_id': data_id,
        'posted_at': posted_at,
        'repost_of': repost_of
    }
示例#7
0
    def dblp_contrib_single(
            self,
            elem: bs4.element.Tag) -> fatcat_openapi_client.ReleaseContrib:
        """
        In the future, might try to implement creator key-ificiation and lookup here.

        Example rows:

            <author>Michael H. B&ouml;hlen</author>
            <author orcid="0000-0002-4354-9138">Nicolas Heist</author>
            <author orcid="0000-0001-9108-4278">Jens Lehmann 0001</author>
        """

        creator_id = None
        extra = None
        raw_name = clean_str(elem.text)

        # remove number in author name, if present
        if raw_name and raw_name.split()[-1].isdigit():
            raw_name = " ".join(raw_name.split()[:-1])

        if elem.get("orcid"):
            orcid_val = elem["orcid"]
            if isinstance(orcid_val, list):
                orcid = clean_orcid(orcid_val[0])
            else:
                orcid = clean_orcid(orcid_val)
            if orcid:
                creator_id = self.lookup_orcid(orcid)
                if not creator_id:
                    extra = dict(orcid=orcid)
        return fatcat_openapi_client.ReleaseContrib(
            raw_name=raw_name,
            creator_id=creator_id,
            extra=extra,
        )
示例#8
0
def get_sections_from_div(el: bs4.element.Tag, sp: BeautifulSoup,
                          parent: Optional[str], faux_max: int) -> Dict:
    """
    Process section headers for one div
    :param el:
    :param sp:
    :return:
    """
    sec_map_dict = dict()
    el_ref_id = None

    # process divs with ids
    if el.get('id', None):
        sec_num = el.get('id-text', None)
        if 'cid' in el.get('id'):
            el_ref_id = el.get('id').replace('cid', 'SECREF')
        elif 'uid' in el.get('id'):
            el_ref_id = el.get('id').replace('uid', 'SECREFU')
        else:
            print('Unknown ID type!', el.get('id'))
            raise NotImplementedError
        el['s2orc_id'] = el_ref_id
        sec_map_dict[el_ref_id] = {
            "num": sec_num,
            "text": get_section_name(el),
            "ref_id": el_ref_id,
            "parent": parent
        }
    # process divs without section numbers
    elif el.get('rend') == "nonumber":
        el_ref_id = f'SECREF{faux_max}'
        el['s2orc_id'] = el_ref_id
        sec_map_dict[el_ref_id] = {
            "num": None,
            "text": get_section_name(el),
            "ref_id": el_ref_id,
            "parent": parent
        }

    # process sub elements
    for sub_el in el.find_all(recursive=False):
        if sub_el.name.startswith('div'):
            # add any unspecified keys
            sec_keys = [
                int(k.strip('SECREF')) for k in sec_map_dict.keys()
                if k and k.strip('SECREF').isdigit()
            ]
            faux_max = max(sec_keys + [faux_max]) + 1
            sec_map_dict.update(
                get_sections_from_div(sub_el, sp,
                                      el_ref_id if el_ref_id else parent,
                                      faux_max))
        elif sub_el.name == 'p' or sub_el.name == 'proof':
            if sub_el.get('id', None):
                sec_num = sub_el.get('id-text', sub_el.hi.get('id-text', None))
                if 'cid' in sub_el.get('id'):
                    sub_el_ref_id = sub_el.get('id').replace('cid', 'SECREF')
                elif 'uid' in sub_el.get('id'):
                    sub_el_ref_id = sub_el.get('id').replace('uid', 'SECREFU')
                else:
                    print('Unknown ID type!', sub_el.get('id'))
                    raise NotImplementedError
                sub_el['s2orc_id'] = sub_el_ref_id
                sec_map_dict[el_ref_id] = {
                    "num":
                    sec_num,
                    "text":
                    sub_el.head.text
                    if sub_el.head else sub_el.hi.text if sub_el.hi else "",
                    "ref_id":
                    sub_el_ref_id,
                    "parent":
                    el_ref_id if el_ref_id else parent
                }
    return sec_map_dict
示例#9
0
def _parse_link(el: bs4.element.Tag) -> Link:
    onclick = el.get('onclick') if el.has_attr('onclick') else None
    return Link(text=el.text, href=None, onclick=onclick)