Пример #1
0
def replace_cell_with_select(cell: Tag, names: list, values: list, attrs={}):
    """
    Replaces the contents of a table cell with a select element with options
    using the given list of names and values.

    If the string contents already in the cell, match one of the values given,
    then that option will be marked as selected.
    attrs is a dictionary of additional attributes that will be applied to the
    select element.
    If the cell is marked as 'contenteditable' or has a class 'editable',
    those attributes will be removed.
    """
    soup = page_builder.soup_from_text("<select></select>")
    select = soup.find(
        'select'
    )  # Soup.select is already a function, so we access the select element this way
    for key in attrs.keys():
        select[key] = attrs[key]

    string = cell.string
    options = build_option_list(names,
                                value_accessor=(lambda n, i: values[i]),
                                selector=(lambda n, i, v: v == string))

    cell.string = ''
    if 'contenteditable' in cell.attrs:
        del cell.attrs['contenteditable']
    if 'class' in cell.attrs and cell.attrs['class'] == "editable":
        del cell.attrs['class']

    select.append(options)
    cell.append(select)
Пример #2
0
def form_service_gen_multiple_values(_id, params, service) -> Union[Tag, str]:
    script = Tag(name="script", attrs={"defer": True})
    values = []
    for env in sorted(
            service,
            reverse=True,
            key=lambda x: int(re.search(r"\d+$", x).group())
            if x[-1].isdigit() else 0,
    ):
        for param, param_value in params.items():
            suffix = env.replace(param, "")
            if env.startswith(param):
                values.append({
                    "default":
                    service.get(f"{param}{suffix}", param_value["default"]),
                    "env":
                    param,
                    "help":
                    param_value["help"],
                    "id":
                    param_value["id"],
                    "label":
                    param_value["label"],
                    "selects":
                    param_value.get("selects", []),
                    "type":
                    param_value["type"],
                })

        if len(values) >= len(params):
            script.append(f"addMultiple('{_id}', '{json.dumps(values)}');")
            values = []

    return script if script.children else ""
Пример #3
0
    def megakills_to_html(self, megakills):

        if megakills is None:
            return None
        columns = megakills.columns

        soup = BeautifulSoup("", "html.parser")
        table = Tag(soup, name="table")
        table["class"] = "blueTable"
        table["id"] = "divmegakills"
        soup.append(table)
        tr = Tag(soup, name="tr")
        table.append(tr)

        for col in columns:
            th = Tag(soup, name="th")
            tr.append(th)
            th.append(col)
        for index, row in megakills.iterrows():
            tr = Tag(soup, name="tr")
            td = Tag(soup, name='td')
            for col in columns:
                td = Tag(soup, name='td')
                td.insert(1, (str(row[col])))
                tr.append(td)
                table.append(tr)
        return soup
Пример #4
0
def copy(element):
    """
	beautifulsoup4 객체의 요소를 복제합니다.
	
	오직 해당 요소의 정보만 복사합니다. 자식들에 대한 정보는 원본과 공유됩니다.
	
	.. bugs::
		work around bug where there is no builder set https://bugs.launchpad.net/beautifulsoup/+bug/1307471.
	
	.. 이 함수 작성에 다음 문서를 참조하였음.
		http://stackoverflow.com/questions/23057631/clone-element-with-beautifulsoup
	
	:param element: 복제 할 원소.
	:type element: bs4.Tag, bs4.NavigableString, bs4.Comment
	:return: 복제된 원소.
	"""
    if isinstance(element, (NavigableString, Comment)):
        return type(element)(element)

    clone_element = Tag(None, element.builder, element.name, element.namespace,
                        element.nsprefix)
    clone_element.attrs = dict(element.attrs)
    for attr in ('can_be_empty_element', 'hidden'):
        setattr(clone_element, attr, getattr(element, attr))
    for child in element.contents:
        clone_element.append(child)
    return clone_element
Пример #5
0
    def wrap_rawtext(cls, element):

        if isinstance(element, NavigableString):
            return

        groups = []
        group = []

        for c in element.children:

            if isinstance(c, NavigableString):
                group.append(c)

            if isinstance(c, Tag):
                groups.append(group)
                group = []

        if len(group) > 0:
            groups.append(group)

        for g in groups:

            if len(g) == 0:
                continue

            par = Tag(name="p")
            g[0].wrap(par)
            for i in range(1, len(g)):
                par.append(g[i])
Пример #6
0
def get_markdown_page_index_objects(content: Tag, url: str, page_path: str,
                                    title: str, page_type: str,
                                    page_views: int) -> List[Dict]:
    headers = ['h1', 'h2', 'h3']
    index_objects = []
    children = [
        element for element in content.children if isinstance(element, Tag)
    ]
    if children[0].name not in headers:
        return get_page_index_objects(content, url, page_path, title,
                                      page_type, page_views)
    block_title = ""
    content = []
    url_with_href = ""
    for child in children:
        if child.name in headers:
            if block_title != '':
                for ind, page_part in enumerate(get_valuable_content(content)):
                    page_info = {
                        'url': url_with_href,
                        'objectID': url_with_href + str(ind),
                        'content': page_part,
                        'headings': block_title,
                        'pageTitle': title,
                        'type': page_type,
                        'pageViews': page_views
                    }
                    index_objects.append(page_info)
            url_with_href = url + '#' + child.get('id')
            block_title = child.text
            content = []
        else:
            content.append(child)
    return index_objects
Пример #7
0
def add_mathjax_call(soup):
    head = soup.find('head')
    if not head:
        msg = 'Could not find <head>'
        raise_desc(ValueError, msg, s=str(soup))

    src = 'https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-MML-AM_CHTML'

    config = r"""
 MathJax.Hub.Config({
    extensions: ["tex2jax.js"],
    jax: ["input/TeX", "output/HTML-CSS"],
    tex2jax: {
      inlineMath: [ ['$','$'], ],
      displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
      processEscapes: true
    },
    "HTML-CSS": { availableFonts: ["TeX"] }
  });
    """
    script = Tag(name='script')
    script['type'] = 'text/x-mathjax-config'
    script.append(config)
    head.append(script)

    script = Tag(name='script')
    script.attrs['src'] = src

    head.append(script)
Пример #8
0
def insert_html(content):
    soup = BeautifulSoup("", "lxml")
    wrap = Tag(soup, name="p")
    wrap["class"] = "text"
    wrap.append(BeautifulSoup(content, 'html.parser'))
    soup.append(wrap)
    return soup
Пример #9
0
def clone_beautiful_soup_tag(elements):
    """
	:type element: Tag
	:rtype: Tag
	"""
    if elements is None:
        raise ElementTypeError('elements is None!')

    if isinstance(elements, (Tag, NavigableString, BeautifulSoup)):
        element = elements
        if isinstance(element, NavigableString):
            return type(element)(element)

        copy = Tag(None, element.builder, element.name, element.namespace,
                   element.nsprefix)

        # work around bug where there is no builder set
        # https://bugs.launchpad.net/beautifulsoup/+bug/1307471
        copy.attrs = dict(element.attrs)
        for attr in ('can_be_empty_element', 'hidden'):
            setattr(copy, attr, getattr(element, attr))
        for child in element.contents:
            copy.append(clone_beautiful_soup_tag(child))
        return copy
    else:
        return [clone_beautiful_soup_tag(x) for x in elements]
Пример #10
0
 def construct_xml(self):
     soup = BeautifulSoup(etree.tostring(etree.Element('OTA_AirLowFareSearchRQ')), 'xml')
     query = soup.contents[0]
     query.attrs = {
         'xmlns':'http://www.opentravel.org/OTA/2003/05',
         'xmlns:xsi':'http://www.w3.org/2001/XMLSchema-instance',
         'PrimaryLangId':'en',
         'Version':'2.001',
         'TimeStamp':str(datetime.datetime.now().isoformat()),
         'EchoToken':str(time.mktime(time.gmtime())),
         'xsi:schemaLocation':'http://www.opentravel.org/2006A/OTA_AirLowFareSearchRQ.xsd',
     }
     
     t_pos = Tag(name='POS')
     t_source = Tag(name='Source')
     t_req = Tag(name='RequestorID')
     t_req.attrs = {
         'ID':'weathersick',
         'URL':'http://www.weathersick.com',
         'Type':'18',
     }
     t_source.append(t_req)
     t_pos.append(t_source)
     query.append(t_pos)
     
     t_odinf = Tag(name='OriginDestinationInformation')
     t_odinf.attrs {'RPH':1}
     t_deptime = Tag(name='DepartureDateTime')
     t_deptime.
     
     OriginDestinationInformation RPH="1"
     
     import pdb; pdb.set_trace()
Пример #11
0
def insert_header(text, size):
    soup = BeautifulSoup("", "lxml")
    header1 = Tag(soup, name="h" + str(size))
    header1["class"] = "header"
    header1.append(text)
    soup.append(header1)
    return soup
Пример #12
0
def deepcopy(element):
    """
	beautifulsoup4 객체의 요소를 재귀적으로 복제합니다.
	
	요소가 가진 정보 및 자식의 모든 정보를 재귀적으로 복제합니다.
	
	이 함수를 통하여, beautifulsoup4 4.0.2에서 append 함수 사용시 발생하는 DOM 깨짐현상(기존에 존재하던 태그에 접근 불가능해지거나, 각 메소드들 별로 원래 정상적으로 접근하게될 결과의 일부분만 얻게 되는 현상)을 피할 수 있습니다.
	
	.. 이 함수 작성에 다음 문서를 참조하였음.
		http://stackoverflow.com/questions/23057631/clone-element-with-beautifulsoup
	
	:param element: 복제 할 원소.
	:type element: bs4.Tag, bs4.NavigableString, bs4.Comment
	:return: 복제된 원소.
	"""
    if isinstance(element, (NavigableString, Comment)):
        return type(element)(element)

    clone_element = Tag(None, element.builder, element.name, element.namespace,
                        element.nsprefix)
    # work around bug where there is no builder set
    # https://bugs.launchpad.net/beautifulsoup/+bug/1307471
    clone_element.attrs = dict(element.attrs)
    for attr in ('can_be_empty_element', 'hidden'):
        setattr(clone_element, attr, getattr(element, attr))
    for child in element.contents:
        clone_element.append(deepcopy(child))
    return clone_element
Пример #13
0
    def renames_to_html(self, table_renames):
        renamesdf = table_renames[0]
        columns = renamesdf.columns
        soup = BeautifulSoup("", "html.parser")

        table = Tag(soup, name="table")
        table["class"] = "blueTable"
        table["id"] = "divrenames"

        tr = Tag(soup, name="tr")
        table.append(tr)

        for col in columns:
            th = Tag(soup, name="th")
            tr.append(th)
            th.append(col)
        for index, row in renamesdf.iterrows():
            tr = Tag(soup, name="tr")
            for col in columns:
                td = Tag(soup, name='td')
                td.insert(1, row[col])
                tr.append(td)
            table.append(tr)

        soup.append(table)
        return soup
Пример #14
0
 def medal_html_string(self, color, count):
     soup = BeautifulSoup("", "html.parser")
     if color == "g":
         medal = self.gold_medal_emoji_html
     elif color == "s":
         medal = self.silver_medal_emoji_html
     elif color == "b":
         medal = self.bronze_medal_emoji_html
     elif color == "p":
         medal = self.poop_emoji_html
     elif color == "q":
         medal = self.cup_emoji_html
     elif color == "a":
         medal = self.amphora_emoji_html
     elif color == "l":
         medal = self.leaf_emoji_html
     elif color == "d":
         medal = self.diamond_emoji_html
     elif color == "f":
         medal = self.silverware_emoji_html
     elif color == "r":
         medal = self.springfling_emoji_html
     elif color == "t":
         medal = self.trident_emoji_html
     for i in range(0, count):
         medal_span = Tag(soup, name='span')
         medal_span["style"] = "font-size:10px;"
         medal_html = BeautifulSoup(medal, 'html.parser')
         medal_span.append(medal_html)
         soup.append(medal_span)
     return soup
Пример #15
0
    def feuds_to_html(self, top_feuds):
        feuds = top_feuds[0]
        columns = top_feuds[1]

        soup = BeautifulSoup("", "html.parser")
        table = Tag(soup, name="table")
        table["class"] = "blueTable"
        table["id"] = "divfeuds"
        soup.append(table)
        tr = Tag(soup, name="tr")
        table.append(tr)

        for col in columns:
            th = Tag(soup, name="th")
            tr.append(th)
            th.append(col)
        for index, row in feuds.iterrows():
            tr = Tag(soup, name="tr")
            td = Tag(soup, name="td")
            for col in feuds.columns:
                td = Tag(soup, name="td")
                td.insert(1, (str(row[col])))
                tr.append(td)
                table.append(tr)
        return soup
Пример #16
0
def merge_row_elements(element: Tag) -> None:
    """
    If an element is an 'mrow' produced by KaTeX, its children are probably needlessly fragmented. For
    instance, the word 'true' will contain four '<mi>' elements, one for 't', 'r', 'u', and 'e'
    each. Merge such elements into single elements.
    """

    if element.name != "mrow":
        return

    elements = [e for e in element.children if isinstance(e, Tag)]
    merger = MathMlElementMerger()
    merged = merger.merge(elements)

    # If the 'mrow' only contains one element after its children are merged, simplify the
    # MathML tree replacing this node with its merged child. Preserve the start and end
    # position of the row element if it is specified, because this often means that a styling
    # macro was applied to the children, and the start and end positions of the row include
    # the control sequence and braces for the styling macro.
    if len(merged) == 1:
        start = element.attrs.get("s2:start")
        end = element.attrs.get("s2:end")
        if start and end:
            merged[0].attrs["s2:start"] = start
            merged[0].attrs["s2:end"] = end
        if "s2:style-start" in element.attrs and "s2:style-end" in element.attrs:
            merged[0].attrs["s2:style-start"] = element.attrs["s2:style-start"]
            merged[0].attrs["s2:style-end"] = element.attrs["s2:style-end"]
        element.replace_with(merged[0])
    else:
        for e in elements:
            e.extract()
        for m in merged:
            element.append(m)
Пример #17
0
 def insert_link(self, url, link_text):
     # <a href="url">link text</a>
     soup = BeautifulSoup("", "html.parser")
     link = Tag(soup, name="a")
     link["href"] = url
     link.append(link_text)
     soup.append(link)
     return soup
Пример #18
0
    def insert_text(self, content):
        soup = BeautifulSoup("", "html.parser")
        text = Tag(soup, name="p")
        text["class"] = "text"

        text.append(content)
        soup.append(text)
        return soup
Пример #19
0
 def as_tag(self) -> Tag:
     tag = Tag(name='event',
               attrs={
                   'start': 'T' + str(self.start),
                   'end': 'T' + str(self.end)
               })
     tag.append(self.value)
     return tag
Пример #20
0
def heading2table(soup, table, row):
    """add heading row to table"""
    tr = Tag(soup, name="tr")
    table.append(tr)
    for attr in row:
        th = Tag(soup, name="th")
        tr.append(th)
        th.append(attr)
Пример #21
0
 def tag(self):
     tt = Tag(name='table')
     for r in self.cells:
         rt = Tag(name='tr')
         for c in r:
             rt.append(c.tag())
         tt.append(rt)
     return tt
Пример #22
0
def heading2table(soup, table, row):
    """add heading row to table"""
    tr = Tag(soup, name="tr")
    table.append(tr)
    for attr in row:
        th = Tag(soup, name="th")
        tr.append(th)
        th.append(attr)
Пример #23
0
def row2table(soup, table, row):
    """ad a row to the table"""
    tr = Tag(soup, name="tr")
    table.append(tr)
    for attr in row:
        td = Tag(soup, name="td")
        tr.append(td)
        td.append(attr)
Пример #24
0
 def tag(self):
     tt=Tag(name='table')
     for r in self.cells:
         rt=Tag(name='tr')
         for c in r:
             rt.append(c.tag())
         tt.append(rt)
     return tt
Пример #25
0
def row2table(soup, table, row):
    """ad a row to the table"""
    tr = Tag(soup, name="tr")
    table.append(tr)
    for attr in row:
        td = Tag(soup, name="td")
        tr.append(td)
        td.append(attr)
def nest_tags(names):
    current = Tag(name=names[0])
    root = current
    for i in range(1, len(names)):
        new_tag = Tag(name=names[i])
        current.append(new_tag)
        current = new_tag
    return root
Пример #27
0
def insert_text(content):
    soup = BeautifulSoup("", "lxml")
    text = Tag(soup, name="p")
    text["class"] = "text"

    text.append(content)
    soup.append(text)
    return soup
Пример #28
0
 def initialize_framework(self, head: Tag, tags: List[Tag]):
     """
     Applys the header tags to the head
     :param head:
     :param tags:
     :return:
     """
     for tag in tags:
         head.append(tag)
Пример #29
0
 def insert_toggle(self, toggle_div):
     soup = BeautifulSoup("", "html.parser")
     link = Tag(soup, name="a")
     link["href"] = "#"
     link["id"] = toggle_div
     link["class"] = "text"
     link.append("+/-")
     soup.append(link)
     return soup
Пример #30
0
def _add_element(soup, element, soup_listing):
    tag = Tag(parser=soup,
              name=element.tag_name,
              namespace=_gumbo.TAG_NAMESPACES[element.tag_namespace],
              attrs=_convert_attrs(element.attributes))
    soup_listing.append(tag)
    for child in element.children:
        tag.append(_add_node(soup, child, soup_listing))
    tag.offset = element.offset
    return tag
Пример #31
0
def replace_dot_code(dot_code: Tag) -> None:
    svg = BeautifulSoup(dot(dot_code.text), 'xml').svg
    assert 'style' not in svg.attrs
    svg.attrs['style'] = (
        f'max-width: 100%; '
        f'width: {svg.attrs.pop("width")}; '
        f'height: {svg.attrs.pop("height")};'
    )
    dot_code.clear()
    dot_code.append(svg)
Пример #32
0
 def tag(self):
     t = Tag(name='td')
     if self.borders:
         t['class'] = self.borders
     if self.back is not None:
         t['style'] = 'background-color: #%06x;' % self.back
     for x in self.texts:
         t.append(x.text_tag())
     for x in self.texts:
         t.append(x.div_tag())
     return t
Пример #33
0
 def tag(self):
     t=Tag(name='td')
     if self.borders:
         t['class']=self.borders
     if self.back is not None:
         t['style']='background-color: #%06x;'%self.back
     for x in self.texts:
         t.append(x.text_tag())
     for x in self.texts:
         t.append(x.div_tag())
     return t
Пример #34
0
def merge(roots):
    if is_atom(roots[0]):
        atom = Tag(name=roots[0].name)
        for child in roots[0].children:
            atom.append(copy.copy(child))
        atom['data-ver'] = merge_versions(roots)
        # if atom.name == 'tr': print('TR', roots)
        atom['class'] = 'atom-wrapper'
        return atom

    # print('name', roots[0].name)

    tree_children = [flatten_children(root) for root in roots]
    tree_children.sort(key=len, reverse=True)
    # flat_children = sum(tree_children, [])

    groups = groupby(tree_children)
    for index, group in enumerate(groups):
        for item in group:
            item['data-group'] = index
            item.group = index
    # print('groups:', groups)

    # graph = [[] for _ in groups]
    # for children in tree_children:
    #     prev = None
    #     for child in children:
    #         if prev is not None:
    #             graph[prev.group].append(child.group)
    #         prev = child

    print(
        'C',
        list(
            map(
                lambda children: list(map(lambda child: child.group, children)
                                      ), tree_children)))
    # print('T', topsort(graph))
    print('G', groups)

    # sorted_groups = [groups[i] for i in topsort(graph)]
    # # if roots[0].get('class') == 'text-wrapper': print('wrapper:', graph, topsort(graph))
    # sorted_groups = [merge(group) for group in sorted_groups]
    sorted_groups = [merge(group) for group in groups]

    soup = Soup(features='html5lib')
    root = soup.new_tag(roots[0].name)
    for x in roots:
        root.attrs.update(x.attrs)
    root['data-ver'] = merge_versions(roots)
    for item in sorted_groups:
        root.append(copy.copy(item))

    return root
Пример #35
0
def clone(el):
    if isinstance(el, NavigableString):
        return type(el)(el)

    copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix)
    copy.attrs = dict(el.attrs)
    for attr in ('can_be_empty_element', 'hidden'):
        setattr(copy, attr, getattr(el, attr))
    for child in el.contents:
        copy.append(clone(child))
    return copy
Пример #36
0
def pc_to_xml_helper(pc):
    # returns a list of soup Tags given a pc list
    if not pc:
        return []
    new_tag = Tag(name=pc[0])
    interior, second_part = first_split(pc)
    if interior and interior[0] not in tag_names:
        new_tag.string = ' '.join(interior)
    elif interior and interior[0] in tag_names:
        for child in pc_to_xml_helper(interior):
            new_tag.append(child)
    return [new_tag] + pc_to_xml_helper(second_part)
Пример #37
0
def clone(el):
    if isinstance(el, NavigableString):
        return type(el)(el)

    copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix)
    # work around bug where there is no builder set
    # https://bugs.launchpad.net/beautifulsoup/+bug/1307471
    copy.attrs = dict(el.attrs)
    for attr in ('can_be_empty_element', 'hidden'):
        setattr(copy, attr, getattr(el, attr))
    for child in el.contents:
        copy.append(clone(child))
    return copy
Пример #38
0
 def soup(self):
     '''
         Returns HTML as a BeautifulSoup element.
     '''
     components_soup = Tag(name=self.tagname, builder=BUILDER)
     components_soup.attrs = self.attributes
     for c in flatten(self.components):
         if hasattr(c, 'soup'):
             components_soup.append(c.soup())
         elif type(c) in (str, ):
             # components_soup.append(BeautifulSoup(str(c)))
             components_soup.append(str(c))
         # else:
             # Component should not be integrated
             # pass
     return components_soup
Пример #39
0
def clone_bs4_elem(el):
    """Clone a bs4 tag before modifying it.

    Code from `http://stackoverflow.com/questions/23057631/clone-element-with
    -beautifulsoup`
    """
    if isinstance(el, NavigableString):
        return type(el)(el)

    copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix)
    # work around bug where there is no builder set
    # https://bugs.launchpad.net/beautifulsoup/+bug/1307471
    copy.attrs = dict(el.attrs)
    for attr in ('can_be_empty_element', 'hidden'):
        setattr(copy, attr, getattr(el, attr))
    for child in el.contents:
        copy.append(clone_bs4_elem(child))
    return copy
Пример #40
0
def get_markdown_page_index_objects(content: Tag, url: str, page_path: str, title: str, page_type: str,
                                    page_views: int) -> List[Dict]:
    headers = ['h1', 'h2', 'h3']
    index_objects = []
    children = [element for element in content.children if isinstance(element, Tag)]
    if children[0].name not in headers:
        return get_page_index_objects(content, url, page_path, title, page_type, page_views)
    block_title = ""
    content = []
    url_with_href = ""
    for child in children:
        if child.name in headers:
            if block_title != '':
                for ind, page_part in enumerate(get_valuable_content(content)):
                    page_info = {'url': url_with_href, 'objectID': url_with_href + str(ind), 'content': page_part,
                                 'headings': block_title, 'pageTitle': title, 'type': page_type,
                                 'pageViews': page_views}
                    index_objects.append(page_info)
            url_with_href = url + '#' + child.get('id')
            block_title = child.text
            content = []
        else:
            content.append(child)
    return index_objects
Пример #41
0
def printhtml(csvdiffs):
    """print the html"""
    soup = BeautifulSoup()
    html = Tag(soup, name="html")
    para1 = Tag(soup, name="p")
    para1.append(csvdiffs[0][0])
    para2 = Tag(soup, name="p")
    para2.append(csvdiffs[1][0])
    table = Tag(soup, name="table")
    table.attrs.update(dict(border="1"))

    soup.append(html)
    html.append(para1)
    html.append(para2)
    html.append(table)
    heading2table(soup, table, csvdiffs[3])
    for row in csvdiffs[4:]:
        row = [str(cell) for cell in row]
        row2table(soup, table, row)
    # print soup.prettify()
    print(soup)    
Пример #42
0
		if len(divFigures) != 0:
			
			for fig in divFigures:
				
				figCaption = fig.p
				# Turn the caption into span for CSS formatting

				#note the games chapter needs some caption work
				if figCaption is not None: 
					figCaption.name = "span"

				# [zach] -- this is to make images that are not full width, have captions below the image
				
				div = Tag(soup, None, "div")
				div['style'] = "clear:both"
				div.append(clone(fig.img));
				
				fig.img.replace_with(div)
				# Images have been stored in ./CHAPTER_NAME/images/ relative 
				# to the chapter html, but image references in the html are 
				# to ./images/.  Modify the image tags:
				div.img["src"] = internalImagesPath + "/" + div.img["src"]
		
		# Make all hyperlinks in the chapter target a new window/tab
		hyperlinkTags = soup.find_all("a")
		for hyperlinkTag in hyperlinkTags:
			hyperlinkTag["target"]= "_blank"

		html = str(soup)
		with open(destChapterPath, "wb") as file:
			file.write(html)
Пример #43
0
def _extract_article_body(page):
    article = page.find(id='artikel').find(class_='content')

    body = Tag(name='temporary_tag')

    # +1 internetz for the person who can tell me why I can't write:
    #   for element in article.children:
    # or
    #   for element in article.contents:
    for element in list(article.children):
        # Ignore the comment form
        if element.name == 'form':
            continue

        # Ignore whitespace
        if element.name is None and re.search('\S', str(element)) is None:
            continue

        # Nor div, nor form, nor whitespace: probably article content
        if element.name != 'div':
            body.append(element.extract())
            continue

        # TODO uncomment me when the app is ready to support subtitles
        # Oh, and change the next if with an elif
        #  if 'field-field-ondertitel' in element['class']:
        #      paragraph = _extract_paragraph(element, 'subtitle')
        #      body.append(paragraph)

        if 'field-field-inleiding' in element['class']:
            paragraph = _extract_paragraph(element, 'introduction')
            body.append(paragraph)

        elif 'field-field-img-regulier' in element['class']:
            images_div = Tag(name='div', attrs={'class': 'image'})
            for image_and_caption in element(id='image-and-caption'):
                image = image_and_caption.img
                caption = image_and_caption.find(class_='caption-text')

                paragraph = Tag(name='p')
                paragraph.append(image)
                if caption is not None:
                    paragraph.append(caption.text)

                images_div.append(paragraph)
            body.append(images_div)

        elif 'field-field-website' in element['class']:
            label = element.find(class_='field-label').text
            label_p = Tag(name='p')
            label_s = Tag(name='strong')
            label_s.append(label)
            label_p.append(label_s)
            body.append(label_p)

            websites = element.find(class_='field-item').contents
            for website in list(websites):
                body.append(website)

        else:
            # Ignore other divs
            pass

    return body
Пример #44
0
			for fig in divFigures:

				figCaption = fig.p
				# Turn the caption into span for CSS formatting

				#note the games chapter needs some caption work
				if figCaption is not None:
					figCaption.name = "div"

				# [zach] -- this is to make images that are not full width, have captions below the image

				div = Tag(soup, None, "div")

				div['style'] = "image" #"clear:both"

				div.append(clone(fig.img))

				fig.img.replace_with(div)
				# Images have been stored in ./CHAPTER_NAME/images/ relative
				# to the chapter html, but image references in the html are
				# to ./images/.  Modify the image tags:
				div.img["src"] = internalImagesPath + "/" + div.img["src"]
				# Turn the figure image into a hyperlink that points to the
				# full resolution version of the image
				imgHyperlink = soup.new_tag("a", href=fig.img["src"])
				fig.img.wrap(imgHyperlink)


				fig['class'] = "inner"
				divWhole = Tag(soup, None, "div")
				divWhole['class'] = "figure"
Пример #45
0
def format_links(html):
    '''
    This monster of a function takes in the html from a post and returns a dict
        containing html, text, summary.
    Uses opengraph to try to get titles for all untitled links, and tries to hyperlink everything.
    '''
    edit_html = html
    html = html.replace('&#34;', '"')
    soup = BeautifulSoup(re.sub(r'&(?!amp;)', r'&amp;', html))

    reformat_str = ''.join(random.sample(string.ascii_uppercase, 10)) + '__'
    reformat_dict = {}
    videos = []
    image = None

    # Set aside all <img> tags, because we need to treat them special and will add them in later.
    for tag_index, img_tag in enumerate(soup.find_all('img')):
        key = reformat_str + 'img' + str(tag_index)
        img_tag.replace_with(key)

        # handle the shitty case where a user inputs a non-http link
        if img_tag.has_attr('src') and not img_tag['src'].startswith('http'):
            new_src = 'http://' + img_tag['src']
            img_tag['src'] = new_src
        if not image:
            image = img_tag['src']

        reformat_dict[key] = img_tag

    # Set aside all <a> tags, because we need to treat them special and will add them in later.
    for tag_index, a_tag in enumerate(soup.find_all('a')):
        key = reformat_str + 'a' + str(tag_index)
        a_tag.replace_with(key)

        # handle the shitty case where a user inputs a non-http link
        if a_tag.has_attr('href'):
            new_href = a_tag['href'].strip()
            if not new_href.startswith('http'):
                new_href = 'http://' + a_tag['href']
            a_tag['href'] = new_href
            embed_link = get_embed_link(new_href)
            if embed_link:
                videos.append(embed_link)

        a_tag['target'] = '_blank'

        try:
            if a_tag.string and a_tag['href'] and a_tag.string in a_tag['href']:
                og_title = get_opengraph(a_tag['href'], params=['title']).get('title')
                a_tag.string = og_title.strip()
        except:
            pass

        reformat_dict[key] = a_tag

    mentions = []
    # Find all mentions and format them
    mention_regex = re.compile(r'(@\S+(?:\s\S+)?)')
    for mention_index, mention_str in enumerate(soup(text=mention_regex)):
        key = reformat_str + 'm' + str(mention_index)
        mention_split_list = mention_regex.split(mention_str)
        parent_tag = Tag(name='span')

        for piece in mention_split_list:
            if type(piece) in [unicode, str]:
                s = mention_regex.search(piece)
                if s:
                    first_letter = re.search(r"@\S+", piece).group()[1]
                    names = [u.name for u in User.objects(name__istartswith=first_letter)]
                    for i in range(len(piece) - 1):
                        query_name = re.compile(piece[1:len(piece) - i], flags=re.IGNORECASE)
                        matches = len([name for name in names if query_name.match(name)])
                        if matches == 1:
                            a_tag = Tag(name='a')
                            target_user = User.objects(name=query_name).get()
                            a_tag['href'] = '/profile/%s' % str(target_user.id)
                            a_tag['target'] = '_blank'
                            a_tag['mention'] = 'Yes'
                            a_tag.string = '@' + query_name.pattern
                            parent_tag.append(a_tag)
                            parent_tag.append(NavigableString(piece[len(piece) - i:]))
                            mentions.append(str(target_user.id))
                            break
                    else:
                        # for/else structure
                        # catch an @ that didn't match any users
                        parent_tag.append(NavigableString(piece))
                else:
                    parent_tag.append(NavigableString(piece))

        reformat_dict[key] = parent_tag
        mention_str.replace_with(key)

    opengraph_index = 0
    opengraph_objects = []

    # Find all plaintext links and format them.
    for p in soup.find_all('p'):
        p_text = unicode(p.text)
        if link_regex.search(p_text):
            new_p = Tag(name='p')
            opengraph_only = False
            p_opengraph_objects = []

            link_split_list = link_regex.split(p_text)
            for piece in link_split_list:
                if type(piece) in [unicode, str]:
                    s = link_regex.search(piece)
                    if s:
                        link_text = s.group().strip()
                        if not link_text.startswith('http'):
                            link_text = 'http://' + link_text
                        opengraph = get_opengraph(link_text)
                        a_tag = Tag(name='a')
                        a_tag.string = opengraph.get('title', link_text) or link_text
                        a_tag['href'] = link_text
                        a_tag['target'] = '_blank'

                        if not image and opengraph['image']:
                            image = opengraph['image']

                        embed_link = get_embed_link(link_text)
                        if embed_link:
                            videos.append(embed_link)
                        else:
                            num_items = 0
                            for item in link_split_list:
                                if item and not re.match(r'^<.+>$', item):
                                    num_items += 1
                            if num_items == 1:
                                opengraph_objects.append(opengraph)
                                p_opengraph_objects.append(opengraph)
                                opengraph_only = True

                        new_p.append(a_tag)
                    else:
                        new_p.append(NavigableString(piece))

            if opengraph_only:
                new_p = Tag(name='p')

            for obj in p_opengraph_objects:
                div = Tag(name='div db-opengraph')
                div['site'] = 'comment.opengraph[%d]' % opengraph_index
                opengraph_index += 1
                new_p.append(div)

            p.replace_with(new_p)

    # Bring back all set-aside <a> and <img> tags
    for key in reformat_dict:
        soup(text=key)[0].replace_with(reformat_dict[key])

    # Extract html from soup
    html = unicode(soup)
    html = clean_html(html)

    # Anonymized html
    for mention in soup.find_all('a', attrs={'mention': 'Yes'}):
        mention.replace_with(NavigableString('@User'))
    anonymized_html = unicode(soup)
    anonymized_html = clean_html(anonymized_html)

    # Generate text
    text = MLStripper.strip_html(html)
    anonymized_text = MLStripper.strip_html(anonymized_html)

    # Generate summary
    first_paragraph = re.compile('<p>.+?(<br/>|</p>)').search(html)
    if first_paragraph:
        summary = MLStripper.strip_html(first_paragraph.group())
    if not summary and opengraph_objects:
        summary = opengraph_objects[0]['title']
    if not summary and text:
        summary = text
    if not summary:
        summary = ""

    # Generate anonymized summary
    first_paragraph = re.compile('<p>.+?(<br/>|</p>)').search(anonymized_html)
    if first_paragraph:
        anonymized_summary = MLStripper.strip_html(first_paragraph.group())
    if not anonymized_summary and opengraph_objects:
        anonymized_summary = opengraph_objects[0]['title']
    if not anonymized_summary and text:
        anonymized_summary = text
    if not anonymized_summary:
        anonymized_summary = ""

    # In summary, we should all the pesky double-spaces and truncate if necessary
    summary = summary.replace('  ', ' ')
    if len(summary) > 100:
        summary = summary[:97] + '...'
    anonymized_summary = anonymized_summary.replace('  ', ' ')
    if len(anonymized_summary) > 100:
        anonymized_summary = anonymized_summary[:97] + '...'

    return {'html': html, 'edit_html': edit_html, 'summary': summary, 'text': text,
            'anonymized_html': anonymized_html, 'anonymized_summary': anonymized_summary,
            'mentions': mentions, 'videos': videos, 'opengraph': opengraph_objects, 'image': image}
Пример #46
0
			for fig in divFigures:
						
				figCaption = fig.p
				# Turn the caption into span for CSS formatting

				#note the games chapter needs some caption work
				if figCaption is not None: 
					figCaption.name = "div"

				# [zach] -- this is to make images that are not full width, have captions below the image
				
				div = Tag(soup, None, "div")
				
				div['style'] = "image" #"clear:both"
				
				div.append(clone(fig.img))
				
				fig.img.replace_with(div)
				# Images have been stored in ./CHAPTER_NAME/images/ relative 
				# to the chapter html, but image references in the html are 
				# to ./images/.  Modify the image tags:
				div.img["src"] = internalImagesPath + "/" + div.img["src"]
				# Turn the figure image into a hyperlink that points to the
				# full resolution version of the image
				imgHyperlink = soup.new_tag("a", href=fig.img["src"])
				fig.img.wrap(imgHyperlink)


				fig['class'] = "inner"
				divWhole = Tag(soup, None, "div")
				divWhole['class'] = "figure"
Пример #47
0
# for attr in mem_attr:
#     th = Tag(soup, None, "th")
#     tr.append(th)
#     th.append(attr)
# print soup.prettify()




for c in chapterTags: 
	ul = Tag(soup, None, "ul")
	li = Tag(soup, None, "li")
	a = Tag(soup, None, "a");
	a['href'] = "chapters/" + c['path'] + ".html"
	a.string = c['title']
	li.append(a)
	ul.append(li)

	#print c['title']
	#print c['path']
	if (len(['innerTags'])):
		ulInner = Tag(soup, None, "ul")
		li.append(ulInner);
		for tag in c['innerTags']: 
			liInner = Tag(soup, None, "li")
			ulInner.append(liInner)
			a = Tag(soup, None, "a")
			tagNoSpaces = tag.replace(" ", "")
			a['href'] = "chapters/" + c['path'] + ".html#" + tagNoSpaces
			a['target'] = "_top"
			a.string = tag
Пример #48
0
			for fig in divFigures:
						
				figCaption = fig.p
				# Turn the caption into span for CSS formatting

				#note the games chapter needs some caption work
				if figCaption is not None: 
					figCaption.name = "div"

				# [zach] -- this is to make images that are not full width, have captions below the image
				
				div = Tag(soup, None, "div")
				
				div['style'] = "image" #"clear:both"
				
				div.append(clone(fig.img))
				
				fig.img.replace_with(div)
				# Images have been stored in ./CHAPTER_NAME/images/ relative 
				# to the chapter html, but image references in the html are 
				# to ./images/.  Modify the image tags:
				div.img["src"] = internalImagesPath + "/" + div.img["src"]
				# Turn the figure image into a hyperlink that points to the
				# full resolution version of the image
				imgHyperlink = soup.new_tag("a", href=fig.img["src"])
				fig.img.wrap(imgHyperlink)


				fig['class'] = "inner"
				divWhole = Tag(soup, None, "div")
				divWhole['class'] = "figure"
Пример #49
0
		if len(divFigures) != 0:
			
			for fig in divFigures:
				
				figCaption = fig.p
				# Turn the caption into span for CSS formatting

				#note the games chapter needs some caption work
				if figCaption is not None: 
					figCaption.name = "span"

				# [zach] -- this is to make images that are not full width, have captions below the image
				
				div = Tag(soup, None, "div")
				div['style'] = "clear:both"
				div.append(clone(fig.img));
				
				fig.img.replace_with(div)
				# Images have been stored in ./CHAPTER_NAME/images/ relative 
				# to the chapter html, but image references in the html are 
				# to ./images/.  Modify the image tags:
				div.img["src"] = internalImagesPath + "/" + div.img["src"]

				# Turn the figure image into a hyperlink that points to the
				# full resolution version of the image
				imgHyperlink = soup.new_tag("a", href=fig.img["src"])
				fig.img.wrap(imgHyperlink)

		
		# Make all hyperlinks in the chapter target a new window/tab
		hyperlinkTags = soup.find_all("a")
Пример #50
0
def build_rss(url, list_selector, item_selector, ignored_qp, output, pretty=False):
    try:
        soup = BeautifulSoup('<rss version="2.0" />', "xml")
        rss = soup.rss
        has_lxml = True
    except FeatureNotFound:
        rss = BeautifulSoup('<rss version="2.0" />').rss
        has_lxml = False

    r = requests.get(url)
    list_html = (BeautifulSoup(r.text, "lxml") if has_lxml else BeautifulSoup(r.text)).html

    channel = Tag(name="channel")
    rss.append(channel)
    channel.append(new_tag("title", list_html.head.title.string))
    channel.append(new_tag("link", url))
    channel.append(new_tag("description", "--"))
    channel.append(new_tag("lastBuildDate", time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
    channel.append(new_tag("generator", "RSS Builder"))

    item_urls = list_html.select(list_selector)
    for item_url in map(lambda i: i["href"], item_urls):
        item_url = urlparse.urljoin(url, item_url)
        parsed = urlparse.urlparse(item_url)
        query_params = urlparse.parse_qsl(parsed.query)
        item_url = urlparse.urlunparse(
            (
                parsed.scheme,
                parsed.netloc,
                parsed.path,
                parsed.params,
                "&".join([k + "=" + v for k, v in query_params if k not in ignored_qp]),
                parsed.fragment,
            )
        )

        r = requests.get(item_url)
        item_html = (BeautifulSoup(r.text, "lxml") if has_lxml else BeautifulSoup(r.text)).html

        item = Tag(name="item")
        item.append(new_tag("title", item_html.head.title.string))
        item.append(new_tag("link", item_url))
        item.append(new_tag("description", str(item_html.select(item_selector)[0])))
        channel.append(item)

    out_func = lambda x: (x.prettify() if pretty else unicode(x)).encode("utf-8")
    if output == "-":
        out_file = sys.stdout
        close_file = lambda: None
    else:
        out_file = open(output, "w")
        close_file = out_file.close

    if has_lxml:
        out_file.write(out_func(soup))
    else:
        out_file.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
        out_file.write(out_func(rss))
    out_file.write("\n")
    close_file()
Пример #51
0
def rebuild_rss(url, output, selectors, replace = None, pretty = False, raw = False):
	source = feedparser.parse(url)

	try:
		soup = BeautifulSoup('<rss version="2.0" />', 'xml')
		rss = soup.rss
		has_lxml = True
	except FeatureNotFound:
		rss = BeautifulSoup('<rss version="2.0" />').rss
		has_lxml = False

	channel = Tag(name = 'channel')
	rss.append(channel)
	putback_elems(source.feed, channel_required, channel)
	putback_elems(source.feed, channel_optional, channel)

	build_date = Tag(name = 'lastBuildDate')
	build_date.string = time.strftime('%a, %d %b %Y %H:%M:%S +0000', time.gmtime())
	channel.append(build_date)

	generator = Tag(name = 'generator')
	generator.string = source.feed.generator + ' & RSS Rebuilder' if hasattr(source.feed, 'generator') else 'RSS Rebuilder'
	channel.append(generator)

	if replace:
		regexp = re.compile(replace[0])

	for entry in source.entries:
		item = Tag(name = 'item')
		channel.append(item)

		putback_elems(entry, item_required, item)
		putback_elems(entry, item_optional, item)

		r = requests.get(entry.link)
		html = r.content if raw else r.text
		linked_html = BeautifulSoup(html, 'lxml') if has_lxml else BeautifulSoup(html)

		content = ''
		for selector in selectors:
			tags = linked_html.select(selector)
			if replace:
				tags = replace_urls(tags, regexp, replace[1])

			content = reduce(lambda s, tag: s + unicode(tag), tags, content)

		desc = Tag(name = 'description')
		desc.string = content
		item.append(desc)

	out_func = lambda x: (x.prettify() if pretty else unicode(x)).encode('utf-8')
	if output == '-':
		out_file = sys.stdout
		close_file = lambda: None
	else:
		out_file = open(output, 'w')
		close_file = out_file.close

	if has_lxml:
		out_file.write(out_func(soup))
	else:
		out_file.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
		out_file.write(out_func(rss))
	out_file.write('\n')
	close_file()