示例#1
0
    def __init__(self, soup_obj, process_html, *args, **kwargs):
        temp = clean_find(soup_obj, ['div', {'class': 'card-body'}])
        text = "".join(get_strings(clean_find(temp, ["h3", {'class': 'h4'}])))
        self.title = text.split(".")[0]

        raw_rows = get_rows_in_table(soup_obj)
        if raw_rows is not None:
            header_row_text = self.get_table_header(soup_obj)

            rows = [extract_columns(row) for row in raw_rows]
            assert are_rowlens_equal(rows)

            if (len(header_row_text) == len(rows[0]) - 1):
                rows = drop_empty_first_row(rows)
            if (len(header_row_text) != len(rows[0])):
                warn("Header: `" + "`, `".join(header_row_text) + "`")
                warn("Row: `" +
                     "`, `".join([get_string(e).strip()
                                  for e in rows[0]]) + "`")
                raise PageScrapeException(
                    "Header has length {0} and rows are length {1}".format(
                        len(header_row_text), len(rows[0])))
            if process_html == False:
                rows = [[str(cell) for cell in row] for row in rows]
            else:
                raise BaseException("Not implemented")

            pandas.DataFrame.__init__(self,
                                      data=rows,
                                      columns=header_row_text,
                                      *args,
                                      **kwargs)
        else:
            pandas.DataFrame.__init__(self, *args, **kwargs)
示例#2
0
def get_report_metadata(soup_obj):
    if is_pdf_page(soup_obj):
        return (None)

    def temp_get_string(tag):
        try:
            text = remove_extra_whitespace(get_full_string(tag))
        except AttributeError:
            text = None
        return (text)

    d = {}
    d["report_name"] = temp_get_string(
        clean_find(soup_obj, ["h1", {
            "class": "mb-2"
        }]))
    d["filer_name"] = temp_get_string(
        clean_find(soup_obj, ["h2", {
            "class": "filedReport"
        }]))
    filing_dateime_string = temp_get_string(
        clean_find(soup_obj, ["p", {
            "class": "muted font-weight-bold"
        }]))
    candidacy_string = temp_get_string(
        clean_find(soup_obj, ["p", {
            "class": "muted"
        }]))
    d["date_filed"] = filing_dateime_string.split()[1]
    d["time_filed"] = " ".join(filing_dateime_string.split()[-2:])
    d["state"] = candidacy_string.split()[3]  # pretty hacky, i admit
    d["candidacy_date"] = candidacy_string.split()[-1]

    return (d)
示例#3
0
def ommitted_assets(soup_obj):
    checkbox = clean_find(soup_obj,
                          ["input", {
                              "name": "filing_omitted_assets"
                          }])
    try:
        return (checkbox["checked"] == "checked")
    except KeyError:
        return (False)
示例#4
0
def get_series_id(soup_obj):
    """ Requires the "category" category as the soup_obj """
    def is_java_link(tag):
        try:  return("javascript:showCat" in tag["href"])
        except KeyError: return(False)
    javascript_link = clean_find(soup_obj, is_java_link)
    if javascript_link:
        series_id = javascript_link.get("href").split("javascript:showCat(")[-1].split(",")[0]
        return(series_id)
    else:
        warn("Manga does not have an id")
        return(None)
示例#5
0
def is_pdf_page(soup_obj):
    return (clean_find(soup_obj, ["div", {"id": "myCarousel"}]) is not None)