Exemplo n.º 1
0
def _generate_entities():
    """for each scrapable page, yield an entity"""
    for url in _get_scrape_urls():
        doc = BeautifulSoup(helpers.fetch_string(url), "html.parser")
        lis = doc.find_all("li", class_="card card--horizontal")
        for li in lis:
            name = get_name(li.find("span", class_="card__name").get_text())

            try:
                mydate = get_date(
                    li.find("span", class_="card__date").get_text())
            except AttributeError:
                mydate = ''

            try:
                place = li.find("span", class_="card__place").get_text()
            except AttributeError:
                place = ""

            picture_url = urljoin(_site_url, li.find("img")["src"])

            fields = [
                {
                    "name": "Date",
                    "value": mydate
                },
                {
                    "name": "Place",
                    "value": place
                },
                {
                    "tag": "picture_url",
                    "value": picture_url
                },
            ]

            try:
                entity_url = urljoin(_entity_base_url,
                                     li.find("a", class_="card__box")["href"])
                doc2 = BeautifulSoup(helpers.fetch_string(entity_url),
                                     "html.parser")
                div = doc2.find("div", class_="article__text")
                ps = div.find_all("p")
                header = ps[0].get_text().strip()
                text = ' '.join([p.get_text().strip() for p in ps[1:]])
                fields.append({"name": header, "value": text})
            except TypeError:
                entity_url = ''

            fields.append({"tag": "url", "value": entity_url})

            yield {
                "_meta": {
                    "id": helpers.make_id(name),
                    "entity_type": "person"
                },
                "fields": fields,
                "name": name
            }
Exemplo n.º 2
0
def _generate_entities():
    """for each scrapable page, yield an entity"""

    for url in _get_scrape_urls():

        doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser")
        div = doc.find('div', id="print_content")
        u = div.find("u").get_text().strip()
        name = u.split(':')[-1]
        year = u.split(':')[0][-4:]

        p = div.find_all("p")[1]

        text = p.get_text().strip()

        fields = [
            {"tag": "url", "value": url},
            {"tag": "text", "value": text},
            {"tag": "year", "value": year}
        ]

        yield {
            "_meta": {
                "id": helpers.make_id(name),
                "entity_type": "company"
            },
            "fields": fields,
            "name": name
        }
Exemplo n.º 3
0
def _custom_opener(url, linux=False):
    if linux:
        return BeautifulSoup(helpers.fetch_string(url, cache_hours=6))
    else:
        from urllib2 import urlopen

        return BeautifulSoup(urlopen(url).read())
Exemplo n.º 4
0
def _generate_entities():
    for url in _get_scrape_urls():
        doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6))

        # setup entity
        entity = {
            "_meta": {
                "id": re.sub(".*AppealId=", "", url),
                "entity_type": "person"
            },
            "types": ["warning"],
            "fields": []
        }

        # load details into fields
        for dt in doc.find("dl", class_="details").find_all("dt"):
            dd = dt.find_next_sibling("dd")
            if "suspect name" in dt.get_text().lower():
                entity["name"] = " ".join([w for w in dd.get_text().split() if re.match("^[a-zA-Z]+$", w)])
            else:
                entity["fields"].append({"name": dt.get_text().strip(), "value": dd.get_text().strip()})

        # load "full text" section
        for h3 in doc.find("div", class_="summary").find_all("h3"):
            p = h3.find_next_sibling("h3")
            if p is not None and "Full Text" in h3.get_text():
                entity["fields"].append({"name": "Summary", "value": p.get_text()})

        # give back entity
        if entity.get("name", "").strip().lower() not in ["", "unknown"]:
            yield entity
Exemplo n.º 5
0
def _generate_entities():
    """for each scrapable page, yield an entity"""

    for url in _get_scrape_urls():

        doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6),
                            "html.parser")
        div = doc.find('div', id="print_content")
        u = div.find("u").get_text().strip()
        name = u.split(':')[-1]
        year = u.split(':')[0][-4:]

        p = div.find_all("p")[1]

        text = p.get_text().strip()

        fields = [{
            "tag": "url",
            "value": url
        }, {
            "tag": "text",
            "value": text
        }, {
            "tag": "year",
            "value": year
        }]

        yield {
            "_meta": {
                "id": helpers.make_id(name),
                "entity_type": "company"
            },
            "fields": fields,
            "name": name
        }
Exemplo n.º 6
0
def _custom_opener(url, linux=False):
    if linux:
        return BeautifulSoup(helpers.fetch_string(url, cache_hours=6))
    else:
        from urllib2 import urlopen

        return BeautifulSoup(urlopen(url))
Exemplo n.º 7
0
def get_tables(url):
    objects = {'objects': []}
    main_page = BeautifulSoup(helpers.fetch_string(url, cache_hours=6))
    # main_page = BeautifulSoup(urlopen(url))
    tables = main_page.find_all('table')

    def __fill_helper(_tag):
        table_object['instance'].append({CUSTOM_TAG: _bs_to_utf(_tag), 'people': []})

    for table in tables:
        table_object = {'instance': []}
        rows = table.find_all('tr')
        for row in rows:
            tds = row.find_all('td')
            len_tds = len(tds)

            if len_tds:
                if len_tds > 1:
                    p_name = _bs_to_utf(tds[1])
                    p_name = re.sub("^\s+", "", p_name.split(".")[-1].strip())
                    person_info = {POSITION: _bs_to_utf(tds[0]), 'person_name': p_name}
                    if tds[0].find('a'):
                        person_info.update({'person_url': _site_ulr + tds[0].find('a')['href']})
                        if CUSTOM_TAG in table_object['instance'][-1]:
                            table_object['instance'][-1]['people'].append(person_info)
                else:
                    __fill_helper(tds[0])
            else:
                __fill_helper(row)

        objects['objects'].append(table_object)

    return objects
Exemplo n.º 8
0
def _generate_entities():
    """for each scrapable page, yield an entity"""
    for url in _get_scrape_urls():
        doc = BeautifulSoup(helpers.fetch_string(url), "html.parser")
        lis = doc.find_all("li", class_="card card--horizontal")
        for li in lis:
             name = get_name(li.find("span", class_="card__name").get_text())
             
             try: 
                 mydate = get_date(li.find("span", class_="card__date").get_text())
             except AttributeError:
                 mydate = '' 
             
             try:  
                 place = li.find("span", class_="card__place").get_text()
             except AttributeError:
                 place = ""

             picture_url = urljoin(_site_url, li.find("img")["src"])

             fields = [
                  {"name": "Date", "value": mydate},
                  {"name": "Place", "value": place},
                  {"tag": "picture_url", "value": picture_url},
             ]

             try:
                 entity_url = urljoin(_entity_base_url, li.find("a", class_="card__box")["href"])
                 doc2 = BeautifulSoup(helpers.fetch_string(entity_url), "html.parser")
                 div = doc2.find("div", class_="article__text")
                 ps = div.find_all("p")       
                 header = ps[0].get_text().strip()
                 text = ' '.join([p.get_text().strip() for p in ps[1:]])
                 fields.append({"name": header, "value": text}) 
             except TypeError:
                 entity_url = ''

             fields.append({"tag": "url", "value": entity_url}) 

             yield {
                 "_meta": {
                     "id": helpers.make_id(name),
                     "entity_type": "person"
                 },
                 "fields": fields,
                 "name": name
             } 
Exemplo n.º 9
0
def _generate_entities():
    """for each scrapable page, yield an entity"""

    for url in _get_scrape_urls():

        doc = BeautifulSoup(helpers.fetch_string(url), "html.parser")
        table = doc.find('table', class_='views-table').find('tbody')
        trs = table.find_all('tr')
        for tr in trs:
            td = tr.find_all('td')
            href = td[0].find_all('a')[1]['href']
            name = td[1].get_text().strip()
            matter_type = td[2].get_text().strip()
            matter_type = " ".join([word.capitalize() for word in matter_type.split()])
            date_failed = td[3].get_text().strip()
            date_failed = "{}-{}-{}".format(date_failed[:4], date_failed[4:6], date_failed[6:])

            fields = [{"name": "Matter Type", "value": matter_type},
                      {"name": "Docket Number", "value": href},
                      {"name": "Date Failed", "value": date_failed}]

            names = _get_name(name):
                if len(names) > 1:
                    name = names[0]
                    aka = []
                    for aka_name in names:
                        aka.append({'name': aka_name})
                else:
                    name = names[0]

                my_id = helpers.make_id(new_name)
                if len(my_id) > 99:
                    my_id = my_id[:99]

                if any(word in name for word in company):
                    entity_type = "company"
                else:
                    entity_type = "person"

                if aka:
                    yield {
                        "_meta": {
                            "id": my_id,
                            "entity_type": entity_type
                        },
                        "fields": fields,
                        "aka": aka,
                        "name": name,
                    }
                else:    
                    yield {
                        "_meta": {
                            "id": my_id,
                            "entity_type": entity_type
                        },
                        "fields": fields,
                        "name": name,
                    }
Exemplo n.º 10
0
def _generate_entities(url, name, office, years_active=None, parish=None):
    """for each scrapable page, yield an entity"""

    doc = BeautifulSoup(helpers.fetch_string(url), "html.parser")
    img = urljoin(_site_url, doc.find("img", class_="ms-rteImage-2")['src'])
    h3 = doc.find_all('h3', class_="ms-rteElement-H3")

    div = h3[0].find_next_sibling()
    current = 'Current Posts ' + div.get_text().strip()
    while div.name != 'h3':
        div = div.find_next_sibling()
        if div:
            current += ' ' + div.get_text().strip()
        else:
            break
    
    current = current.replace('dateMember', 'date Member')

    fields = [
        {"tag": "url", "value": url},
        {"tag": "Current Posts", "value": current},
        {"tag": "picture_url", "value": img},
        {"tag": "Office", "value": office},
    ]

    if years_active:
        fields.append({"tag": "Years Active", "value": years_active})

    if parish:
        fields.append({"tag": "Parish", "value": parish})

    try:
        p = h3[1].find_next_sibling()
        career = p.get_text().strip()
        while p.name == 'p':
            p = p.find_next_sibling()
            career += ' ' + p.get_text().strip()
        fields.append({"tag": "Parliamentary Career", "value": career})
    except IndexError:
        pass

    ps = doc.find_all("p", class_="ms-rteElement-P")
    for p in ps:
        p = p.get_text().strip()
        if 'Born' in p:
            fields.append({"tag": "date_of_birth", "value": get_date(p.split(':')[-1].strip())})
        elif 'Parents' in p:
            fields.append({"tag": "Parents", "value": p.split(':')[-1].strip()})
    
    return {
        "_meta": {
            "id": helpers.make_id(name),
            "entity_type": "person"
        },
        "fields": fields,
        "name": name
    }
Exemplo n.º 11
0
def _generate_entities():
    """for each scrapable page, yield an entity"""

    for url in _get_scrape_urls():

        doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser")
        section = doc.find('section', id="pageContent")
        h1 = section.find("h1").get_text().strip()

        if '(' in h1:
            h1_without_bracket = h1.split('(')[0] + h1.split(')')[-1]
            h1_without_bracket = h1_without_bracket.strip()
        else:
            h1_without_bracket = h1

        names1 = h1_without_bracket.split(',')
        names2 = []
        for name in names1:
            for new_name in divide_name(name, ' & '):
                names2.append(new_name)
        new_names = []

        for name in names2:
            for new_name in divide_name(name, ' and '):
                new_names.append(new_name)

        text = section.find("p").get_text().strip()
        fields = [
            {"tag": "url", "value": url},
            {"name": "text", "value": text}
            ]

        custom_fields = section.find_all("h2")
        for custom_field in custom_fields:
            field_name = custom_field.get_text().strip()
            if field_name == 'Defendants':
                values1 = section.find_all('div', class_="chargeDefendant")
                values2 = section.find_all('div', class_="chargeCharge")
                values = zip(values1, values2)
                field_value = ' '.join([value[0].get_text().strip() + ' ' + value[1].get_text().strip() for value in values])
            else:
                field_value = custom_field.find_next_sibling('p').get_text().strip()
            fields.append({"tag": field_name, "value": field_value})

        for name in new_names:

            name = name.strip()

            yield {
                "_meta": {
                    "id": helpers.make_id(name),
                    "entity_type": "company"
                },
                "fields": fields,
                "name": name
            }
Exemplo n.º 12
0
def get_rows(url):
    objects = []
    main_page = BeautifulSoup(helpers.fetch_string(url, cache_hours=6))
    rows = main_page.find('table').find_all('tr')
    for row in rows[1:]:
        name, _url = row.find_all('td')[:2]
        obj = {'name': _bs_to_utf(name),
               'picture_url': _site_ulr + _url.find('img')['src'] if _url.find('img') else _bs_to_utf(_url)}
        objects.append(obj)
    return objects
Exemplo n.º 13
0
def _custom_opener(url, linux=True):
    if linux:
        return BeautifulSoup(helpers.fetch_string(url, cache_hours=6))
    else:
        from urllib2 import urlopen

        try:
            return BeautifulSoup(urlopen(url).read())
        except Exception, e:
            print e
            pass
Exemplo n.º 14
0
def _custom_opener(url, linux=False):
    if linux:
        return BeautifulSoup(helpers.fetch_string(url, cache_hours=6))
    else:
        from urllib2 import urlopen

        try:
            return BeautifulSoup(urlopen(url).read())
        except Exception, e:
            print e
            pass
Exemplo n.º 15
0
def _get_scrape_urls():
    for page in itertools.count(1):
        doc = BeautifulSoup(helpers.fetch_string(_base_url.format(page), cache_hours=6))

        # find all matching tags, bail if no more
        found_urls = False
        for a in doc.find_all("a"):
            if a.has_attr("href") and "most-wanted-detail" in a["href"]:
                yield urlparse.urljoin(_base_url, a["href"])
                found_urls = True
        if not found_urls:
            break
Exemplo n.º 16
0
def _generate_entities():
    """for each scrapable page, yield an entity"""
    run = True
    page = 0
    while run:
        url = _base_url + str(page)
        page += 1
        doc = BeautifulSoup(helpers.fetch_string(url), "html.parser")
        div = doc.find('div', id="resultsSearchBox")
        all_h3 = div.find_all("h3", id='')

        if not all_h3:
            run = False
            return

        for h3 in all_h3:
            a = h3.find('a')
            href = urljoin(_site_url, a['href'])
            name = a.get_text().split(':')[1].strip()
            sub = h3.find_next_sibling('sub')
            spans = sub.find_all('span')
            if spans:
                published = get_date(spans[0].get_text().strip())
                modified = get_date(spans[1].get_text().strip())
            else:
                sub = sub.get_text().strip()
                published = get_date(sub[11:21])
                modified = get_date(sub[-10:])

            if any(company in name.lower() for company in companies):
                entity_type = "company"
            else:
                entity_type = "person"

            fields = [{
                "tag": "url",
                "value": href
            }, {
                "tag": "Published",
                "value": published
            }, {
                "tag": "Last Modified",
                "value": modified
            }]

            yield {
                "_meta": {
                    "id": helpers.make_id(name),
                    "entity_type": entity_type
                },
                "fields": fields,
                "name": name
            }
Exemplo n.º 17
0
def _get_scrape_urls():
    """find all scrapable links on main page"""

    for url in _base_url, _base_url2:

        doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser")
        uls = doc.find_all("ul", class_="contentListing")

        for ul in uls:
            href = ul.find_all("a")
            for link in href:
                if link:
                    yield link['href']
Exemplo n.º 18
0
def _get_parties(url):
    party_objects = []

    main_page = BeautifulSoup(helpers.fetch_string(url, cache_hours=6))

    table_party = main_page.find('table', {'class': 'telbogTable'}).find_all('tr')[1:-1]

    for row in table_party:
        party_url = _site_ulr + row.find('a')['href']
        party_id = row.find('a')['href'].split('=')[1].replace('{', '').replace('}', '')
        party_name = row.find('a').text.encode('utf8')
        party_objects.append({'party_url': party_url, 'party_id': party_id, 'party_name': party_name})

    return party_objects
Exemplo n.º 19
0
def _get_scrape_urls():
    """find all scrapable links on main page"""

    for url in _base_url, _base_url2:

        doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6),
                            "html.parser")
        uls = doc.find_all("ul", class_="contentListing")

        for ul in uls:
            href = ul.find_all("a")
            for link in href:
                if link:
                    yield link['href']
Exemplo n.º 20
0
def get_rows(url):
    objects = []
    main_page = BeautifulSoup(helpers.fetch_string(url, cache_hours=6))
    rows = main_page.find('table').find_all('tr')
    for row in rows[1:]:
        name, _url = row.find_all('td')[:2]
        obj = {
            'name':
            _bs_to_utf(name),
            'picture_url':
            _site_ulr +
            _url.find('img')['src'] if _url.find('img') else _bs_to_utf(_url)
        }
        objects.append(obj)
    return objects
Exemplo n.º 21
0
def _get_scrape_urls():
    """find all scrapable links on main page"""

    url = _base_url

    doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6), "html.parser")
    div = doc.find("div", id="print_content")
    uls = div.find_all('ul')

    for ul in uls:
        href = ul.find_all("a")
        for link in href:
            link = link['href']
            link = urljoin(_site_url, link)
            yield link
Exemplo n.º 22
0
def _generate_entities(data):
    """for each scrapable page, yield an entity"""

    i = 0
    while i < len(data):
        release_date = datetime.strptime(data[i].text, '%m/%d/%Y')
        release_date = release_date.strftime('%Y-%m-%d')
        name = data[i + 1].text
        url = data[i + 1].find_element_by_tag_name('a').get_attribute("/href")

        href = data[i + 2].find_element_by_tag_name('a').get_attribute("/href")
        related = []
        if href:
            doc = BeautifulSoup(helpers.fetch_string(href), "html.parser")
            tds = doc.find_all("td", class_='ms-vb')
            for td in tds:
                try:
                    related.append(td.find('a')['href'])
                except AttributeError:
                    pass

        related_documents = ' '.join(related)
        fields = [{
            "name": "Release date",
            "value": release_date
        }, {
            "tag": "url",
            "value": url
        }, {
            "name": "Related documents",
            "value": related_documents
        }]
        i += 3

        my_id = helpers.make_id(name)
        if len(my_id) > 99:
            my_id = my_id[:99]

        entity = {
            "_meta": {
                "id": my_id,
                "entity_type": "company"
            },
            "fields": fields,
            "name": name,
        }

        helpers.emit(entity)
Exemplo n.º 23
0
def _get_scrape_urls():
    """find all scrapable links on main page"""

    url = _base_url

    doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6),
                        "html.parser")
    div = doc.find("div", id="print_content")
    uls = div.find_all('ul')

    for ul in uls:
        href = ul.find_all("a")
        for link in href:
            link = link['href']
            link = urljoin(_site_url, link)
            yield link
Exemplo n.º 24
0
def custom_opener(url, linux=True):
    """
    While using WINDOWS use linux=False parameter, but before final contribute change in to linux=True
    :param url: input url
    :param linux: switch between linux or windows
    """
    from bs4 import BeautifulSoup
    from helpers import fetch_string

    if linux:
        return BeautifulSoup(fetch_string(url, cache_hours=6))
    else:
        from urllib2 import urlopen

        try:
            return BeautifulSoup(urlopen(url).read())
        except Exception, e:
            print e
            pass
Exemplo n.º 25
0
def get_tables(url):
    objects = {'objects': []}
    main_page = BeautifulSoup(helpers.fetch_string(url, cache_hours=6))
    # main_page = BeautifulSoup(urlopen(url))
    tables = main_page.find_all('table')

    def __fill_helper(_tag):
        table_object['instance'].append({
            CUSTOM_TAG: _bs_to_utf(_tag),
            'people': []
        })

    for table in tables:
        table_object = {'instance': []}
        rows = table.find_all('tr')
        for row in rows:
            tds = row.find_all('td')
            len_tds = len(tds)

            if len_tds:
                if len_tds > 1:
                    p_name = _bs_to_utf(tds[1])
                    p_name = re.sub("^\s+", "", p_name.split(".")[-1].strip())
                    person_info = {
                        POSITION: _bs_to_utf(tds[0]),
                        'person_name': p_name
                    }
                    if tds[0].find('a'):
                        person_info.update({
                            'person_url':
                            _site_ulr + tds[0].find('a')['href']
                        })
                        if CUSTOM_TAG in table_object['instance'][-1]:
                            table_object['instance'][-1]['people'].append(
                                person_info)
                else:
                    __fill_helper(tds[0])
            else:
                __fill_helper(row)

        objects['objects'].append(table_object)

    return objects
Exemplo n.º 26
0
def custom_opener(url, linux=True):
    """
    While using WINDOWS use linux=False parameter, but before final contribute change in to linux=True
    :param url: input url
    :param linux: switch between linux or windows
    """
    from bs4 import BeautifulSoup
    from helpers import fetch_string

    if linux:
        return BeautifulSoup(fetch_string(url, cache_hours=6))
    else:
        from urllib2 import urlopen

        try:
            return BeautifulSoup(urlopen(url).read())
        except Exception, e:
            print e
            pass
Exemplo n.º 27
0
def _generate_entities(data):
    """for each scrapable page, yield an entity"""

    i = 0
    while i < len(data):
        release_date = datetime.strptime(data[i].text, '%m/%d/%Y')
        release_date = release_date.strftime('%Y-%m-%d')
        name = data[i+1].text
        url = data[i+1].find_element_by_tag_name('a').get_attribute("/href")

        href = data[i+2].find_element_by_tag_name('a').get_attribute("/href")
        related = []
        if href:
            doc = BeautifulSoup(helpers.fetch_string(href), "html.parser")
            tds = doc.find_all("td", class_='ms-vb')
            for td in tds:
                try:
                    related.append(td.find('a')['href'])
                except AttributeError:
                    pass
         
        related_documents = ' '.join(related) 
        fields = [{"name": "Release date", "value": release_date},
                  {"tag": "url", "value": url},
                  {"name": "Related documents", "value": related_documents}]
        i += 3

        my_id = helpers.make_id(name)
        if len(my_id) > 99:
            my_id = my_id[:99]

        entity = {
            "_meta": {
                "id": my_id,
                "entity_type": "company"
            },
            "fields": fields,
            "name": name,
        }

        helpers.emit(entity)
Exemplo n.º 28
0
def custom_opener(url):
    """
    While using WINDOWS use linux=False parameter, but before final contribute change in to linux=True
    :param url: input url
    :param linux: switch between linux or windows
    """

    import platform
    from helpers import fetch_string

    _OS_LINUX = True if "linux" in platform.system().lower() or 'unix' in platform.system().lower() else False

    if _OS_LINUX:
        return BeautifulSoup(fetch_string(url, cache_hours=6))
    else:
        from urllib2 import urlopen

        try:
            return BeautifulSoup(urlopen(url, timeout=20).read())
        except Exception, e:
            print e
Exemplo n.º 29
0
def _get_parties(url):
    party_objects = []

    main_page = BeautifulSoup(helpers.fetch_string(url, cache_hours=6))

    table_party = main_page.find('table', {
        'class': 'telbogTable'
    }).find_all('tr')[1:-1]

    for row in table_party:
        party_url = _site_ulr + row.find('a')['href']
        party_id = row.find('a')['href'].split('=')[1].replace('{',
                                                               '').replace(
                                                                   '}', '')
        party_name = row.find('a').text.encode('utf8')
        party_objects.append({
            'party_url': party_url,
            'party_id': party_id,
            'party_name': party_name
        })

    return party_objects
Exemplo n.º 30
0
def _generate_entities(url, name, office, years_active=None, parish=None):
    """for each scrapable page, yield an entity"""

    doc = BeautifulSoup(helpers.fetch_string(url), "html.parser")
    img = urljoin(_site_url, doc.find("img", class_="ms-rteImage-2")['src'])
    h3 = doc.find_all('h3', class_="ms-rteElement-H3")

    div = h3[0].find_next_sibling()
    current = 'Current Posts ' + div.get_text().strip()
    while div.name != 'h3':
        div = div.find_next_sibling()
        if div:
            current += ' ' + div.get_text().strip()
        else:
            break

    current = current.replace('dateMember', 'date Member')

    fields = [
        {
            "tag": "url",
            "value": url
        },
        {
            "tag": "Current Posts",
            "value": current
        },
        {
            "tag": "picture_url",
            "value": img
        },
        {
            "tag": "Office",
            "value": office
        },
    ]

    if years_active:
        fields.append({"tag": "Years Active", "value": years_active})

    if parish:
        fields.append({"tag": "Parish", "value": parish})

    try:
        p = h3[1].find_next_sibling()
        career = p.get_text().strip()
        while p.name == 'p':
            p = p.find_next_sibling()
            career += ' ' + p.get_text().strip()
        fields.append({"tag": "Parliamentary Career", "value": career})
    except IndexError:
        pass

    ps = doc.find_all("p", class_="ms-rteElement-P")
    for p in ps:
        p = p.get_text().strip()
        if 'Born' in p:
            fields.append({
                "tag": "date_of_birth",
                "value": get_date(p.split(':')[-1].strip())
            })
        elif 'Parents' in p:
            fields.append({
                "tag": "Parents",
                "value": p.split(':')[-1].strip()
            })

    return {
        "_meta": {
            "id": helpers.make_id(name),
            "entity_type": "person"
        },
        "fields": fields,
        "name": name
    }
Exemplo n.º 31
0
def _get_scrape_urls():
    """find all scrapable links on main page"""

    doc = BeautifulSoup(helpers.fetch_string(_base_url), "html.parser")
    council = doc.find('div', id='div_938d72bb-8154-4b6a-bd71-8144ca6bf1a0')
    house = doc.find('div', id='div_47793ec9-3449-46a3-9095-f2eb8c475846')

    council_lis = council.find_all("div", class_="link-item")
    house_lis = house.find_all("li", class_="dfwp-item")

    for li in council_lis:
        person = li.find("a")
        link = person["href"]
        name = _get_name(person.get_text())
        office = "Legislative Council"
        entity = _generate_entities(link, name, office)
        yield entity

    for li in house_lis:
        try:
            parish = li.find("div", class_="groupheader").get_text().strip()
        except AttributeError:
            continue
        all_div = li.find_all("div", class_="link-item")
        for div in all_div:
            person = div.find("a")
            link = person["href"]
            name = _get_name(person.get_text())
            office = "House of Keys"
            years_active = None
            entity = _generate_entities(link, name, office, years_active,
                                        parish)
            yield entity

    doc = BeautifulSoup(helpers.fetch_string(_base_url2), "html.parser")
    div = doc.find('div', id='div_a1526572-2de9-494b-a410-6fdc17d3b84e')
    trs = div.find_all('tr', class_='ms-itmhover')
    for tr in trs:
        try:
            td = tr.find_all('td')
            name = _get_name(td[1].get_text())
            office = "House of Keys"
            link = urljoin(_site_url, td[3].find('a')['href'])
            years_active = td[2].get_text().strip()
            try:
                date = int(years_active.split()[-1])
                if date < 1990:
                    continue
            except ValueError:
                pass
            if '.pdf' in link:
                fields = [{
                    "tag": "url",
                    "value": link
                }, {
                    "tag": "Years Active",
                    "value": years_active
                }, {
                    "tag": "Office",
                    "value": office
                }]
                yield {
                    "_meta": {
                        "id": helpers.make_id(name),
                        "entity_type": "person"
                    },
                    "fields": fields,
                    "name": name
                }
                continue
            entity = _generate_entities(link, name, office, years_active)
            yield entity
        except TypeError:
            pass
Exemplo n.º 32
0
def _get_scrape_urls():
    """find all scrapable links on main page"""

    doc = BeautifulSoup(helpers.fetch_string(_base_url), "html.parser")
    council = doc.find('div', id='div_938d72bb-8154-4b6a-bd71-8144ca6bf1a0')
    house = doc.find('div', id='div_47793ec9-3449-46a3-9095-f2eb8c475846')

    council_lis = council.find_all("div", class_="link-item")
    house_lis = house.find_all("li", class_="dfwp-item")

    for li in council_lis:
        person = li.find("a")
        link = person["href"]
        name = _get_name(person.get_text())
        office = "Legislative Council"
        entity = _generate_entities(link, name, office)
        yield entity

    for li in house_lis:
        try:             
            parish = li.find("div", class_="groupheader").get_text().strip()
        except AttributeError:
            continue
        all_div = li.find_all("div", class_="link-item")
        for div in all_div: 
            person = div.find("a")
            link = person["href"]
            name = _get_name(person.get_text())
            office = "House of Keys"
            years_active = None
            entity = _generate_entities(link, name, office, years_active, parish)
            yield entity

    doc = BeautifulSoup(helpers.fetch_string(_base_url2), "html.parser")
    div = doc.find('div', id='div_a1526572-2de9-494b-a410-6fdc17d3b84e')
    trs = div.find_all('tr', class_='ms-itmhover')
    for tr in trs:
        try:
            td = tr.find_all('td')
            name = _get_name(td[1].get_text())
            office = "House of Keys"
            link = urljoin(_site_url, td[3].find('a')['href'])
            years_active = td[2].get_text().strip()
            try:
                date = int(years_active.split()[-1])
                if date < 1990:
                    continue
            except ValueError:
                pass
            if '.pdf' in link:
                fields = [
                    {"tag": "url", "value": link},
                    {"tag": "Years Active", "value": years_active},
                    {"tag": "Office", "value": office}
                ]
                yield {
                    "_meta": {
                        "id": helpers.make_id(name),
                        "entity_type": "person"
                    },
                    "fields": fields,
                    "name": name
                }
                continue
            entity = _generate_entities(link, name, office, years_active)
            yield entity
        except TypeError:
            pass
Exemplo n.º 33
0
def _generate_entities():
    """for each scrapable page, yield an entity"""

    doc = BeautifulSoup(helpers.fetch_string(_base_url[0]), "html.parser")
    form = doc.find('form', {'name': 'criminalqueryeng_p2'})
    tables = form.find_all('table', {'bgcolor': '#84BD00'})

    tr = tables[0].find_all('tr')
    i = 1
    while i < len(tr):
        td = tr[i].find_all('td')
        name = _get_name(td[2].get_text().strip())
        date_filing = _get_date(td[1].get_text().strip())
        try:
            url = td[6].find('a')['href']
        except TypeError:
            url = ''
        summarized_facts = td[5].get_text().strip()

        fields = [{"name": "Summarized Facts", "value": summarized_facts},
                  {"name": "Press Release", "value": url},
                  {"name": "Date of Complaint Filing", "value": date_filing}]

        yield {
            "_meta": {
                "id": helpers.make_id(name),
                "entity_type": "company"
            },
            "fields": fields,
            "name": name,
        }
        i += 2

    tr = tables[1].find_all('tr')
    i = 1
    while i < len(tr):
        td = tr[i].find_all('td')
        name = _get_name(td[4].get_text().strip())
        date_filing = _get_date(td[1].get_text().strip())
        try:
            url = td[8].find('a')['href']
        except TypeError:
            url = ''
        summarized_facts = td[7].get_text().strip()
        baht = td[9].get_text().strip()
        section = td[5].get_text().strip()
        law = td[6].get_text().strip()
        nomer = td[3].get_text().strip()

        fields = [{"name": "Summarized Facts", "value": summarized_facts},
                  {"name": "Press Release", "value": url},
                  {"name": "Date of Complaint Filing", "value": date_filing},
                  {"name": "Amount of Fines (Baht)", "value": baht},
                  {"name": "Section", "value": section},
                  {"name": "Relevant Law", "value": law},
                  {"name": "Order Number", "value": nomer},
        ]

        yield {
            "_meta": {
                "id": helpers.make_id(name),
                "entity_type": "company"
            },
            "fields": fields,
            "name": name,
        }
        i += 2

    doc = BeautifulSoup(helpers.fetch_string(_base_url[1]), "html.parser")
    tr = doc.find_all('tr')
    i = 0
    while i < len(tr):
        try:
            td=tr[i].find_all('td')
            name = _get_name(td[1].get_text().strip())
            type_personal = td[2].get_text().strip()
            try:
                url = td[3].find('a')['href']
            except TypeError:
                url = ''
            summarized_facts = td[4].get_text().strip()
            administrative_orders = td[5].get_text().strip()
            effective_date = td[6].get_text().strip()
      
            fields = [{"name": "Type of Personal", "value": type_personal},
                      {"name": "Press Release", "value": url},
                      {"name": "Date of Complaint Filing", "value": date_filing},
                      {"name": "Administrative Orders", "value": administrative_orders},
                      {"name": "Summarized Facts", "value": summarized_facts},
                      {"name": "Effective Date", "value": effective_date},
            ]
      
            my_id = helpers.make_id(name)
            if len(my_id) > 99:
                my_id = my_id[:99]
            
            yield {
                "_meta": {
                    "id": my_id,
                    "entity_type": "company"
                },
                "fields": fields,
                "name": name,
            }
            i += 2
        except:
            i += 1

    doc = BeautifulSoup(helpers.fetch_string(_base_url[2]), "html.parser")
    tr = doc.find_all('tr')
    i = 0
    while i < len(tr):
        try:
            td=tr[i].find_all('td')
            name = _get_name(td[3].get_text().strip())
            sanction = _get_date(td[1].get_text().strip())
            summarized_facts = td[7].get_text().strip()
            nomer = td[2].get_text().strip()
            types_business = td[4].get_text().strip()
            relevant_law = td[5].get_text().strip()
            section = td[6].get_text().strip()
            baht = td[10].get_text().strip()
    
            fields = [{"name": "Date of Imposing the Administrative Sanction", "value": sanction},
                      {"name": "Types of Business", "value": types_business},
                      {"name": "Summarized Facts", "value": summarized_facts},
                      {"name": "Order Number", "value": nomer},
                      {"name": "Relevant Law", "value": relevant_law},
                      {"name": "Section", "value": section},
                      {"name": "Amount of Fines (Baht)", "value": baht},
            ]
            
            my_id = helpers.make_id(name)
            if len(my_id) > 99:
                my_id = my_id[:99]
    
            yield {
                "_meta": {
                    "id": my_id,
                    "entity_type": "company"
                },
                "fields": fields,
                "name": name,
            }
            i += 2
        except:
            i += 1
Exemplo n.º 34
0
def _get_people(party_obj):
    people_obj = []
    for party in party_obj:
        modified_url = party['party_url'] + PAGINATION
        modified_url = quote(modified_url, safe=SAFE_QUOTE)

        page_object = BeautifulSoup(helpers.fetch_string(modified_url, cache_hours=6))

        table_party = page_object.find('table', {'class': 'telbogTable'}).find_all('tr')[1:]
        for person in table_party:
            person_url = _site_ulr + person['onclick'].replace('document.location=(\'', '').replace('\')', '')
            person_id = person_url.split('/')[-1].strip('.aspx')
            all_td = person.find_all('td')
            person_name = ' '.join([_.text for _ in all_td[:2]])
            position = all_td[3].text
            phone = all_td[4].text.split(':')[-1].strip()

            try:
                profile_pic = _site_ulr + '/' + all_td[-1].find('img')['src']
            except TypeError:
                profile_pic = None

            people_entity = _create_entity()

            people_entity['_meta']['entity_type'] = 'person'
            people_entity['name'] = person_name
            conc_names = person_name + position + person_id
            people_entity['_meta']['id'] = _create_id(conc_names)

            fields = [{'tag': 'political_party', 'value': party['party_name']},
                      {'tag': 'url', 'value': person_url},
                      {'tag': 'position', 'value': position},
                      {'tag': 'phone_number', 'value': phone.strip('View biography')},
                      {'tag': 'country', 'value': 'Denmark'},
                      {'tag': 'person_name', 'value': person_name}]
            if profile_pic:
                fields.append(
                    {
                        'tag': 'picture_url',
                        'value': profile_pic.replace('~/media/', 'Members/~/media/').replace('84', '133').replace('x84',
                                                                                                                  'x133')
                    }
                )

            open_person_url = BeautifulSoup(helpers.fetch_string(quote(person_url, safe=SAFE_QUOTE), cache_hours=6))

            bio = open_person_url.find('div', {'class': 'tabContent clearfix'})
            first_block = bio.find('p').text
            regexp_born_in_place = re.compile('born (.+),')
            regexp_born = re.compile(r'born (.+).')

            try:
                born_string = regexp_born_in_place.search(first_block).group(0).split(',')[0].strip('born ')
            except AttributeError:
                born_string = regexp_born.search(first_block).group(0).split(',')[0].split('.')[0].strip('born ')

            if 'in' in born_string or ' at ' in born_string or ' on ' in born_string:
                try:
                    date, place = born_string.split(' in ')
                except ValueError:
                    try:
                        date, place = born_string.split(' at ')
                    except ValueError:
                        date, place = born_string.split(' on ')

                fields.append({'tag': 'date_of_birth', 'value': str(parser.parse(date)).split(' ')[0]})
                fields.append({'tag': 'place_of_birth', 'value': place})
            else:
                try:
                    fields.append({'tag': 'date_of_birth', 'value': str(parser.parse(born_string)).split(' ')[0]})
                except ValueError:
                    fields.append(
                        {'tag': 'date_of_birth', 'value': str(parser.parse(born_string.split('.')[0])).split(' ')[0]})

            people_entity['fields'] = fields
            people_obj.append(people_entity)
    return people_obj
Exemplo n.º 35
0
def _get_people(party_obj):
    people_obj = []
    for party in party_obj:
        modified_url = party['party_url'] + PAGINATION
        modified_url = quote(modified_url, safe=SAFE_QUOTE)

        page_object = BeautifulSoup(
            helpers.fetch_string(modified_url, cache_hours=6))

        table_party = page_object.find('table', {
            'class': 'telbogTable'
        }).find_all('tr')[1:]
        for person in table_party:
            person_url = _site_ulr + person['onclick'].replace(
                'document.location=(\'', '').replace('\')', '')
            person_id = person_url.split('/')[-1].strip('.aspx')
            all_td = person.find_all('td')
            person_name = ' '.join([_.text for _ in all_td[:2]])
            position = all_td[3].text
            phone = all_td[4].text.split(':')[-1].strip()

            try:
                profile_pic = _site_ulr + '/' + all_td[-1].find('img')['src']
            except TypeError:
                profile_pic = None

            people_entity = _create_entity()

            people_entity['_meta']['entity_type'] = 'person'
            people_entity['name'] = person_name
            conc_names = person_name + position + person_id
            people_entity['_meta']['id'] = _create_id(conc_names)

            fields = [{
                'tag': 'political_party',
                'value': party['party_name']
            }, {
                'tag': 'url',
                'value': person_url
            }, {
                'tag': 'position',
                'value': position
            }, {
                'tag': 'phone_number',
                'value': phone.strip('View biography')
            }, {
                'tag': 'country',
                'value': 'Denmark'
            }, {
                'tag': 'person_name',
                'value': person_name
            }]
            if profile_pic:
                fields.append({
                    'tag':
                    'picture_url',
                    'value':
                    profile_pic.replace('~/media/',
                                        'Members/~/media/').replace(
                                            '84',
                                            '133').replace('x84', 'x133')
                })

            open_person_url = BeautifulSoup(
                helpers.fetch_string(quote(person_url, safe=SAFE_QUOTE),
                                     cache_hours=6))

            bio = open_person_url.find('div', {'class': 'tabContent clearfix'})
            first_block = bio.find('p').text
            regexp_born_in_place = re.compile('born (.+),')
            regexp_born = re.compile(r'born (.+).')

            try:
                born_string = regexp_born_in_place.search(first_block).group(
                    0).split(',')[0].strip('born ')
            except AttributeError:
                born_string = regexp_born.search(first_block).group(0).split(
                    ',')[0].split('.')[0].strip('born ')

            if 'in' in born_string or ' at ' in born_string or ' on ' in born_string:
                try:
                    date, place = born_string.split(' in ')
                except ValueError:
                    try:
                        date, place = born_string.split(' at ')
                    except ValueError:
                        date, place = born_string.split(' on ')

                fields.append({
                    'tag': 'date_of_birth',
                    'value': str(parser.parse(date)).split(' ')[0]
                })
                fields.append({'tag': 'place_of_birth', 'value': place})
            else:
                try:
                    fields.append({
                        'tag':
                        'date_of_birth',
                        'value':
                        str(parser.parse(born_string)).split(' ')[0]
                    })
                except ValueError:
                    fields.append({
                        'tag':
                        'date_of_birth',
                        'value':
                        str(parser.parse(
                            born_string.split('.')[0])).split(' ')[0]
                    })

            people_entity['fields'] = fields
            people_obj.append(people_entity)
    return people_obj
Exemplo n.º 36
0
def _generate_entities():
    """for each scrapable page, yield an entity"""

    for url in _get_scrape_urls():

        doc = BeautifulSoup(helpers.fetch_string(url, cache_hours=6),
                            "html.parser")
        section = doc.find('section', id="pageContent")
        h1 = section.find("h1").get_text().strip()

        if '(' in h1:
            h1_without_bracket = h1.split('(')[0] + h1.split(')')[-1]
            h1_without_bracket = h1_without_bracket.strip()
        else:
            h1_without_bracket = h1

        names1 = h1_without_bracket.split(',')
        names2 = []
        for name in names1:
            for new_name in divide_name(name, ' & '):
                names2.append(new_name)
        new_names = []

        for name in names2:
            for new_name in divide_name(name, ' and '):
                new_names.append(new_name)

        text = section.find("p").get_text().strip()
        fields = [{
            "tag": "url",
            "value": url
        }, {
            "name": "text",
            "value": text
        }]

        custom_fields = section.find_all("h2")
        for custom_field in custom_fields:
            field_name = custom_field.get_text().strip()
            if field_name == 'Defendants':
                values1 = section.find_all('div', class_="chargeDefendant")
                values2 = section.find_all('div', class_="chargeCharge")
                values = zip(values1, values2)
                field_value = ' '.join([
                    value[0].get_text().strip() + ' ' +
                    value[1].get_text().strip() for value in values
                ])
            else:
                field_value = custom_field.find_next_sibling(
                    'p').get_text().strip()
            fields.append({"tag": field_name, "value": field_value})

        for name in new_names:

            name = name.strip()

            yield {
                "_meta": {
                    "id": helpers.make_id(name),
                    "entity_type": "company"
                },
                "fields": fields,
                "name": name
            }