Пример #1
0
    def test_divine_interp_tag(self):

        # HD1 elements
        HD = bS('<HD1>Introduction\n</HD1>', 'lxml-xml').find('HD1')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'intro')
        HD = bS('<HD1>Appendix X - X-rays\n</HD1>', 'lxml-xml').find('HD1')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'appendix')
        HD = bS('<HD1>Appendices G & H - Cane\n</HD1>', 'lxml-xml').find('HD1')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'appendices')
        HD = bS('<HD1>Section 1002.4 - Known\n</HD1>', 'lxml-xml').find('HD1')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'section')
        HD = bS('<HD1>Inevitable Random HD1\n</HD1>', 'lxml-xml').find('HD1')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), '')
        # HD2 elements
        HD = bS('<HD2>Section 1002.4 - Known\n</HD2>', 'lxml-xml').find('HD2')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'section')
        HD = bS('<HD2>2(b) Application\n</HD2>', 'lxml-xml').find('HD2')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'graph_id')
        # HD3 elements
        HD = bS('<HD3>Section 1002.4 - Known\n</HD3>', 'lxml-xml').find('HD3')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'section')
        HD = bS('<HD3>2(b) Application\n</HD3>', 'lxml-xml').find('HD3')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'graph_id')
        HD = bS('<HD3>(b) Application\n</HD3>', 'lxml-xml').find('HD3')
        self.assertEqual(
            divine_interp_tag_use(HD, '1030'),
            'graph_id_inferred_section')
    def test_divine_interp_tag(self):

        # HD1 elements
        HD = bS('<HD1>Introduction\n</HD1>', 'lxml-xml').find('HD1')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'intro')
        HD = bS('<HD1>Appendix X - X-rays\n</HD1>', 'lxml-xml').find('HD1')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'appendix')
        HD = bS('<HD1>Appendices G & H - Cane\n</HD1>', 'lxml-xml').find('HD1')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'appendices')
        HD = bS('<HD1>Section 1002.4 - Known\n</HD1>', 'lxml-xml').find('HD1')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'section')
        HD = bS('<HD1>Inevitable Random HD1\n</HD1>', 'lxml-xml').find('HD1')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), '')
        # HD2 elements
        HD = bS('<HD2>Section 1002.4 - Known\n</HD2>', 'lxml-xml').find('HD2')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'section')
        HD = bS('<HD2>2(b) Application\n</HD2>', 'lxml-xml').find('HD2')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'graph_id')
        # HD3 elements
        HD = bS('<HD3>Section 1002.4 - Known\n</HD3>', 'lxml-xml').find('HD3')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'section')
        HD = bS('<HD3>2(b) Application\n</HD3>', 'lxml-xml').find('HD3')
        self.assertEqual(divine_interp_tag_use(HD, '1002'), 'graph_id')
        HD = bS('<HD3>(b) Application\n</HD3>', 'lxml-xml').find('HD3')
        self.assertEqual(
            divine_interp_tag_use(HD, '1030'),
            'graph_id_inferred_section')
 def test_parse_interp_graph_reference(self):
     valid_graph_element = bS("<HD3>Paragraph 2(c)(1)</HD3>", 'lxml-xml')
     self.assertEqual(
         parse_interp_graph_reference(valid_graph_element, '1002', '2'),
         "2-c-1-Interp")
     invalid_graph_element = bS("<HD3>Paragraph X(5)(a)</HD3>", 'lxml-xml')
     self.assertEqual(
         parse_interp_graph_reference(invalid_graph_element, '1002', '2'),
         "")
     valid_inferred_section_graph_element = bS(
         "<HD3>Paragraph (c)(1)</HD3>", 'lxml-xml')
     self.assertEqual(
         parse_interp_graph_reference(
             valid_inferred_section_graph_element, '1030', '2'),
         "2-c-1-Interp")
Пример #4
0
 def test_parse_interp_graph_reference(self):
     valid_graph_element = bS("<HD3>Paragraph 2(c)(1)</HD3>", 'lxml-xml')
     self.assertEqual(ecfr_importer.parse_interp_graph_reference(
         valid_graph_element, '1002', '2'),
         "2-c-1-Interp")
     invalid_graph_element = bS("<HD3>Paragraph X(5)(a)</HD3>", 'lxml-xml')
     self.assertEqual(ecfr_importer.parse_interp_graph_reference(
         invalid_graph_element, '1002', '2'),
         "")
     valid_inferred_section_graph_element = bS(
         "<HD3>Paragraph (c)(1)</HD3>", 'lxml-xml')
     self.assertEqual(
         ecfr_importer.parse_interp_graph_reference(
             valid_inferred_section_graph_element, '1030', '2'),
         "2-c-1-Interp")
Пример #5
0
def index():
    if request.method == 'POST':
        try:
            searchstring = request.form['content'].replace(" ", "%20")
            flipkart_url = "https://www.flipkart.com/search?q=" + searchstring
            uclient = uReq(flipkart_url)
            flipkartpage = uclient.read()
            uclient.close()
            flipkart_html = bS(flipkartpage, "html.parser")
            bigboxes = flipkart_html.findAll("div",
                                             {"class": "bhgxx2 col-12-12"})

            del bigboxes[0:3]
            del bigboxes[-5:]
            alldata = {}

            for boxes in bigboxes:

                productname = boxes.find("div", {"class": "_3wU53n"})
                productLinks = "https://www.flipkart.com" + boxes.div.div.div.a[
                    'href']
                alldata[productname.text] = productLinks
            with open('url.txt',
                      'w') as file:  #dumping the url dict in a txt file
                file.write(json.dumps(alldata))

            return render_template("links.html", alldata=alldata)
        except Exception as e:
            print('The Exception message is: ', e)
            return 'something is wrong'
        return render_template("links.html")
    else:
        return render_template("index.html")
Пример #6
0
    def get_url_content(self, news_source):
        """
        Returns the content of the called RSS Feed

        We might need this part in order to verify the validity of the request

        user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
        headers = {'User-Agent':user_agent}
        But I am not sure how to incorporate this into the feedparser.

        :param news_source: nos, rtl or nu
        :return: Beautiful soup object that contains all the text that is being returned..
        """
        import feedparser  # Usefull to parse RSS feeds
        from helper.miscfunction import dict_find
        from bs4 import BeautifulSoup as bS

        url_label = self.get_label(news_source)
        url_list = self.get_url_feed(news_source)
        rss_text = []
        for i_label, i_url in zip(url_label, url_list):
            url_content = feedparser.parse(i_url)
            rss_html_text = list(dict_find('value', url_content))
            for i_html in rss_html_text:
                process_html = bS(i_html, 'lxml').text.replace('\n', ' ')
                rss_text.append((news_source + '-' + i_label, process_html))
        return rss_text
Пример #7
0
 def getresponse(self):
     self.response = requests.get(self.translatelink,
                                  headers={'User-Agent': 'Mozilla/5.0'})
     soup = bS(self.response.text, 'html.parser')
     b = []
     # This piece looks for the first 2 translations:
     # This next piece looks for remaining translations in <a> tag:
     a = soup.find_all('a', class_='translation')
     for i in a:
         if i.text:
             word = i.text.replace("\n", "")
             word = word.lstrip()
             if word != 'Translation':
                 b.append(word)
     # And this piece searches for remaining translations in <div> tag:
     a = soup.find_all('div', class_='translation')
     for i in a:
         if i.text:
             word = i.text.replace("\n", "")
             word = word.lstrip()
             b.append(word)
     self.translations = b
     # This piece searches the all translations and examples:
     examp = soup.find_all("div", {"class": {"src ltr", "trg ltr"}})
     res = []
     for i in examp:
         a = i.text.strip()
         res.append(a)
     self.examples = res
Пример #8
0
def grabcharsfromfile(htmlfile: str, cur_file: int) -> List[Character]:
    charfile = open(htmlfile)
    char_bs = bS(charfile, "lxml")
    charfile.close()
    chartable = char_bs.table  # All characters exist in the same table
    characterblocks = chartable.findAll("p")
    numchars = len(characterblocks) // 4
    curspot = 0
    characters = []
    for x in range(0, numchars):
        mark = characterblocks[curspot].contents[1].string
        curspot += 1
        charauthorblock = characterblocks[curspot].string
        if len(charauthorblock) > 4:  # we have an author
            author = charauthorblock[4::]
        else:
            author = ""
        curspot += 1
        charworkblock = characterblocks[curspot].string
        if len(charworkblock) > 3:
            work = charworkblock[3::]
        else:
            work = ""
        curspot += 1
        characterblocks[curspot].unwrap()
        filepathname = characterblocks[curspot].attrs['id']
        work_id = filepathname[:8]
        page_id = filepathname[18:26]
        coords = splitcoordinates(filepathname[18:])
        curspot += 1
        characters.append(Character(mark, author, work, work_id, page_id, coords))
    return characters,
Пример #9
0
    def parse(self):
        for i in range(len(self.source)):
            soup = bS(self.source[i], features='lxml')
            content = soup.find_all("li", class_="")
            ref_ = [
                c.get('data-id') for c in content
                if c.get('data-id') is not None
            ]
            for j in range(len(ref_)):
                data = soup.find(attrs={'data-id': '{}'.format(ref_[j])})
                ref = ref_[j]
                topic_ = data.find("a", class_="lien-jv topic-title")
                topic = topic_.text.strip()
                t_link = os.environ.get('BASE_') + topic_.get('href')
                count = data.find("span", class_="topic-count").text.strip()
                time = data.find("span", class_="topic-date").text.strip()
                today = date.today()
                if hour.match(time):
                    date_ = str(today) + 'T' + str(time)
                else:
                    date_ = parse(time)
                topics = {'topic': topic,
                          'topic_link': t_link,
                          'ref': ref,
                          'keywords': self.search,
                          'post_count': int(count),
                          'indexed': 0,
                          'last_msg_date': date_,
                          'collection_date': datetime.now()}
                ES_.index(index=os.environ.get('INDEX_'), id=ref, body=topics)
                self.counter.append(1)
            self.refs.extend(ref_)
        CL_.close()

        self.update_index()
Пример #10
0
def get_price_from_flip_kart(link, class_name):
    page = urllib.request.urlopen(link)

    soup = bS(page, features="html.parser")
    value = soup.find('div', class_=class_name).string
    flip_kart_price = int(value[1::])
    return flip_kart_price
Пример #11
0
    def fetch(this):

        artciles = []
        while this.url != False:
            
            response = requests.get(this.url)
            data = bS(response.text, "html.parser")
            cards = data.select(".card")
            
            for card in cards:
                emoji = card.select_one(".emoji").text
                textCard = card.select_one(".card-text").text
                headerText = card.select("span")[1].text
                img = urljoin(this.url,card.select_one("img").attrs['src'])
                site = this.url

                newArticle = Article(headerText,emoji,textCard,img,site)
                artciles.append(newArticle)
            # End for card in cards
            
            nextSiteExists = data.select_one(".navigation a")
            if nextSiteExists != None:
                nextPage = urljoin(this.url,nextSiteExists.attrs['href'])
                this.url = nextPage
            else:
                this.url = False
    
        return artciles
Пример #12
0
async def get_election_offices():
    """Starting point of the scraper program. Scrapes BASE_URL for election office
    information and both dumps results to a .json file and returns the results as json.

    @return: list of scraped results as json.
    """
    # Get list of county names from registrar to populate form
    # Define coroutine functions (context managers)
    async with CloudflareScraper() as session:
        async with session.get(BASE_URL) as s:
            # ClientResponse.read() is a coroutine function so it must be awaited
            text = await s.read()
        soup = bS(text, "html5lib")

        info_list = soup.findAll("area")
        counties = [info['alt'] for info in info_list]
        county_urls = [info['href'] for info in info_list]

        # Use list of counties and IDs to get county info for each county
        tasks: List[Task] = []
        num_scraped = 0
        master_list = []

        for i in range(len(counties)):
            # Create task for a future asynchronous operation and store it in task list
            tasks.append(
                asyncio.create_task(
                    scrape_one_county(session, counties[i], county_urls[i])))

        # Run the coroutines and iterate over the yielded results as they complete
        # (out-of-order). Use asyncio.gather() with a couple code modifications to
        # preserve list order
        future: Future[Tuple[str, str, str, str, str]]
        for future in asyncio.as_completed(tasks):
            # Unpack awaited result of scrape_one_county()
            (
                address,
                county_website,
                phone_number,
                email_address,
                county_name,
            ) = await future
            schema = format_data_into_schema(
                address,
                county_website,
                phone_number,
                email_address,
                county_name,
            )
            master_list.append(schema)
            num_scraped += 1
            print(f"[New York] Scraped {county_name} county: "
                  f"#{num_scraped} of {len(counties)} .... "
                  f"[{round((num_scraped / len(counties)) * 100, 2)}%]")
    master_list = sorted(master_list, key=lambda county: county['countyName'])

    with open(os.path.join(ROOT_DIR, "scrapers", "new_york", "new_york.json"),
              "w") as f:
        json.dump(master_list, f)
    return master_list
 def test_parse_appendix_elements_with_interp_ref(self):
     from regulations3k.scripts.ecfr_importer import PAYLOAD
     PAYLOAD.interp_refs.update(
         {'1002-A': {'1': 'see(1002-A-1-Interp)'}})
     p_soup = bS(self.test_xml, 'lxml-xml')
     appendix = p_soup.find('DIV5').find('DIV9')
     parsed_appendix = parse_appendix_elements(appendix, '1002-A')
     self.assertIn("see(1002-A-1-Interp)", parsed_appendix)
 def test_set_table(self):
     PAYLOAD.reset()
     p_soup = bS(self.test_xml, 'lxml-xml')
     appendix = p_soup.find('DIV5').find('DIV9')
     table = appendix.find('TABLE')
     table_id = 'table-A-0'
     ecfr_importer.set_table(table, table_id)
     self.assertIn(table_id, PAYLOAD.tables.keys())
     self.assertTrue(isinstance(PAYLOAD.tables[table_id], RegTable))
Пример #15
0
 def test_set_table(self):
     PAYLOAD.reset()
     p_soup = bS(self.test_xml, 'lxml-xml')
     appendix = p_soup.find('DIV5').find('DIV9')
     table = appendix.find('TABLE')
     table_id = 'table-A-0'
     ecfr_importer.set_table(table, table_id)
     self.assertIn(table_id, PAYLOAD.tables.keys())
     self.assertTrue(isinstance(PAYLOAD.tables[table_id], RegTable))
Пример #16
0
def list_sub_ids(start):
    url = 'https://movie.douban.com/top250'
    params = {"start": start}

    response = requests.get(url, params=params, headers=headers)
    assert response.status_code == 200

    soup = bS(response.text, 'html.parser')
    items = soup.find_all('div', 'item')
    return list(map(get_id_from_item, items))
 def test_parse_appendix_elements_with_interp_ref(self):
     PAYLOAD.interp_refs.update({'A': {'1': 'see(A-1-Interp)'}})
     p_soup = bS(self.test_xml, 'lxml-xml')
     appendix = p_soup.find('DIV5').find('DIV9')
     parsed_appendix = ecfr_importer.parse_appendix_elements(appendix, 'A')
     self.assertIn("{1}", parsed_appendix)
     self.assertIn("(print or type):__", parsed_appendix)
     self.assertIn("<table>", parsed_appendix)
     self.assertIn("![image-A-1]", parsed_appendix)
     self.assertIn("{table-A-0}", PAYLOAD.tables.keys())
Пример #18
0
async def get_election_offices():
    async with aiohttp.ClientSession() as session:
        async with session.get(URL) as r:
            text = await r.read()

    soup = bS(text.decode("utf-8"), "html.parser")
    elems = soup.find_all("td")

    master_list = []

    for e in elems:
        text = [i.strip() for i in e.get_text('\n').split('\n') if i.strip()]
        if not text:
            continue

        county = text[0]
        clerk = text[1].split(":")[-1].strip()
        email = text[
            2] if county != "Daggett" else "*****@*****.**"
        street_number_name = text[3] if 'UT' in text[
            4] else f"{text[3]}, {text[4]}"
        city = text[-3].split(",")[0]
        zip_code = text[-3].split()[-1]
        phone = text[-2].split(":")[-1].strip()

        subschema = format_address_data(street_number_name, county)

        schema = {
            "countyName": county,
            "physicalAddress": {
                "city": city,
                "state": "Utah",
                "zipCode": zip_code,
                "locationName": subschema["locationName"],
            },
            "phone": phone,
            "email": email,
            "officeSupervisor": clerk,
            "website": URL,
        }

        if "poBox" in subschema:
            schema["physicalAddress"]["poBox"] = subschema["poBox"]
        if "aptNumber" in subschema:
            schema["physicalAddress"]["aptNumber"] = subschema["aptNumber"]
        if "streetNumberName" in subschema:
            schema["physicalAddress"]["streetNumberName"] = subschema[
                "streetNumberName"]

        master_list.append(schema)

    with open(os.path.join(ROOT_DIR, "scrapers", "utah", "utah.json"),
              "w") as f:
        json.dump(master_list, f)
    return master_list
def ecfr_to_regdown(part_number, file_path=None):
    """
    Extract a regulation Part from eCFR XML, and create regdown content.

    The default XML source is the latest regulation posting at www.gpo.gov,
    which gets updated every few days.

    If `file_path` is specified, a local XML file is parsed instead.

    DIV1 is a title (as in Title 12)
    DIV3 is a chapter (not used here)
    DIV5 is a part
    DIV6 is a subpart
    DIV8 is a section
    DIV9 is an appendix
    DIV9 element whose HEAD starts with 'Supplement I' is an interpretation

    To avoid mischief, we make sure the part number is on a whitelist.
    """
    PAYLOAD.reset()
    if part_number not in PART_ALLOWLIST:
        raise ValueError("Provided Part number is not a CFPB regulation.")
    starter = datetime.datetime.now()
    if file_path:
        try:
            with open(file_path, "r") as f:
                markup = f.read()
        except IOError:
            logger.info("Could not open local file {}".format(file_path))
            return
    else:
        ecfr_request = requests.get(LATEST_ECFR)
        if not ecfr_request.ok:
            logger.info(
                "ECFR request failed with code {} and reason {}".format(
                    ecfr_request.status_code, ecfr_request.reason
                )
            )
            return
        ecfr_request.encoding = "utf-8"
        markup = ecfr_request.text
    soup = bS(markup, "lxml-xml")
    parts = soup.find_all("DIV5")
    part_soup = [div for div in parts if div["N"] == part_number][0]
    PAYLOAD.get_effective_date(part_number)
    PAYLOAD.parse_part(part_soup, part_number)
    part = PAYLOAD.part
    PAYLOAD.parse_version(part_soup, part)
    # parse_subparts will create and associate sections and appendices
    parse_subparts(part_soup, part)
    msg = "Draft version of Part {} created.\n" "Parsing took {}".format(
        part_number, (datetime.datetime.now() - starter)
    )
    return msg
Пример #20
0
 def test_parse_appendices_creation(self):
     PAYLOAD.reset()
     self.assertEqual(len(PAYLOAD.appendices), 0)
     test_part = Part.objects.first()
     test_subpart = Subpart.objects.first()
     PAYLOAD.subparts['appendix_subpart'] = test_subpart
     PAYLOAD.interp_refs.update({'A': {'1': 'see(A-1-Interp)'}})
     soup = bS(self.test_xml, 'lxml-xml')
     test_appendices = [soup.find('DIV5').find('DIV9')]
     ecfr_importer.parse_appendices(test_appendices, test_part)
     self.assertEqual(len(PAYLOAD.appendices), 1)
Пример #21
0
def ecfr_to_regdown(part_number, file_path=None):
    """
    Extract a regulation Part from eCFR XML, and create regdown content.

    The default XML source is the latest regulation posting at www.gpo.gov,
    which gets updated every few days.

    If `file_path` is specified, a local XML file is parsed instead.

    DIV1 is a title (as in Title 12)
    DIV3 is a chapter (not used here)
    DIV5 is a part
    DIV6 is a subpart
    DIV8 is a section
    DIV9 is an appendix
    DIV9 element whose HEAD starts with 'Supplement I' is an interpretation

    To avoid mischief, we make sure the part number is on a whitelist.
    """
    PAYLOAD.reset()
    if part_number not in PART_WHITELIST:
        raise ValueError('Provided Part number is not a CFPB regulation.')
    starter = datetime.datetime.now()
    if file_path:
        try:
            with open(file_path, 'r') as f:
                markup = f.read()
        except IOError:
            logger.info("Could not open local file {}".format(file_path))
            return
    else:
        ecfr_request = requests.get(LATEST_ECFR)
        if not ecfr_request.ok:
            logger.info(
                "ECFR request failed with code {} and reason {}".format(
                    ecfr_request.status_code, ecfr_request.reason))
            return
        ecfr_request.encoding = 'utf-8'
        markup = ecfr_request.text
    soup = bS(markup, "lxml-xml")
    parts = soup.find_all('DIV5')
    part_soup = [div for div in parts if div['N'] == part_number][0]
    PAYLOAD.get_effective_date(part_number)
    PAYLOAD.parse_part(part_soup, part_number)
    part = PAYLOAD.part
    PAYLOAD.parse_version(part_soup, part)
    # parse_subparts will create and associate sections and appendices
    parse_subparts(part_soup, part)
    msg = (
        "Draft version of Part {} created.\n"
        "Parsing took {}".format(
            part_number, (datetime.datetime.now() - starter))
    )
    return msg
 def test_parse_appendices_creation(self):
     PAYLOAD.reset()
     self.assertEqual(len(PAYLOAD.appendices), 0)
     test_part = Part.objects.first()
     test_subpart = Subpart.objects.first()
     PAYLOAD.subparts['appendix_subpart'] = test_subpart
     PAYLOAD.interp_refs.update({'A': {'1': 'see(A-1-Interp)'}})
     soup = bS(self.test_xml, 'lxml-xml')
     test_appendices = [soup.find('DIV5').find('DIV9')]
     ecfr_importer.parse_appendices(test_appendices, test_part)
     self.assertEqual(len(PAYLOAD.appendices), 1)
 def test_appendix_graph_parsing(self):
     ls = IdLevelState()
     p_soup = bS(self.test_xml, 'lxml-xml')
     graphs = p_soup.find_all('DIV5')[1].find_all('DIV9')[1].find_all('P')
     parsed_graph2 = ls.parse_appendix_graph(graphs[2], '1002-A')
     self.assertIn("(2) To the extent not included in item 1 above:",
                   parsed_graph2)
     parsed_graph3 = ls.parse_appendix_graph(graphs[3], '1002-A')
     self.assertIn("(i) National banks", parsed_graph3)
     ecfr_importer.parse_appendix_paragraphs(graphs, 'appendix', '1002-A')
     self.assertIn('\n1(a)', p_soup.text)
Пример #24
0
 def test_parse_appendix_elements_with_interp_ref(self):
     PAYLOAD.interp_refs.update(
         {'A': {'1': 'see(A-1-Interp)'}})
     p_soup = bS(self.test_xml, 'lxml-xml')
     appendix = p_soup.find('DIV5').find('DIV9')
     parsed_appendix = ecfr_importer.parse_appendix_elements(
         appendix, 'A')
     self.assertIn("{1}", parsed_appendix)
     self.assertIn("(print or type):__", parsed_appendix)
     self.assertIn("<table>", parsed_appendix)
     self.assertIn("![image-A-1]", parsed_appendix)
     self.assertIn("{table-A-0}", PAYLOAD.tables.keys())
Пример #25
0
 def test_appendix_id_type_sniffer(self):
     ls = IdLevelState()
     p_soup = bS(self.test_xml, 'lxml-xml')
     appendices = p_soup.find_all('DIV5')[1].find_all('DIV9')
     appendix_0_graphs = appendices[0].find_all('P')
     appendix_0_type = ls.sniff_appendix_id_type(appendix_0_graphs)
     self.assertEqual('appendix', appendix_0_type)
     appendix_1_graphs = appendices[1].find_all('P')
     appendix_1_type = ls.sniff_appendix_id_type(appendix_1_graphs)
     self.assertEqual('section', appendix_1_type)
     appendix_2_graphs = appendices[2].find_all('P')
     appendix_2_type = ls.sniff_appendix_id_type(appendix_2_graphs)
     self.assertIs(appendix_2_type, None)
 def test_appendix_id_type_sniffer(self):
     ls = IdLevelState()
     p_soup = bS(self.test_xml, 'lxml-xml')
     appendices = p_soup.find_all('DIV5')[1].find_all('DIV9')
     appendix_0_graphs = appendices[0].find_all('P')
     appendix_0_type = ls.sniff_appendix_id_type(appendix_0_graphs)
     self.assertEqual('appendix', appendix_0_type)
     appendix_1_graphs = appendices[1].find_all('P')
     appendix_1_type = ls.sniff_appendix_id_type(appendix_1_graphs)
     self.assertEqual('section', appendix_1_type)
     appendix_2_graphs = appendices[2].find_all('P')
     appendix_2_type = ls.sniff_appendix_id_type(appendix_2_graphs)
     self.assertIs(appendix_2_type, None)
Пример #27
0
async def request_data_for_one_county(session: ClientSession, county_data):
    session = aiohttp.ClientSession(connector=aiohttp.TCPConnector(verify_ssl=False))
    async with session.post(NEW_URL, data=county_data) as req:
        text = await req.read()
    _soup = bS(text.decode("utf-8"), "html.parser")

    office_data = _soup.find(id="pnlClerk").find(class_="card-body").text
    example = {"\t": None, "\n": " ", "\r": None}
    table = office_data.maketrans(example)
    cleaned = office_data.translate(table)

    res = format_data_into_schema(county_data["CountyName"], cleaned)
    return res
Пример #28
0
def review(product):
    try:
        file = open("url.txt", "r")
        contents = file.read()
        dictionary = ast.literal_eval(contents)
        productlink = dictionary[product]
        prodRes = requests.get(productlink)
        prodRes.encoding = 'utf-8'
        prod_html = bS(prodRes.text, "html.parser")
        print(prod_html)
        commentboxes = prod_html.find_all('div', {'class': "_3nrCtb"})
        reviews = []
        for commentbox in commentboxes:
            try:
                # name.encode(encoding='utf-8')
                name = commentbox.div.div.find_all(
                    'p', {'class': '_3LYOAd _3sxSiS'})[0].text
            except:
                name = 'No Name'
            try:
                # rating.encode(encoding='utf-8')
                rating = commentbox.div.div.div.div.text
            except:
                rating = 'No Rating'
            try:
                # commentHead.encode(encoding='utf-8')
                commentHead = commentbox.div.div.div.p.text
            except:
                commentHead = 'No Comment Heading'
            try:
                comtag = commentbox.div.div.find_all('div', {'class': ''})
                # custComment.encode(encoding='utf-8')
                custComment = comtag[0].div.text

            except Exception as e:
                print("Exception while creating dictionary: ", e)
            mydict = {
                "Product": product,
                "Name": name,
                "Rating": rating,
                "CommentHead": commentHead,
                "Comment": custComment
            }
            reviews.append(mydict)
        return render_template('results.html',
                               reviews=reviews[0:(len(reviews) - 1)])
    except Exception as e:
        print('The Exception message is: ', e)
        return 'something is wrong'
    return render_template('results.html')
Пример #29
0
def imdbScraping():
    celebrityNameList = []
    celebrityDetails = {}
    counter = 0

    BASE_URL = "http://m.imdb.com/feature/bornondate"

    #I'm using Selenium tool to extract the content from the IMDB page as this page is dynamic in nature
    driver = webdriver.Chrome()
    driver.get(BASE_URL)
    html = driver.page_source
    #Creating soup object to html that I have generated with webdriver.
    soup = bS(html, "html5lib")
    content = soup.find('section', 'posters list')
    bornToday = content.findChild('h1').text

    #Looping through all the 'a' tag that contents all the details required about the Celebrities that we looking too
    for a in content.findAll('a', 'poster', limit=10):

        celebrityDetails[counter] = {}

        #Creating a dictionary that hilds the details of each celebrities
        '''
        for 0 <= counter < 10, create a celebrity deatils of celebrity that we are interested in
            celebrityDetails{counter: {"celebrityName": "name",
                                       "celebrityImage": "image",
                                       "celebrityProfession": "profession",
                                       "celebrityBestWork": "bestWork"
                                       "celebritySentimentAnalysis": "sentimentAnalysis p/n/nt",
                                       }
                             }
        '''
        #Extracting all the required details
        celebrityName = a.find('span', 'title').text
        celebrityNameList.append(celebrityName)

        celebrityDetails[counter]["celebrityName"] = celebrityName
        celebrityDetails[counter]["celebrityImage"] = a.img['src']

        Profession, bestWork = a.find('div', 'detail').text.split(",", 1)

        celebrityDetails[counter]["celebrityProfession"] = Profession
        celebrityDetails[counter]["celebrityBestWork"] = bestWork

        counter += 1
        #print counter


#Returning celebrity name list and the celebrity details.
    return celebrityNameList, celebrityDetails
Пример #30
0
async def scrape_one_county(session, county_id, county_name):
    data = {"idTown": county_id, "SubmitCounty": "Submit", "contactType": "R"}
    async with session.post(INFO_URL, data=data) as s:
        text = await s.read()
        soup = bS(text, "html5lib")

    table = soup.find("table", {"id": "Table1"})
    rows = table.find_all("tr")

    # Get county registsrar name
    registrar_name = ""
    if "County Chief Registrar" in rows[0].getText():
        registrar_name = get_county_registrar(rows[0].getText())

    # Get mailing and physical addresses
    phys_address, mail_address = "", ""

    if ("Physical Address:" in rows[0].getText()
            and "SAME AS ABOVE" not in rows[0].getText()):
        phys_info_str = str(rows[0])
        phys_address = format_address_html(phys_info_str)

    mail_info_str = str(rows[1])
    mail_address = format_address_html(mail_info_str)

    # Get phone number
    phone_number = ""
    if "Telephone: " in rows[2].getText():
        contact_info_str = rows[2].getText()
        phone_number = get_phone_number(contact_info_str)

    # Get Email
    email_address = ""
    email = soup.find("span", class_="__cf_email__")
    if email is not None:
        hex_email = email["data-cfemail"]

        # function to decode hexadecimal email strings
        # lifted this off some stackoverflow post lol
        email_address = electionsaver.decode_email(hex_email)

    return (
        registrar_name,
        phys_address,
        mail_address,
        phone_number,
        email_address,
        county_name,
    )
Пример #31
0
def getAllHeadlines(category, outputFile):
    """
    will read all the headlines and save in the
    filename specified
    :return: none
    """
    url = ["https://marathi.abplive.com/news/"]  # using this website to scrap
    url[0] += category
    allHeadlines = []
    counter = 2  # this website track pages after 2

    for link in url:
        while link:
            if counter > DATA_LEN:
                break
            htmlDoc = ''
            print(f"Getting............ {link}")
            req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
            with urlopen(req) as response:
                for line in response:
                    line = line.decode('utf-8')
                    htmlDoc = htmlDoc + line.replace('\n', '')
                soup = bS(htmlDoc, 'html.parser')
                headlineDiv = soup.find_all(
                    'div', {'class': 'uk-width-3-5 fz20 p-10 newsList_ht'})
                for headLine in headlineDiv:
                    article = headLine.text

                    article = re.sub(r"\([^)]*\)", r'', article)
                    article = re.sub(r"\[[^\]]*\]", r'', article)
                    article = re.sub(r"<[^>]*>", r'', article)
                    article = re.sub(r"^https?://.*[\r\n]*", r'', article)
                    article = re.sub(r'^http?://.*[\r\n]*', r'', article)
                    article = article.replace(u'\ufeff', '')
                    article = article.replace(u'\xa0', u'')
                    article = article.replace('  ', ' ')
                    article = article.replace(' , ', ', ')
                    article = article.replace('-', '')
                    article += "\n"

                    allHeadlines.append(article)
            link = url[0] + "/page-" + str(counter)
            counter += 1

    print(f"Total headlines collected :: {len(allHeadlines)}")
    with open(outputFile, "w") as file:
        for headline in allHeadlines:
            file.write(str(headline))
    print("Done......................")
Пример #32
0
async def get_election_offices():
    # page is dynamic--use selenium to execute the javascript before extracting data
    driver = WTVWebDriver("Missouri").get_webdriver()
    driver.get(URL)
    time.sleep(1)

    soup = bS(driver.page_source, "html.parser")
    elems = soup.find_all(class_="group")

    master_list = []

    for e in elems:
        text = [i.strip() for i in e.get_text("\n").split("\n") if i.strip()]

        county = text[0].split(",")[0].split(" County")[0].split(" Board")[0]
        street_number_name = text[1]
        city = text[2].split(",")[0]
        zip_code = text[2].split()[-1]
        phone = text[3]
        website = URL if len(text) == 6 else text[-1]

        subschema = format_address_data(street_number_name, county)

        schema = {
            "countyName": county,
            "physicalAddress": {
                "city": city,
                "state": "Missouri",
                "zipCode": zip_code,
                "locationName": subschema["locationName"],
            },
            "phone": phone,
            "website": website,
        }

        if "poBox" in subschema:
            schema["physicalAddress"]["poBox"] = subschema["poBox"]
        if "aptNumber" in subschema:
            schema["physicalAddress"]["aptNumber"] = subschema["aptNumber"]
        if "streetNumberName" in subschema:
            schema["physicalAddress"]["streetNumberName"] = subschema[
                "streetNumberName"]

        master_list.append(schema)

    with open(os.path.join(ROOT_DIR, "scrapers", "missouri", "missouri.json"),
              "w") as f:
        json.dump(master_list, f)
    return master_list
Пример #33
0
def main():
    url = input("Enter the URL - ")
    #url = "https://www.keepinspiring.me/quotes-about-happiness/"
    
    headerS = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36"}
    page = requests.get(url, headers = headerS)
    soup = bS(page.content , 'html.parser')
    #print(soup.prettify())
    quotes = []
    for i in soup.find_all(class_="author-quotes"):
        quotes.append(i.get_text())
    f = open("happyQuotes.txt" ,"w" , encoding="utf-8")
    for quote in quotes:
        f.write(quote+"\n")
    f.close()
 def test_table_no_head_rows(self):
     test_table = ('<DIV>\n'
                   '<TABLE class="gpotbl_table">\n'
                   '<TR>\n'
                   '<TD>\n'
                   'Cell content\n'
                   '</TD>\n'
                   '</TR>'
                   '</TABLE>'
                   '</DIV>')
     table_soup = bS(test_table, 'lxml-xml').find('TABLE')
     table_label = '{table-test-label}'
     regtable = RegTable(table_label)
     msg = regtable.parse_xml_table(table_soup)
     self.assertEqual(msg, "Table is set for {}!".format(table_label))
     self.assertNotIn('<thead>', regtable.table())
Пример #35
0
 def test_appendix_graph_parsing(self):
     ls = IdLevelState()
     p_soup = bS(self.test_xml, 'lxml-xml')
     graphs = p_soup.find_all('DIV5')[1].find_all('DIV9')[1].find_all('P')
     parsed_graph2 = ls.parse_appendix_graph(graphs[2], '1002-A')
     self.assertIn(
         "(2) To the extent not included in item 1 above:",
         parsed_graph2
     )
     parsed_graph3 = ls.parse_appendix_graph(graphs[3], '1002-A')
     self.assertIn(
         "(i) National banks",
         parsed_graph3
     )
     ecfr_importer.parse_appendix_paragraphs(graphs, 'appendix', '1002-A')
     self.assertIn('\n1(a)', p_soup.text)
Пример #36
0
async def get_election_offices():
    """Starting point of the scraper program. Scrapes BASE_URL for election office
    information and both dumps results to a .json file and returns the results as json.

    @return: list of scraped results as json.
    """
    # Define coroutine functions (context managers)
    async with CloudflareScraper() as session:
        async with session.get(BASE_URL) as s:
            # ClientResponse.read() is a coroutine function so it must be awaited
            text = await s.read()
        soup = bS(text.decode("utf-8"), "html.parser")

        test_county_data = get_county_codes_and_names(soup)
        county_data = sorted(test_county_data, key=lambda k: k["countyName"])
        num_scraped = 0
        master_list = []

        # Create list that will store asyncio tasks
        tasks: List[Task] = []
        for county in county_data:
            code = county["countyCode"]
            name = county["countyName"]
            # Create task for a future asynchronous operation and store it in task list
            tasks.append(asyncio.create_task(scrape_one_county(session, code, name)))

        # Run the coroutines and iterate over the yielded results as they complete
        # (out-of-order). Use asyncio.gather() with a couple code modifications to
        # preserve list order
        future: Future[Tuple[str, str, str, str]]
        for future in asyncio.as_completed(tasks):
            # Unpack awaited result of scrape_one_county()
            cleaned_string, protected_email, _, county_name = await future
            schema = format_data_into_schema(
                cleaned_string, protected_email, county_name
            )
            master_list.append(schema)
            num_scraped += 1
            print(
                f"[Florida] Scraped {county_name} county: "
                f"#{num_scraped} of {len(county_data)} .... "
                f"[{round((num_scraped / len(county_data)) * 100, 2)}%]"
            )

    with open(os.path.join(ROOT_DIR, "scrapers", "florida", "florida.json"), "w") as f:
        json.dump(master_list, f)
    return master_list
Пример #37
0
async def scrape_one_county(session, county_code, county_name):
    url = BASE_URL + "countyInfo.asp?county=" + county_code
    # s = scraper.get(url)
    async with session.get(url) as s:
        text = await s.read()
        soup = bS(text.decode("utf-8"), "html.parser")

    # relevant info is in a random <p> with no classes
    county_info = soup.find("p", attrs={"class": None}).text
    hex_email = soup.find("span", class_="__cf_email__")["data-cfemail"]

    # clean up \t \r \n tags from string
    example = {"\t": None, "\n": " ", "\r": None}
    table = county_info.maketrans(example)
    cleaned = county_info.translate(table)

    return cleaned, hex_email, county_code, county_name
Пример #38
0
 def test_table_no_head_rows(self):
     test_table = (
         '<DIV>\n'
         '<TABLE class="gpotbl_table">\n'
         '<TR>\n'
         '<TD>\n'
         'Cell content\n'
         '</TD>\n'
         '</TR>'
         '</TABLE>'
         '</DIV>')
     table_soup = bS(test_table, 'lxml-xml').find('TABLE')
     table_label = '{table-test-label}'
     regtable = RegTable(table_label)
     msg = regtable.parse_xml_table(table_soup)
     self.assertEqual(msg, "Table is set for {}!".format(table_label))
     self.assertNotIn('<thead>', regtable.table())
Пример #39
0
async def scrape_one_county(session, county_name):
    county_url = BASE_URL + county_name.lower()
    async with session.get(county_url) as s:
        text = await s.read()
        soup = bS(text, "html5lib")

    p_tags = soup.findAll('p')

    address = ''
    county_website = county_url
    phone_number = ''
    email_address = ''
    director_name = ''

    # Basically need to make a state machine and parse line by line, initially
    # scraping address components, then phone number, director, email, etc. This website sucks.

    # Variable to determine whether we are still scraping address.
    scraping_address = True
    for line in p_tags[3:]:
        if (phone_number == '' and '(' in line.text and ')' in line.text
                and "Post" not in line.text and "John" not in line.text
                and "Room" not in line.text):
            raw_number = line.text
            phone_number = raw_number.replace('Phone', '').replace(
                'Office', '').replace(':', '').strip()

            # No longer on an address line, so set to false.
            scraping_address = False

        if director_name == '' and 'Director' in line.text:
            end_index = line.text.index('Director')
            director_name = line.text[:end_index].replace('-', '').strip()

        if email_address == '' and '@' in line.text:
            email_address = line.text.strip()

        if 'Board of Voter Registration' in line.text:
            county_website = line.find('a')['href']

        if scraping_address:
            address = address + ' ' + line.text

    return (address, county_website, phone_number, email_address,
            director_name, county_name)
 def test_interp_graph_parsing(self):
     soup = bS(self.interp_xml, 'lxml-xml')
     part_soup = soup.find('DIV5')
     part = parse_part(part_soup, '1002')
     version = parse_version(part_soup, part)
     interp_subpart = Subpart(
         title="Supplement I to Part {}".format(part.part_number),
         label="Official Interpretations",
         version=version)
     interp_subpart.save()
     interp = [div for div
               in part_soup.find_all('DIV9')
               if div.find('HEAD').text.startswith('Supplement I')][0]
     parse_interps(interp, part, interp_subpart)
     self.assertEqual(
         Subpart.objects.filter(title__contains='Supplement I').count(),
         1,
     )
Пример #41
0
 def test_interp_inferred_section_graph_parsing(self):
     PAYLOAD.reset()
     self.assertEqual(PAYLOAD.interp_refs, {})
     soup = bS(self.interp_xml, 'lxml-xml')
     parts = soup.find_all('DIV5')
     part_soup = [div for div in parts if div['N'] == '1030'][0]
     PAYLOAD.parse_part(part_soup, '1030')
     part = PAYLOAD.part
     PAYLOAD.parse_version(part_soup, part)
     version = PAYLOAD.version
     interp_subpart = Subpart(
         title="Supplement I to Part {}".format(part.part_number),
         label="Official Interpretations",
         version=version)
     interp_subpart.save()
     interp = [div for div
               in part_soup.find_all('DIV9')
               if div.find('HEAD').text.startswith('Supplement I')][0]
     ecfr_importer.parse_interps(interp, part, interp_subpart)
     self.assertEqual(PAYLOAD.interp_refs['1']['c'], 'see(1-c-Interp)')
Пример #42
0
 def test_interp_graph_parsing(self):
     soup = bS(self.interp_xml, 'lxml-xml')
     part_soup = soup.find('DIV5')
     PAYLOAD.parse_part(part_soup, '1002')
     part = PAYLOAD.part
     PAYLOAD.parse_version(part_soup, part)
     version = PAYLOAD.version
     interp_subpart = Subpart(
         title="Supplement I to Part {}".format(part.part_number),
         label="Official Interpretations",
         version=version)
     interp_subpart.save()
     interp = [div for div
               in part_soup.find_all('DIV9')
               if div.find('HEAD').text.startswith('Supplement I')][0]
     ecfr_importer.parse_interps(interp, part, interp_subpart)
     self.assertEqual(
         Subpart.objects.filter(title__contains='Supplement I').count(),
         1,
     )
Пример #43
0
 def test_parse_appendix_elements(self):
     p_soup = bS(self.test_xml, 'lxml-xml')
     appendix = p_soup.find('DIV5').find('DIV9')
     parsed_appendix = ecfr_importer.parse_appendix_elements(
         appendix, 'A')
     self.assertIn("**1.", parsed_appendix)
Пример #44
0
 def test_parse_interp_graph_no_id(self):
     section_graph_element_no_id = bS(
         "<P>This is a bare interp paragraph with no ID.</P>", 'lxml-xml')
     parsed_graph = ecfr_importer.parse_interp_graph(
         section_graph_element_no_id)
     self.assertTrue(parsed_graph.startswith('This is a bare interp'))
Пример #45
0
 def test_multi_id_paragraph_parsing(self):
     soup = bS(self.test_xml, 'lxml-xml')
     graph_soup = soup.find_all('P')
     parsed_graphs = ecfr_importer.parse_section_paragraphs(graph_soup, '1')
     self.assertIn('**(a) Delivery of account disclosures**', parsed_graphs)