Python Conversion 예제들, covid_scraping.Conversion Python 예제들

예제 #1

0

파일 보기

    def scrape(self):
        name = 'FloridaGov'
        url = 'https://floridahealthcovid19.gov/frequently-asked-questions/'
        html = requests.get(url).text
        soup = BeautifulSoup(html, "lxml")

        questions = [
            str(q) for q in soup.findAll("h4", {"class": "panel-title"})
        ]
        answers = [
            str(a) for a in soup.findAll("div", {"class": "panel-body"})
        ]

        converter = Conversion(self._filename, self._path)
        for question, answer in zip(questions, answers):
            converter.addExample({
                'sourceUrl': url,
                'sourceName': name,
                "needUpdate": True,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": question,
                "answer": answer,
                "hasAnswer": True,
                "targetEducationLevel": "NA",
                "topic": [],
                "extraData": {},
                "targetLocation": "Florida",
                "language": "en"
            })
        return converter.write()

예제 #2

0

파일 보기

    def scrape(self):
        name = 'Delaware State Government'
        url = 'https://coronavirus.delaware.gov/what-delawareans-can-do/#faqs'
        html = requests.get(url).text
        soup = BeautifulSoup(html, "lxml")

        questions = [
            str(q) for q in soup.findAll("h4", {"class": "panel-title"})
        ]
        answers = [
            str(a) for a in soup.findAll("div", {"class": "panel-body"})
        ]

        converter = Conversion(self._filename, self._path)
        for question, answer in zip(questions, answers):
            converter.addExample({
                'sourceUrl': url,
                'sourceName': name,
                "needUpdate": True,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": question,
                "answer": answer,
                "hasAnswer": True,
                "targetEducationLevel": "NA",
                "topic": [],
                "extraData": {},
                "targetLocation": "Delaware",
                "language": "en"
            })
        return converter.write()

예제 #3

0

파일 보기

    def scrape(self):
        name = 'North Dakota Stake Government'
        url = 'https://ndresponse.gov/covid-19-resources/covid-19-faqs'
        html = requests.get(url).text

        soup = BeautifulSoup(html, "lxml").findAll(
            'div',
            {'class': 'view-content'})[4].findAll('div',
                                                  {'class': 'views-row'})
        soup = [x.findAll('div', {'class': 'views-row'}) for x in soup]
        soup = list(itertools.chain.from_iterable(soup))

        questions = list(map(self._extract_question, soup))
        answers = list(map(self._extract_answer, soup))
        converter = Conversion(self._filename, self._path)
        for question, answer in zip(questions, answers):
            converter.addExample({
                'sourceUrl': url,
                'sourceName': name,
                "needUpdate": True,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": question,
                "answer": answer,
                "hasAnswer": True,
                "targetEducationLevel": "NA",
                "topic": [],
                "extraData": {},
                "targetLocation": "North Dakota",
                "language": "en"
            })
        return converter.write()

예제 #4

0

파일 보기

파일: kansas_gov.py 프로젝트: mfleming99/scraping-qas

    def scrape(self):
        name = 'Kansas Department of Health and Enviroment'
        url = 'https://ks-kdhecovid19.civicplus.com/faq.aspx'
        html = requests.get(url).text
        soup = BeautifulSoup(html, "lxml").find('div', {
            'id': 'modulecontent'
        }).findAll('dl')

        questions = list(map(self._extract_question, soup))
        answers = list(map(self._extract_answer, soup))

        converter = Conversion(self._filename, self._path)
        for question, answer in zip(questions, answers):
            converter.addExample({
                'sourceUrl': url,
                'sourceName': name,
                "needUpdate": True,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": question,
                "answer": answer,
                "hasAnswer": True,
                "targetEducationLevel": "NA",
                "topic": [],
                "extraData": {},
                "targetLocation": "Kansas",
                "language": "en"
            })
        return converter.write()

예제 #5

0

파일 보기

 def test_key_exception(self):
     with self.assertRaises(KeyError) as ke:
         converter = Conversion('test', '.')
         converter.addExample({
             'sourceUrl': 'example.com',
             'language': 'en',
         })

예제 #6

0

파일 보기

 def test_value_exception(self):
     with self.assertRaises(ValueError) as ve:
         converter = Conversion('test', '.')
         converter.addExample({
             'sourceUrl': ['example.com'],
             "language": 'en',
         })

예제 #7

0

파일 보기

파일: vermont_gov.py 프로젝트: mfleming99/scraping-qas

    def scrape(self):
        name = 'Vermont Department of Health'
        url = 'https://apps.health.vermont.gov/COVID/faq/'
        html = requests.get(url).text

        lastUpdateTime = time.mktime(dateparser.parse(BeautifulSoup(html, "lxml").find('p', {'class' : 'subtitle'})\
                        .getText().split('Updated:')[1].strip()).timetuple())

        soup = BeautifulSoup(html, "lxml").find('ul', {'class' : 'topics'}).findAll('li', {'class' : 'faq'})

        questions = list(map(self._extract_question, soup))
        answers = list(map(self._extract_answer, soup))

        converter = Conversion(
            self._filename,
            self._path)
        for question, answer in zip(questions, answers):
            converter.addExample({
                'sourceUrl': url,
                'sourceName': name,
                "needUpdate": True,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": question,
                "answer": answer,
                "hasAnswer": True,
                "targetEducationLevel": "NA",
                "topic": [],
                "extraData": {},
                "targetLocation": "Vermont",
                "language": "en"
            })
        return converter.write()

예제 #8

0

파일 보기

 def scrape(self):
     scraper_list = [
         Arbeitsagentur,
         BAUA,
         BMAS,
         BMG,
         BMWI,
         BVF,
         BZgA,
         BerlinerSenat,
         Bundesregierung,
         CDC_Children,
         CDC_Individuals,
         CDC_Travel,
         CDC_Water,
         ECDC,
         FHM_EN,
         FHM_SV,
         GOV_pl,
         IHK,
         KVB,
         RKI,
         Salute_IT,
         #UNICEF,
         WHO,
     ]
     logger = logging.getLogger(__name__)
     logging.disable(logging.WARNING)
     process = CrawlerProcess({
         'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
         'ITEM_PIPELINES': {
             '__main__.Pipeline': 1
         }
     })
     for crawler in scraper_list:
         process.crawl(crawler)
     process.start()
     df = pd.concat(RESULTS)
     converter = Conversion(self._filename, self._path)
     for _, row in df.iterrows():
         converter.addExample({
             'sourceUrl': row.link,
             'sourceName': row.source,
             "needUpdate": True,
             "typeOfInfo": "QA",
             "isAnnotated": False,
             "responseAuthority": "",
             "question": row.question,
             "answer": row.answer_html,
             "hasAnswer": bool(row.answer),
             "targetEducationLevel": "NA",
             "topic": [],
             "extraData": {},
             "targetLocation": row.country,
             "language": row.lang,
         })
     return converter.write()

예제 #9

0

파일 보기

    def scrape(self):
        Block = namedtuple('Block', 'content tags')
        extra_data = {}

        url = 'https://www.cnn.com/interactive/2020/health/coronavirus-questions-answers/'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')

        lastUpdatedTime = time.mktime(
            dateparser.parse(' '.join(
                soup.find('div', {
                    'class': 'cnnix-timestamp'
                }).getText().split()[1:]),
                             date_formats=['%B %d, %Y, %I %p']).timetuple())

        tags = [
            tag.get('data-topic')
            for tag in soup.find_all('div', attrs={'class': 'nav-button'})
        ]

        body = soup.find_all('div', attrs={'class':
                                           'interactive-container'})[1]

        blocks = []
        for div in body.find_all('div'):
            if 'question' == div.get('class')[0]:
                tags = div.get('class')[1:]
                block = Block(div, tags)
                blocks.append(block)

        questions, answers, topics = [], [], []
        for block in blocks:
            question = block.content.find('div', attrs={'class': 'question-q'})
            answer = block.content.find('div', attrs={'class': 'question-a'})
            questions.append(str(question))
            answers.append(str(answer))
            topics.append(block.tags)

        converter = Conversion(self._filename, self._path)
        for q, a, t in zip(questions, answers, topics):
            converter.addExample({
                'sourceUrl': url,
                'sourceName': "CNN",
                "needUpdate": True,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": q,
                "answer": a,
                "hasAnswer": a is not None,
                "targetEducationLevel": "NA",
                "topic": t,
                "extraData": {},
                "targetLocation": "United States",
                "language": 'en',
            })
        return converter.write()

예제 #10

0

파일 보기

    def scrape(self):
        name = 'Cleveland Clinic'
        url = 'https://newsroom.clevelandclinic.org/2020/03/18/frequently-asked-questions-about-coronavirus-disease-2019-covid-19/'
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
        r = requests.get(url, headers=headers)
        soup = BeautifulSoup(r.text, "lxml")

        faq = soup.find("div", {"class": "entry-content"})
        answers, questions = [], []

        q = ''
        a = ''
        for e in faq.findAll(recursive=False):
            if e.name == 'h5':
                if q and a:
                    questions.append(q.replace('Q:', ''))
                    answers.append(a.replace('A:', ''))

                q = str(e)
                a = ''
            else:
                a += " " + str(e)
        if q and a:
            questions.append(q.replace('Q:', ''))
            answers.append(a.replace('A:', ''))

        lastUpdateTime = time.mktime(
            dateparser.parse(
                soup.find(
                    "h3",
                    {"entry-sub-title"}).getText().strip().replace("Updated ", "")).timetuple())

        converter = Conversion(
            self._filename,
            self._path)
        for question, answer in zip(questions, answers):
            converter.addExample({
                'sourceUrl': url,
                'sourceName': name,
                "needUpdate": True,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": question,
                "answer": answer,
                "hasAnswer": True,
                "targetEducationLevel": "NA",
                "topic": [],
                "extraData": {},
                "targetLocation": "Cleveland",
                "language": "en"
            })
        return converter.write()

예제 #11

0

파일 보기

    def scrape(self):
        name = 'Texas Human Resources'
        url = 'https://www.dshs.state.tx.us/coronavirus/faq.aspx'
        html = requests.get(url, verify=False).text
        soup = BeautifulSoup(html, "lxml")

        # faq is in the second div
        faq = soup.find(
            "div", {
                "id": "ctl00_ContentPlaceHolder1_uxContent"}).findAll(
            "div", recursive=False)[1]
        lastUpdateTime = time.mktime(
            time.strptime(
                soup.find(
                    "span",
                    {"lastUpdatedDate"}).getText().strip(),
                "%B %d, %Y"))
        questions, answers = [], []
        a = ''
        begun = False
        for e in faq.findAll(recursive=False):
            if e.name == 'h3':
                if begun:
                    questions.append(q)
                    answers.append(a)
                q = str(e)
                a = ''
                begun = True
            elif e.name == 'p' or e.name == 'ul':
                a += str(e)
        questions.append(q)
        answers.append(a)

        converter = Conversion(
            self._filename,
            self._path)
        for question, answer in zip(questions, answers):
            converter.addExample({
                'sourceUrl': url,
                'sourceName': name,
                "needUpdate": True,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": question,
                "answer": answer,
                "hasAnswer": True,
                "targetEducationLevel": "NA",
                "topic": [],
                "extraData": {},
                "targetLocation": "Texas",
                "language": 'en'
            })
        return converter.write()

예제 #12

0

파일 보기

파일: jhu_hub.py 프로젝트: mfleming99/scraping-qas

 def scrape(self):
     hub_links_to_scrape = ['https://hub.jhu.edu/2020/03/30/andrew-pekosz-immunity-seasonality/?fbclid=IwAR2LUcjr7Ltz6koe0IjRV3gr7E3tW0K6hqlcaYPtKQz3HBmjlQ7YRGrtgHw',
                            'https://hub.jhu.edu/2020/03/23/how-to-self-quarantine-self-isolate/?mc_cid=0ed1a231a3&mc_eid=9687fd9d33']
     success = True
     for link in hub_links_to_scrape:
         faqs, lastUpdateTime = self._scrape(link)
         converter = Conversion(
             self._filename,
             self._path)
         for faq in faqs:
             converter.addExample(faq)
         success &= converter.write()
     return success

예제 #13

0

파일 보기

    def scrape(self):
        converter = Conversion(
            self._filename,
            self._path)

        df = pd.read_csv(open("COVID19infosheet - Info.tsv", 'r'), sep="\t")
        df = self._clean_headers(df)
        df['json'] = df.apply(self._prepare_data, axis=1)
        for obj in df['json']:
            if not obj['hasAnswer']:
                continue
            converter.addExample(obj)

        return converter.write()

예제 #14

0

파일 보기

    def scrape(self):
        chrome_driver_path = os.environ['CHROME_DRIVER_PATH']
        name = 'Oregon Public Health Division'
        url = 'https://www.oregon.gov/oha/PH/DISEASESCONDITIONS/DISEASESAZ/Pages/COVID19-FAQ.aspx?wp1284=l:100'
        opts = Options()
        opts.set_headless()
        driver = webdriver.Chrome(executable_path=chrome_driver_path, chrome_options=opts)
        driver.get(url)
        try:
            WebDriverWait(driver, 60).until(ec.presence_of_element_located((By.TAG_NAME, 'td')))
        except:
            return False
        html = driver.page_source
        soup = BeautifulSoup(html, 'lxml')

        questions = soup.findAll('td', {'data-title': 'Question'})
        answers = soup.findAll('td', {'data-title': 'Answer'})
        topics = soup.findAll('td', {'data-title': 'Topic'})

        lastUpdateTime = time.time()

        converter = Conversion(
            self._filename,
            self._path)

        for t, q, a in zip(topics, questions, answers):
            topic = self._extract_topic(t)
            question = self._extract_question(q)
            answer = self._extract_answer(a)

            converter.addExample({
                'sourceUrl': url,
                'sourceName': name,
                "needUpdate": True,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": question,
                "answer": answer,
                "hasAnswer": True,
                "targetEducationLevel": "NA",
                "topic": [topic],
                "extraData": {},
                "targetLocation": "Oregon",
                "language": "en"
            })

        driver.quit()
        return converter.write()

예제 #15

0

파일 보기

파일: whomyth_scraper.py 프로젝트: mfleming99/scraping-qas

 def scrape(self):
     url = 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/advice-for-public/myth-busters'
     html = urlopen(url)
     soup = BeautifulSoup(html, "lxml")
     qas_plus_some = soup.find_all(
         'div', class_='sf-content-block content-block')
     qa_pairs = []
     for potential in qas_plus_some:
         for child in potential.children:
             if "h2" in str(
                     child):  # Super hacky ... but this seemed to be the best way for this site
                 s_child = str(child)
                 s_child = s_child.replace("\n", " ")
                 s_child = s_child.replace(u'\xa0', u' ')
                 qa = s_child.split("</h2>")
                 if len(qa) == 2:
                     question = str(qa[0])
                     answer = str(qa[1])
                 elif len(qa) == 3:  # First question is different
                     question = str(qa[1])
                     answer = str(qa[2])
                 else:
                     print("ERROR:")  # TODO: better error handling?
                 qa_pairs.append((question, answer))
     converter = Conversion(
         self._filename,
         self._path)
     for pair in qa_pairs:
         converter.addExample({
             "sourceName": 'WHOMyth',
             "sourceUrl": url,
             "typeOfInfo": 'QA',
             "needUpdate": True,
             "typeOfInfo": 'QA',
             "isAnnotated": False,
             "responseAuthority": "",
             "question": pair[0],
             "answer": pair[1],
             "hasAnswer": True,
             "targetEducationLevel": 'NA',
             "topic": ["Myths"],
             "extraData": {},
             "targetLocation": "",
             "language": 'en'
         })
     return converter.write()

예제 #16

0

파일 보기

    def scrape(self):
        name = 'Hawaii State Government'
        url = 'https://health.hawaii.gov/coronavirusdisease2019/what-you-should-know/faqs/'
        html = requests.get(url).text
        soup = BeautifulSoup(html, "lxml")

        questions = [str(q)
                     for q in soup.findAll("h3")]

        answers = []
        for q in soup.findAll("h3"):
            a = ""
            for tag in q.next_siblings:
                if tag.name == "div":
                    break
                else:
                    a += str(tag)
            answers.append(a)

        lastUpdate = time.mktime(dateparser.parse(' '.join(soup.find(
                         'em').getText().split()[1:]), date_formats=['%B %d, %Y, %I %p']).timetuple())

        converter = Conversion(
            self._filename,
            self._path)
        for question, answer in zip(questions, answers):
            converter.addExample({
                'sourceUrl': url,
                'sourceName': name,
                "needUpdate": True,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": question,
                "answer": answer,
                "hasAnswer": True,
                "targetEducationLevel": "NA",
                "topic": [],
                "extraData": {},
                "targetLocation": "Hawaii",
                "language": "en"
            })
        return converter.write()

예제 #17

0

파일 보기

파일: fda_scraper.py 프로젝트: mfleming99/scraping-qas

    def scrape(self):
        name = 'FDA'
        url = 'https://www.fda.gov/emergency-preparedness-and-response/mcm-issues/coronavirus-disease-2019-covid-19-frequently-asked-questions'
        html = urlopen(url)
        soup = BeautifulSoup(html, "lxml")
        questions, answers = [], []

        for panelgroup in soup.findAll("div", {"class": "panel-group"}):
            for qa in panelgroup.findAll('div', {"class": "panel"}):
                q = str(qa.find("div",
                                {"class": "panel-heading"})).replace('Q:', '')
                a = str(qa.find("div",
                                {"class": "panel-body"})).replace('A:', '')
                questions.append(q)
                answers.append(a)

        lastUpdateTime = time.mktime(
            time.strptime(
                soup.find(
                    "p",
                    {"lcds-description-list__item-text"}).getText().strip(),
                "%m/%d/%Y"))

        converter = Conversion(self._filename, self._path)
        for question, answer in zip(questions, answers):
            converter.addExample({
                'sourceUrl': url,
                'sourceName': name,
                "needUpdate": True,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": question,
                "answer": answer,
                "hasAnswer": True,
                "targetEducationLevel": "NA",
                "topic": [],
                "extraData": {},
                "targetLocation": 'US',
                'language': 'en'
            })
        return converter.write()

예제 #18

0

파일 보기

파일: avma_scraper.py 프로젝트: mfleming99/scraping-qas

 def scrape(self):
     url = 'https://www.avma.org/resources-tools/animal-health-and-welfare/covid-19/covid-19-faqs-pet-owners'
     html = requests.get(url).text
     soup = BeautifulSoup(html, 'lxml')
     faq = soup.find('h3', {'id': '1'})
     questions = []
     answers = []
     begun = False
     for e in faq.next_siblings:
         if e.name == 'h5':
             if begun:
                 questions.append(q)
                 answers.append(a)
             q = str(e)
             a = ''
             begun = True
         elif e.name == 'p':
             a += str(e)
     questions.append(q)
     answers.append(a)
     converter = Conversion(self._filename, self._path)
     for q, a in zip(questions, answers):
         converter.addExample({
             'sourceUrl':
             'https://www.avma.org/sites/default/files/2020-03/covid-19-faq-pet-owners.pdf',
             'sourceName': 'AVMA',
             #No dates exist on the page
             "needUpdate": True,
             "containsURLs": False,
             "typeOfInfo": "QA",
             "isAnnotated": False,
             "responseAuthority": "",
             "question": q,
             "answer": a,
             "hasAnswer": True,
             "targetEducationLevel": "NA",
             "topic": ['pets', 'animals'],
             "extraData": {},
             "targetLocation": "US",
             'language': 'en'
         })
     return converter.write()

예제 #19

0

파일 보기

 def test_blank_answer_exception(self):
     with self.assertRaises(ValueError) as e:
         converter = Conversion('test', '.')
         converter.addExample({
             'sourceUrl': 'example.com',
             'sourceName': "example",
             "needUpdate": True,
             "typeOfInfo": "QA",
             "isAnnotated": False,
             "responseAuthority": "",
             "question": '<a href="example.com/dir1">What is COVID-19?</a>',
             "answer": '\n    \n',
             "hasAnswer": True,
             "targetEducationLevel": "NA",
             "topic": ['topic1', 'topic2'],
             "extraData": {
                 'hello': 'goodbye'
             },
             "targetLocation": "US",
             "language": 'en',
         })

예제 #20

0

파일 보기

파일: jhu_med.py 프로젝트: mfleming99/scraping-qas

    def scrape(self):
        url = "https://www.hopkinsmedicine.org/health/conditions-and-diseases/coronavirus/coronavirus-frequently-asked-questions"
        html = requests.get(url).text
        soup = BeautifulSoup(html, 'lxml').find_all('div', {'class': 'rtf'})
        lastUpdateTime = time.mktime(
            dateparser.parse(soup[-1].getText().strip()[7:]).timetuple())

        final_questions = []
        final_responces = []
        for section in soup:
            questions = section.find_all('h3')
            for question in questions:
                final_questions.append(question.get_text(strip=False))
                soup_iter = question
                answer = ""
                while soup_iter.find_next_sibling(
                ) and soup_iter.find_next_sibling().name in ['p', 'ul']:
                    soup_iter = soup_iter.find_next_sibling()
                    answer += " " + str(soup_iter)
                final_responces.append(answer)
        converter = Conversion(self._filename, self._path)
        for q, a in zip(final_questions, final_responces):
            converter.addExample({
                'sourceUrl': url,
                'sourceName': "JHU Medicine",
                "needUpdate": True,
                "containsURLs": False,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": q,
                "answer": a,
                "hasAnswer": True,
                "targetEducationLevel": "NA",
                "topic": [],
                "extraData": {},
                'targetLocation': '',
                'language': 'en'
            })
        return converter.write()

예제 #21

0

파일 보기

 def test_time_consistency(self):
     subprocess.run(
         ['touch', './schema_v0.3/test_time_consistency_v0.3.jsonl'])
     converter = Conversion('test_time_consistency', '.')
     converter.addExample({
         'sourceUrl': 'time.com',
         'sourceName': "time",
         "needUpdate": True,
         "typeOfInfo": "QA",
         "isAnnotated": False,
         "responseAuthority": "",
         "question": 'Hello, my time should match my next line?',
         "answer": 'Hello this is the example responce',
         "hasAnswer": True,
         "targetEducationLevel": "NA",
         "topic": ['topic1', 'topic2'],
         "extraData": {
             'hello': 'goodbye'
         },
         "targetLocation": "US",
         "language": 'en',
     })
     converter.addExample({
         'sourceUrl': 'time.com',
         'sourceName': "uuid",
         "needUpdate": True,
         "typeOfInfo": "QA",
         "isAnnotated": False,
         "responseAuthority": "",
         "question": 'Do I match the above line time? Please say yes!',
         "answer": 'Hello this is the example responce',
         "hasAnswer": True,
         "targetEducationLevel": "NA",
         "topic": ['topic1', 'topic2'],
         "extraData": {
             'hello': 'goodbye'
         },
         "targetLocation": "US",
         "language": 'en',
     })
     converter.write()
     with jsonlines.open(
             './schema_v0.3/test_time_consistency_v0.3.jsonl') as reader:
         line = reader.read()
         dateLastChanged_0 = line['dateLastChanged']
         line = reader.read()
         dateLastChanged_1 = line['dateLastChanged']
     self.assertEqual(dateLastChanged_0, dateLastChanged_1)
     subprocess.run(
         ['rm', './schema_v0.3/test_time_consistency_v0.3.jsonl'])

예제 #22

0

파일 보기

파일: nyt_scraper.py 프로젝트: mfleming99/scraping-qas

    def scrape(self):
        name = 'NYTimes'
        url = 'https://www.nytimes.com/interactive/2020/world/coronavirus-tips-advice.html'
        html = urlopen(url)
        soup = BeautifulSoup(html, "lxml")

        questions, answers = [], []
        for panelgroup in soup.findAll("div", {"class": "g-question-wrap"}):
            q = str(panelgroup.find('h3'))
            a = str(panelgroup.find('div', {'class': "g-answer-wrap"}))
            questions.append(q)
            answers.append(a)

        lastUpdateTime = time.mktime(
            time.strptime(
                soup.find('time').getText(),
                "Updated %B %d, %Y"))

        converter = Conversion(
            self._filename,
            self._path)
        for question, answer in zip(questions, answers):
            converter.addExample({
                'sourceUrl': url,
                'sourceName': name,
                "needUpdate": True,
                "typeOfInfo": "QA",
                "isAnnotated": False,
                "responseAuthority": "",
                "question": question,
                "answer": answer,
                "hasAnswer": True,
                "targetEducationLevel": "NA",
                "topic": [],
                "extraData": {},
                "targetLocation": "",
                "language": 'en'
            })
        return converter.write()

예제 #23

0

파일 보기

 def test_addExample(self):
     converter = Conversion('test', '.')
     converter.addExample({
         'sourceUrl': 'example.com',
         'sourceName': "example",
         "needUpdate": True,
         "typeOfInfo": "QA",
         "isAnnotated": False,
         "responseAuthority": "",
         "question": '<a href="example.com/dir1">What is COVID-19?</a>',
         "answer":
         '<p><a href="example.com/dir2">Coronaviruses</a> are a large family of viruses.</p>',
         "hasAnswer": True,
         "targetEducationLevel": "NA",
         "topic": ['topic1', 'topic2'],
         "extraData": {
             'hello': 'goodbye'
         },
         "targetLocation": "US",
         "language": 'en',
     })
     self.assertEqual(len(converter._examples), 1)
     self.assertEqual(converter.write(), True)

예제 #24

0

파일 보기

파일: jhu_bloomberg.py 프로젝트: mfleming99/scraping-qas

 def scrape(self):
     url = 'https://www.globalhealthnow.org/2020-02/coronavirus-expert-reality-check'
     html = requests.get(url).text
     lastUpdateTime = time.mktime(
         time.strptime(
             BeautifulSoup(html, 'lxml').find('div', {
                 'class': 'article-meta-wrap'
             }).getText().strip(), '%B %d, %Y'))
     soup = BeautifulSoup(html, 'lxml').find('div', {
         'property': 'schema:text'
     }).findAll('h3')
     questions_list = list(filter(self._filter_h3_headers, soup))
     questions = [x.getText().strip() for x in questions_list]
     responces = list(map(self._get_responces, questions_list[:-1]))
     responces.append(self._get_final_responce(questions_list[-1]))
     responces = list(map(self._truncate_responce, responces))
     topics = list(map(self._get_topic, questions_list))
     converter = Conversion(self._filename, self._path)
     for q, a, t in zip(questions, responces, topics):
         converter.addExample({
             'sourceUrl': url,
             'sourceName':
             "Johns Hopkins Bloomberg School of Public Health",
             "needUpdate": True,
             "typeOfInfo": "QA",
             "isAnnotated": False,
             "responseAuthority": "",
             "question": q,
             "answer": a,
             "hasAnswer": True,
             "targetEducationLevel": "College",
             "topic": [t],
             "extraData": {},
             "targetLocation": "",
             'language': 'en'
         })
     return converter.write()

예제 #25

0

파일 보기

 def scrape(self):
     url = 'https://www.canada.ca/en/public-health/services/diseases/coronavirus-disease-covid-19.html#faq'
     html = requests.get(url).text
     soup = BeautifulSoup(html, 'lxml').find('ul', {
         'class': 'list-unstyled'
     }).findAll('a')
     lastUpdatedTime = time.mktime(
         dateparser.parse(
             BeautifulSoup(html,
                           'lxml').find('p', {
                               'class': 'text-right h3 mrgn-tp-sm'
                           }).getText()).timetuple())
     questions = [str(x) for x in soup]
     response_links = [x['href'] for x in soup]
     responses = list(map(self._link_to_responce, response_links))
     converter = Conversion(self._filename, self._path)
     for q, a in zip(questions, responses):
         if not a:  # no accompanying answer to question
             continue
         converter.addExample({
             'sourceUrl': url,
             'sourceName': "Public Health Agency of Canada",
             "needUpdate": True,
             "typeOfInfo": "QA",
             "isAnnotated": False,
             "responseAuthority": "",
             "question": q,
             "answer": a if a else "",
             "hasAnswer": a is not None,
             "targetEducationLevel": "NA",
             "topic": [],
             "extraData": {},
             "targetLocation": "Canada",
             "language": 'en',
         })
     return converter.write()

예제 #26

0

파일 보기

 def scrape(self):
     converter = Conversion(
         self._filename,
         self._path)
     # Put the code here that makes the
     for exampleNums in range(10):
         converter.addExample({
             'sourceUrl': 'example.com',
             'sourceName': "example",
             "needUpdate": True,
             "typeOfInfo": "QA",
             "isAnnotated": False,
             "responseAuthority": "",
             "question": '<a href="example.com/dir1">What is COVID-19?</a>',
             "answer": '<p><a href="example.com/dir2">Coronaviruses</a> are a large family of viruses.</p>',
             "hasAnswer": True,
             "targetEducationLevel": "NA",
             "topic": ['topic1', 'topic2'],
             "extraData": {'hello': 'goodbye'},
             "targetLocation": "US",
             "language": 'en',
         })
     # This write() will fail because the path doesnt exist
     return converter.write()

예제 #27

0

파일 보기

 def test_init(self):
     converter = Conversion('test', '.')
     self.assertEqual(converter._file_prefix, 'test')
     self.assertEqual(converter._examples, [])

예제 #28

0

파일 보기

 def test_remove_unseen(self):
     subprocess.run(
         ['touch', './schema_v0.3/test_remove_unseen_v0.3.jsonl'])
     converter = Conversion('test_remove_unseen', '.')
     converter.addExample({
         'sourceUrl': 'time.com',
         'sourceName': "time",
         "needUpdate": True,
         "typeOfInfo": "QA",
         "isAnnotated": False,
         "responseAuthority": "",
         "question": 'Hello, my time should match my next line?',
         "answer": 'Hello this is the example responce',
         "hasAnswer": True,
         "targetEducationLevel": "NA",
         "topic": ['topic1', 'topic2'],
         "extraData": {
             'hello': 'goodbye'
         },
         "targetLocation": "US",
         "language": 'en',
     })
     converter.write()
     converter = Conversion('test_remove_unseen', '.')
     converter.addExample({
         'sourceUrl': 'time.com',
         'sourceName': "time",
         "needUpdate": True,
         "typeOfInfo": "QA",
         "isAnnotated": False,
         "responseAuthority": "",
         "question": 'I am completely new?',
         "answer": 'I am unique! I am special! I matter!',
         "hasAnswer": True,
         "targetEducationLevel": "NA",
         "topic": ['topic1', 'topic2'],
         "extraData": {
             'hello': 'goodbye'
         },
         "targetLocation": "US",
         "language": 'en',
     })
     converter.write()
     with open('./schema_v0.3/test_remove_unseen_v0.3.jsonl') as reader:
         self.assertEqual(len(reader.readlines()), 1)

예제 #29

0

파일 보기

 def test_id_preservation_fuzzy_change(self):
     subprocess.run(
         ['touch', './schema_v0.3/test_id_preservation_v0.3.jsonl'])
     converter = Conversion('test_id_preservation', '.')
     converter.addExample({
         'sourceUrl': 'uuid.com',
         'sourceName': "uuid",
         "needUpdate": True,
         "typeOfInfo": "QA",
         "isAnnotated": False,
         "responseAuthority": "",
         "question": 'Hello, this is the example question?',
         "answer": 'Hello this is the example responce',
         "hasAnswer": True,
         "targetEducationLevel": "NA",
         "topic": ['topic1', 'topic2'],
         "extraData": {
             'hello': 'goodbye'
         },
         "targetLocation": "US",
         "language": 'en',
     })
     converter.write()
     with jsonlines.open(
             './schema_v0.3/test_id_preservation_v0.3.jsonl') as reader:
         line = reader.read()
         id = line['ID']
     converter = Conversion('test_id_preservation', '.')
     converter.addExample({
         'sourceUrl': 'uuid.com',
         'sourceName': "uuid",
         "needUpdate": True,
         "typeOfInfo": "QA",
         "isAnnotated": False,
         "responseAuthority": "",
         "question": 'Hello, but this is the example question?',
         "answer": 'Hello this is the example responce',
         "hasAnswer": True,
         "targetEducationLevel": "NA",
         "topic": ['topic1', 'topic2'],
         "extraData": {
             'hello': 'goodbye'
         },
         "targetLocation": "US",
         "language": 'en',
     })
     converter.write()
     with jsonlines.open(
             './schema_v0.3/test_id_preservation_v0.3.jsonl') as reader:
         line = reader.read()
         new_id = line['ID']
     self.assertEqual(id, new_id)
     subprocess.run(['rm', './schema_v0.3/test_id_preservation_v0.3.jsonl'])

예제 #30

0

파일 보기

파일: nfid_scraper.py 프로젝트: mfleming99/scraping-qas

 def scrape(self):
     examples = self._crawl_common() + self._crawl_at_risk()
     converter = Conversion(self._filename, self._path)
     for example in examples:
         converter.addExample(example)
     return converter.write()