def word_list(link): stopwords = hw2.read_stopwords('common_words') regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]') url = link try: html = request.urlopen(url).read() except: print('cannot open page') return [] soup = BeautifulSoup(html) # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() word = [] # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) for line in lines: li = line.split(' ') for w in li: w = w.lower() if w != '' and (w not in stopwords) and (regex.search(w) is None and (w != 'cdc')): f = re.sub(r'[^\w\s]', '', w) final = stemmer.stem(f) word.append(final) return word
def process_string(sent): regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]') stopwords = hw2.read_stopwords('common_words') words = sent.split(' ') final = [] for w in words: w = w.lower() if w != '' and (w not in stopwords) and (regex.search(w) is None): f = re.sub(r'[^\w\s]', '', w) final.append(stemmer.stem(f)) print(final) return final
def extract_medical_terms(): stopwords = hw2.read_stopwords('common_words') html = request.urlopen('https://en.wikipedia.org/wiki/List_of_medical_symptoms').read() soup = BeautifulSoup(html) word = [] for t in soup.find_all('li'): for w in t.text.split(' '): if w.lower() not in stopwords and ('(' not in w.lower()) and ("/" not in w.lower())\ and ("(" not in w.lower()): f = re.sub(r'[^\w\s]', '', w.lower()) final = stemmer.stem(f) word.append(final) with open('medical_terms', 'w', encoding='utf-8') as f: for item in word: f.write("%s\n" % item)
import hw2, hw4 import util import mechanize import sqlite3 from queue import Queue, PriorityQueue from bs4 import BeautifulSoup from string import ascii_lowercase from urllib import parse, request from urllib.parse import urlparse from collections import defaultdict import csv import operator import json from datetime import date, datetime medical_terms = hw2.read_stopwords('medical_terms') def renew_data(): base = 'https://www.cdc.gov/diseasesconditions/az/' disease_pages_alpha = [] disease_pages = [] for a in ascii_lowercase: link = base + a + '.html' disease_pages_alpha.append(link) res = request.urlopen(link) html = res.read() soup = BeautifulSoup(html, 'html.parser') target = soup.find("div", {"class": "az-content"}) for link in target.findChildren("a"): disease = [link.text, link.get('href')]
import hw2 import util import sqlite3 from selenium import webdriver from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException from queue import Queue from bs4 import BeautifulSoup from string import ascii_lowercase from urllib import parse, request from urllib.parse import urlparse from collections import defaultdict import json from datetime import date, datetime medical_terms = hw2.read_stopwords('medical_terms') disease_list = hw2.read_stopwords('disease_list') print(disease_list) symptom_list = util.load_symptoms() def renew_data_pubmed(): disease_to_article_links = defaultdict() base = 'https://www.ncbi.nlm.nih.gov/pubmed/' for idx, d in enumerate(disease_list): print(d) link = base + '?term=' + d links = get_all_article_links(link) disease_to_article_links[d] = links return dict(disease_to_article_links) # store_csv('disease_pages.csv', disease_pages)