예제 #1
0
def word_list(link):
    stopwords = hw2.read_stopwords('common_words')
    regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]')
    url = link
    try:
        html = request.urlopen(url).read()
    except:
        print('cannot open page')
        return []
    soup = BeautifulSoup(html)

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()
    word = []
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    for line in lines:
        li = line.split(' ')
        for w in li:
            w = w.lower()
            if w != '' and (w not in stopwords) and (regex.search(w) is None and (w != 'cdc')):
                f = re.sub(r'[^\w\s]', '', w)
                final = stemmer.stem(f)
                word.append(final)
    return word
예제 #2
0
def process_string(sent):
    regex = re.compile('[@_!#$%^&*()<>?/\|}{~:]')
    stopwords = hw2.read_stopwords('common_words')
    words = sent.split(' ')
    final = []
    for w in words:
        w = w.lower()
        if w != '' and (w not in stopwords) and (regex.search(w) is None):
            f = re.sub(r'[^\w\s]', '', w)
            final.append(stemmer.stem(f))
    print(final)
    return final
예제 #3
0
def extract_medical_terms():
    stopwords = hw2.read_stopwords('common_words')
    html = request.urlopen('https://en.wikipedia.org/wiki/List_of_medical_symptoms').read()
    soup = BeautifulSoup(html)
    word = []
    for t in soup.find_all('li'):
        for w in t.text.split(' '):
            if w.lower() not in stopwords and ('(' not in w.lower()) and ("/" not in w.lower())\
                    and ("(" not in w.lower()):
                f = re.sub(r'[^\w\s]', '', w.lower())
                final = stemmer.stem(f)
                word.append(final)

    with open('medical_terms', 'w', encoding='utf-8') as f:
        for item in word:
            f.write("%s\n" % item)
예제 #4
0
파일: engine.py 프로젝트: pli28/Medhacks19
import hw2, hw4
import util
import mechanize
import sqlite3
from queue import Queue, PriorityQueue
from bs4 import BeautifulSoup
from string import ascii_lowercase
from urllib import parse, request
from urllib.parse import urlparse
from collections import defaultdict
import csv
import operator
import json
from datetime import date, datetime

medical_terms = hw2.read_stopwords('medical_terms')


def renew_data():
    base = 'https://www.cdc.gov/diseasesconditions/az/'
    disease_pages_alpha = []
    disease_pages = []
    for a in ascii_lowercase:
        link = base + a + '.html'
        disease_pages_alpha.append(link)
        res = request.urlopen(link)
        html = res.read()
        soup = BeautifulSoup(html, 'html.parser')
        target = soup.find("div", {"class": "az-content"})
        for link in target.findChildren("a"):
            disease = [link.text, link.get('href')]
예제 #5
0
import hw2
import util
import sqlite3
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
from queue import Queue
from bs4 import BeautifulSoup
from string import ascii_lowercase
from urllib import parse, request
from urllib.parse import urlparse
from collections import defaultdict
import json
from datetime import date, datetime

medical_terms = hw2.read_stopwords('medical_terms')
disease_list = hw2.read_stopwords('disease_list')
print(disease_list)
symptom_list = util.load_symptoms()


def renew_data_pubmed():
    disease_to_article_links = defaultdict()
    base = 'https://www.ncbi.nlm.nih.gov/pubmed/'
    for idx, d in enumerate(disease_list):
        print(d)
        link = base + '?term=' + d
        links = get_all_article_links(link)
        disease_to_article_links[d] = links
    return dict(disease_to_article_links)
    # store_csv('disease_pages.csv', disease_pages)