def scrapeEmol(): r = requests.get("http://www.emol.com") html_doc = r.text soup = BeautifulSoup(html_doc, 'html.parser') for emol_link in soup.find_all("a"): if len(emol_link.getText()) > 50: SQLDBService.storeTemporaryEncounter(emol_link.getText(), sourceName)
def scrape(self, source, sourceName): # ElMostrador scraper does not really requires source! r = requests.get("http://www.elmostrador.cl") html_doc = r.text soup = BeautifulSoup(html_doc, 'html.parser') for high_impact_article in soup.find_all("img"): if len(high_impact_article.get('alt')) > 50: print high_impact_article.get('alt') SQLDBService.storeTemporaryEncounter( high_impact_article.get('alt').replace(u'\u201c', '"').replace(u'\u201d', '"'), sourceName)
def scrape(self, source, sourceName): # assumption: url is ok!, but if not, then RequestException is caught try: r = requests.get(source) except requests.exceptions.RequestException as r_exception: print r_exception return 2 html_doc = r.text soup = BeautifulSoup(html_doc, 'html.parser') # the generic scraper does only get all text from the source # in order to avoid uninteresting text, we only consider "text" above 50 characters length for genericSourceLinkText in soup.find_all("a"): if len(genericSourceLinkText.getText()) > 50: SQLDBService.storeTemporaryEncounter( genericSourceLinkText.getText().replace(u'\u201c', '"').replace(u'\u201d', '"'), sourceName)