def scrapeArticle(URL): #Goes through an article indexes the title, date, body text and author if avaliable #Does not work on galleries and video #Strips name Aljazeera from body text inText = '' curImage = '' soup = BeautifulSoup(urllib.urlopen(URL)) for section in soup.find_all('div', attrs={'class': 'text section'}): bodySoup = BeautifulSoup(str(section)) for paragraph in bodySoup.find_all('p'): inText += paragraph.get_text().encode('utf-8').strip() + '\n\n' ArticleDict = { 'title': checkElement( soup.find('div', attrs={'class': 'articleOpinion-title--container'}), 'title'), 'author': ' '.join( checkElement( soup.find('span', attrs={'class': 'articleOpinion-byline'}), 'author').split()), 'body_text': inText, 'source': 'Al Jazeera', 'URL': URL } date = soup.find('span', attrs={'class': 'date'}) time = soup.find('span', attrs={'class': 'time'}) if date != None: if time != None: ArticleDict['date'] = (date.get_text() + ' ' + time.get_text()).encode('utf-8') ArticleDict['timestamp'] = getTime( ArticleDict['date'].replace('AM', ' AM ').replace('PM', ' PM '), [','], [':'], '%B %d %Y %I %M %p %Z') else: ArticleDict['date'] = date.get_text().encode('utf-8') ArticleDict['timestamp'] = getTime( ArticleDict['date'].replace('AM', ' AM ').replace('PM', ' PM '), [','], [':'], '%B %d %Y') else: ArticleDict['date'] = 'Unknown' ArticleDict['timestamp'] = datetime.datetime.now() imageList = soup.prettify().split('\n') for cell in imageList: if 'background-image' in cell and '1460' in cell: curImage = cell curImage = curImage[(curImage.find("'") + 1):] curImage = curImage[:curImage.find("'")] if curImage.strip() != '': ArticleDict['image'] = 'http://america.aljazeera.com' + curImage return ArticleDict
def scrapeArticle(URL): #Indexes title, date, body, and author of given article #returns values in a dictionary bText = '' soup = BeautifulSoup(urllib.urlopen(URL)) linkList = map(lambda x: x.get_text(), soup.find_all('a')) pageStatus = soup.find('div', attrs = {'class':'singlepage'}) if pageStatus != None: soup = BeautifulSoup(urllib.urlopen('http://abcnews.go.com' + pageStatus.a.get('href'))) for paragraph in soup.find_all('p', attrs = {'itemprop':'articleBody'}): if paragraph.a not in linkList: bText += paragraph.get_text() date = checkElement(soup.find('div', attrs = {'class':'date'}), 'date') image = soup.find('div', attrs = {'class' : 'main_media'}) ArticleDict = {'title' : checkElement(soup.find('h1', True), 'title'), 'author' : " ".join(checkElement( soup.find('div', attrs = {'class':'byline'}), 'author') .split('\n\n')[0].lower().title().split()) .replace('And', 'and').replace('Abc', 'Source'), 'body_text' : bText.replace('\n', '\n\n').strip(), 'URL' : URL, 'source' : 'ABC News', 'date' : date, 'timestamp' : getTime(date, [',',':','.'], [], '%b %d %Y')} if image != None and image.img != None: if image.img.get('src') != None: ArticleDict['image'] = image.img.get('src') return ArticleDict
def scrapeArticle(URL): bText = "" sList = [] soup = BeautifulSoup(urllib.urlopen(URL)) mainStory = BeautifulSoup(str(soup.find("div", attrs={"class": "entry"}))) for paragraph in mainStory.find_all("p"): if paragraph.has_attr("class") != True and paragraph.get_text().strip() != "": sList.append(" ".join(paragraph.get_text().replace("CBS", "Source").strip().split())) if len(sList) <= 1: return None for sentence in sList: bText += sentence.strip() + ("\n\n") date = checkElement(soup.find("span", attrs={"class": "time"}), "date") ArticleDict = { "title": checkElement(soup.find("h1", attrs={"class": "title"}), "title"), "author": checkElement(soup.find("span", attrs={"class": "author"}), "author"), "body_text": bText[:-3], "URL": URL, "date": date, "timestamp": getTime( date.replace("AM", " AM ").replace("PM", " PM ").strip(), [",", ":"], [], "%B %d %Y %I %M %p" ), } source = soup.find("span", attrs={"class": "source"}) if source != None: ArticleDict["source"] = source.get_text() else: ArticleDict["source"] = "CBS News" image = soup.find("div", attrs={"class": "article-image"}) if image != None and image.img != None: ArticleDict["image"] = image.img.get("src") return ArticleDict
def scrapeArticle(URL): bText = '' sList = [] soup = BeautifulSoup(urllib.urlopen(URL)) mainStory = BeautifulSoup(str(soup.find('div', attrs = {'class': 'entry'}))) for paragraph in mainStory.find_all('p'): if paragraph.has_attr('class') != True and paragraph.get_text().strip() != '': sList.append(' '.join(paragraph.get_text() .replace('CBS', 'Source').strip().split())) if len(sList) <= 1: return None for sentence in sList: bText += sentence.strip() + ('\n\n') date = checkElement(soup.find('span', attrs = {'class': 'time'}), 'date') ArticleDict = {'title' : checkElement(soup.find('h1', attrs = {'class': 'title'}), 'title'), 'author' : checkElement(soup.find('span', attrs = {'class':'author'}), 'author'), 'body_text' : bText[:-3], 'URL' : URL, 'date' : date, 'timestamp' : getTime(date.replace('AM', ' AM ').replace('PM', ' PM ').strip(), [',',':'], [], '%B %d %Y %I %M %p')} source = soup.find('span', attrs = {'class': 'source'}) if source != None: ArticleDict['source'] = source.get_text() else: ArticleDict['source'] = 'CBS News' image = soup.find('div', attrs = {'class':'article-image'}) if image != None and image.img != None: ArticleDict['image'] = image.img.get('src') return ArticleDict
def scrapeArticle(URL): paraList = [] ArticleDict = {} soup = BeautifulSoup(urllib.urlopen(URL)) rawList = soup.prettify().split('\n') for cell in rawList: if 'published_time' in cell: cell = cell[cell.find('content=') + 9:] cell = cell[:cell.find('"')] p = cell.split('-') ArticleDict['date'] = p[1] + '/' + p[2] + '/' + p[0] else: ArticleDict['date'] = 'Unknown' ArticleDict['timestamp'] = getTime(ArticleDict['date'], [','], ['/'], '%m %d %Y') soup2 = BeautifulSoup(str(soup.find('div', attrs={'class': 'body'}))) for line in soup2.find_all('p'): if line.has_attr('class') == False: paraList.append(line.get_text()) ArticleDict['author'] = checkElement(soup.find('p'), 'author') ArticleDict['body_text'] = '\n\n'.join(paraList[:-2]) ArticleDict['title'] = checkElement(soup.find('h1'), 'title') ArticleDict['source'] = 'Forbes' ArticleDict['URL'] = URL return ArticleDict
def scrapeArticle(URL): #Goes through an article indexes the title, date, body text and author if avaliable #Does not work on galleries and video #Strips name Aljazeera from body text inText = '' curImage = '' soup = BeautifulSoup(urllib.urlopen(URL)) for section in soup.find_all('div', attrs = {'class':'text section'}): bodySoup = BeautifulSoup(str(section)) for paragraph in bodySoup.find_all('p'): inText += paragraph.get_text().encode('utf-8').strip() + '\n\n' ArticleDict = {'title' : checkElement( soup.find('div', attrs = {'class':'articleOpinion-title--container'}),'title'), 'author' : ' '.join(checkElement( soup.find('span', attrs = {'class':'articleOpinion-byline'}), 'author').split()), 'body_text' : inText, 'source' : 'Al Jazeera', 'URL' : URL} date = soup.find('span', attrs = {'class':'date'}) time = soup.find('span', attrs = {'class':'time'}) if date != None: if time != None: ArticleDict['date'] = (date.get_text() + ' ' + time.get_text()).encode('utf-8') ArticleDict['timestamp'] = getTime(ArticleDict['date'].replace('AM', ' AM ').replace('PM', ' PM '), [','], [':'], '%B %d %Y %I %M %p %Z') else: ArticleDict['date'] = date.get_text().encode('utf-8') ArticleDict['timestamp'] = getTime(ArticleDict['date'].replace('AM', ' AM ').replace('PM', ' PM '), [','], [':'], '%B %d %Y') else: ArticleDict['date'] = 'Unknown' ArticleDict['timestamp'] = datetime.datetime.now() imageList = soup.prettify().split('\n') for cell in imageList: if 'background-image' in cell and '1460' in cell: curImage = cell curImage = curImage[(curImage.find("'") + 1):] curImage = curImage[:curImage.find("'")] if curImage.strip() != '': ArticleDict['image'] = 'http://america.aljazeera.com' + curImage return ArticleDict
def scrapeArticle(URL): #scrapes CNN article #removes links that are not part of the main article #returns author, last updated date, title, image link, and the body text in a dictionary soup = BeautifulSoup(urllib.urlopen(URL)) linkList = map(lambda x: x.get_text(), soup.find_all('a')) inText = '' for para in soup.find_all('p', attrs = {'class': 'zn-body__paragraph'}): if para.get_text() not in linkList: if para.q == None and para.em == None: inText += (para.get_text().strip() + '\n\n') date = checkElement(soup.find('div', attrs = {'class': 'cnn_strytmstmp'}), 'date') ArticleDict = {'title' : checkElement(soup.find('h2', attrs = {'class': 'pg-headline'}), 'title'), 'author' : checkElement( soup.find('span', attrs = {'class': 'metadata__byline__author'}), 'author'), 'source' : 'CNN', 'body_text' : inText, 'date' : date, 'URL' : URL, 'timestamp' : getTime('0' + date.replace('Update', ''), [','] ,[':'], '%I %M %p %Z %a %B %d %Y')} image = soup.find('div', attrs = {'class' : 'cnn_stryimg640captioned'}) if image != None: image = BeautifulSoup(str(image)).prettify() for cell in image.split(' '): if 'src' in cell: ArticleDict['image'] = cell[cell.find('http'):].replace('"', '') elif soup.find('div', attrs = {'class':'cnnStryVidCont'}) != None: image = soup.prettify().split('\n') for cell in image: if 'thumb:' in cell: image = cell[cell.find('http'):] if image != None: ArticleDict['image'] = image[:-1*(len(image) - image.find("'"))] else: image = soup.find('div', attrs = {'class' : 'cnnArticleGalleryPhotoContainer'}) if image != None: image = BeautifulSoup(str(image)).prettify().split(' ') for cell in image: if 'src' in cell: ArticleDict['image'] = cell[cell.find('http'):].replace('"', '') return ArticleDict
def scrapeArticle(URL): #Indexes title, date, body, and author of given article #returns values in a dictionary bText = '' soup = BeautifulSoup(urllib.urlopen(URL)) linkList = map(lambda x: x.get_text(), soup.find_all('a')) pageStatus = soup.find('div', attrs={'class': 'singlepage'}) if pageStatus != None: soup = BeautifulSoup( urllib.urlopen('http://abcnews.go.com' + pageStatus.a.get('href'))) for paragraph in soup.find_all('p', attrs={'itemprop': 'articleBody'}): if paragraph.a not in linkList: bText += paragraph.get_text() date = checkElement(soup.find('div', attrs={'class': 'date'}), 'date') image = soup.find('div', attrs={'class': 'main_media'}) ArticleDict = { 'title': checkElement(soup.find('h1', True), 'title'), 'author': " ".join( checkElement( soup.find('div', attrs={'class': 'byline'}), 'author').split('\n\n')[0].lower().title().split()).replace( 'And', 'and').replace('Abc', 'Source'), 'body_text': bText.replace('\n', '\n\n').strip(), 'URL': URL, 'source': 'ABC News', 'date': date, 'timestamp': getTime(date, [',', ':', '.'], [], '%b %d %Y') } if image != None and image.img != None: if image.img.get('src') != None: ArticleDict['image'] = image.img.get('src') return ArticleDict
def scrapeArticle(URL): #Scrapes article at given url #Returns title, author, source, and body_text in dictionary bText = '' soup = BeautifulSoup(urllib.urlopen(URL)) date_source = ' '.join(soup.find('span', attrs = {'class' : 'source'}).get_text().split())[1:].split('-') linkList = map(lambda x: x.get_text(), soup.find_all('a')) articleDict ={ 'title' : checkElement(soup.find('h1', attrs = {'class' : 'page-headline'}), 'title'), 'author' : checkElement(soup.find('span', attrs = {'class':'byline'}), 'author'), 'source' : date_source[0].strip(), 'date' : date_source[1].strip(), 'timestamp' : getTime(date_source[1].strip(), [','], [], '%A %B %d %Y'), 'URL' : URL} body_soup = BeautifulSoup(str(soup.find('div', attrs = {'class' : 'article-text'}))) for para in body_soup.find_all('p'): if para.get_text() not in linkList: bText += para.get_text().strip() + '\n\n' articleDict['body_text'] = bText image = soup.find('div', attrs = {'class' : 'photo'}) if image != None and image.img != None: if image.img.get('src') != None: articleDict['image'] = image.img.get('src') return articleDict
def scrapeArticle(URL): paraList = [] ArticleDict = {} soup = BeautifulSoup(urllib.urlopen(URL)) rawList = soup.prettify().split('\n') for cell in rawList: if 'published_time' in cell: cell = cell[cell.find('content=') + 9:] cell = cell[:cell.find('"')] p = cell.split('-') ArticleDict['date'] = p[1] + '/' + p[2] + '/' + p[0] else: ArticleDict['date'] = 'Unknown' ArticleDict['timestamp'] = getTime(ArticleDict['date'], [','], ['/'], '%m %d %Y') soup2 = BeautifulSoup(str(soup.find('div', attrs = {'class':'body'}))) for line in soup2.find_all('p'): if line.has_attr('class') == False: paraList.append(line.get_text()) ArticleDict['author'] = checkElement(soup.find('p'),'author') ArticleDict['body_text'] = '\n\n'.join(paraList[:-2]) ArticleDict['title'] = checkElement(soup.find('h1'), 'title') ArticleDict['source'] = 'Forbes' ArticleDict['URL'] = URL return ArticleDict
def scrapeMoney(URL): #scrapes CNN Money article #returns author, last updated date, title, and body text in a dictionary #strips the links that are not part of the main article soup = BeautifulSoup(urllib.urlopen(URL)) image = soup.find('div', attrs = {'id': 'ie_dottop'}) linkList = map(lambda x: x.get_text(), soup.find_all('a')) body_soup = BeautifulSoup(str(soup.find('div', attrs = {'id': 'storytext'}))) date = checkElement(soup.find('span', attrs = {'class' : 'cnnDateStamp'}), 'date') inText = checkElement(soup.find('h2'), 'header') + '\n\n' for paragraph in body_soup.find_all('p')[1:]: if paragraph.find('span') == None: if paragraph.a != None: if paragraph.a.get_text() != linkList: inText += paragraph.get_text()[1:] + '\n\n' else: inText += paragraph.get_text()[1:] + '\n\n' in_soup = BeautifulSoup(str(soup.find('div', attrs = {'id': 'storycontent'}))) moneyDict = {'title' : checkElement(in_soup.find('h1'), 'title'), 'author' : checkElement( soup.find('span', attrs = {'class' : 'byline'}), 'author'), 'date' : date, 'source' : 'CNN Money', 'body_text' : inText, 'URL' : URL, 'timestamp' : getTime(date, [','] ,[':'], '%B %d %Y %I %M %p %Z')} if image != None and image.img != None: src = image.img.get('src') if src != None: moneyDict['image'] = image.img.get('src') else: img2 = soup.find('figure', attrs = {'class': 'body_img body_img--620'}) if img2 != None: moneyDict['image'] = img2.img.get('src') return moneyDict