def process_page(self, url, soup): global uid_ # remove all javascript and stylesheet code for script in soup(["script", "style"]): script.extract() content = soup.find(class_='db-contentScn') if not content: log.error('content extraction failed') verbose('content extraction failed') log.error('{}'.format(url)) raise Exception else: verbose(' Content:=') verbose(' size: {}'.format(len(content))) year, month = self.extract_year_month(url, soup) log.info('year, month = {}, {}'.format(year, month)) verbose(' year/month: {}/{}'.format(year, month)) name = '___'.join( url.split('?')[0].split('/')[-2:] ).replace('.html', '') log.debug(content) paras = content.findAll('p') ; log.debug(pformat(paras)) path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name) for d in self.SUBDIRS: mkdir('{}/{}/{}'.format(d, year, self.month_alias[month])) page_content = '\n'.join(p.text for p in paras) page_abstract = paras[0].text.strip() title = soup.find('h1') breadcrumbs = soup.find(class_='breadCrums').findAll('a') breadcrumbs = ','.join([b.text.replace('\n', '').replace('\r', '') for b in breadcrumbs]) tags = soup.find(class_='tglst').findAll('a') tags = ','.join([b.text.replace('\n', '').replace('\r', '') for b in tags]) log.info(title.text) log.info(breadcrumbs) log.info(tags) record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url, title.text.strip(), breadcrumbs, tags) return (path_suffix, record, { self.ARTICLES_DIR : page_content , self.ABSTRACTS_DIR: page_abstract } )
def process_page(self, page_name, soup): global uid_ # remove all javascript and stylesheet code for script in soup(["script", "style"]): script.extract() content = soup.find(class_='article') if not content: log.error('content extraction failed') verbose('content extraction failed') log.error('{}'.format(page_name)) raise Exception else: verbose(' Content:=') verbose(' size: {}'.format(len(content))) year, month = self.extract_year_month(page_name, soup) log.info('year, month = {}, {}'.format(year, month)) verbose(' year/month: {}/{}'.format(year, month)) m = re.search('{}\/.*\/([^\/]+).html'.format(self.ROOT_URL), page_name) if m: log.debug(pformat(m)) name = m.group(1) else: uid_ += 1 name = '{}'.format(uid_) log.debug(content) paras = content.findAll('p') ; log.debug(pformat(paras)) path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name) for d in self.SUBDIRS: mkdir('{}/{}/{}'.format(d, year, self.month_alias[month])) page_content = '\n'.join(p.text for p in paras) page_abstract = paras[0].text.strip() title = soup.find(class_='headline') record = '{}|{}'.format(path_suffix.strip(), title.text.strip()) log.info(title.text) return (path_suffix, record, { self.ARTICLES_DIR : page_content , self.ABSTRACTS_DIR: page_abstract } )
def process_page(self, url, soup): global uid_ # remove all javascript and stylesheet code for script in soup(["script", "style", "iframe"]): script.extract() content = soup.find(class_='_picCon _disable_copy _munchDiscuss') if not content: log.error('content extraction failed') verbose('content extraction failed') log.error('{}'.format(url)) raise Exception else: try: verbose('content extraction Success') #New verbose(' Content:=') verbose(' size: {}'.format(len(content))) year, month = self.extract_year_month(url, soup) log.info('year, month = {}, {}'.format(year, month)) verbose(' year/month: {}/{}'.format(year, month)) name = '___'.join(url.split('?')[0].split('/')[-2:]).replace( '.html', '') log.debug(content) # paras = content.findAll('p') # log.debug(pformat(paras)) path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name) for d in self.SUBDIRS: mkdir('{}/{}/{}'.format(d, year, self.month_alias[month])) # page_content = '\n'.join(p.text for p in paras) page_content = content.text.replace(u'\xa0', '') page_abstract = soup.find(class_='small_intro').text.strip() title = soup.find(class_='arH LineHiet') breadcrumbs = soup.find(class_='breadcrumbs').findAll('a') breadcrumbs = ','.join([ b.text.replace('\n', '').replace('\r', '') for b in breadcrumbs ]) tags = soup.find(class_='_tag pb-0 pb-md-3').findAll('a') tags = ','.join( [b.text.replace('\n', '').replace('\r', '') for b in tags]) log.info(title.text) log.info(breadcrumbs) log.info(tags) record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url, title.text.strip(), breadcrumbs, tags) return (path_suffix, record, { self.ARTICLES_DIR: page_content, self.ABSTRACTS_DIR: page_abstract }) except: verbose("Error while processing")
def process_page(self, url, soup): global uid_ # remove all javascript and stylesheet code for script in soup(["script", "style"]): script.extract() content = soup.find(class_='article').find('arttextxml') if not content: log.error('content extraction failed') verbose('content extraction failed') log.error('{}'.format(url)) raise Exception else: try: verbose('content extraction Success') verbose(' Content:=') verbose(' size: {}'.format(len(content))) year, month = self.extract_year_month(url, soup) log.info('year, month = {}, {}'.format(year, month)) verbose(' year/month: {}/{}'.format(year, month)) name = '___'.join(url.split('?')[0].split('/')[-5:-2]).replace( '.html', '') log.debug(content) # paras = content.findAll('p') # log.debug(pformat(paras)) path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name) for d in self.SUBDIRS: mkdir('{}/{}/{}'.format(d, year, self.month_alias[month])) page_content = content.text page_abstract = soup.find(class_='artsyn').text title = soup.find(class_='leftmain').findAll('h1')[0] verbose(title) breadcrumbs = soup.find(class_='breadcrumb').findAll('li') breadcrumbs = ','.join([ b.text.replace('\n', '').replace('\r', '') for b in breadcrumbs ]) tags = soup.find(class_='keyinfo').findAll('a') tags = ','.join([ b.text.replace('\n', '').replace('\r', '').replace('|', '') for b in tags ]) log.info(title.text) log.info(breadcrumbs) log.info(tags) record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url, title.text.strip(), breadcrumbs, tags) return (path_suffix, record, { self.ARTICLES_DIR: page_content, self.ABSTRACTS_DIR: page_abstract }) except: verbose("Error while processing")
def process_page(self, url, soup): global uid_ # remove all javascript and stylesheet code for script in soup(["script", "style"]): script.extract() content = soup.find(class_='rightsec') if not content: log.error('content extraction failed') verbose('content extraction failed') log.error('{}'.format(url)) raise Exception else: try: verbose('content extraction Success')#New verbose(' Content:=') verbose(' size: {}'.format(len(content))) year, month = self.extract_year_month(url, soup) log.info('year, month = {}, {}'.format(year, month)) verbose(' year/month: {}/{}'.format(year, month)) name = '___'.join( url.split('?')[0].split('/')[-2:] ).replace('.html', '') log.debug(content) paras = content.findAll('p') log.debug(pformat(paras)) path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name) for d in self.SUBDIRS: mkdir('{}/{}/{}'.format(d, year, self.month_alias[month])) page_content = '\n'.join(p.text for p in paras) page_abstract = soup.find(class_='synopsis').text title = soup.find(class_='storytop').find('h1') breadcrumbs = soup.find_all('div', class_=['breadcrumb', 'MT30'])[0].find('ul').find_all('li') breadcrumbs = ','.join([b.text.replace('\n', '').replace('\r', '').replace('»', '').strip() for b in breadcrumbs]) # tags = soup.find(class_='tag-list').findAll('a') # tags = ','.join([b.text.replace('\n', '').replace('\r', '') # for b in tags]) tags = "" #No tags log.info(title.text) log.info(breadcrumbs) log.info(tags) record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url, title.text.strip(), breadcrumbs, tags) return (path_suffix, record, { self.ARTICLES_DIR: page_content, self.ABSTRACTS_DIR: page_abstract } ) except : verbose("Error while processing")
sys.setrecursionlimit(10000) # Open txt with number of lo and creates and output file with all the urls. # This is important because some of the URL's need to be changed by hand. file_lo_txt = "lo.txt" output_url_file = "url_lo.txt" # Need to generate an exception list in the form lo_id -> url exceptions = [("ca3", "https://www.rcophth.ac.uk/learningoutcomes/12658/"), ("ps24", "https://www.rcophth.ac.uk/learningoutcomes/ps24-2/")] url_array = url_generator(file_lo_txt, output_url_file, exceptions) # Creating a path to html tables pwd = Path(os.getcwd()) html_path = pwd / "html_tables" xls_path = pwd / "xls" mkdir(pwd, "html_tables") mkdir(pwd, "xls") lo_content = [] print( f"Would you like to collect all {len(url_array)} learning outcomes?:[y/n]") answer = input() pick_content = [] pick_headings = [] pick_title = [] pick_synopsis = [] if answer == "y": # begin of loop to access all the learning outcomes.