def process_page(self, url, soup):
        global uid_
        # remove all javascript and stylesheet code
        for script in soup(["script", "style"]): 
            script.extract()

        content = soup.find(class_='db-contentScn')

        if not content:
            log.error('content extraction failed')
            verbose('content extraction failed')
            log.error('{}'.format(url))
            raise Exception
        else:
            verbose(' Content:=')
            verbose('  size: {}'.format(len(content)))        
            year, month = self.extract_year_month(url, soup)
            log.info('year, month = {}, {}'.format(year, month))

            verbose('  year/month: {}/{}'.format(year, month))
            name = '___'.join(
                url.split('?')[0].split('/')[-2:]
            ).replace('.html', '')

            log.debug(content)
            paras = content.findAll('p')  ; log.debug(pformat(paras))

            path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name)

            for d in self.SUBDIRS:
                mkdir('{}/{}/{}'.format(d, year, self.month_alias[month]))

            
            page_content  = '\n'.join(p.text for p in paras)
            page_abstract = paras[0].text.strip()
            title         = soup.find('h1')

            breadcrumbs  = soup.find(class_='breadCrums').findAll('a')
            breadcrumbs  = ','.join([b.text.replace('\n', '').replace('\r', '')
                                for b in breadcrumbs])

            tags = soup.find(class_='tglst').findAll('a')
            tags = ','.join([b.text.replace('\n', '').replace('\r', '')
                                for b in tags])
                                
            log.info(title.text)
            log.info(breadcrumbs)
            log.info(tags)

            record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url, title.text.strip(),
                                breadcrumbs, tags)

            return (path_suffix,
                    record, 
                    {
                        self.ARTICLES_DIR : page_content
                        , self.ABSTRACTS_DIR: page_abstract
                    }
            )
    def process_page(self, page_name, soup):
        global uid_
        # remove all javascript and stylesheet code
        for script in soup(["script", "style"]): 
            script.extract()

        content = soup.find(class_='article')

        if not content:
            log.error('content extraction failed')
            verbose('content extraction failed')
            log.error('{}'.format(page_name))
            raise Exception
        else:
            verbose(' Content:=')
            verbose('  size: {}'.format(len(content)))        
            year, month = self.extract_year_month(page_name, soup)
            log.info('year, month = {}, {}'.format(year, month))

            verbose('  year/month: {}/{}'.format(year, month))
            m = re.search('{}\/.*\/([^\/]+).html'.format(self.ROOT_URL), page_name)
            if m:
                log.debug(pformat(m))
                name = m.group(1)
            else:
                uid_ += 1
                name = '{}'.format(uid_)

            log.debug(content)
            paras = content.findAll('p')  ; log.debug(pformat(paras))

            path_suffix = '{}/{}/{}.txt'.format(year, self.month_alias[month], name)

            for d in self.SUBDIRS:
                mkdir('{}/{}/{}'.format(d, year, self.month_alias[month]))

            
            page_content  = '\n'.join(p.text for p in paras)
            page_abstract = paras[0].text.strip()
            title         = soup.find(class_='headline')
            record        = '{}|{}'.format(path_suffix.strip(), title.text.strip())

            log.info(title.text)

            return (path_suffix,
                    record, 
                    {
                        self.ARTICLES_DIR : page_content
                        , self.ABSTRACTS_DIR: page_abstract
                    }
            )
예제 #3
0
    def process_page(self, url, soup):
        global uid_
        # remove all javascript and stylesheet code
        for script in soup(["script", "style", "iframe"]):
            script.extract()

        content = soup.find(class_='_picCon _disable_copy _munchDiscuss')

        if not content:
            log.error('content extraction failed')
            verbose('content extraction failed')
            log.error('{}'.format(url))
            raise Exception
        else:
            try:
                verbose('content extraction Success')  #New
                verbose(' Content:=')
                verbose('  size: {}'.format(len(content)))
                year, month = self.extract_year_month(url, soup)
                log.info('year, month = {}, {}'.format(year, month))

                verbose('  year/month: {}/{}'.format(year, month))
                name = '___'.join(url.split('?')[0].split('/')[-2:]).replace(
                    '.html', '')

                log.debug(content)
                # paras = content.findAll('p')
                # log.debug(pformat(paras))

                path_suffix = '{}/{}/{}.txt'.format(year,
                                                    self.month_alias[month],
                                                    name)

                for d in self.SUBDIRS:
                    mkdir('{}/{}/{}'.format(d, year, self.month_alias[month]))

                # page_content = '\n'.join(p.text for p in paras)
                page_content = content.text.replace(u'\xa0', '')
                page_abstract = soup.find(class_='small_intro').text.strip()
                title = soup.find(class_='arH LineHiet')

                breadcrumbs = soup.find(class_='breadcrumbs').findAll('a')
                breadcrumbs = ','.join([
                    b.text.replace('\n', '').replace('\r', '')
                    for b in breadcrumbs
                ])

                tags = soup.find(class_='_tag pb-0 pb-md-3').findAll('a')
                tags = ','.join(
                    [b.text.replace('\n', '').replace('\r', '') for b in tags])
                log.info(title.text)
                log.info(breadcrumbs)
                log.info(tags)

                record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url,
                                                 title.text.strip(),
                                                 breadcrumbs, tags)
                return (path_suffix, record, {
                    self.ARTICLES_DIR: page_content,
                    self.ABSTRACTS_DIR: page_abstract
                })
            except:
                verbose("Error while processing")
    def process_page(self, url, soup):
        global uid_
        # remove all javascript and stylesheet code
        for script in soup(["script", "style"]):
            script.extract()

        content = soup.find(class_='article').find('arttextxml')

        if not content:
            log.error('content extraction failed')
            verbose('content extraction failed')
            log.error('{}'.format(url))
            raise Exception
        else:
            try:
                verbose('content extraction Success')
                verbose(' Content:=')
                verbose('  size: {}'.format(len(content)))
                year, month = self.extract_year_month(url, soup)
                log.info('year, month = {}, {}'.format(year, month))

                verbose('  year/month: {}/{}'.format(year, month))
                name = '___'.join(url.split('?')[0].split('/')[-5:-2]).replace(
                    '.html', '')

                log.debug(content)
                # paras = content.findAll('p')
                # log.debug(pformat(paras))

                path_suffix = '{}/{}/{}.txt'.format(year,
                                                    self.month_alias[month],
                                                    name)

                for d in self.SUBDIRS:
                    mkdir('{}/{}/{}'.format(d, year, self.month_alias[month]))

                page_content = content.text
                page_abstract = soup.find(class_='artsyn').text
                title = soup.find(class_='leftmain').findAll('h1')[0]
                verbose(title)
                breadcrumbs = soup.find(class_='breadcrumb').findAll('li')
                breadcrumbs = ','.join([
                    b.text.replace('\n', '').replace('\r', '')
                    for b in breadcrumbs
                ])

                tags = soup.find(class_='keyinfo').findAll('a')
                tags = ','.join([
                    b.text.replace('\n', '').replace('\r',
                                                     '').replace('|', '')
                    for b in tags
                ])
                log.info(title.text)
                log.info(breadcrumbs)
                log.info(tags)

                record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url,
                                                 title.text.strip(),
                                                 breadcrumbs, tags)

                return (path_suffix, record, {
                    self.ARTICLES_DIR: page_content,
                    self.ABSTRACTS_DIR: page_abstract
                })
            except:
                verbose("Error while processing")
    def process_page(self, url, soup):
        global uid_
        # remove all javascript and stylesheet code
        for script in soup(["script", "style"]):
            script.extract()

        content = soup.find(class_='rightsec')

        if not content:
            log.error('content extraction failed')
            verbose('content extraction failed')
            log.error('{}'.format(url))
            raise Exception
        else:
            try:
                verbose('content extraction Success')#New
                verbose(' Content:=')                
                verbose('  size: {}'.format(len(content)))
                year, month = self.extract_year_month(url, soup)
                log.info('year, month = {}, {}'.format(year, month))

                verbose('  year/month: {}/{}'.format(year, month))
                name = '___'.join(
                    url.split('?')[0].split('/')[-2:]
                ).replace('.html', '')

                log.debug(content)
                paras = content.findAll('p')
                log.debug(pformat(paras))
                
                path_suffix = '{}/{}/{}.txt'.format(year,
                                                    self.month_alias[month], name)

                for d in self.SUBDIRS:
                    mkdir('{}/{}/{}'.format(d, year, self.month_alias[month]))

                page_content = '\n'.join(p.text for p in paras)
                page_abstract = soup.find(class_='synopsis').text
                title = soup.find(class_='storytop').find('h1')
                
                breadcrumbs = soup.find_all('div', class_=['breadcrumb', 'MT30'])[0].find('ul').find_all('li')
                breadcrumbs = ','.join([b.text.replace('\n', '').replace('\r', '').replace('»', '').strip()
                                        for b in breadcrumbs])
                # tags = soup.find(class_='tag-list').findAll('a')
                # tags = ','.join([b.text.replace('\n', '').replace('\r', '')
                #                 for b in tags])
                tags = "" #No tags
                log.info(title.text)
                log.info(breadcrumbs)
                log.info(tags)

                record = '{}|{}|{}|{}|{}'.format(path_suffix.strip(), url, title.text.strip(),
                                                breadcrumbs, tags)
                return (path_suffix,
                        record,
                        {
                            self.ARTICLES_DIR: page_content, self.ABSTRACTS_DIR: page_abstract
                        }
                        )
            except :
                verbose("Error while processing")
예제 #6
0
sys.setrecursionlimit(10000)
# Open txt with number of lo and creates and output file with all the urls.
# This is important because some of the URL's need to be changed by hand.
file_lo_txt = "lo.txt"
output_url_file = "url_lo.txt"
# Need to generate an exception list in the form lo_id -> url
exceptions = [("ca3", "https://www.rcophth.ac.uk/learningoutcomes/12658/"),
              ("ps24", "https://www.rcophth.ac.uk/learningoutcomes/ps24-2/")]
url_array = url_generator(file_lo_txt, output_url_file, exceptions)

# Creating a path to html tables

pwd = Path(os.getcwd())
html_path = pwd / "html_tables"
xls_path = pwd / "xls"
mkdir(pwd, "html_tables")
mkdir(pwd, "xls")

lo_content = []

print(
    f"Would you like to collect all {len(url_array)} learning outcomes?:[y/n]")
answer = input()

pick_content = []
pick_headings = []
pick_title = []
pick_synopsis = []

if answer == "y":
    # begin of loop to access all the learning outcomes.