示例#1
0
文件: sources.py 项目: iceberg273/ACE
    def parse_article(self, html):

        soup = super(SageSource, self).parse_article(html)  # Do some preprocessing
        if not soup: return False

        # To download tables, we need the content URL and the number of tables
        content_url = soup.find('meta', {'name': 'citation_public_url'})['content']

        n_tables = len(soup.find_all('span', class_='table-label'))

        # Now download each table and parse it
        tables = []
        for i in range(n_tables):
            t_num = i+1
            url = '%s/T%d.expansion.html' % (content_url, t_num)
            table_html = scrape.get_url(url)
            table_html = self.decode_html_entities(table_html)
            table_soup = BeautifulSoup(table_html)
            tc = table_soup.find(class_='table-expansion')
            t = tc.find('table', {'id': 'table-%d' % (t_num)})
            t = self.parse_table(t)
            if t: 
                t.number = t_num
                t.title = tc.find(class_='table-label').text
                try:
                    t.caption = tc.find(class_='table-caption').get_text()
                except: pass
                try:
                    t.notes = tc.find(class_='table-footnotes').get_text()
                except: pass
                tables.append(t)

        self.article.tables = tables
        return self.article
示例#2
0
文件: main.py 项目: Ouasfi/JobFinder
def FindJobs(jobTitle: str, jobType: str, filterPattern: str):
    base_url = "https://www.indeed.fr"
    url = scrape.get_url(base_url, jobTitle, jobType)
    df = scrape.get_offers(base_url, url)
    df_pr = process.process_offers(df, r'filterPattern')
    df_pr.to_csv('offers.csv')
    return df_pr
示例#3
0
文件: sources.py 项目: iceberg273/ACE
    def parse_article(self, html):
        soup = super(JournalOfCognitiveNeuroscienceSource, self).parse_article(html)
        if not soup: return False

        # To download tables, we need the DOI and the number of tables
        m = re.search('\<meta.*content="http://dx.doi.org/(10.1162/jocn_a_00371)["\s]+', html)
        doi = m.group(1)

        pattern = re.compile('^T\d+$')
        n_tables = len(soup.find_all('table', {'id': pattern }))

        tables = []

        # Now download each table and parse it
        for i in range(n_tables):
            url = 'http://www.mitpressjournals.org/action/showPopup?citid=citart1&id=T%d&doi=%s' % (i+1, doi)
            table_html = scrape.get_url(url)
            table_html = self.decode_html_entities(table_html)
            table_soup = BeautifulSoup(table_html)
            t = table_soup.find('table').find('table')  # JCogNeuro nests tables 2-deep
            t = self.parse_table(t)
            if t: tables.append(t)

        self.article.tables = tables
        return self.article
示例#4
0
文件: sources.py 项目: iceberg273/ACE
    def parse_article(self, html):
        soup = super(JournalOfCognitiveNeuroscienceSource,
                     self).parse_article(html)
        if not soup: return False

        # To download tables, we need the DOI and the number of tables
        m = re.search(
            '\<meta.*content="http://dx.doi.org/(10.1162/jocn_a_00371)["\s]+',
            html)
        doi = m.group(1)

        pattern = re.compile('^T\d+$')
        n_tables = len(soup.find_all('table', {'id': pattern}))

        tables = []

        # Now download each table and parse it
        for i in range(n_tables):
            url = 'http://www.mitpressjournals.org/action/showPopup?citid=citart1&id=T%d&doi=%s' % (
                i + 1, doi)
            table_html = scrape.get_url(url)
            table_html = self.decode_html_entities(table_html)
            table_soup = BeautifulSoup(table_html)
            t = table_soup.find('table').find(
                'table')  # JCogNeuro nests tables 2-deep
            t = self.parse_table(t)
            if t: tables.append(t)

        self.article.tables = tables
        return self.article
示例#5
0
    def _download_table(self, url):
        ''' For Sources that have tables in separate files, a helper for 
        downloading and extracting the table data. Also saves to file if desired.
        '''

        delay = self.delay if hasattr(self, 'delay') else 0

        if self.table_dir is not None:
            filename = '%s/%s' % (self.table_dir, url.replace('/', '_'))
            if os.path.exists(filename):
                table_html = open(filename).read().decode('utf-8')
            else:
                table_html = scrape.get_url(url, delay=delay)
                open(filename, 'w').write(table_html.encode('utf-8'))
        else:
            table_html = scrape.get_url(url, delay=delay)

        table_html = self.decode_html_entities(table_html)
        return (BeautifulSoup(table_html))
示例#6
0
文件: sources.py 项目: NBCLab/ACE
    def _download_table(self, url):
        ''' For Sources that have tables in separate files, a helper for 
        downloading and extracting the table data. Also saves to file if desired.
        '''

        delay = self.delay if hasattr(self, 'delay') else 0

        if self.table_dir is not None:
            filename = '%s/%s' % (self.table_dir, url.replace('/', '_'))
            if os.path.exists(filename):
                table_html = open(filename).read().decode('utf-8')
            else:
                table_html = scrape.get_url(url, delay=delay)
                open(filename, 'w').write(table_html.encode('utf-8'))
        else:
            table_html = scrape.get_url(url, delay=delay)

        table_html = self.decode_html_entities(table_html)
        return(BeautifulSoup(table_html))
示例#7
0
文件: sources.py 项目: iceberg273/ACE
    def parse_article(self, html):

        soup = super(SageSource,
                     self).parse_article(html)  # Do some preprocessing
        if not soup: return False

        # To download tables, we need the content URL and the number of tables
        content_url = soup.find('meta',
                                {'name': 'citation_public_url'})['content']

        n_tables = len(soup.find_all('span', class_='table-label'))

        # Now download each table and parse it
        tables = []
        for i in range(n_tables):
            t_num = i + 1
            url = '%s/T%d.expansion.html' % (content_url, t_num)
            table_html = scrape.get_url(url)
            table_html = self.decode_html_entities(table_html)
            table_soup = BeautifulSoup(table_html)
            tc = table_soup.find(class_='table-expansion')
            t = tc.find('table', {'id': 'table-%d' % (t_num)})
            t = self.parse_table(t)
            if t:
                t.number = t_num
                t.title = tc.find(class_='table-label').text
                try:
                    t.caption = tc.find(class_='table-caption').get_text()
                except:
                    pass
                try:
                    t.notes = tc.find(class_='table-footnotes').get_text()
                except:
                    pass
                tables.append(t)

        self.article.tables = tables
        return self.article