Exemplo n.º 1
0
 def find_google_scholar_page(self, prof: Professor):
     # get search results page
     self.get_page(
         'https://scholar.google.com/scholar?q=author%%3A"%s"+%s' %
         (urllib.parse.quote(prof.simple_name()), prof.school))
     # look for a matching user profile
     try:
         anchor = self.selenium_driver.find_element_by_css_selector(
             'h4.gs_rt2 a')
         return anchor.get_attribute('href')
     except NoSuchElementException:
         return None
Exemplo n.º 2
0
    def scrape_search_results(self, prof: Professor) -> List[Paper]:
        """In this case, we are saving all articles, even if we are not sure that they match the author.
        We only search in the past ten years (2007 and later) and only include the first 100 pages of results,
        and only papers that have at least one citation in Google Scholar (to save us some time)."""
        # parse each page of results, up to at most 1000 articles (100 pages)
        papers = []
        # for each page of results
        for start in range(0, 1000, 10):
            result_row_info = []
            # get search results page
            self.get_page(
                'https://scholar.google.com/scholar?start=%d&as_ylo=%s&q=author%%3A"%s"+%s'
                % (start, STARTING_YEAR, urllib.parse.quote(
                    prof.simple_name()), prof.school))

            # We get the GS and WoS citation counts from the search results page
            # We get the full citation information by virtually clicking the "cite" link for each article
            tree = tree_from_string(self.selenium_driver.page_source)
            for row in css_select(tree, 'div.gs_r div.gs_ri'):
                scholar_citations = None
                wos_citations = None
                citation_id = None
                for link in css_select(row, 'div.gs_fl a'):
                    if 'Cited by' in link.text:
                        scholar_citations = link.text.split(' ')[-1]
                    elif 'Web of Science:' in link.text:
                        wos_citations = link.text.split(': ')[-1]
                    elif 'Related articles' in link.text:
                        citation_id = link.get('href').split(":")[1]
                # ignore papers with no citations
                if not scholar_citations:
                    break
                result_row_info.append({
                    'scholar_citations': scholar_citations,
                    'wos_citations': wos_citations,
                    'citation_id': citation_id
                })
            # stop when we've gone past the end of results
            if len(result_row_info) == 0:
                break

            # fetch each citation and pick out the Chicago format because it has full firstnames
            # and includes all the author names (or at least more of them before using "et al."
            # eg., https://scholar.google.com/scholar?q=info:J2Uvx00ui50J:scholar.google.com/&output=cite&scirp=1&hl=en
            for r in result_row_info:
                self.get_page(
                    'https://scholar.google.com/scholar?q=info:%s:scholar.google.com/'
                    '&output=cite&scirp=1&hl=en' % r['citation_id'])
                # the third row in the table contains the Chicago-style citation
                citation = self.selenium_driver.find_elements_by_css_selector(
                    'td')[2].text
                year = get_year(citation)
                if not year:
                    continue
                # look for the first period that is not part of a middle initial
                match = re.search(r"\w{2}\. ", citation)
                if not match:
                    # otherwise, just take the first period as in:
                    # Al-Najjar, Nabil I. "A bayesian framework for precautionary policies." (2013).
                    match = re.search(r"\. ", citation)
                authors = citation[:match.end()]
                # venue is in italics
                try:
                    venue = self.selenium_driver.find_elements_by_css_selector('td')[2]\
                                                .find_element_by_css_selector('i').text
                except NoSuchElementException:
                    # this is probably a working paper
                    continue
                match = re.findall(
                    r"\"(.*)\"", citation)  # article titles are inside quotes
                if len(match) == 0:
                    # this is a book, which we don't record
                    continue
                title = match[0]
                papers.append(
                    Paper(title=title,
                          authors=authors,
                          venue=venue,
                          year=year,
                          scholar_citations=r['scholar_citations'],
                          wos_citations=r['wos_citations'],
                          id=r['citation_id']))
        return papers