示例#1
0
def extract_citation_for_publication(link):
    """
    this function craws the list of articles from a given link. If it has next page, it will continue to it until there is none
    @param[in]      profile_url     the link of google scholar profile you want to crawl
    @return         the list of articles as a list where each entry is dictionary    
    """
    browser=Browser()
    citation={}
    # go the citation view
    # as the page is written is javascript, we are not able to get its content via urllib2
    # intead we will use Selenium to simulate a web browser to render the page
    # req=urllib2.Request(publication[k]['link'], headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'})
    # p=urllib2.urlopen(req)
    # sub_soup=BeautifulSoup(p.readlines()[0], 'html.parser')
    # s=sub_soup.find(id='gs_ccl')
    browser.get(link)
    while True:
        citation_root=browser.find_element_by_id('gs_ccl')
        citation_list=citation_root.find_elements_by_class_name('gs_r')
        for citation_item in citation_list:
            # title
            title=citation_item.find_element_by_class_name('gs_rt').text
            # try to get the downloading link, if there is one
            try:
                link=citation_item.find_element_by_id('gs_ggsW2')
                link=link.find_element_by_link_text(link.text).get_attribute('href')
            except:
                link=None
            # author
            author_line=citation_item.find_element_by_class_name('gs_a')
            author_name=author_line.text.split(', ')
            author={}
            # for each of the author, find its link if its exits
            for a in author_name:
                try:
                    print '.',
                    # there is a google scholar profile with author
                    item=author_line.find_element_by_link_text(a)
                    author[a]=item.get_attribute('href')
                except:
                    # there is not such profile
                    author[a]=None
            # we can also press the cite button to get the detailed citation information, skipped here
            citation[title]={'link':link, 'author': author}
        # go to next page, if there is one
        if not next_page(browser):
            break
    # close
    browser.close()
    return citation
示例#2
0
def extract_publication(profile_url, verbose=verbose_citation_list):
    """
    this function crawl the publication list from the google scholar profile
    @param[in]      profile_url     the link of google scholar profile you want to crawl
    @param[in]      verbose         the level of information you want to scrawl. By default, we will scraw the detailed citation list for each of your publicaiton
    @return         the list of pulication as a list, where each entry is a dictionary
    """
    # scholar's artical list
    browser=Browser()
    browser.get(profile_url)
    publication={}
    while True:
        publication_list=browser.find_elements_by_class_name('gsc_a_tr')
        for publication_item in publication_list:
            title=publication_item.find_element_by_class_name('gsc_a_at').text
            print title,
            author=publication_item.find_elements_by_class_name('gs_gray')[0].text.split(', ')
            vendor=publication_item.find_elements_by_class_name('gs_gray')[1].text
            try:
                citation=int(publication_item.find_element_by_class_name('gsc_a_ac').text)
                link=publication_item.find_element_by_class_name('gsc_a_ac').get_attribute('href')
            except:
                citation=0
                link=None
            try:
                year=int(publication_item.find_element_by_class_name('gsc_a_h').text)
            except:
                year=None
            if citation>0 and verbose>=verbose_citation_list:
                print 'and its citation list',
                cited_by=extract_citation_for_publication(link)
            else:
                cited_by=None    
            print 'finished'
            publication[title]={'link':link,'author':author,'vendor':vendor,'citation':citation, 'cited by': cited_by, 'year':year}
        if not next_page(browser):
            break
    browser.close()
    return publication
示例#3
0
class BaseTestCase(unittest.TestCase):
    """
        Base Test Case which is inherited through all tests in order to
        provide proper webdriver workflow to set up and
        tear down test case groups.
    """

    # some configuration defaults if the environment is started from Pycharm/Terminal
    BASE_LINK = "https://"

    try:
        BASE_LINK = env_config.get('url')
    except SystemExit:
        pass

    try:
        browser_env = os.environ["BROWSER_ENV"]
    except KeyError:
        # browser_env is empty if not running in terminal, therefore it Chrome is added as default here in code 4 PyChrm
        browser_env = "chrome"

    def get_base_link(self):
        try:
            return env_config.get('url')
        except SystemExit:
            return self.BASE_LINK

    def setUp(self):
        if self.browser_env == 'chrome':
            # this is the setup for working remotely with linux
            # in house just call self.driver = Chrome()
            # Use these commands if you don't want Chrome in headless mode
            options = webdriver.ChromeOptions()
            options.add_argument('--user-agent=piinctest')
            self.driver = webdriver.Chrome(
                executable_path='/usr/local/bin/chromedriver', options=options)

        # Use these commands for Chrome headless
        elif self.browser_env == 'headless':
            options = webdriver.ChromeOptions()
            options.add_argument('headless')
            options.add_argument("--window-size=1920x1080")
            self.driver = webdriver.Chrome(
                executable_path='/usr/local/bin/chromedriver', options=options)

        elif self.browser_env == 'firefox':
            profile = webdriver.FirefoxProfile()
            profile.set_preference("general.useragent.override", "piinctest")
            self.driver = Firefox(profile)

        # Use these commands for Firefox headless
        elif self.browser_env == 'firefoxHeadless':
            options = webdriver.FirefoxOptions()
            options.add_argument('-headless')
            options.add_argument("--window-size=1920x1080")
            self.driver = webdriver.Firefox(
                executable_path='/usr/local/bin/geckodriver', options=options)

        elif self.browser_env == "iexplorer":
            caps = DesiredCapabilities.INTERNETEXPLORER.copy()
            caps["ensureCleanSession"] = True
            # This is set as suggested default path, if you have different path, change it /usr/local/bin
            self.driver = Ie(
                executable_path="C:/webdrivers/iedriverserver.exe",
                capabilities=caps)

        self.driver.delete_all_cookies()
        try:
            self.driver.maximize_window()
        except AttributeError:
            self.driver.set_window_size(1920, 1200)
        except WebDriverException:
            self.driver.set_window_size(1920, 1200)
        self.driver.get(self.BASE_LINK)

    def tearDown(self):
        global result
        if hasattr(self, '_outcome'):  # Python 3.4+
            result = self.defaultTestResult()
            self._feedErrorsToResult(result, self._outcome.errors)
        if len(result.errors) > 0 or len(result.failures) > 0:
            fail_url = self.driver.current_url
            print(fail_url)
            now = datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')
            fn = os.path.join(os.path.dirname(__file__), '..', '..',
                              'screenshots/Screenshot_%s.png' % now)
            self.driver.get_screenshot_as_file(fn)
            print(str("Screenshot added at path: " + fn))
        self.driver.close()
        self.driver.quit()
示例#4
0
            code = re.search('编号:\w+', room_type_code[i][j]).group()
            try:
                types = re.search('标准价', room_type_code[i][j]).group()
            except AttributeError:
                types = '钟点房(08:00~22:00)'
            price = re.search('\u00A5\d+', room_type_code[i][j]).group()
            info.append((code, types, price))
        room_info[room_type[i]] = info

    hotel_info_text.write('-' * 32 + hotel[1] + '-' * 32 + '\n')
    for i in room_info:
        hotel_info_text.write(i + ':' + '\n')
        for j in room_info[i]:
            code = j[0]
            types = j[1]
            price = j[2]
            hotel_info_text.write(' ' * 4 + code + ' ' + types + ' ' + price +
                                  '\n')

    ie.close()
    ie.switch_to.window(windows)
    ie.find_element_by_id("txtKeyword").send_keys(Keys.CONTROL + 'a')
    ie.find_element_by_id("txtKeyword").send_keys(Keys.DELETE)


for i in hotels:
    spider(i)
    time.sleep(5)
hotel_info_text.close()
ie.close()