示例#1
0
    def getElsName(self, url):
        browser = webdriver.Firefox()  # 初始化Firefox浏览器
        while not pageUrlQueue.empty():
            url = pageUrlQueue.get()
            try:
                browser.get(url)  # 调用get方法抓取
            except TimeoutException, e:
                browser.refresh()
            if 'books' in url:
                if WebDriverWait(browser, 120, 0.5).until(
                        lambda browser: browser.find_element_by_class_name(
                            'contributor-content')) == 'WrongPage':
                    print "WrongPage"
                    continue

                # 解析html
                soup = BeautifulSoup(browser.page_source,
                                     'html.parser',
                                     from_encoding='utf-8')

                authors = soup.find_all('div', class_='contributor-content')

                book = soup.find(
                    'header',
                    class_='book-intro-header').find('h1').get_text()

                for author in authors:
                    if 'China' not in author.find('div'):
                        continue
                    authorName = author.find('h3').get_text()
                    nameQueue.put('%s%s%s' % (authorName, '!!!', book))

            elif 'call-for-papers' in url:
                if WebDriverWait(browser, 120, 0.5).until(
                        lambda browser: browser.find_element_by_class_name(
                            'title')) == 'WrongPage':
                    print "WrongPage"
                    continue

                # 解析html
                soup = BeautifulSoup(browser.page_source,
                                     'html.parser',
                                     from_encoding='utf-8')

                book = soup.find(
                    'div', class_='publication-title').find('h1').get_text()

                authors = soup.find('div',
                                    class_='article-content').find_all('p')
                for p in authors:
                    if p.find('br') and 'China' in p.find('br'):
                        authorName = p.find('strong').get_text()
                        university = p.find('br').get_text()
                        nameQueue.put(
                            '%s%s%s%s%s' %
                            (authorName, '|', university, '!!!', book))
示例#2
0
    def getIeeeDocuments(self, url):
        browser = webdriver.Firefox()  # 初始化Firefox浏览器
        while not pageUrlQueue.empty():
            browser.get(pageUrlQueue.get())  # 调用get方法抓取
            # # # 等待网页加载完
            if WebDriverWait(browser, 60, 0.5).until(
                    lambda browser: browser.find_element_by_class_name(
                        'c-Pagination-nodes')) == 'WrongPage':
                continue
            # 向下滑动获取完整页面
            for i in range(1, 5):
                browser.execute_script("window.scrollBy(0,3000)")
                time.sleep(1)

            # 解析html
            soup = BeautifulSoup(browser.page_source,
                                 'html.parser',
                                 from_encoding='utf-8')

            links = soup.find_all('a', href=re.compile(r"document"))
            tempSet = set()
            for link in links:
                if len(link['href']) == 18:
                    tempSet.add(link['href'])
            for temp in tempSet:
                docUrl = '%s%s%s' % ('http://ieeexplore.ieee.org', temp,
                                     'authors?ctx=authors')
                docUrlQueue.put(docUrl)
        browser.quit()
示例#3
0
    def getIeeePages(self, keyword):
        browser = webdriver.Firefox()  # 初始化Firefox浏览器
        browser.get(pageUrlQueue.get())  # 调用get方法抓取
        # # # 等待网页加载完
        WebDriverWait(
            browser, 120,
            0.5).until(lambda browser: browser.find_element_by_class_name(
                'c-Pagination-nodes'))
        # 解析html
        soup = BeautifulSoup(browser.page_source,
                             'html.parser',
                             from_encoding='utf-8')

        links = soup.find('div', class_="Dashboard-section").find('span')
        totalRecords = links.get_text().split('of')[1]

        print totalRecords
        pages = int(
            math.ceil(float(totalRecords.replace(',', '').strip()) / 25))
        # 实在太多了取前100页,也就是2500个
        pages = min(pages, 100)
        for i in range(1, pages):
            url = '%s%s%s%s%d%s%s' % (
                'http://ieeexplore.ieee.org/search/searchresult.jsp?',
                'queryText=', keyword, '&pageNumber=', i, rangeYear, '\n')
            pageUrlRQueue.put(url)

        browser.quit()
示例#4
0
    def getElsDocs(self, keyword):
        browser = webdriver.Firefox()  # 初始化Firefox浏览器
        browser.get(pageUrlQueue.get())  # 调用get方法抓取
        # # # 等待网页加载完
        WebDriverWait(
            browser, 120,
            0.5).until(lambda browser: browser.find_element_by_class_name(
                'search-result-meta'))
        # 解析html
        soup = BeautifulSoup(browser.page_source,
                             'html.parser',
                             from_encoding='utf-8')

        items = soup.find_all('div', class_=re.compile(r'search-result-body'))

        for item in items:
            if item.find('span', class_='category').get_text() == 'Journals':
                pageUrlRQueue.put(item.find('a')['href'])

        # 实在太多了取前100页,也就是2000个
        culPage = int(
            soup.find('div', id='pagination-wrapper').find(
                'li', class_='selected').find('span').get_text())
        if culPage < 200:
            browser.find_element_by_class_name(
                'btn-right-arrow btn-tertiary').click()

        browser.quit()
示例#5
0
    def getIeeeName(self, num):
        browser = webdriver.Firefox()  # 初始化Firefox浏览器
        filename = '%d%s' % (num, 'ieeeTemp.txt')
        f = open(filename, 'a')
        while not pageUrlQueue.empty():
            url = pageUrlQueue.get()
            try:
                browser.get(url)  # 调用get方法抓取
            except TimeoutException, e:
                browser.refresh()
            # # # 等待网页加载完
            WebDriverWait(
                browser, 120,
                0.5).until(lambda browser: browser.find_element_by_class_name(
                    'document-ft-section-header'))
            time.sleep(1)
            # 解析html
            soup = BeautifulSoup(browser.page_source,
                                 'html.parser',
                                 from_encoding='utf-8')
            docName = soup.find(
                'h1', class_="document-title").find('span').get_text()

            #爬领域关键字
            lists = soup.find_all('li',
                                  class_="doc-all-keywords-list-item ng-scope")
            for list in lists:
                if list.find('strong').get_text() == 'Author Keywords ':
                    keylinks = list.find_all('a')
                    for keylink in keylinks:
                        keywordSet.add(keylink.get_text())
            break
            keywords = []
            for keyword in keywordSet:
                keywords.append(keyword)

            authors = soup.find(
                'section', class_="document-all-authors ng-scope").find_all(
                    'div', class_='pure-u-18-24')
            for author in authors:
                area = author.find('div', class_='ng-binding').get_text()
                if 'China' in area:
                    name = author.find('span',
                                       class_='ng-binding').get_text().strip()
                    areas = re.split("[,.:()]", area)
                    university = 'None'
                    for a in areas:
                        if 'university' in a or 'University' in a:
                            university = a.strip()
                            break
                    try:
                        f.write(url)
                        f.write('%s%s%s%s%s%s%s' %
                                (name, '|', university, ','.join(keywords),
                                 '!!!', docName, '\n'))
                    except UnicodeDecodeError, e:
                        print name
                        print university
                        print docName
示例#6
0
    def DBLPcrew(self, num):
        browser = webdriver.Firefox()  # 初始化Firefox浏览器
        filename = '%d%s' % (num, 'nameIeeeNum.txt')
        ieeeNum = 'not found'
        f = open(filename, 'a')
        while not pageUrlQueue.empty():
            nameDs = pageUrlQueue.get().split('!!!')
            author = nameDs[0].split('|')[0]
            papers = nameDs[1].split('*@*')
            #根据论文找作者
            for paper in papers:
                paper = paper.strip()
                url = '%s%s' % ('http://dblp.uni-trier.de/search?q=', paper)
                try:
                    browser.get(url)  # 调用get方法抓取
                except TimeoutException, e:
                    browser.refresh()
                # # # 等待网页加载完
                if WebDriverWait(browser, 30, 0.5).until(
                        lambda browser: browser.find_element_by_class_name(
                            'title')) == 'WrongPage':
                    # print "WrongPaper"
                    continue

                # 解析html
                soup = BeautifulSoup(browser.page_source,
                                     'html.parser',
                                     from_encoding='utf-8')

                datas = soup.find_all('div', class_='data')
                detailLink = None
                for d in datas:
                    if paper == d.find('span',
                                       class_='title').get_text().strip('.'):
                        links = d.find_all('a')
                        for link in links:
                            if link.find('span') and link.find(
                                    'span').get_text() == author:
                                detailLink = link['href']
                                break
                    if detailLink:
                        break
                #找到author后进入链接查找ieeeNum
                try:
                    browser.get(detailLink)  # 调用get方法抓取
                except TimeoutException, e:
                    browser.refresh()
                except WebDriverException, e:
                    pass
示例#7
0
    def getACMName(self, url):
        browser = webdriver.Firefox()  # 初始化Firefox浏览器
        filename = url + 'acmTemp.txt'
        f = open(filename, 'a')
        while not pageUrlQueue.empty():
            try:
                browser.get(pageUrlQueue.get())  # 调用get方法抓取
            except TimeoutException, e:
                browser.refresh()
            # # # 等待网页加载完
            if WebDriverWait(browser, 120, 0.5).until(
                    lambda browser: browser.find_element_by_class_name(
                        'title')) == 'WrongPage':
                print "WrongPage"
            # 解析html
            soup = BeautifulSoup(browser.page_source,
                                 'html.parser',
                                 from_encoding='utf-8')

            authorName = soup.find(
                'span', class_="small-text").find('strong').get_text()

            universitySet = soup.find_all('td',
                                          class_="small-text")[1].find_all('a')
            universityT = None
            # for u in universitySet:
            #     if u.get_text() in universityEnglish:
            #         universityT = u.get_text()

            if not universityT:
                continue

            titles = soup.find_all('div', class_="title")
            docs = []
            for title in titles:
                titlelink = title.find('a').get_text()
                docs.append(titlelink)
            docS = '*@*'.join(docs)
            authorName = ' '.join(authorName.split())
            # nameQueue.put('%s%s%s%s%s' % (authorName, '|', universityT, '!!!', docS))

            f.write('%s%s%s%s%s' % (authorName, '|', universityT, '!!!', docS))
示例#8
0
    def getACMPages(self, keyword):
        browser = webdriver.Firefox()  # 初始化Firefox浏览器
        browser.get(pageUrlQueue.get())  # 调用get方法抓取
        # # # 等待网页加载完
        WebDriverWait(browser, 120, 0.5).until(
            lambda browser: browser.find_element_by_id('searchtots'))
        # 解析html
        soup = BeautifulSoup(browser.page_source,
                             'html.parser',
                             from_encoding='utf-8')

        totalRecords = soup.find('div',
                                 id="searchtots").find('strong').get_text()
        totalRecords = int(totalRecords.replace(',', ''))

        # 实在太多了取前100页,也就是2000个
        pages = min(totalRecords, 100 * 20)
        for i in range(0, pages, 20):
            url = '%s%s%s%d%s' % ('http://dl.acm.org/results.cfm?query=',
                                  keyword, '&srt=_score&start=', i, '\n')
            pageUrlRQueue.put(url)

        browser.quit()
示例#9
0
    def getACMAuthors(self, url):
        browser = webdriver.Firefox()  # 初始化Firefox浏览器
        tempSet = set()
        while not pageUrlQueue.empty():
            try:
                browser.get(pageUrlQueue.get())  # 调用get方法抓取
            except TimeoutException, e:
                browser.refresh()
            # # # 等待网页加载完
            if WebDriverWait(browser, 60, 0.5).until(
                    lambda browser: browser.find_element_by_class_name(
                        'authors')) == 'WrongPage':
                continue

            # 解析html
            soup = BeautifulSoup(browser.page_source,
                                 'html.parser',
                                 from_encoding='utf-8')

            authors = soup.find_all('div', class_='authors')
            for author in authors:
                links = author.find_all('a')
                for link in links:
                    tempSet.add(link['href'].split('&')[0])
示例#10
0
    def googleScholarSearch(self, num):
        # profile = webdriver.FirefoxProfile()
        # profile.set_preference('network.proxy.type', 1)
        # profile.set_preference('network.proxy.http', '127.0.0.1')
        # profile.set_preference('network.proxy.http_port', 17890)  # int
        # profile.update_preferences()
        # browser = webdriver.Firefox(firefox_profile=profile)

        browser = webdriver.Firefox()  # 初始化Firefox浏览器
        filename = '%d%s' % (num, 'nameIeeeNum.txt')
        f = open(filename, 'a')
        while not pageUrlQueue.empty():
            nameDs = pageUrlQueue.get().split('!!!')
            parts = nameDs[0].split('|')
            tempProfe = professor.Professor(parts[0])
            tempProfe.university = parts[1]
            tempProfe.ieeeNum = parts[2]
            tempProfe.citations = parts[3]
            tempProfe.h_index = parts[4]
            tempProfe.studyArea = parts[5]
            papers = nameDs[1].split('*@*')

            authorFind = False
            for paper in papers:
                paper = paper.strip()
                try:
                    browser.get(googleSchalarUrl)  # 调用get方法抓取
                    browser.find_element_by_class_name('gs_in_txt').send_keys(
                        paper)
                    browser.find_element_by_id('gs_hp_tsb').click()
                except TimeoutException, e:
                    browser.get(googleSchalarUrl)
                    browser.find_element_by_class_name('gs_in_txt').send_keys(
                        paper)
                    browser.find_element_by_id('gs_hp_tsb').click()
                except WebDriverException, e:
                    continue

                if WebDriverWait(browser, 30, 0.5).until(
                        lambda browser: browser.find_element_by_class_name(
                            'gs_r')) == 'WrongPage':
                    # print "WrongPaper"
                    continue

                # 解析html
                soup = BeautifulSoup(browser.page_source,
                                     'html.parser',
                                     from_encoding='utf-8')

                items = soup.find_all('div', class_='gs_a')
                authors = None
                for item in items:
                    authors = item.find_all('a')
                    # authors = item.fing('div', class_='gs_a').find_all('a')
                    break

                if not authors:
                    continue

                for author in authors:
                    url = '%s%s' % (personPage, author['href'])
                    try:
                        browser.get(url)  # 调用get方法抓取
                    except TimeoutException, e:
                        browser.get(url)

                    if WebDriverWait(browser, 30, 0.5).until(
                            lambda browser: browser.find_element_by_class_name(
                                'gsc_rsb_std')) == 'WrongPage':
                        # print "WrongPaper"
                        continue

                    if browser.find_element_by_id(
                            'gsc_prf_in').text == tempProfe.englishName:
                        authorFind = True
                        # 解析html
                        soup = BeautifulSoup(browser.page_source,
                                             'html.parser',
                                             from_encoding='utf-8')

                        details = soup.find_all('div', class_='gsc_prf_il')

                        try:
                            tempProfe.university = details[0].find(
                                'a').get_text()
                        except:
                            tempProfe.university = details[0].get_text()
                        tempKeywords = []
                        keywords = details[1].find_all('a')
                        for keyword in keywords:
                            tempKeywords.append(keyword.get_text())

                        tempProfe.studyArea = ','.join(tempKeywords)

                        datas = soup.find_all('td', class_='gsc_rsb_std')
                        tempProfe.citations = datas[0].get_text()
                        tempProfe.h_index = datas[2].get_text()
示例#11
0
class PaperInfoCrew:
    def __init__(self):
        self.email = '*****@*****.**'
        self.password = '******'

    # 从DBLP可以统计发表IEEE Trans.篇数
    def DBLPcrew(self, num):
        browser = webdriver.Firefox()  # 初始化Firefox浏览器
        filename = '%d%s' % (num, 'nameIeeeNum.txt')
        ieeeNum = 'not found'
        f = open(filename, 'a')
        while not pageUrlQueue.empty():
            nameDs = pageUrlQueue.get().split('!!!')
            author = nameDs[0].split('|')[0]
            papers = nameDs[1].split('*@*')
            #根据论文找作者
            for paper in papers:
                paper = paper.strip()
                url = '%s%s' % ('http://dblp.uni-trier.de/search?q=', paper)
                try:
                    browser.get(url)  # 调用get方法抓取
                except TimeoutException, e:
                    browser.refresh()
                # # # 等待网页加载完
                if WebDriverWait(browser, 30, 0.5).until(
                        lambda browser: browser.find_element_by_class_name(
                            'title')) == 'WrongPage':
                    # print "WrongPaper"
                    continue

                # 解析html
                soup = BeautifulSoup(browser.page_source,
                                     'html.parser',
                                     from_encoding='utf-8')

                datas = soup.find_all('div', class_='data')
                detailLink = None
                for d in datas:
                    if paper == d.find('span',
                                       class_='title').get_text().strip('.'):
                        links = d.find_all('a')
                        for link in links:
                            if link.find('span') and link.find(
                                    'span').get_text() == author:
                                detailLink = link['href']
                                break
                    if detailLink:
                        break
                #找到author后进入链接查找ieeeNum
                try:
                    browser.get(detailLink)  # 调用get方法抓取
                except TimeoutException, e:
                    browser.refresh()
                except WebDriverException, e:
                    pass
                if WebDriverWait(
                        browser, 60,
                        0.5).until(lambda browser: browser.find_element_by_id(
                            'max-record-info')) == 'WrongPage':
                    # print 'WrongPage'
                    continue
                #动态检查行不通,所以强行等待
                time.sleep(4)
                soup = BeautifulSoup(browser.page_source,
                                     'html.parser',
                                     from_encoding='utf-8')
                ieeeNum = soup.find('div', id='authorpage-refine').find(
                    'span', id='max-record-count').get_text()

                # if profeMap.has_key(author):
                #     profeMap[author].ieeeNum = ieeeNum
                # else:
                #     profe = professor.Professor(author)
                #     profe.ieeeNum = ieeeNum
                #     profeMap[author] = profe
                # f.write(author)
                if ieeeNum:
                    break
示例#12
0
                               class_='publication-title js-publication-title'
                               ).get_text() == paper:
                        # 打开论文页
                        paperLink = '%s%s' % (
                            'https://www.researchgate.net/',
                            li.find('a',
                                    class_=re.compile(
                                        r'js-publication-title-link'))['href'])
                        try:
                            browser.get(paperLink)  # 调用get方法抓取
                        except TimeoutException, e:
                            browser.refresh()

                        if WebDriverWait(
                                browser, 30,
                                0.5).until(lambda browser: browser.
                                           find_element_by_class_name(
                                               'publication-author-position')
                                           ) == 'WrongPage':
                            # print "WrongPaper"
                            pass
                        # 显示全部author
                        try:
                            if browser.find_element_by_class_name(
                                    'author-list-action-button'):
                                if browser.find_element_by_class_name(
                                        'author-list-action-button'
                                ).text != 'Hide':
                                    browser.find_element_by_class_name(
                                        'author-list-action-button').click()
                                    time.sleep(2)
                        except NoSuchElementException, e:
示例#13
0
            university = tempProfe.chiUniversity
        else:
            university = tempProfe.university
        url = '%s%s%s%s' % (baiduUrl, name, ' ', university)

        if university == 'None':
            url = '%s%s' % (url, ' 计算机')

        # url = 'https://www.baidu.com/s?wd=' + university
        try:
            browser.get(url)  # 调用get方法抓取
        except TimeoutException, e:
            browser.get(url)  # 调用get方法抓取

        if WebDriverWait(
                browser, 30,
                0.5).until(lambda browser: browser.find_element_by_class_name(
                    'n')) == 'WrongPage':
            # print "WrongPaper"
            return

        # 解析html
        soup = BeautifulSoup(browser.page_source,
                             'html.parser',
                             from_encoding='utf-8')
        details1 = soup.find_all('a', class_="c-showurl")
        # details2 = None
        # try:
        #     browser.find_element_by_class_name('n').click()
        #     soup = BeautifulSoup(browser.page_source, 'html.parser', from_encoding='utf-8')
        #     details2 = soup.find_all('a', class_="c-showurl")
        # except Exception, e: