def getLnks(dv, nReq, nRst=None): ''' 一个生成器,从search result页获取paper链接 Parameters ---------- dv : 当前handle是search result页的webdriver nReq : 采集paper数量的需求 nRst : 检索结果的数量,靠getNumOfRst获取 Returns ------- getLnks : 一个生成器,生成获取的paper链接 ''' if nRst is None: nRst = getNumOfRst(dv) ###paper链接xpath lnkPath = '//a[@class="smallV110 snowplow-full-record"]' ###等待页面打开 waitTillOpen(dv, value=lnkPath) lnk = dv.find_element_by_xpath(lnkPath) pat = __getPattern(lnk.get_attribute('href')) for doc in range(1, 1 + min(nReq, nRst, MAX_DOC)): yield __getLnk(doc, pat)
def sortResults(dv, sid, qid, sortReq=''): ''' 按需求对检索结果排序 Parameters ---------- dv : 当前handle是search result页的webdriver sid : sid qid : qid sortReq : 排序需求,默认或无效需求视为“日期降序” ''' ###等待页面打开 waitTillOpen(dv) if type(sortReq) is not str: sortReq = '' sortId = str2SortId.get(sortReq.upper(), SortId.PYD) dv.execute_script(__getSortJs(sid, qid, sortId))
def getNumOfRst(dv): ''' 获取检索结果的数量 Parameters ---------- dv : 当前handle是search result页的webdriver Returns ------- getNumOfRst : 检索结果的数量 ''' nRstPath = '//h3[@class="title4"]/*' ###等待页面打开 waitTillOpen(dv, value=nRstPath) return int(dv.find_element_by_xpath(nRstPath).text.replace(',', ''))
def getIds(dv): ''' 获取sid和qid sid决定用户session,qid决定检索编号,二者共同决定检索结果 Parameters ---------- dv : 当前handle是search result页的webdriver Returns ------- getIds : 返回id对(sid, qid) ''' ###等待页面打开 waitTillOpen(dv) sid = dv.execute_script('return SID') qid = dv.execute_script('return qid.value') return sid, qid
def __getSynonyms(this, s: str, mode): ''' 使用有道翻译获得同义词集 ''' dv = this.__dv #dv.refresh() pathInput = '//textarea[@class="input__original__area"]' pathAnswer = '//div[contains(@class,"input__target__text")]/p/span' pathSuggestWait = '//*[@class="suggest__title"]/../../*[contains(@style,"block")]' pathSuggest = '//*[@class="suggest__title"]/../ul/*' pathRelative = '//div[@class="dict__relative"]/*' pathTrans = '//a[@id="transMachine"]' ###翻译语言 if mode == 'ch2en': this.__ch2en() else: this.__auto() ###输入 waitTillOpen(dv, 10, value=pathInput) ipts = dv.find_elements_by_xpath(pathInput) this.__iknow() ipt = ipts[0] ipt.clear() for i in range(10): ans = dv.find_elements_by_xpath(pathAnswer) if not ans: break dv.find_element_by_xpath(pathTrans).click() sleep(0.1) else: assert 0, 'translate area not cleared' ipt.send_keys(s) rst = set() ###翻译结果 waitTillOpen(dv, value=pathAnswer) ans = dv.find_elements_by_xpath(pathAnswer) rst |= {x.text for x in ans} if ans: ###翻译改进结果 ans[0].click() try: waitTillOpen(dv, 10, value=pathSuggestWait) sug = dv.find_elements_by_xpath(pathSuggest) rst |= {x.text for x in sug} except TimeoutException: pass ###翻译相关结果 relative = dv.find_elements_by_xpath(pathRelative) rst |= {x.text for x in relative} this.__auto() ipt.clear() dv.find_element_by_xpath(pathTrans).click() return rst
def __waitTillOpen(this): ''' 打开paper页时等待页面加载 如果长时间打不开则视为被反爬系统禁止访问 Raise ----- Exception : paper页打不开报错 ''' for i in range(6): try: waitTillOpen(this.driver, 10) if i: print('INFO : open succeed, tried %dth' % (i + 1), ' ' * 20) break except TimeoutException as e: print('ERROR : failed to open the page of paper, tried %dth' % (i + 1), end='\r') else: print() raise Exception('maybe banned by wos, please check')
def __clickIfNotActive(this, element, secWait, pathWait, reClick=0): ''' 点击元素,如果元素不处于活跃状态的话,并等待直到一个路径的元素出现 Parameters ---------- element : 点击的元素 secWait : 一次点击的最长等待时间 pathWait : 等待加载的判断路径 reClick : 重复点击次数,防止论文引用量排序点击没有反应,默认为0 ''' dv = this.__dv # dv.execute_script('arguments[0].scrollIntoView();', element) if 'active' not in element.get_attribute('class'): # element.click() dv.execute_script('arguments[0].click();', element) sleep(waitUnit) waitTillOpen(dv, secWait, value=pathWait) for i in range(reClick): dv.execute_script('arguments[0].click();', element) sleep(waitUnit) waitTillOpen(dv, secWait, value=pathWait)
def __auto(this): pathLang = '//*[@class="select-text"]' pathAuto = '//*[@data-value="AUTO"]/../../*[contains(@style,"block")]/*[@data-value="AUTO"]/a' pathWait = '//*[@data-value="AUTO"]/../../*[contains(@style,"none")]/*[@data-value="AUTO"]' if this.__getMode() == 'AUTO': return dv = this.__dv waitTillOpen(dv, 10, value=pathLang) dv.find_element_by_xpath(pathLang).click() waitTillOpen(dv, 10, value=pathAuto) dv.find_element_by_xpath(pathAuto).click() waitTillOpen(dv, 10, value=pathWait)
def __ch2en(this): pathLang = '//*[@class="select-text"]' pathCh2En = '//*[@data-value="zh-CHS2en"]/../../*[contains(@style,"block")]/*[@data-value="zh-CHS2en"]/a' pathWait = '//*[@data-value="zh-CHS2en"]/../../*[contains(@style,"none")]/*[@data-value="zh-CHS2en"]' if this.__getMode() == 'zh-CHS2en': return dv = this.__dv waitTillOpen(dv, 10, value=pathLang) dv.find_element_by_xpath(pathLang).click() waitTillOpen(dv, 10, value=pathCh2En) dv.find_element_by_xpath(pathCh2En).click() waitTillOpen(dv, 10, value=pathWait)