def get_tc_data(self, report_link, paper_data_id, tc_data): ''' 인용보고서에서 인용년도를 조회하는 함수 (start() 함수에서 사용) :param report_link: 인용보고서 링크 :param paper_data_id: :param tc_data: 인용년도 연도별 데이터 저장 :return: ''' self.ui_stream.push(command='log', msg='인용년도 정보를 가져옵니다.') session = requests.Session() session = sju_utiles.set_user_agent(session) session.cookies.update(self.session.cookies) # [단계 2/3] 인용년도 조회 (병렬 구성) ######################################################################### report_link = report_link[0] http_res = session.get(self.base_url + report_link['href']) target_content = http_res.content soup = BeautifulSoup(target_content, 'html.parser') raw_tc_data = soup.select_one('script#raw_tc_data') tc_tuple_list = re.findall(r'([0-9]+)\=([0-9]+)', raw_tc_data.text) # tc_list = list(filter(lambda x: int(x[1]) > 0, tc_tuple_list)) tc_dict = {} for x in tc_tuple_list: tc_dict.update({x[0]: x[1]}) tc_data['tc_dict'] = tc_dict
def set_session(self, cookies=None): ''' 세션 갱신 함수 :param cookies: 쿠키 값 저장 변수 :return: ''' MAX_TRIES = 5 self.qid = 0 ui_stream = self.ui_stream tries = 0 session = requests.Session() # SID와 JSESSIONID가 주어질 경우 if cookies: session = sju_utiles.set_user_agent(session) session.cookies.update(cookies) self.SID = session.cookies['SID'].replace("\"", "") self.jsessionid = session.cookies['JSESSIONID'] ui_stream.push(command='log', msg='SID : %s' % self.SID) ui_stream.push(command='log', msg='JSESSIONID : %s' % self.jsessionid) while tries < MAX_TRIES and not cookies: # 세션 갱신 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1001]) session = sju_utiles.set_user_agent(session) # 세션 SID, JSESSIONID 요청 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1101]) res = session.get('http://apps.webofknowledge.com', allow_redirects=False) for redirect in session.resolve_redirects(res, res.request): if 'SID' in session.cookies.keys(): break # SID요청 에러 판별 if res.status_code == requests.codes.FOUND or res.status_code == requests.codes.OK: if res.url.find('login') > -1: raise sju_exceptions.LoginRequired() self.SID = session.cookies['SID'].replace("\"", "") self.jsessionid = session.cookies['JSESSIONID'] ui_stream.push(command='log', msg='SID : %s' % self.SID) ui_stream.push(command='log', msg='JSESSIONID : %s' % self.jsessionid) # 요청 성공 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1201]) break elif res.status_code == 403: # ui_stream.push(command='log', msg='스레드가 403 상태 메세지를 받았습니다.') tries += 1 ui_stream.push(command='log', msg='서버에서 거부하여 2초 뒤 재시도합니다. [%d/%d]' % (tries, MAX_TRIES)) continue else: # 요청 실패 ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1301][0]) raise sju_exceptions.InitSessionError() if tries >= MAX_TRIES: raise sju_exceptions.InitSessionError() if self.session: self.session.close() self.session = session