def start(self, query, start_year, end_year, gubun): ''' 논문 빠른 검색 및 상세정보 제공함수 :param query: keyword, p_authors, organization 각각 문자열 :param start_year: 시작년도 :param end_year: 끝년도 :param gubun: 검색 구분 카테고리 :return: ''' # Sejong Univ 로 고정 ##################### query = (query[0], query[1], 'Sejong Univ') session = self.session base_url = self.base_url ui_stream = self.ui_stream keyword = query[0] p_authors = query[1] organization = query[2] # 검색속도 향상을 위한 헤더 랜더마이즈 # orginal_headers = session.headers # session.headers.update({'User-Agent': str(random.getrandbits(32))}) # [단계 1/3] 최초 검색 ######################################################################### ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1002]) ui_stream.push(command='log', msg='검색어 : %s' % keyword) if keyword.find('=') != -1: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1300][0]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1300][0] }) return action_url = '/WOS_GeneralSearch.do' form_data = { 'action': 'search', 'product': 'WOS', 'search_mode': 'GeneralSearch', 'sa_params': 'WOS||%s|http://apps.webofknowledge.com|\'' % self.SID, 'SID': self.SID, 'value(input1)': keyword, 'value(select1)': gubun, 'startYear': start_year, 'endYear': end_year, } if organization != '': form_data.update({ 'limitStatus': 'expanded', 'value(bool_1_2)': 'AND', 'value(input2)': organization, 'value(select2)': 'AD', 'fieldCount': '2', }) # 검색 요청 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1102]) url = base_url + action_url # SEJONG WIFI 접속 시 변수명에 특정 문자를 바르게 인코딩하지 못하는 현상 # 어떤 문자인 지 찾아서 수정하는 작업이 필요. # form_data = sju_utiles.get_form_data(action_url, form_data) self.qid += 1 http_res = sju_utiles.sju_post(session, url, form_data, 5, query) #http_res = session.post(url, form_data, verify=False) # # 검색 성공 # if http_res.status_code == requests.codes.ok: # location = http_res.history[0].headers['Location'] # reffer = base_url + '/' + location # # 검색 실패 # else: # ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][2]) # raise sju_exceptions.RequestsError # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '검색을 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return # http_res = session.get(reffer) # # Access Denied # if http_res.status_code == 403: # ui_stream.push( # command='res', target='errQuery', # res={'query': query, 'msg': '결과 리스트 페이지를 요청했으나 서버가 접근 권한 없음을 반환했습니다.'} # ) # return target_content = http_res.content soup = BeautifulSoup(target_content, 'html.parser') atag = soup.select_one('a.snowplow-full-record') try: if not atag: raise sju_exceptions.NoPaperDataError() # 검색 결과가 없을 경우 except sju_exceptions.NoPaperDataError: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][0]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1302][0] }) return except Exception as e: ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1303][0]) raise Exception(e) ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1202]) # [단계 3/3] 전체 Fast 데이터 다운로드 ######################################################################### # Fast 5000 요청 및 다운로드 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1204]) qid = soup.select('input#qid')[0].attrs['value'] rurl = soup.select('input#rurl')[0].attrs['value'] self.qid = int(qid) action_url = '/OutboundService.do?action=go&&' form_data = { 'qid': str(self.qid), 'SID': self.SID, 'mark_to': '5000', 'markTo': '5000', } form_data = sju_utiles.get_form_data(action_url, form_data) url = base_url + action_url http_res = sju_utiles.sju_post(session, url, form_data, 5, query) #http_res = session.post(url, form_data, verify=False) self.qid += 1 # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '인용 논문 자료 다운로드를 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return # Fast 5000 데이터 처리 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1404]) fast_5000 = http_res.content.decode('utf-8').replace('\r', '') fast_5000_list = fast_5000.split('\n') keys = fast_5000_list[0].split('\t') fast_5000_list = fast_5000_list[1:] if fast_5000_list[-1] == '': fast_5000_list.pop() article = {} articles = [] for row in fast_5000_list: row_list = row.split('\t') for idx, key in enumerate(keys): article[key] = row_list[idx] article['id'] = str(random.getrandbits(8)) articles.append(article) article = {} if self.qid > 180: self.set_session() ui_stream.push(command='res', target='fast_5000', res=articles) return
def start(self, query_string_url, session, p_authors): ''' 각 논문 정보를 보여줄 상세 정보 제공함수 :param query_string_url: 상세정보를 받을 url 변수 :param session: session 변수 :param p_authors: :return: ''' session = session ui_stream = self.ui_stream base_url = self.base_url qid = query_string_url[query_string_url.find('qid') + 4:query_string_url.find('SID') - 1] local_qid = qid SID = query_string_url[query_string_url.find('SID') + 4:query_string_url.find('page') - 1] paper_data_id = str(random.getrandbits(32)) # 임시로 페이지 확인! 로그!! #ui_stream.push(command='log', msg='[url]: %s'%query_string_url[query_string_url.find('page'):]) # 상세 정보 페이지 요청 http_res = session.get(query_string_url) # Access Denied if http_res.status_code == 403: ui_stream.push( command='res', target='errQuery', res={ 'query': query_string_url[query_string_url.find('page'):], 'msg': '해당 논문의 상세 정보를 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return # 논문의 정보 및 인용정보 fetch ######################################################################### target_content = http_res.content try: paper_data, cnt_link = sju_utiles.parse_paper_data( target_content, paper_data_id, "dupl") except Exception as e: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[4302][2]) ui_stream.push( command='res', target='errQuery', res={ 'query': query_string_url[query_string_url.find('page'):], 'msg': sju_CONSTANTS.STATE_MSG[4302][2] }) # raise sju_exceptions.FailedToParseError(e, query) return # 요청 성공 else: ui_stream.push(command='res', target='paperData', res=paper_data) ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4203]) try: query = [ paper_data['title'], p_authors, paper_data['researchAreas'] ] except: query = [paper_data['title'], '', ''] # # 요청 실패 # 인용 논문 정보 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4004]) # 인용 횟수에 따른 분기 # 인용 논문이 없을 때 if not cnt_link: ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4304][0]) #ui_stream.push(command='log', msg='[url]: %s'%query_string_url) #local_qid += 1 #self.qid += 1 return # 인용 논문이 5000개 이상일 때 elif int(paper_data['timesCited']) > 4999: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[4304][1]) ui_stream.push( command='res', target='errQuery', res={ 'query': query_string_url[query_string_url.find('page'):], 'msg': sju_CONSTANTS.STATE_MSG[4304][1] }) ui_stream.push(command='log', msg='[url]: %s' % query_string_url) #local_qid += 1 #self.qid += 1 return # 인용 리포트 요청 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4104]) url = base_url + cnt_link['href'] http_res = session.get(url) # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '인용 리포트를 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return target_content = http_res.content soup = BeautifulSoup(target_content, 'html.parser') # 인용문 링크는 존재하나, 클릭할 경우 검색 결과가 없다는 메세지가 뜰 때 if soup.text.find( 'Your search found no records') != -1 or soup.text.find( 'None of the Citing Articles are in your subscription' ) != -1: ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4304][3]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[4304][3] }) return local_qid = soup.select('input#qid')[0].attrs['value'] rurl = soup.select('input#rurl')[0].attrs['value'] times_cited = paper_data['timesCited'] #self.qid = int(qid) local_qid = int(local_qid) # Fast 5000 요청 및 다운로드 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4204]) action_url = '/OutboundService.do?action=go&&' form_data = { 'qid': str(local_qid), 'SID': SID, 'mark_to': times_cited, 'markTo': times_cited, } form_data = sju_utiles.get_form_data(action_url, form_data) url = base_url + action_url http_res = session.post(url, form_data) local_qid += 1 #self.qid += 1 # Access Denied if http_res.status_code == 403: ui_stream.push( command='res', target='errQuery', res={ 'query': query_string_url[query_string_url.find('page'):], 'msg': '인용 논문 자료 다운로드를 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return # Fast 5000 데이터 처리 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4404]) fast_5000 = http_res.content.decode('utf-8').replace('\r', '') fast_5000_list = fast_5000.split('\n') keys = fast_5000_list[0].split('\t') fast_5000_list = fast_5000_list[1:] if fast_5000_list[-1] == '': fast_5000_list.pop() article = {} citing_articles = [] for row in fast_5000_list: row_list = row.split('\t') for idx, key in enumerate(keys): article[key] = row_list[idx] citing_articles.append(article) article = {} # UI 응답 형식에 맞게 변환 ############################################################################## citingArticles = { 'id': paper_data['id'], 'selfCitation': 0, 'othersCitation': 0, 'titles': [], 'authors': [], 'isSelf': [] } # 기준 저자 검증 if p_authors != '': ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1504]) p_authors = list( map(lambda x: x.replace(' ', '').replace(',', ''), p_authors.split(';'))) for article in citing_articles: citingArticles['titles'] += [article['TI']] citingArticles['authors'] += [article['AU']] au_temp = article['AU'].replace(' ', '').replace(',', '') if p_authors != '': found = False for pa in p_authors: if re.search(pa, au_temp, flags=re.IGNORECASE): found = True citingArticles['selfCitation'] += 1 citingArticles['isSelf'] += ['Self'] break if not found: citingArticles['othersCitation'] += 1 citingArticles['isSelf'] += ['Others\''] else: citingArticles['isSelf'] += ['-'] ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4404]) ui_stream.push(command='res', target='citingArticles', res=citingArticles) #ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4200][0]) return
def start(self, query, start_year, end_year, gubun): ''' 하나의 논문에 관한 상세 정보 제공함수 :param query: keyword, p_authors, organization 각각 문자열 :param start_year: 시작년도 :param end_year: 끝년도 :param gubun: 검색 구분 카테고리 :return: ''' # Sejong Univ 로 고정 ##################### query = (query[0], query[1], 'Sejong Univ') # driver = self.driver session = self.session base_url = self.base_url ui_stream = self.ui_stream keyword = query[0] p_authors = query[1] organization = query[2] paper_data_id = str(random.getrandbits(32)) # 검색속도 향상을 위한 헤더 랜더마이즈 # orginal_headers = session.headers # session.headers.update({'User-Agent': str(random.getrandbits(32))}) # [단계 1/3] 최초 검색 ######################################################################### ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1002]) ui_stream.push(command='log', msg='검색어 : %s' % keyword) if keyword.find('=') != -1: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1300][0]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1300][0] }) return action_url = '/WOS_GeneralSearch.do' form_data = { 'action': 'search', 'product': 'WOS', 'search_mode': 'GeneralSearch', 'sa_params': 'WOS||%s|http://apps.webofknowledge.com|\'' % self.SID, 'SID': self.SID, 'value(input1)': keyword, 'value(select1)': gubun, 'startYear': start_year, 'endYear': end_year, } if organization != '': form_data.update({ 'limitStatus': 'expanded', 'value(bool_1_2)': 'AND', 'value(input2)': organization, 'value(select2)': 'AD', 'fieldCount': '2', }) # 검색 요청 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1102]) url = base_url + action_url # SEJONG WIFI 접속 시 변수명에 특정 문자를 바르게 인코딩하지 못하는 현상 # 어떤 문자인 지 찾아서 수정하는 작업이 필요. # form_data = sju_utiles.get_form_data(action_url, form_data) self.qid += 1 http_res = session.post(url, form_data) # # 검색 성공 # if http_res.status_code == requests.codes.ok: # location = http_res.history[0].headers['Location'] # reffer = base_url + '/' + location # # 검색 실패 # else: # ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][2]) # raise sju_exceptions.RequestsError # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '검색을 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return # http_res = session.get(reffer) # # Access Denied # if http_res.status_code == 403: # ui_stream.push( # command='res', target='errQuery', # res={'query': query, 'msg': '결과 리스트 페이지를 요청했으나 서버가 접근 권한 없음을 반환했습니다.'} # ) # return target_content = http_res.content soup = BeautifulSoup(target_content, 'html.parser') atag_list = soup.select('a.snowplow-full-record') report_link = soup.select('a.citation-report-summary-link') try: if len(atag_list) == 0: raise sju_exceptions.NoPaperDataError() elif len(atag_list) > 1: raise sju_exceptions.MultiplePaperDataError() # 검색 결과가 없을 경우 except sju_exceptions.NoPaperDataError: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][0]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1302][0] }) return # 검색 결과가 2개 이상일 경우 except sju_exceptions.MultiplePaperDataError: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][1]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1302][1] }) return except Exception as e: ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1303][0]) raise Exception(e) ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1202]) # [단계 2/3] 상세 정보 페이지 fetch, 인용년도 조회 (스레딩) ######################################################################### ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1003]) tc_data = {'tc_dict': []} tc_parsing_thread = None # 인용 보고서 링크가 잡힐 때 if len(report_link) != 0: # 인용년도 조회 스레딩 tc_parsing_thread = threading.Thread(target=self.get_tc_data, args=(report_link, paper_data_id, tc_data)) tc_parsing_thread.start() # 결과 리스트 페이지를 들렀다 오는 경우 query_string = atag_list[0]['href'] # # 상세 보기 바로 진입 하는 경우 # # qid가 랜덤한 경우가 존재... 사용하기 위해선 # # 이슈가 해결되야함. # action_url = '/full_record.do' # query_data = { # 'page': '1', # 'qid': str(self.qid), # 'SID': self.SID, # 'doc': '1', # } # query_string = sju_utiles.get_query_string(action_url, query_data) # 상세 정보 요청 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1103]) # session.headers['Reffer'] = reffer http_res = session.get(base_url + query_string) # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '해당 논문의 상세 정보를 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return target_content = http_res.content # 상세 정보 파싱 try: paper_data, cnt_link = sju_utiles.parse_paper_data( target_content, paper_data_id, "single") # paper_data['subsidy'] = sju_utiles.get_subsidy01(paper_data, p_authors) # 검색 결과가 없을 경우 except sju_exceptions.NoPaperDataError: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][0]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1302][0] }) return # 검색 결과가 2개 이상일 경우 except sju_exceptions.MultiplePaperDataError: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][1]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1302][1] }) return except Exception as e: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][2]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1302][2] }) # raise sju_exceptions.FailedToParseError(e, query) return # 요청 성공 else: ui_stream.push(command='res', target='paperData', res=paper_data) # 인용 년도 조회 완료를 기다림 if tc_parsing_thread: tc_parsing_thread.join() ui_stream.push(command='log', msg='인용 년도 조회가 완료되었습니다.') tc_dict = tc_data['tc_dict'] # 인용 년도 조회 성공 시 출력 if len(tc_dict) > 0: ui_stream.push(command='res', target='tc_data', res={ 'id': paper_data_id, 'tc_data': tc_dict }) ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1203]) # # 요청 실패 # [단계 3/3] 인용 논문 정보 ######################################################################### ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1004]) # 인용 횟수에 따른 분기 if not cnt_link: ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1304][0]) self.qid += 1 return elif int(paper_data['timesCited']) > 4999: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1304][1]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1304][1] }) self.qid += 1 return # 인용 리포트 요청 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1104]) url = base_url + cnt_link['href'] http_res = session.get(url) target_content = http_res.content # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '인용 리포트를 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return soup = BeautifulSoup(target_content, 'html.parser') # 인용문 링크는 존재하나, 클릭할 경우 검색 결과가 없다는 메세지가 뜰 때 if soup.text.find( 'Your search found no records') != -1 or soup.text.find( 'None of the Citing Articles are in your subscription' ) != -1: ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1304][3]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1304][3] }) return qid = soup.select('input#qid')[0].attrs['value'] rurl = soup.select('input#rurl')[0].attrs['value'] times_cited = paper_data['timesCited'] self.qid = int(qid) # Fast 5000 요청 및 다운로드 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1204]) action_url = '/OutboundService.do?action=go&&' form_data = { 'qid': str(self.qid), 'SID': self.SID, 'mark_to': times_cited, 'markTo': times_cited, } form_data = sju_utiles.get_form_data(action_url, form_data) url = base_url + action_url http_res = session.post(url, form_data) self.qid += 1 # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '인용 논문 자료 다운로드를 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return # Fast 5000 데이터 처리 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1404]) fast_5000 = http_res.content.decode('utf-8').replace('\r', '') fast_5000_list = fast_5000.split('\n') keys = fast_5000_list[0].split('\t') fast_5000_list = fast_5000_list[1:] if fast_5000_list[-1] == '': fast_5000_list.pop() article = {} citing_articles = [] for row in fast_5000_list: row_list = row.split('\t') for idx, key in enumerate(keys): article[key] = row_list[idx] citing_articles.append(article) article = {} # UI 응답 형식에 맞게 변환 citingArticles = { 'id': paper_data['id'], 'selfCitation': 0, 'othersCitation': 0, 'titles': [], 'authors': [], 'isSelf': [] } # 기준 저자 검증 if p_authors != '': ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1504]) p_authors = list( map(lambda x: x.replace(' ', '').replace(',', ''), p_authors.split(';'))) for article in citing_articles: citingArticles['titles'] += [article['TI']] citingArticles['authors'] += [article['AU']] au_temp = article['AU'].replace(' ', '').replace(',', '') if p_authors != '': found = False for pa in p_authors: if re.search(pa, au_temp, flags=re.IGNORECASE): found = True citingArticles['selfCitation'] += 1 citingArticles['isSelf'] += ['Self'] break if not found: citingArticles['othersCitation'] += 1 citingArticles['isSelf'] += ['Others\''] else: citingArticles['isSelf'] += ['-'] ui_stream.push(command='res', target='citingArticles', res=citingArticles) # [단계 종료] 단일 상세 검색 ######################################################################### # history 제한 방지 if self.qid > 180: self.set_session() ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1200][0]) return