def get_name(input_data): count = 0 while count < len(input_data): all_person_data_list = [] try: for i in range(count, len(input_data)): print('当前进度:%s / %s' % (i + 1, len(input_data))) url = base_url % input_data[i][1] author_info = requests.get(url=url, proxies=proxies, headers=headers).json() author_name = author_info['search-results']['entry'][0]['preferred-name']['surname'] + ' ' + \ author_info['search-results']['entry'][0]['preferred-name'].get('initials', '') all_person_data_list.append( [input_data[i][0], input_data[i][1], author_name.strip()]) count = len(input_data) # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) count = i if all_person_data_list: result_df = pd.DataFrame( data=all_person_data_list, columns=['person_id', 'scopus_id', 'name']) write2sql([['scopus_author_name', result_df]])
def get_cite_data(input_data): count = 0 while count < len(input_data): result_df_list = [] # 启动浏览器并获取cookies driver = start_driver() cookies = get_cookies(driver) driver.close() try: for i in range(count, len(input_data)): print('当前进度:%s / %s' % (i + 1, len(input_data))) cite_journal_list = [] cite_journal_id_list = [] get_page_url = base_url % (input_data[i][0], str(1)) page_info = requests.get(get_page_url, proxies=proxies, headers=headers, timeout=300, cookies=cookies).json() cite_journal_list += [k['srctitle'] for k in page_info['docs']] cite_journal_id_list += [k['srcid'] for k in page_info['docs']] for j in range(2, int(page_info['Pages']) + 1): url = base_url % (input_data[i][0], str(j)) cite_info = requests.get(url, proxies=proxies, headers=headers, timeout=300, cookies=cookies) cite_info_dict = cite_info.json() cite_journal_list += [ k['srctitle'] for k in cite_info_dict['docs'] ] cite_journal_id_list += [ k['srcid'] for k in cite_info_dict['docs'] ] cite_df_temp = pd.DataFrame( data={ 'cite_journal': cite_journal_list, 'cite_journal_id': cite_journal_id_list }) cite_df_temp['cite_num'] = 1 cite_journal_data = cite_df_temp.groupby( by=['cite_journal', 'cite_journal_id'], as_index=False).sum() cite_journal_data['scopus_journal_id'] = input_data[i][0] result_df_list.append(cite_journal_data) count = len(input_data) # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) count = i if result_df_list: all_data = pd.concat(result_df_list) write2sql([['scopus_cite_data0810', all_data]])
def get_doc_data(input_data, school, value): headers['content-type'] = 'application/json' post_url = post_url_base % value[1] if school == 'Chinese Academy of Medical Sciences - Peking Union Medical College': del post_data['filters']['orgtype'] count = 0 while count < len(input_data): result_df_list = [] try: session = requests.session() session.post(url=login_url, data=login_data, headers=headers_login, timeout=300) for i in range(count, len(input_data)): print('当前进度:%s / %s' % (i + 1, len(input_data))) for year in range(2015, 2020): post_data['filters']['sbjname']['is'] = input_data[i] post_data['filters']['period']['is'] = [year, year] doc_data = session.post(url=post_url, data=json.dumps(post_data), headers=headers, timeout=300).json() if doc_data['items']: for item in doc_data['items']: item['doc_num'] = item['jifdocsq1']['value'] del item['jifdocsq1'] del item['wosDocuments'] data_df = pd.DataFrame(data=doc_data['items']) data_df['year'] = year data_df['category_id'] = input_data[i][:4] data_df['category_name'] = input_data[i][5:] result_df_list.append(data_df) count = len(input_data) session.close() # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) session.close() count = i if result_df_list: all_data_df = pd.concat(result_df_list) all_data_df = all_data_df.loc[all_data_df['orgName'] == value[0]] all_data_df['orgName'] = school write2sql([['wos_doc_data_copy', all_data_df]])
def get_article(input_data): count = 0 while count < len(input_data): all_person_data_list = [] try: for i in range(count, len(input_data)): print('当前进度:%s / %s' % (i + 1, len(input_data))) for j in range(1000): url = base_url.format(input_data[i][1], input_data[i][2], str(j * 25)) author_article_info = requests.get(url=url, proxies=proxies, headers=headers).json() if 'service-error' in author_article_info: print(input_data[i][2]) break if 'entry' not in author_article_info['search-results']: break if 'error' in author_article_info['search-results'][ 'entry'][0]: break for article_dict in author_article_info['search-results'][ 'entry']: doi = article_dict.get('prism:doi', '') eid = article_dict.get('eid', '') scopus_article_id = article_dict['dc:identifier'][10:] publish_year = article_dict.get('prism:coverDate', '') all_person_data_list.append([ input_data[i][0], input_data[i][1], doi, publish_year, eid, scopus_article_id ]) count = len(input_data) # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) count = i if all_person_data_list: result_df = pd.DataFrame(data=all_person_data_list, columns=[ 'person_id', 'scopus_id', 'doi', 'publish_year', 'eid', 'scopus_article_id' ]) write2sql([['scopus_author_article', result_df]])
def get_doc_data(input_data): headers['content-type'] = 'application/json' count = 0 while count < len(input_data): result_df_list = [] try: session = requests.session() session.post(url=login_url, data=login_data, headers=headers_login, timeout=300) for i in range(count, len(input_data)): print('当前进度:%s / %s' % (i + 1, len(input_data))) post_url = post_url_base.format(input_data[i][1]) post_data_detail['filters']['sbjname'][ 'is'] = input_data[i][3] + ' ' + input_data[i][4] doc_data = session.post(url=post_url, data=json.dumps(post_data_detail), headers=headers, timeout=300).json() if doc_data['items']: for item in doc_data['items']: item['title'] = item['a']['title'] del item['a'] data_df = pd.DataFrame(data=doc_data['items']) data_df['category_id'] = input_data[i][3] data_df['category_name'] = input_data[i][4] data_df['orgName'] = input_data[i][0] result_df_list.append(data_df) count = len(input_data) session.close() # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) session.close() count = i if result_df_list: all_data_df = pd.concat(result_df_list) write2sql([['wos_doc_data_detail', all_data_df]])
def main_prog(driver, subject_code, type_code): driver.get(basic_url) time.sleep(0.5) # 选择资助类别 driver.find_element_by_id('f_grantCode').click() time.sleep(0.5) driver.find_element_by_xpath( '/html/body/div[4]/div[1]/div[2]/div/table/tbody/tr[1]/td/table/tbody/tr[6]/td[2]/select/option[%s]' % type_code).click() time.sleep(0.5) # 选择年份 # driver.find_element_by_id('f_year').click() # time.sleep(0.5) # driver.find_element_by_xpath('/html/body/div[4]/div[1]/div[2]/div/table/tbody/tr[1]/td/table/tbody/tr[10]/td[2]/select/option[1]').click() # 选择申请代码 driver.find_element_by_name('subjectCode').send_keys(subject_code) time.sleep(2) driver.find_element_by_xpath('/html/body/div[6]/ul/li[1]').click() time.sleep(0.5) # 输入验证码,识别错误时重新输入 while '递减' not in driver.page_source: # 识别验证码 check_code = check_code_ocr(driver) # 填入验证码 driver.find_element_by_id('f_checkcode').clear() driver.find_element_by_id('f_checkcode').send_keys(check_code) time.sleep(0.5) # 查询 driver.find_element_by_id('searchBt').click() time.sleep(0.5) result_list = get_data(driver) if result_list: result_df = pd.DataFrame(data=result_list, columns=[ 'id', '项目批准号', '申请代码', '项目名称', '项目负责人', '依托单位', '批准金额', '项目起止年月' ]) write2sql([['nsfc_data', result_df]])
def crawl_h_index(input_data): count = 0 while count < len(input_data): # 启动浏览器并获取cookies driver = start_driver() cookies = get_cookies(driver) driver.close() result_list = [] try: # 开始对每位学者再scopus上进行匹配和信息获取 for i in range(count, len(input_data)): print('当前进度:%s / %s' % (i + 1, len(input_data))) url = 'https://www.scopus.com/authid/detail.uri?authorId=%s' % input_data[ i][1] detail_page = requests.get(url, proxies=proxies, headers=headers, timeout=300, cookies=cookies) soup = bs(detail_page.text, 'lxml') h_index = soup.find(id='authorDetailsHindex') element = h_index.find(class_='fontLarge') if element: result_list.append( [input_data[i][0], input_data[i][1], element.text]) # 结束循环 count = len(input_data) # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) count = i # 将已完成的部分进行数据写入 result_df = pd.DataFrame(data=result_list, columns=['person_id', 'scopus_id', 'h_index']) write2sql([['h_index', result_df]])
def get_detail_data(input_data): data_no = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(10, 99)) count = 0 t_index = 4 while count < len(input_data): # 启动浏览器并获取cookies driver = start_driver() cookies = get_cookies(driver) driver.close() ins_data_list = [] hindex_data_list = [] article_data_list = [] try: for i in range(count, len(input_data)): print('当前进度:%s / %s' % (i + 1, len(input_data))) # 获取学者文献及引证数据 article_data_df = catch_info(input_data[i][t_index], cookies) article_data_df['name_zh'] = input_data[i][2] article_data_df['person_id'] = input_data[i][0] article_data_list.append(article_data_df) # 获取学者h_index数据 detail_url = detail_base_url % input_data[i][t_index] detail_page = requests.get(detail_url, proxies=proxies, headers=headers, timeout=300, cookies=cookies) detail_soup = bs(detail_page.text, 'lxml') h_index = detail_soup.find(id='authorDetailsHindex') h_index_element = h_index.find(class_='fontLarge') if h_index_element: hindex_data_list.append([input_data[i][0], input_data[i][t_index], h_index_element.text]) # 获取学者机构数据 ins_url = ins_base_url % input_data[i][t_index] ins_page = requests.get(ins_url, proxies=proxies, headers=headers, timeout=300, cookies=cookies) ins_data = eval(ins_page.text) for ins in ins_data: # print(len(ins['affiliationName']), ins['affiliationName']) ins['start_year'] = ins['dateRange'][0] ins['end_year'] = ins['dateRange'][1] ins.pop('dateRange') ins_data_df = pd.DataFrame(ins_data) rename_dict = {'affiliationCity': 'aff_city', 'affiliationName': 'aff_name', 'affiliationCountry': 'aff_country', 'affiliationId': 'aff_id', 'affiliationUrl': 'aff_url'} ins_data_df.rename(columns=rename_dict, inplace=True) ins_data_df['scopus_id'] = input_data[i][t_index] ins_data_df['person_id'] = input_data[i][0] ins_data_df['name_zh'] = input_data[i][2] ins_data_list.append(ins_data_df) count = len(input_data) # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) count = i all_ins_data = pd.DataFrame() if not ins_data_list else pd.concat(ins_data_list) all_hindex_data = pd.DataFrame() if not ins_data_list else pd.DataFrame(data=hindex_data_list, columns=['person_id', 'scopus_id', 'h_index']) all_article_data = pd.DataFrame() if not ins_data_list else pd.concat(article_data_list) for _df in [all_ins_data, all_article_data]: if len(_df) > 0: _df['data_no'] = data_no _df['flag'] = 2 all_hindex_data['flag'] = 2 write2sql([['author_info_new', all_article_data], ['author_exp', all_ins_data], ['h_index', all_hindex_data]])
try: for i in range(count, len(data_list)): print(i) data = data_list[i] url = data[1] # page = requests.get(url=url, headers=headers, proxies=proxies, cookies=cookies_dict) page = requests.get(url=url, headers=headers, cookies=cookies_dict) source = re.findall(r'''<div class="i-list_baokan t1_12">[\s\S]*?</div>''', page.text) if source: link_list = re.findall(r'''<li style=".*?"><a title=".*?" href="(.*?)">''', source[0]) title_list = re.findall(r'''<li style=".*?"><a title="(.*?)" href=".*?''', source[0]) domain_url = 'http://www.apabi.com/sjtu/' link_list = [domain_url + i for i in link_list] df = pd.DataFrame(data={'title': title_list, 'text_link': link_list}) df['text_date'] = data[2] df['page_link'] = data[1] df['page_name'] = data[0] result_list.append(df) # 结束循环 count = len(data_list) # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) print('当前进度:%s / %s' % (i + 1, len(data_list))) count = i if result_list: all_data_df = pd.concat(result_list) write2sql([['rmrb_copy', all_data_df]])
def get_detail_data(input_data): data_no = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str( random.randint(10, 99)) count = 0 while count < len(input_data): all_person_data_list = [] driver = start_driver() try: for i in range(count, len(input_data)): print('当前进度:%s / %s' % (i + 1, len(input_data))) url = detail_base_url % input_data[i][1] driver.get(url) try: driver.find_element_by_id('authViewCitOver').click() except Exception as err: print('ERROR:%s' % err) continue time.sleep(10) cto_id1 = re.findall(r'CTOF_([0-9]+)', driver.current_url) cto_id2 = re.findall(r'var ctoId = "CTODS_([0-9]+)"', driver.page_source) one_person_data_list = [] # 页数循环 for j in range(5): url2 = article_url % (cto_id2[0], cto_id1[0], 200 * j) driver.get(url2) if 'Error 500' in driver.page_source: break soup = bs(driver.page_source.replace(' ', '0'), 'lxml') publish_year = re.findall(r'<span>([0-9]{4})</span>', driver.page_source) previous_cell_count = [ int(k.text) for k in soup.find_all(class_='previousCellCount') ] previous_years = [ int(k.text) for k in soup.find_all(class_='previousYears') ] sub_total = [ int(k.text) for k in soup.find_all(class_='subTotal') ] all_total = [ int(k.text) for k in soup.find_all(class_='subtotal') ] prev_latest = [ int(k.text) for k in soup.find_all(class_='prevLatestYears') ] one_page_data = pd.DataFrame({ 'publish_year': publish_year, 'previous_cell_count': previous_cell_count, '2006': previous_years[0::15], '2007': previous_years[1::15], '2008': previous_years[2::15], '2009': previous_years[3::15], '2010': previous_years[4::15], '2011': previous_years[5::15], '2012': previous_years[6::15], '2013': previous_years[7::15], '2014': previous_years[8::15], '2015': previous_years[9::15], '2016': previous_years[10::15], '2017': previous_years[11::15], '2018': previous_years[12::15], '2019': previous_years[13::15], '2020': previous_years[14::15], 'sub_total': sub_total, 'all_total': all_total, 'prev_latest': prev_latest }) one_person_data_list.append(one_page_data) if one_person_data_list: one_person_data = pd.concat(one_person_data_list) one_person_data['person_id'] = input_data[i][0] one_person_data['scopus_id'] = input_data[i][1] all_person_data_list.append(one_person_data) count = len(input_data) driver.close() # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) count = i driver.close() if all_person_data_list: all_person_data = pd.concat(all_person_data_list) all_person_data['article_num'] = 1 final_data = all_person_data.groupby( by=['person_id', 'scopus_id', 'publish_year'], as_index=False).sum() final_data['data_no'] = data_no write2sql([['article_cite_data', final_data]])
def main_prog(input_data): ''' :param input_data: [{'person_id':1234564, 'name':'liu bo', 'ins':['fudan university', 'xx university', 'xxx university'], 'ins_id':[111, 222, 333], 'name_zh':'刘博'}, {...}] :return: ''' data_no = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(10, 99)) logger.info('数据版本号: %s' % data_no) count = 0 while count < len(input_data): all_aff_df = pd.DataFrame(data=None, columns=None) basic_info_df = pd.DataFrame(data=None, columns=None) mult_result_df = pd.DataFrame(data=None, columns=None) not_matched_df = pd.DataFrame(data=None, columns=None) # 启动浏览器并获取cookies driver = start_driver() cookies = get_cookies(driver) driver.close() try: # 开始对每位学者再scopus上进行匹配和信息获取 for i in range(count, len(input_data)): person_id = input_data[i]['person_id'] author_name = input_data[i]['name'] author_name_zh = input_data[i]['name_zh'] author_ins = input_data[i]['ins'] author_ins_id = input_data[i]['ins_id'] logger.info('当前进度:软科id:%s, 姓名:%s,%s' % (person_id, author_name_zh, author_name[0])) # 机构英文名称全部转为小写 author_ins = [i.lower() for i in author_ins] # todo 0624临时修改 # authorID_list = get_id(person_id, author_name, author_name_zh, author_ins[0]) authorID_list = [] for _name in author_name: for _ins in author_ins: authorID_list.extend(get_id(person_id, _name, author_name_zh, _ins)) if authorID_list: break if authorID_list: break # 以机构对应的scopus_id匹配 aff_df, basic_info, mult_re, not_match = match(cookies, person_id, author_name, author_name_zh, author_ins_id, authorID_list) all_aff_df = all_aff_df.append(aff_df, ignore_index=True) basic_info_df = basic_info_df.append(basic_info, ignore_index=True) mult_result_df = mult_result_df.append(mult_re, ignore_index=True) not_matched_df = not_matched_df.append(not_match, ignore_index=True) # 结束循环 count = len(input_data) # 出现错误时,从错误处中断,再从该处开始 except Exception as err: logger.info('ERROR:%s' % err) logger.info('当前进度:%s / %s' % (i+1, len(input_data))) count = i # 添加data_no字段 for df in [all_aff_df, basic_info_df, mult_result_df, not_matched_df]: if len(df) > 0: df['data_no'] = data_no # 将已完成的部分进行数据写入 write2sql([['author_info_new', basic_info_df], ['author_exp', all_aff_df], ['mult_matched_author', mult_result_df], ['not_matched_author', not_matched_df]])
# df = pd.read_excel('C:/Users/Administrator/Desktop/校名映射表.xlsx') # data = pd.merge(data, df, how='left', on='orgName') # # print('merge完成') # print('开始写入数据库') # write2sql([['wos_doc_data0930', data]]) # print('写入数据库完成') dbutil = DBUtil(host, port, database, username, password) sql = 'select * from wos_doc_data_detail' data = dbutil.get_allresult(sql, 'df') print('读取源数据完成') df = pd.read_excel('C:/Users/Administrator/Desktop/校名映射表.xlsx') data = pd.merge(data, df, how='left', on='orgName') data1 = data.loc[0:499999] data2 = data.loc[500000:999999] data3 = data.loc[1000000:1499999] data4 = data.loc[1500000:1999999] data5 = data.loc[2000000:2499999] data6 = data.loc[2500000:2999999] data7 = data.loc[3000000:] data_list = [data1, data2, data3, data4, data5, data6, data7] i = 0 for data in data_list: print(i) write2sql([['wos_doc_data_detail0930', data]]) i += 1
try: for i in range(count, len(data_list)): print(i) data = data_list[i] url = data[1] # page = requests.get(url=url, headers=headers, proxies=proxies, cookies=cookies_dict) page = requests.get(url=url, headers=headers, cookies=cookies_dict) source = re.findall(r'''<div class="i-list_baokan t1_12">[\s\S]*?</div>''', page.text) if source: link_list = re.findall(r'''<li style=".*?"><a title=".*?" href="(.*?)">''', source[0]) title_list = re.findall(r'''<li style=".*?"><a title="(.*?)" href=".*?''', source[0]) domain_url = 'http://www.apabi.com/sjtu/' link_list = [domain_url + i for i in link_list] df = pd.DataFrame(data={'title': title_list, 'text_link': link_list}) df['text_date'] = data[2] df['page_link'] = data[1] df['page_name'] = data[0] result_list.append(df) # 结束循环 count = len(data_list) # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) print('当前进度:%s / %s' % (i + 1, len(data_list))) count = i if result_list: all_data_df = pd.concat(result_list) write2sql([['rmrb', all_data_df]])
def get_doc_data(input_data, author_position, period): if period == '近10年': post_data['filters']['period']['is'] = [2010, 2019] if period == '近5年' or period == '近5年Q1': post_data['filters']['period']['is'] = [2015, 2019] if 'authorposition' in post_data['filters']: post_data['filters']['authorposition']['is'] = [author_position] if author_position == 'First & Corresponding': post_data['filters']['authorposition']['is'] = [ 'First', 'Corresponding' ] if author_position == 'All': del post_data['filters']['authorposition'] if author_position == 'First & Corresponding': post_data['filters']['authorposition'] = { "is": ['First', 'Corresponding'] } a = post_data count = 0 no_result = [] while count < len(input_data): result_df_list = [] try: session = requests.session() session.post(url=login_url, data=login_data, headers=headers_login, timeout=300) for i in range(count, len(input_data)): print('当前进度:%s / %s' % (i + 1, len(input_data))) post_data['filters']['personId']['is'] = input_data[i][2] author_data = session.post(url=post_url, data=json.dumps(post_data), headers=headers, timeout=300).json() if author_data['items']: for item in author_data['items']: item['doc_num'] = item['wosDocuments']['value'] item['highly_cited_paper'] = item['highlyCitedPapers'][ 'value'] item['inter_colla'] = item['intCollaborations'][ 'value'] del item['wosDocuments'] del item['highlyCitedPapers'] del item['intCollaborations'] data_df = pd.DataFrame(data=author_data['items']) data_df['person_id'] = input_data[i][0] data_df['name_zh'] = input_data[i][1] result_df_list.append(data_df) else: no_result.append([ input_data[i][0], input_data[i][1], author_position, 0 ]) count = len(input_data) session.close() # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) session.close() count = i if result_df_list: all_data_df = pd.concat(result_df_list) all_data_df['author_position'] = author_position all_data_df['period'] = period write2sql([['incites_author_data1008', all_data_df]]) if no_result: no_result_df = pd.DataFrame(data=no_result, columns=['人才编号', '姓名', '作者类型', '文献数量']) no_result_df.to_excel( 'C:/Users/Administrator/Desktop/no_result1008.xlsx', index=False)
def wos_article(input_data): count = 0 page_count = 33 while count < len(input_data): articel_data_all = [] try: # 建立连接session session = requests.session() connection = session.get(url, headers=headers, proxies=proxies) # 获取sid sid = re.findall(r'SID=(\w+)&', connection.url)[0] # 开始循环获取数据 for i in range(count, len(input_data)): print('当前进度:%s / %s' % (i + 1, len(input_data))) print(input_data[i]) name = input_data[i][0] data = data_ini.format(sid, sid, name) search_page = session.post(post_url, data=data, proxies=proxies, headers=headers, timeout=300) qid = re.findall(r'qid=([0-9]+)&', search_page.text)[0] search_result_num = re.findall( r'FINAL_DISPLAY_RESULTS_COUNT = ([0-9]+);', search_page.text)[0] print(search_result_num) page_num = math.ceil(int(search_result_num) / 500) for k in range(page_count, page_num): print(f'{k}/{page_num}') data_dl = data_dl_ini.format(qid, sid, qid, sid, k * 500 + 500, k * 500 + 1, name, k * 500 + 1, k * 500 + 500) dl_page = session.post(dl_post_url, data=data_dl, proxies=proxies, headers=headers, timeout=300) if ('PT' not in dl_page.text) and ('AU' not in dl_page.text): raise Exception('error') data_list = dl_page.text.strip().split('\n') articel_data_temp = [ j.strip().split('\t') for j in data_list[1:] ] df_temp = pd.DataFrame(data=articel_data_temp, columns=columns) df_temp['name_zh'] = input_data[i][1] articel_data_all.append(df_temp) page_count = 0 # 结束循环 count = len(input_data) session.close() # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) session.close() count = i page_count = k if articel_data_all: articel_data_df = pd.concat(articel_data_all) write2sql([['wos_article_data', articel_data_df]])
find_df_list.append(temp_df) else: not_find.append(source_list[i][4]) else: not_find.append(source_list[i][4]) # 结束循环 count = len(source_list) # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) print('当前进度:%s / %s' % (i + 1, len(source_list))) count = i if str(type(err)) == "<class 'UnicodeEncodeError'>": count += 1 not_find.append(source_list[i][4]) if len(find_df_list) > 0: all_find = pd.concat(find_df_list) else: all_find = pd.DataFrame() if len(not_find) > 0: all_not_find = pd.DataFrame(data=not_find, columns=['person_id']) else: all_not_find = pd.DataFrame() write2sql([['find_result', all_find], ['not_find', all_not_find]])