def deal31(): dbutil = DBUtil(host, port, database, username, password) sql = 'select * from author_info_new' author_info_new0910 = dbutil.get_allresult(sql, 'df') sql = 'select * from h_index' h_index0910 = dbutil.get_allresult(sql, 'df') sql = 'select * from article_cite_data' article_cite_data0910 = dbutil.get_allresult(sql, 'df') sql = 'select scopus_id, person_id, publish_year, count(*) as first_au_doc_num from scopus_author_article ' \ 'group by scopus_id, person_id, publish_year' scopus_author_article0910 = dbutil.get_allresult(sql, 'df') physics_list = pd.read_excel( 'C:/Users/Administrator/Desktop/物理学人才清单_20200908.xlsx', sheet_name='data') physics_list['人才编号'] = physics_list['人才编号'].astype('str') author_info_new0910 = author_info_new0910.loc[ author_info_new0910['person_id'].isin(list(physics_list['人才编号']))] h_index0910 = h_index0910.loc[h_index0910['person_id'].isin( list(physics_list['人才编号']))] article_cite_data0910 = article_cite_data0910.loc[ article_cite_data0910['person_id'].isin(list(physics_list['人才编号']))] scopus_author_article0910 = scopus_author_article0910.loc[ scopus_author_article0910['person_id'].isin(list( physics_list['人才编号']))] dbutil.df_insert('author_info_new0910', author_info_new0910) dbutil.df_insert('h_index0910', h_index0910) dbutil.df_insert('article_cite_data0910', article_cite_data0910) dbutil.df_insert('scopus_author_article0910', scopus_author_article0910)
def deal6(): dbutil = DBUtil(host, port, database, username, password) sql = "select person_id, name from not_matched_author where data_no='2020060819255418'" df = dbutil.get_allresult(sql, 'df') dbutil.close() df2 = pd.read_excel( 'C:/Users/Administrator/Desktop/test_data/test_data2.xlsx') df2['学者代码'] = df2['学者代码'].astype('str') result = pd.merge(df, df2, how='left', left_on='person_id', right_on='学者代码') result.sort_values(by=['学者代码'], inplace=True, ignore_index=True) search_url = 'https://www.scopus.com/results/authorNamesList.uri?sort=count-f&src=al' \ '&st1={}' \ '&st2={}' \ '&orcidId=&affilName={}' \ '&sot=anl&sdt=anl&sl=64&resultsPerPage=200&offset=1&jtp=false¤tPage=1&exactAuthorSearch=true' for i in range(len(result)): first_name = result.loc[i, 'name'].split(' ')[0].lower() last_name = result.loc[i, 'name'].split(' ')[1].lower() ins_name = '+'.join(result.loc[i, 'ins_en'].split(' ')).lower() result.loc[i, '搜索链接'] = search_url.format(first_name, last_name, ins_name) result.to_excel('C:/Users/Administrator/Desktop/0609not.xlsx', sheet_name='Sheet1', index=False)
def rf_cal(): dbutil = DBUtil(host, port, database, username, password) sql = 'select scopus_journal_id, cite_num+0 as cite_num, cite_journal_id from scopus_cite_data ' \ 'where scopus_journal_id!=cite_journal_id' all_cite_data = dbutil.get_allresult(sql, 'df') # with open('core_journal.json', 'r', encoding='utf-8') as cj: # core_journal = json.load(cj) core_journal = {} already_data = pd.read_excel('C:/Users/Administrator/Desktop/分学科结果.xlsx') already_data['scopus_journal_id'] = already_data[ 'scopus_journal_id'].astype('str') for subject, sub_df in already_data.groupby('subject'): core_journal[subject] = list(sub_df['scopus_journal_id']) rf_cited_list = [] rf_citing_list = [] for subject, sub_journal in core_journal.items(): print(subject) sub_cited_df_temp = all_cite_data.loc[ all_cite_data['cite_journal_id'].isin(sub_journal)] sub_cited_df = sub_cited_df_temp.groupby( 'scopus_journal_id', as_index=False)['cite_num'].sum() sub_citing_df_temp = all_cite_data.loc[ all_cite_data['scopus_journal_id'].isin(sub_journal)] sub_citing_df = sub_citing_df_temp.groupby( 'cite_journal_id', as_index=False)['cite_num'].sum() sub_cited_df['subject'] = subject rf_cited_list.append(sub_cited_df) sub_citing_df['subject'] = subject rf_citing_list.append(sub_citing_df) rf_cited_df = pd.concat(rf_cited_list) rf_citing_df = pd.concat(rf_citing_list) # dbutil.df_insert('rf_cited_data', rf_cited_df) # dbutil.df_insert('rf_citing_data', rf_citing_df) select_journal = pd.read_excel( 'C:/Users/Administrator/Desktop/未被分学科期刊.xlsx') select_journal['Scopus Source ID'] = select_journal[ 'Scopus Source ID'].astype('str') rf_cited_df = rf_cited_df.loc[rf_cited_df['scopus_journal_id'].isin( list(select_journal['Scopus Source ID']))] rf_citing_df = rf_citing_df.loc[rf_citing_df['cite_journal_id'].isin( list(select_journal['Scopus Source ID']))] dbutil.df_insert('rf_cited_data_last', rf_cited_df) dbutil.df_insert('rf_citing_data_last', rf_citing_df) dbutil.close()
def deal10(): dbutil = DBUtil(host, port, database, username, password) sql = "select person_id+0 as person_id from (select DISTINCT person_id from find_result UNION select " \ "person_id from not_find) a ORDER BY person_id" df = dbutil.get_allresult(sql, 'df') df2 = pd.read_excel('C:/Users/Administrator/Desktop/0618webofscience.xlsx') df3 = df2.loc[~df2['person_id'].isin(list(df['person_id']))] df3.to_excel('C:/Users/Administrator/Desktop/data_left.xlsx', sheet_name='Sheet1', index=False)
def deal8(): df = pd.read_excel('C:/Users/Administrator/Desktop/0618webofscience.xlsx') dbutil = DBUtil(host, port, database, username, password) sql = "select distinct (person_id+0) as person_id from not_find where person_id not in (select person_id from find_result)" not_find = dbutil.get_allresult(sql, 'df') dbutil.close() result = pd.merge(not_find, df, how='left', on='person_id') result.to_excel('C:/Users/Administrator/Desktop/未搜索到结果清单.xlsx', sheet_name='Sheet1', index=False)
def data_prepare(): dbutil = DBUtil(host, port, database, username, password) sql = 'select scopus_journal_id, sum(cite_num+0) as cite_num_all from scopus_cite_data group by scopus_journal_id' df1 = dbutil.get_allresult(sql, 'df') sql = 'SELECT scopus_journal_id,cite_num+0 as cite_num_self from scopus_cite_data ' \ 'where scopus_journal_id=cite_journal_id' df2 = dbutil.get_allresult(sql, 'df') df1 = pd.merge(df1, df2, how='outer', on='scopus_journal_id') df1.fillna(0, inplace=True) df1['cite_num_noself'] = df1['cite_num_all'] - df1['cite_num_self'] df1.to_excel('C:/Users/Administrator/Desktop/去除自引被引数据.xlsx', index=False) sql = 'select cite_journal_id as scopus_journal_id, sum(cite_num+0) as cite_num_all ' \ 'from scopus_cite_data group by cite_journal_id' df3 = dbutil.get_allresult(sql, 'df') df3 = pd.merge(df3, df2, how='outer', on='scopus_journal_id') df3.fillna(0, inplace=True) df3['cite_num_noself'] = df3['cite_num_all'] - df3['cite_num_self'] df3.to_excel('C:/Users/Administrator/Desktop/去除自引引用数据.xlsx', index=False)
def temp(input_data): from src.config.DBUtil import DBUtil from src.Scopus_Crawler.scopus_config import host, port, database, username, password dbutil = DBUtil(host, port, database, username, password) sql = "SELECT person_id FROM author_info_new UNION SELECT person_id FROM mult_matched_author UNION SELECT " \ "person_id FROM not_matched_author" df = dbutil.get_allresult(sql, 'df') dbutil.close() input_list = [str(i['person_id']) for i in input_data] diff = set(input_list).difference(set(df['person_id'])) out_data = [i for i in input_data if str(i['person_id']) in diff] return out_data
def deal9(): dbutil = DBUtil(host, port, database, username, password) sql = "select person_id+0 as person_id, find_num, ins from find_result" find_result = dbutil.get_allresult(sql, 'df') find_result['s_num'] = '0' for i in range(len(find_result)): temp_list = find_result.loc[i, 'ins'][:-1].split(' (') find_result.loc[i, 'ins'] = temp_list[0] find_result.loc[i, 's_num'] = temp_list[1] df = pd.read_excel('C:/Users/Administrator/Desktop/0618webofscience.xlsx') result = pd.merge(find_result, df, on='person_id', how='left') result.set_index(['person_id', 'JCR期刊列表', '姓名', 'find_num', 'ins_x'], inplace=True) result.to_excel('C:/Users/Administrator/Desktop/有搜索结果的数据.xlsx', sheet_name='Sheet1')
def deal1(): dbutil = DBUtil(host, port, database, username, password) sql = "select DISTINCT person_id, name, scopus_id from author_info_new where data_no='2020060819255418'" author_list1 = dbutil.get_allresult(sql, 'df') dbutil.close() author_list2 = pd.read_excel( 'C:/Users/Administrator/Desktop/test_data/test_data2.xlsx', sheet_name='Sheet1') author_list2['学者代码'] = author_list2['学者代码'].astype('str') author_list = pd.merge(author_list1, author_list2, how='left', left_on='person_id', right_on='学者代码') author_list.drop_duplicates(subset=['person_id'], inplace=True, keep='first') author_list.reset_index(drop=True, inplace=True) author_list['详细链接'] = 'a' author_list['百度搜索链接'] = 'b' author_list['搜索链接'] = 'c' search_url = 'https://www.scopus.com/results/authorNamesList.uri?sort=count-f&src=al' \ '&st1={}' \ '&st2={}' \ '&orcidId=&affilName={}' \ '&sot=anl&sdt=anl&sl=64&resultsPerPage=200&offset=1&jtp=false¤tPage=1&exactAuthorSearch=true' for i in range(len(author_list)): author_list.loc[ i, '详细链接'] = 'https://www.scopus.com/authid/detail.uri?authorId=%s' % author_list.loc[ i, 'scopus_id'] first_name = author_list.loc[i, 'name'].split(' ')[0].lower() last_name = author_list.loc[i, 'name'].split(' ')[1].lower() ins_name = '+'.join(author_list.loc[i, 'ins_en'].split(' ')).lower() author_list.loc[i, '搜索链接'] = search_url.format(first_name, last_name, ins_name) author_list.loc[i, '百度搜索链接'] = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=%s' \ % author_list.loc[i, '头衔当选单位'] + author_list.loc[i, '姓名'] author_list.to_excel('C:/Users/Administrator/Desktop/check_data.xlsx', index=False, encoding='utf-8')
def deal30(): dbutil = DBUtil(host, port, database, username, password) sql = 'select distinct person_id from author_info_new' df = dbutil.get_allresult(sql, 'df') input_df = pd.read_excel( 'C:/Users/Administrator/Desktop/物理学人才清单_20200908.xlsx', sheet_name='Sheet2') input_df['人才编号'] = input_df['人才编号'].astype('str') input_df = input_df.loc[~input_df['人才编号'].isin(list(df['person_id']))] input_df[ '百度搜索'] = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=' + input_df[ '当选单位信息'] + input_df['姓名'] input_df['百度学术搜索'] = 'https://xueshu.baidu.com/usercenter/data/authorchannel?cmd=inject_page&author=' + \ input_df['姓名'] + '&affiliate=' + input_df['当选单位信息'] input_df.to_excel('C:/Users/Administrator/Desktop/人工查找0909.xlsx', index=False)
def deal21(): dbutil = DBUtil(host, port, database, username, password) sql = 'select * from wos_doc_data' df = dbutil.get_allresult(sql, 'df') df['doc_num'] = df['doc_num'].astype('int') df2 = pd.read_excel( 'C:/Users/Administrator/Desktop/cssc-category-mapping.xlsx', sheet_name='Sheet1') df2['id'] = df2['id'].astype('str') for i in range(len(df2)): if len(df2.loc[i, 'id']) < 4: df2.loc[i, 'id'] = '0' + df2.loc[i, 'id'] df = pd.merge(df, df2, on='category_id') result = df.groupby(by=['orgName', 'year', 'id', 'Description'], as_index=False)['doc_num'].sum() dbutil.df_insert('doc_data', result)
def deal5(): dbutil = DBUtil(host, port, database, username, password) sql = "select DISTINCT person_id from mult_matched_author where data_no='2020052716115197'" df = dbutil.get_allresult(sql, 'df') dbutil.close() df2 = pd.read_excel( 'C:/Users/Administrator/Desktop/test_data/test_data2.xlsx') df2['学者代码'] = df2['学者代码'].astype('str') result = pd.merge(df, df2, how='left', left_on='person_id', right_on='学者代码') result.drop_duplicates(subset=['person_id'], inplace=True, ignore_index=True) result.to_excel('C:/Users/Administrator/Desktop/0605.xlsx', sheet_name='Sheet1', index=False)
def last_process(): dbutil = DBUtil(host, port, database, username, password) # sql = 'select scopus_journal_id, cite_num as rf_cited_num, subject from rf_cited_data' # rf_cited_data = dbutil.get_allresult(sql, 'df') # # sql = 'select cite_journal_id as scopus_journal_id, cite_num as rf_citing_num, subject from rf_citing_data' # rf_citing_data = dbutil.get_allresult(sql, 'df') sql = 'select scopus_journal_id, cite_num as rf_cited_num, subject from rf_cited_data_last' rf_cited_data = dbutil.get_allresult(sql, 'df') sql = 'select cite_journal_id as scopus_journal_id, cite_num as rf_citing_num, subject from rf_citing_data_last' rf_citing_data = dbutil.get_allresult(sql, 'df') sql = 'select * from cited_data' sum_cited_data = dbutil.get_allresult(sql, 'df') sql = 'select * from citing_data' sum_citing_data = dbutil.get_allresult(sql, 'df') rf_cited_data = pd.merge(rf_cited_data, sum_cited_data, how='left', on='scopus_journal_id') rf_cited_data['rf_cited_value'] = rf_cited_data[ 'rf_cited_num'] / rf_cited_data['cited_num'] rf_cited_sum = rf_cited_data.groupby( 'scopus_journal_id', as_index=False)['rf_cited_value'].sum() rf_cited_sum.rename(columns={'rf_cited_value': 'rf_cited_sum'}, inplace=True) rf_cited_data = pd.merge(rf_cited_data, rf_cited_sum, on='scopus_journal_id') rf_cited_data['rf_cited_percent'] = rf_cited_data[ 'rf_cited_value'] / rf_cited_data['rf_cited_sum'] rf_citing_data = pd.merge(rf_citing_data, sum_citing_data, how='left', on='scopus_journal_id') rf_citing_data['rf_citing_value'] = rf_citing_data[ 'rf_citing_num'] / rf_citing_data['citing_num'] rf_citing_sum = rf_citing_data.groupby( 'scopus_journal_id', as_index=False)['rf_citing_value'].sum() rf_citing_sum.rename(columns={'rf_citing_value': 'rf_citing_sum'}, inplace=True) rf_citing_data = pd.merge(rf_citing_data, rf_citing_sum, on='scopus_journal_id') rf_citing_data['rf_citing_percent'] = rf_citing_data[ 'rf_citing_value'] / rf_citing_data['rf_citing_sum'] rf_data = pd.merge(rf_cited_data, rf_citing_data, how='outer', on=['scopus_journal_id', 'subject']) rf_data.drop(columns=['cited_num', 'citing_num'], inplace=True) rf_data = pd.merge(rf_data, sum_cited_data, how='left', on='scopus_journal_id') rf_data = pd.merge(rf_data, sum_citing_data, how='left', on='scopus_journal_id') rf_data.fillna(0, inplace=True) rf_data['scopus_journal_id'] = rf_data['scopus_journal_id'].astype('int64') journal_name_data = pd.read_excel( 'C:/Users/Administrator/Desktop/Journal Citation Score 2019带一级学科信息20200726.xlsx' ) journal_name_data.drop_duplicates(subset=['Scopus Source ID'], inplace=True, ignore_index=True) with open('core_journal.json', 'r', encoding='utf-8') as cj: core_journal = json.load(cj) journal_name_data_temp = journal_name_data.loc[:, [ 'Scopus Source ID', 'Scholarly Output' ]] journal_name_data_temp['Scopus Source ID'] = journal_name_data_temp[ 'Scopus Source ID'].astype('str') core_journal_article_num = {} for subject, value in core_journal.items(): core_journal_article_num[subject] = sum( journal_name_data_temp[journal_name_data_temp['Scopus Source ID']. isin(value)]['Scholarly Output']) rf_data['cited_citing_sum'] = rf_data['citing_num'] + rf_data['cited_num'] rf_data['cited_citing_percent_sum'] = rf_data[ 'rf_cited_percent'] + rf_data['rf_citing_percent'] # 获取中文名称 journal_name_zh = pd.read_excel( 'C:/Users/Administrator/Desktop/期刊翻译结果.xlsx') rf_data = pd.merge(rf_data, journal_name_zh, how='left', on='scopus_journal_id') # rf_data.to_excel('C:/Users/Administrator/Desktop/rf_data.xlsx', index=False) rf_data.to_excel('C:/Users/Administrator/Desktop/rf_data_last.xlsx', index=False) rf_data_result = rf_data core_journal_num_dict = {} for subject, value in core_journal.items(): core_journal_num_dict[subject] = len(value) rf_data_result['core_journal数量'] = rf_data_result['subject'] rf_data_result['core_journal数量'].replace(core_journal_num_dict, inplace=True) rf_data_result['core_journal数量'] = rf_data_result['core_journal数量'].astype( 'int64') rf_data_result['core_journal发文数量'] = rf_data_result['subject'] rf_data_result['core_journal发文数量'].replace(core_journal_article_num, inplace=True) rf_data_result['core_journal发文数量'] = rf_data_result[ 'core_journal发文数量'].astype('int64') rf_data_result1 = copy.deepcopy(rf_data_result) rf_data_result2 = copy.deepcopy(rf_data_result) # 分学科结果,原始结果 # 合并core_journal学科 select_core_journal = pd.read_excel( 'C:/Users/Administrator/Desktop/core_journal-new.xlsx') select_core_journal = select_core_journal.loc[ select_core_journal['Core journal'].notnull(), ['scopus_journal_id', 'Core journal']] rf_data_result2 = pd.merge(rf_data_result2, select_core_journal, how='left', on='scopus_journal_id') rf_data_result2 = rf_data_result2.loc[ (rf_data_result2['cited_citing_percent_sum'] >= 0.4) | (rf_data_result2['subject'] == rf_data_result2['Core journal'])] # rf_data_result2.to_excel('C:/Users/Administrator/Desktop/分学科结果.xlsx', index=False) rf_data_result2.to_excel('C:/Users/Administrator/Desktop/分学科结果last.xlsx', index=False)
element = h_index.find(class_='fontLarge') if element: result_list.append( [input_data[i][0], input_data[i][1], element.text]) # 结束循环 count = len(input_data) # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) count = i # 将已完成的部分进行数据写入 result_df = pd.DataFrame(data=result_list, columns=['person_id', 'scopus_id', 'h_index']) write2sql([['h_index', result_df]]) if __name__ == '__main__': # 单元测试用 dbutil = DBUtil(host, port, database, username, password) sql = 'select distinct person_id, scopus_id from author_info_new where ' \ 'person_id not in (select person_id from h_index)' author_id_df = dbutil.get_allresult(sql, 'df') dbutil.close() input_data = author_id_df.values.tolist() crawl_h_index(input_data) print('****** END ******')
# dbutil = DBUtil(host, port, database, username, password) # sql = 'select * from wos_doc_data_copy' # data = dbutil.get_allresult(sql, 'df') # print('读取源数据完成') # df = pd.read_excel('C:/Users/Administrator/Desktop/校名映射表.xlsx') # data = pd.merge(data, df, how='left', on='orgName') # # print('merge完成') # print('开始写入数据库') # write2sql([['wos_doc_data0930', data]]) # print('写入数据库完成') dbutil = DBUtil(host, port, database, username, password) sql = 'select * from wos_doc_data_detail' data = dbutil.get_allresult(sql, 'df') print('读取源数据完成') df = pd.read_excel('C:/Users/Administrator/Desktop/校名映射表.xlsx') data = pd.merge(data, df, how='left', on='orgName') data1 = data.loc[0:499999] data2 = data.loc[500000:999999] data3 = data.loc[1000000:1499999] data4 = data.loc[1500000:1999999] data5 = data.loc[2000000:2499999] data6 = data.loc[2500000:2999999] data7 = data.loc[3000000:] data_list = [data1, data2, data3, data4, data5, data6, data7] i = 0
}) cite_df_temp['cite_num'] = 1 cite_journal_data = cite_df_temp.groupby( by=['cite_journal', 'cite_journal_id'], as_index=False).sum() cite_journal_data['scopus_journal_id'] = input_data[i][0] result_df_list.append(cite_journal_data) count = len(input_data) # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) count = i if result_df_list: all_data = pd.concat(result_df_list) write2sql([['scopus_cite_data0810', all_data]]) if __name__ == '__main__': dbutil = DBUtil(host, port, database, username, password) sql = 'select scopus_journal_id from scopus_journal_id where scopus_journal_id not ' \ 'in (select distinct scopus_journal_id from scopus_cite_data0810) and cite_num!="0"' journal_id_df = dbutil.get_allresult(sql, 'df') dbutil.close() input_data = journal_id_df.values.tolist() # input_data = input_data[:5] get_cite_data(input_data)
#! /usr/bin/python # -*- coding: utf-8 -*- import pandas as pd from src.config.DBUtil import DBUtil from src.Scopus_Crawler.scopus_config import host, port, database, username, password dbutil = DBUtil(host, port, database, username, password) sql = 'select * from wos_article_data' already_df = dbutil.get_allresult(sql, 'df') already_df.drop_duplicates(subset=['UT'], ignore_index=True, inplace=True) dbutil.df_insert('wos_article_data_final', already_df) dbutil.close()
data_df['orgName'] = input_data[i][0] result_df_list.append(data_df) count = len(input_data) session.close() # 出现错误时,从错误处中断,再从该处开始 except Exception as err: print('ERROR:%s' % err) session.close() count = i if result_df_list: all_data_df = pd.concat(result_df_list) write2sql([['wos_doc_data_detail', all_data_df]]) if __name__ == '__main__': dbutil = DBUtil(host, port, database, username, password) sql = 'select orgName, `key`, year, category_id, category_name from wos_doc_data_copy' input_df = dbutil.get_allresult(sql, 'df') input_df.drop_duplicates( subset=['orgName', 'key', 'category_id', 'category_name'], inplace=True) dbutil.close() input_data = input_df.values.tolist() for rdd in range(9): get_doc_data(input_data[5000 * rdd:5000 * rdd + 5000])