def deal6(): dbutil = DBUtil(host, port, database, username, password) sql = "select person_id, name from not_matched_author where data_no='2020060819255418'" df = dbutil.get_allresult(sql, 'df') dbutil.close() df2 = pd.read_excel( 'C:/Users/Administrator/Desktop/test_data/test_data2.xlsx') df2['学者代码'] = df2['学者代码'].astype('str') result = pd.merge(df, df2, how='left', left_on='person_id', right_on='学者代码') result.sort_values(by=['学者代码'], inplace=True, ignore_index=True) search_url = 'https://www.scopus.com/results/authorNamesList.uri?sort=count-f&src=al' \ '&st1={}' \ '&st2={}' \ '&orcidId=&affilName={}' \ '&sot=anl&sdt=anl&sl=64&resultsPerPage=200&offset=1&jtp=false¤tPage=1&exactAuthorSearch=true' for i in range(len(result)): first_name = result.loc[i, 'name'].split(' ')[0].lower() last_name = result.loc[i, 'name'].split(' ')[1].lower() ins_name = '+'.join(result.loc[i, 'ins_en'].split(' ')).lower() result.loc[i, '搜索链接'] = search_url.format(first_name, last_name, ins_name) result.to_excel('C:/Users/Administrator/Desktop/0609not.xlsx', sheet_name='Sheet1', index=False)
def write2sql(rows): ''' :param rows: [[表名, df名称], [...]] :return: ''' dbutil = DBUtil(host, port, database, username, password) for row in rows: dbutil.df_insert(row[0], row[1]) dbutil.close()
def rf_cal(): dbutil = DBUtil(host, port, database, username, password) sql = 'select scopus_journal_id, cite_num+0 as cite_num, cite_journal_id from scopus_cite_data ' \ 'where scopus_journal_id!=cite_journal_id' all_cite_data = dbutil.get_allresult(sql, 'df') # with open('core_journal.json', 'r', encoding='utf-8') as cj: # core_journal = json.load(cj) core_journal = {} already_data = pd.read_excel('C:/Users/Administrator/Desktop/分学科结果.xlsx') already_data['scopus_journal_id'] = already_data[ 'scopus_journal_id'].astype('str') for subject, sub_df in already_data.groupby('subject'): core_journal[subject] = list(sub_df['scopus_journal_id']) rf_cited_list = [] rf_citing_list = [] for subject, sub_journal in core_journal.items(): print(subject) sub_cited_df_temp = all_cite_data.loc[ all_cite_data['cite_journal_id'].isin(sub_journal)] sub_cited_df = sub_cited_df_temp.groupby( 'scopus_journal_id', as_index=False)['cite_num'].sum() sub_citing_df_temp = all_cite_data.loc[ all_cite_data['scopus_journal_id'].isin(sub_journal)] sub_citing_df = sub_citing_df_temp.groupby( 'cite_journal_id', as_index=False)['cite_num'].sum() sub_cited_df['subject'] = subject rf_cited_list.append(sub_cited_df) sub_citing_df['subject'] = subject rf_citing_list.append(sub_citing_df) rf_cited_df = pd.concat(rf_cited_list) rf_citing_df = pd.concat(rf_citing_list) # dbutil.df_insert('rf_cited_data', rf_cited_df) # dbutil.df_insert('rf_citing_data', rf_citing_df) select_journal = pd.read_excel( 'C:/Users/Administrator/Desktop/未被分学科期刊.xlsx') select_journal['Scopus Source ID'] = select_journal[ 'Scopus Source ID'].astype('str') rf_cited_df = rf_cited_df.loc[rf_cited_df['scopus_journal_id'].isin( list(select_journal['Scopus Source ID']))] rf_citing_df = rf_citing_df.loc[rf_citing_df['cite_journal_id'].isin( list(select_journal['Scopus Source ID']))] dbutil.df_insert('rf_cited_data_last', rf_cited_df) dbutil.df_insert('rf_citing_data_last', rf_citing_df) dbutil.close()
def deal8(): df = pd.read_excel('C:/Users/Administrator/Desktop/0618webofscience.xlsx') dbutil = DBUtil(host, port, database, username, password) sql = "select distinct (person_id+0) as person_id from not_find where person_id not in (select person_id from find_result)" not_find = dbutil.get_allresult(sql, 'df') dbutil.close() result = pd.merge(not_find, df, how='left', on='person_id') result.to_excel('C:/Users/Administrator/Desktop/未搜索到结果清单.xlsx', sheet_name='Sheet1', index=False)
def temp(input_data): from src.config.DBUtil import DBUtil from src.Scopus_Crawler.scopus_config import host, port, database, username, password dbutil = DBUtil(host, port, database, username, password) sql = "SELECT person_id FROM author_info_new UNION SELECT person_id FROM mult_matched_author UNION SELECT " \ "person_id FROM not_matched_author" df = dbutil.get_allresult(sql, 'df') dbutil.close() input_list = [str(i['person_id']) for i in input_data] diff = set(input_list).difference(set(df['person_id'])) out_data = [i for i in input_data if str(i['person_id']) in diff] return out_data
def deal1(): dbutil = DBUtil(host, port, database, username, password) sql = "select DISTINCT person_id, name, scopus_id from author_info_new where data_no='2020060819255418'" author_list1 = dbutil.get_allresult(sql, 'df') dbutil.close() author_list2 = pd.read_excel( 'C:/Users/Administrator/Desktop/test_data/test_data2.xlsx', sheet_name='Sheet1') author_list2['学者代码'] = author_list2['学者代码'].astype('str') author_list = pd.merge(author_list1, author_list2, how='left', left_on='person_id', right_on='学者代码') author_list.drop_duplicates(subset=['person_id'], inplace=True, keep='first') author_list.reset_index(drop=True, inplace=True) author_list['详细链接'] = 'a' author_list['百度搜索链接'] = 'b' author_list['搜索链接'] = 'c' search_url = 'https://www.scopus.com/results/authorNamesList.uri?sort=count-f&src=al' \ '&st1={}' \ '&st2={}' \ '&orcidId=&affilName={}' \ '&sot=anl&sdt=anl&sl=64&resultsPerPage=200&offset=1&jtp=false¤tPage=1&exactAuthorSearch=true' for i in range(len(author_list)): author_list.loc[ i, '详细链接'] = 'https://www.scopus.com/authid/detail.uri?authorId=%s' % author_list.loc[ i, 'scopus_id'] first_name = author_list.loc[i, 'name'].split(' ')[0].lower() last_name = author_list.loc[i, 'name'].split(' ')[1].lower() ins_name = '+'.join(author_list.loc[i, 'ins_en'].split(' ')).lower() author_list.loc[i, '搜索链接'] = search_url.format(first_name, last_name, ins_name) author_list.loc[i, '百度搜索链接'] = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=%s' \ % author_list.loc[i, '头衔当选单位'] + author_list.loc[i, '姓名'] author_list.to_excel('C:/Users/Administrator/Desktop/check_data.xlsx', index=False, encoding='utf-8')
def deal5(): dbutil = DBUtil(host, port, database, username, password) sql = "select DISTINCT person_id from mult_matched_author where data_no='2020052716115197'" df = dbutil.get_allresult(sql, 'df') dbutil.close() df2 = pd.read_excel( 'C:/Users/Administrator/Desktop/test_data/test_data2.xlsx') df2['学者代码'] = df2['学者代码'].astype('str') result = pd.merge(df, df2, how='left', left_on='person_id', right_on='学者代码') result.drop_duplicates(subset=['person_id'], inplace=True, ignore_index=True) result.to_excel('C:/Users/Administrator/Desktop/0605.xlsx', sheet_name='Sheet1', index=False)
#! /usr/bin/python # -*- coding: utf-8 -*- import pandas as pd from src.config.DBUtil import DBUtil from src.Scopus_Crawler.scopus_config import host, port, database, username, password dbutil = DBUtil(host, port, database, username, password) sql = 'select * from wos_article_data' already_df = dbutil.get_allresult(sql, 'df') already_df.drop_duplicates(subset=['UT'], ignore_index=True, inplace=True) dbutil.df_insert('wos_article_data_final', already_df) dbutil.close()