def write2sql(rows): ''' :param rows: [[表名, df名称], [...]] :return: ''' dbutil = DBUtil(host, port, database, username, password) for row in rows: dbutil.df_insert(row[0], row[1]) dbutil.close()
def rf_cal(): dbutil = DBUtil(host, port, database, username, password) sql = 'select scopus_journal_id, cite_num+0 as cite_num, cite_journal_id from scopus_cite_data ' \ 'where scopus_journal_id!=cite_journal_id' all_cite_data = dbutil.get_allresult(sql, 'df') # with open('core_journal.json', 'r', encoding='utf-8') as cj: # core_journal = json.load(cj) core_journal = {} already_data = pd.read_excel('C:/Users/Administrator/Desktop/分学科结果.xlsx') already_data['scopus_journal_id'] = already_data[ 'scopus_journal_id'].astype('str') for subject, sub_df in already_data.groupby('subject'): core_journal[subject] = list(sub_df['scopus_journal_id']) rf_cited_list = [] rf_citing_list = [] for subject, sub_journal in core_journal.items(): print(subject) sub_cited_df_temp = all_cite_data.loc[ all_cite_data['cite_journal_id'].isin(sub_journal)] sub_cited_df = sub_cited_df_temp.groupby( 'scopus_journal_id', as_index=False)['cite_num'].sum() sub_citing_df_temp = all_cite_data.loc[ all_cite_data['scopus_journal_id'].isin(sub_journal)] sub_citing_df = sub_citing_df_temp.groupby( 'cite_journal_id', as_index=False)['cite_num'].sum() sub_cited_df['subject'] = subject rf_cited_list.append(sub_cited_df) sub_citing_df['subject'] = subject rf_citing_list.append(sub_citing_df) rf_cited_df = pd.concat(rf_cited_list) rf_citing_df = pd.concat(rf_citing_list) # dbutil.df_insert('rf_cited_data', rf_cited_df) # dbutil.df_insert('rf_citing_data', rf_citing_df) select_journal = pd.read_excel( 'C:/Users/Administrator/Desktop/未被分学科期刊.xlsx') select_journal['Scopus Source ID'] = select_journal[ 'Scopus Source ID'].astype('str') rf_cited_df = rf_cited_df.loc[rf_cited_df['scopus_journal_id'].isin( list(select_journal['Scopus Source ID']))] rf_citing_df = rf_citing_df.loc[rf_citing_df['cite_journal_id'].isin( list(select_journal['Scopus Source ID']))] dbutil.df_insert('rf_cited_data_last', rf_cited_df) dbutil.df_insert('rf_citing_data_last', rf_citing_df) dbutil.close()
def deal31(): dbutil = DBUtil(host, port, database, username, password) sql = 'select * from author_info_new' author_info_new0910 = dbutil.get_allresult(sql, 'df') sql = 'select * from h_index' h_index0910 = dbutil.get_allresult(sql, 'df') sql = 'select * from article_cite_data' article_cite_data0910 = dbutil.get_allresult(sql, 'df') sql = 'select scopus_id, person_id, publish_year, count(*) as first_au_doc_num from scopus_author_article ' \ 'group by scopus_id, person_id, publish_year' scopus_author_article0910 = dbutil.get_allresult(sql, 'df') physics_list = pd.read_excel( 'C:/Users/Administrator/Desktop/物理学人才清单_20200908.xlsx', sheet_name='data') physics_list['人才编号'] = physics_list['人才编号'].astype('str') author_info_new0910 = author_info_new0910.loc[ author_info_new0910['person_id'].isin(list(physics_list['人才编号']))] h_index0910 = h_index0910.loc[h_index0910['person_id'].isin( list(physics_list['人才编号']))] article_cite_data0910 = article_cite_data0910.loc[ article_cite_data0910['person_id'].isin(list(physics_list['人才编号']))] scopus_author_article0910 = scopus_author_article0910.loc[ scopus_author_article0910['person_id'].isin(list( physics_list['人才编号']))] dbutil.df_insert('author_info_new0910', author_info_new0910) dbutil.df_insert('h_index0910', h_index0910) dbutil.df_insert('article_cite_data0910', article_cite_data0910) dbutil.df_insert('scopus_author_article0910', scopus_author_article0910)
def deal21(): dbutil = DBUtil(host, port, database, username, password) sql = 'select * from wos_doc_data' df = dbutil.get_allresult(sql, 'df') df['doc_num'] = df['doc_num'].astype('int') df2 = pd.read_excel( 'C:/Users/Administrator/Desktop/cssc-category-mapping.xlsx', sheet_name='Sheet1') df2['id'] = df2['id'].astype('str') for i in range(len(df2)): if len(df2.loc[i, 'id']) < 4: df2.loc[i, 'id'] = '0' + df2.loc[i, 'id'] df = pd.merge(df, df2, on='category_id') result = df.groupby(by=['orgName', 'year', 'id', 'Description'], as_index=False)['doc_num'].sum() dbutil.df_insert('doc_data', result)
#! /usr/bin/python # -*- coding: utf-8 -*- import pandas as pd from src.config.DBUtil import DBUtil from src.Scopus_Crawler.scopus_config import host, port, database, username, password dbutil = DBUtil(host, port, database, username, password) sql = 'select * from wos_article_data' already_df = dbutil.get_allresult(sql, 'df') already_df.drop_duplicates(subset=['UT'], ignore_index=True, inplace=True) dbutil.df_insert('wos_article_data_final', already_df) dbutil.close()