예제 #1
0
def write2sql(rows):
    '''

    :param rows: [[表名, df名称], [...]]
    :return:
    '''
    dbutil = DBUtil(host, port, database, username, password)
    for row in rows:
        dbutil.df_insert(row[0], row[1])

    dbutil.close()
def rf_cal():
    dbutil = DBUtil(host, port, database, username, password)
    sql = 'select scopus_journal_id, cite_num+0 as cite_num, cite_journal_id from scopus_cite_data ' \
          'where scopus_journal_id!=cite_journal_id'
    all_cite_data = dbutil.get_allresult(sql, 'df')

    # with open('core_journal.json', 'r', encoding='utf-8') as cj:
    #     core_journal = json.load(cj)

    core_journal = {}
    already_data = pd.read_excel('C:/Users/Administrator/Desktop/分学科结果.xlsx')
    already_data['scopus_journal_id'] = already_data[
        'scopus_journal_id'].astype('str')
    for subject, sub_df in already_data.groupby('subject'):
        core_journal[subject] = list(sub_df['scopus_journal_id'])

    rf_cited_list = []
    rf_citing_list = []
    for subject, sub_journal in core_journal.items():
        print(subject)
        sub_cited_df_temp = all_cite_data.loc[
            all_cite_data['cite_journal_id'].isin(sub_journal)]
        sub_cited_df = sub_cited_df_temp.groupby(
            'scopus_journal_id', as_index=False)['cite_num'].sum()
        sub_citing_df_temp = all_cite_data.loc[
            all_cite_data['scopus_journal_id'].isin(sub_journal)]
        sub_citing_df = sub_citing_df_temp.groupby(
            'cite_journal_id', as_index=False)['cite_num'].sum()

        sub_cited_df['subject'] = subject
        rf_cited_list.append(sub_cited_df)

        sub_citing_df['subject'] = subject
        rf_citing_list.append(sub_citing_df)

    rf_cited_df = pd.concat(rf_cited_list)
    rf_citing_df = pd.concat(rf_citing_list)

    # dbutil.df_insert('rf_cited_data', rf_cited_df)
    # dbutil.df_insert('rf_citing_data', rf_citing_df)

    select_journal = pd.read_excel(
        'C:/Users/Administrator/Desktop/未被分学科期刊.xlsx')
    select_journal['Scopus Source ID'] = select_journal[
        'Scopus Source ID'].astype('str')
    rf_cited_df = rf_cited_df.loc[rf_cited_df['scopus_journal_id'].isin(
        list(select_journal['Scopus Source ID']))]
    rf_citing_df = rf_citing_df.loc[rf_citing_df['cite_journal_id'].isin(
        list(select_journal['Scopus Source ID']))]

    dbutil.df_insert('rf_cited_data_last', rf_cited_df)
    dbutil.df_insert('rf_citing_data_last', rf_citing_df)

    dbutil.close()
def deal31():
    dbutil = DBUtil(host, port, database, username, password)

    sql = 'select * from author_info_new'
    author_info_new0910 = dbutil.get_allresult(sql, 'df')

    sql = 'select * from h_index'
    h_index0910 = dbutil.get_allresult(sql, 'df')

    sql = 'select * from article_cite_data'
    article_cite_data0910 = dbutil.get_allresult(sql, 'df')

    sql = 'select scopus_id, person_id, publish_year, count(*) as first_au_doc_num from scopus_author_article ' \
          'group by scopus_id, person_id, publish_year'
    scopus_author_article0910 = dbutil.get_allresult(sql, 'df')

    physics_list = pd.read_excel(
        'C:/Users/Administrator/Desktop/物理学人才清单_20200908.xlsx',
        sheet_name='data')
    physics_list['人才编号'] = physics_list['人才编号'].astype('str')

    author_info_new0910 = author_info_new0910.loc[
        author_info_new0910['person_id'].isin(list(physics_list['人才编号']))]
    h_index0910 = h_index0910.loc[h_index0910['person_id'].isin(
        list(physics_list['人才编号']))]
    article_cite_data0910 = article_cite_data0910.loc[
        article_cite_data0910['person_id'].isin(list(physics_list['人才编号']))]
    scopus_author_article0910 = scopus_author_article0910.loc[
        scopus_author_article0910['person_id'].isin(list(
            physics_list['人才编号']))]

    dbutil.df_insert('author_info_new0910', author_info_new0910)
    dbutil.df_insert('h_index0910', h_index0910)
    dbutil.df_insert('article_cite_data0910', article_cite_data0910)
    dbutil.df_insert('scopus_author_article0910', scopus_author_article0910)
def deal21():
    dbutil = DBUtil(host, port, database, username, password)
    sql = 'select * from wos_doc_data'
    df = dbutil.get_allresult(sql, 'df')
    df['doc_num'] = df['doc_num'].astype('int')

    df2 = pd.read_excel(
        'C:/Users/Administrator/Desktop/cssc-category-mapping.xlsx',
        sheet_name='Sheet1')
    df2['id'] = df2['id'].astype('str')
    for i in range(len(df2)):
        if len(df2.loc[i, 'id']) < 4:
            df2.loc[i, 'id'] = '0' + df2.loc[i, 'id']

    df = pd.merge(df, df2, on='category_id')
    result = df.groupby(by=['orgName', 'year', 'id', 'Description'],
                        as_index=False)['doc_num'].sum()
    dbutil.df_insert('doc_data', result)
#! /usr/bin/python
# -*- coding: utf-8 -*-

import pandas as pd

from src.config.DBUtil import DBUtil
from src.Scopus_Crawler.scopus_config import host, port, database, username, password

dbutil = DBUtil(host, port, database, username, password)
sql = 'select * from wos_article_data'
already_df = dbutil.get_allresult(sql, 'df')

already_df.drop_duplicates(subset=['UT'], ignore_index=True, inplace=True)

dbutil.df_insert('wos_article_data_final', already_df)
dbutil.close()