def deal6():
    dbutil = DBUtil(host, port, database, username, password)
    sql = "select person_id, name from not_matched_author where data_no='2020060819255418'"
    df = dbutil.get_allresult(sql, 'df')
    dbutil.close()

    df2 = pd.read_excel(
        'C:/Users/Administrator/Desktop/test_data/test_data2.xlsx')

    df2['学者代码'] = df2['学者代码'].astype('str')
    result = pd.merge(df,
                      df2,
                      how='left',
                      left_on='person_id',
                      right_on='学者代码')
    result.sort_values(by=['学者代码'], inplace=True, ignore_index=True)

    search_url = 'https://www.scopus.com/results/authorNamesList.uri?sort=count-f&src=al' \
                 '&st1={}' \
                 '&st2={}' \
                 '&orcidId=&affilName={}' \
                 '&sot=anl&sdt=anl&sl=64&resultsPerPage=200&offset=1&jtp=false&currentPage=1&exactAuthorSearch=true'

    for i in range(len(result)):
        first_name = result.loc[i, 'name'].split(' ')[0].lower()
        last_name = result.loc[i, 'name'].split(' ')[1].lower()
        ins_name = '+'.join(result.loc[i, 'ins_en'].split(' ')).lower()
        result.loc[i, '搜索链接'] = search_url.format(first_name, last_name,
                                                  ins_name)

    result.to_excel('C:/Users/Administrator/Desktop/0609not.xlsx',
                    sheet_name='Sheet1',
                    index=False)
예제 #2
0
def write2sql(rows):
    '''

    :param rows: [[表名, df名称], [...]]
    :return:
    '''
    dbutil = DBUtil(host, port, database, username, password)
    for row in rows:
        dbutil.df_insert(row[0], row[1])

    dbutil.close()
def rf_cal():
    dbutil = DBUtil(host, port, database, username, password)
    sql = 'select scopus_journal_id, cite_num+0 as cite_num, cite_journal_id from scopus_cite_data ' \
          'where scopus_journal_id!=cite_journal_id'
    all_cite_data = dbutil.get_allresult(sql, 'df')

    # with open('core_journal.json', 'r', encoding='utf-8') as cj:
    #     core_journal = json.load(cj)

    core_journal = {}
    already_data = pd.read_excel('C:/Users/Administrator/Desktop/分学科结果.xlsx')
    already_data['scopus_journal_id'] = already_data[
        'scopus_journal_id'].astype('str')
    for subject, sub_df in already_data.groupby('subject'):
        core_journal[subject] = list(sub_df['scopus_journal_id'])

    rf_cited_list = []
    rf_citing_list = []
    for subject, sub_journal in core_journal.items():
        print(subject)
        sub_cited_df_temp = all_cite_data.loc[
            all_cite_data['cite_journal_id'].isin(sub_journal)]
        sub_cited_df = sub_cited_df_temp.groupby(
            'scopus_journal_id', as_index=False)['cite_num'].sum()
        sub_citing_df_temp = all_cite_data.loc[
            all_cite_data['scopus_journal_id'].isin(sub_journal)]
        sub_citing_df = sub_citing_df_temp.groupby(
            'cite_journal_id', as_index=False)['cite_num'].sum()

        sub_cited_df['subject'] = subject
        rf_cited_list.append(sub_cited_df)

        sub_citing_df['subject'] = subject
        rf_citing_list.append(sub_citing_df)

    rf_cited_df = pd.concat(rf_cited_list)
    rf_citing_df = pd.concat(rf_citing_list)

    # dbutil.df_insert('rf_cited_data', rf_cited_df)
    # dbutil.df_insert('rf_citing_data', rf_citing_df)

    select_journal = pd.read_excel(
        'C:/Users/Administrator/Desktop/未被分学科期刊.xlsx')
    select_journal['Scopus Source ID'] = select_journal[
        'Scopus Source ID'].astype('str')
    rf_cited_df = rf_cited_df.loc[rf_cited_df['scopus_journal_id'].isin(
        list(select_journal['Scopus Source ID']))]
    rf_citing_df = rf_citing_df.loc[rf_citing_df['cite_journal_id'].isin(
        list(select_journal['Scopus Source ID']))]

    dbutil.df_insert('rf_cited_data_last', rf_cited_df)
    dbutil.df_insert('rf_citing_data_last', rf_citing_df)

    dbutil.close()
def deal8():
    df = pd.read_excel('C:/Users/Administrator/Desktop/0618webofscience.xlsx')

    dbutil = DBUtil(host, port, database, username, password)
    sql = "select distinct (person_id+0) as person_id from not_find where person_id not in (select person_id from find_result)"
    not_find = dbutil.get_allresult(sql, 'df')
    dbutil.close()
    result = pd.merge(not_find, df, how='left', on='person_id')
    result.to_excel('C:/Users/Administrator/Desktop/未搜索到结果清单.xlsx',
                    sheet_name='Sheet1',
                    index=False)
예제 #5
0
def temp(input_data):
    from src.config.DBUtil import DBUtil
    from src.Scopus_Crawler.scopus_config import host, port, database, username, password

    dbutil = DBUtil(host, port, database, username, password)
    sql = "SELECT person_id FROM author_info_new UNION SELECT person_id FROM mult_matched_author UNION SELECT " \
          "person_id FROM not_matched_author"
    df = dbutil.get_allresult(sql, 'df')
    dbutil.close()
    input_list = [str(i['person_id']) for i in input_data]
    diff = set(input_list).difference(set(df['person_id']))
    out_data = [i for i in input_data if str(i['person_id']) in diff]
    return out_data
def deal1():
    dbutil = DBUtil(host, port, database, username, password)
    sql = "select DISTINCT person_id, name, scopus_id from author_info_new where data_no='2020060819255418'"
    author_list1 = dbutil.get_allresult(sql, 'df')
    dbutil.close()

    author_list2 = pd.read_excel(
        'C:/Users/Administrator/Desktop/test_data/test_data2.xlsx',
        sheet_name='Sheet1')
    author_list2['学者代码'] = author_list2['学者代码'].astype('str')

    author_list = pd.merge(author_list1,
                           author_list2,
                           how='left',
                           left_on='person_id',
                           right_on='学者代码')
    author_list.drop_duplicates(subset=['person_id'],
                                inplace=True,
                                keep='first')
    author_list.reset_index(drop=True, inplace=True)

    author_list['详细链接'] = 'a'
    author_list['百度搜索链接'] = 'b'
    author_list['搜索链接'] = 'c'

    search_url = 'https://www.scopus.com/results/authorNamesList.uri?sort=count-f&src=al' \
                 '&st1={}' \
                 '&st2={}' \
                 '&orcidId=&affilName={}' \
                 '&sot=anl&sdt=anl&sl=64&resultsPerPage=200&offset=1&jtp=false&currentPage=1&exactAuthorSearch=true'

    for i in range(len(author_list)):
        author_list.loc[
            i,
            '详细链接'] = 'https://www.scopus.com/authid/detail.uri?authorId=%s' % author_list.loc[
                i, 'scopus_id']
        first_name = author_list.loc[i, 'name'].split(' ')[0].lower()
        last_name = author_list.loc[i, 'name'].split(' ')[1].lower()
        ins_name = '+'.join(author_list.loc[i, 'ins_en'].split(' ')).lower()
        author_list.loc[i, '搜索链接'] = search_url.format(first_name, last_name,
                                                       ins_name)
        author_list.loc[i, '百度搜索链接'] = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=%s' \
                                       % author_list.loc[i, '头衔当选单位'] + author_list.loc[i, '姓名']

    author_list.to_excel('C:/Users/Administrator/Desktop/check_data.xlsx',
                         index=False,
                         encoding='utf-8')
def deal5():
    dbutil = DBUtil(host, port, database, username, password)
    sql = "select DISTINCT person_id from mult_matched_author where data_no='2020052716115197'"
    df = dbutil.get_allresult(sql, 'df')
    dbutil.close()

    df2 = pd.read_excel(
        'C:/Users/Administrator/Desktop/test_data/test_data2.xlsx')

    df2['学者代码'] = df2['学者代码'].astype('str')
    result = pd.merge(df,
                      df2,
                      how='left',
                      left_on='person_id',
                      right_on='学者代码')
    result.drop_duplicates(subset=['person_id'],
                           inplace=True,
                           ignore_index=True)

    result.to_excel('C:/Users/Administrator/Desktop/0605.xlsx',
                    sheet_name='Sheet1',
                    index=False)
#! /usr/bin/python
# -*- coding: utf-8 -*-

import pandas as pd

from src.config.DBUtil import DBUtil
from src.Scopus_Crawler.scopus_config import host, port, database, username, password

dbutil = DBUtil(host, port, database, username, password)
sql = 'select * from wos_article_data'
already_df = dbutil.get_allresult(sql, 'df')

already_df.drop_duplicates(subset=['UT'], ignore_index=True, inplace=True)

dbutil.df_insert('wos_article_data_final', already_df)
dbutil.close()