def get_name(input_data):
    count = 0
    while count < len(input_data):
        all_person_data_list = []
        try:
            for i in range(count, len(input_data)):
                print('当前进度:%s / %s' % (i + 1, len(input_data)))
                url = base_url % input_data[i][1]
                author_info = requests.get(url=url,
                                           proxies=proxies,
                                           headers=headers).json()
                author_name = author_info['search-results']['entry'][0]['preferred-name']['surname'] + ' ' + \
                              author_info['search-results']['entry'][0]['preferred-name'].get('initials', '')
                all_person_data_list.append(
                    [input_data[i][0], input_data[i][1],
                     author_name.strip()])

            count = len(input_data)

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            count = i

        if all_person_data_list:
            result_df = pd.DataFrame(
                data=all_person_data_list,
                columns=['person_id', 'scopus_id', 'name'])
            write2sql([['scopus_author_name', result_df]])
def get_cite_data(input_data):
    count = 0
    while count < len(input_data):
        result_df_list = []
        # 启动浏览器并获取cookies
        driver = start_driver()
        cookies = get_cookies(driver)
        driver.close()
        try:
            for i in range(count, len(input_data)):
                print('当前进度:%s / %s' % (i + 1, len(input_data)))
                cite_journal_list = []
                cite_journal_id_list = []
                get_page_url = base_url % (input_data[i][0], str(1))
                page_info = requests.get(get_page_url,
                                         proxies=proxies,
                                         headers=headers,
                                         timeout=300,
                                         cookies=cookies).json()
                cite_journal_list += [k['srctitle'] for k in page_info['docs']]
                cite_journal_id_list += [k['srcid'] for k in page_info['docs']]
                for j in range(2, int(page_info['Pages']) + 1):
                    url = base_url % (input_data[i][0], str(j))
                    cite_info = requests.get(url,
                                             proxies=proxies,
                                             headers=headers,
                                             timeout=300,
                                             cookies=cookies)
                    cite_info_dict = cite_info.json()
                    cite_journal_list += [
                        k['srctitle'] for k in cite_info_dict['docs']
                    ]
                    cite_journal_id_list += [
                        k['srcid'] for k in cite_info_dict['docs']
                    ]

                cite_df_temp = pd.DataFrame(
                    data={
                        'cite_journal': cite_journal_list,
                        'cite_journal_id': cite_journal_id_list
                    })
                cite_df_temp['cite_num'] = 1
                cite_journal_data = cite_df_temp.groupby(
                    by=['cite_journal',
                        'cite_journal_id'], as_index=False).sum()
                cite_journal_data['scopus_journal_id'] = input_data[i][0]
                result_df_list.append(cite_journal_data)

            count = len(input_data)

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            count = i

        if result_df_list:
            all_data = pd.concat(result_df_list)
            write2sql([['scopus_cite_data0810', all_data]])
def get_doc_data(input_data, school, value):
    headers['content-type'] = 'application/json'
    post_url = post_url_base % value[1]
    if school == 'Chinese Academy of Medical Sciences - Peking Union Medical College':
        del post_data['filters']['orgtype']
    count = 0
    while count < len(input_data):
        result_df_list = []
        try:
            session = requests.session()
            session.post(url=login_url,
                         data=login_data,
                         headers=headers_login,
                         timeout=300)
            for i in range(count, len(input_data)):
                print('当前进度:%s / %s' % (i + 1, len(input_data)))
                for year in range(2015, 2020):
                    post_data['filters']['sbjname']['is'] = input_data[i]
                    post_data['filters']['period']['is'] = [year, year]
                    doc_data = session.post(url=post_url,
                                            data=json.dumps(post_data),
                                            headers=headers,
                                            timeout=300).json()

                    if doc_data['items']:
                        for item in doc_data['items']:
                            item['doc_num'] = item['jifdocsq1']['value']
                            del item['jifdocsq1']
                            del item['wosDocuments']

                        data_df = pd.DataFrame(data=doc_data['items'])
                        data_df['year'] = year
                        data_df['category_id'] = input_data[i][:4]
                        data_df['category_name'] = input_data[i][5:]
                        result_df_list.append(data_df)

            count = len(input_data)
            session.close()

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            session.close()
            count = i

        if result_df_list:
            all_data_df = pd.concat(result_df_list)
            all_data_df = all_data_df.loc[all_data_df['orgName'] == value[0]]
            all_data_df['orgName'] = school
            write2sql([['wos_doc_data_copy', all_data_df]])
예제 #4
0
def get_article(input_data):
    count = 0
    while count < len(input_data):
        all_person_data_list = []
        try:
            for i in range(count, len(input_data)):
                print('当前进度:%s / %s' % (i + 1, len(input_data)))
                for j in range(1000):
                    url = base_url.format(input_data[i][1], input_data[i][2],
                                          str(j * 25))
                    author_article_info = requests.get(url=url,
                                                       proxies=proxies,
                                                       headers=headers).json()
                    if 'service-error' in author_article_info:
                        print(input_data[i][2])
                        break

                    if 'entry' not in author_article_info['search-results']:
                        break

                    if 'error' in author_article_info['search-results'][
                            'entry'][0]:
                        break

                    for article_dict in author_article_info['search-results'][
                            'entry']:
                        doi = article_dict.get('prism:doi', '')
                        eid = article_dict.get('eid', '')
                        scopus_article_id = article_dict['dc:identifier'][10:]
                        publish_year = article_dict.get('prism:coverDate', '')
                        all_person_data_list.append([
                            input_data[i][0], input_data[i][1], doi,
                            publish_year, eid, scopus_article_id
                        ])

            count = len(input_data)

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            count = i

        if all_person_data_list:
            result_df = pd.DataFrame(data=all_person_data_list,
                                     columns=[
                                         'person_id', 'scopus_id', 'doi',
                                         'publish_year', 'eid',
                                         'scopus_article_id'
                                     ])
            write2sql([['scopus_author_article', result_df]])
def get_doc_data(input_data):
    headers['content-type'] = 'application/json'
    count = 0
    while count < len(input_data):
        result_df_list = []
        try:
            session = requests.session()
            session.post(url=login_url,
                         data=login_data,
                         headers=headers_login,
                         timeout=300)
            for i in range(count, len(input_data)):
                print('当前进度:%s / %s' % (i + 1, len(input_data)))
                post_url = post_url_base.format(input_data[i][1])
                post_data_detail['filters']['sbjname'][
                    'is'] = input_data[i][3] + ' ' + input_data[i][4]
                doc_data = session.post(url=post_url,
                                        data=json.dumps(post_data_detail),
                                        headers=headers,
                                        timeout=300).json()
                if doc_data['items']:
                    for item in doc_data['items']:
                        item['title'] = item['a']['title']
                        del item['a']

                    data_df = pd.DataFrame(data=doc_data['items'])
                    data_df['category_id'] = input_data[i][3]
                    data_df['category_name'] = input_data[i][4]
                    data_df['orgName'] = input_data[i][0]
                    result_df_list.append(data_df)

            count = len(input_data)
            session.close()

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            session.close()
            count = i

        if result_df_list:
            all_data_df = pd.concat(result_df_list)
            write2sql([['wos_doc_data_detail', all_data_df]])
def main_prog(driver, subject_code, type_code):
    driver.get(basic_url)
    time.sleep(0.5)
    # 选择资助类别
    driver.find_element_by_id('f_grantCode').click()
    time.sleep(0.5)
    driver.find_element_by_xpath(
        '/html/body/div[4]/div[1]/div[2]/div/table/tbody/tr[1]/td/table/tbody/tr[6]/td[2]/select/option[%s]'
        % type_code).click()
    time.sleep(0.5)
    # 选择年份
    # driver.find_element_by_id('f_year').click()
    # time.sleep(0.5)
    # driver.find_element_by_xpath('/html/body/div[4]/div[1]/div[2]/div/table/tbody/tr[1]/td/table/tbody/tr[10]/td[2]/select/option[1]').click()
    # 选择申请代码
    driver.find_element_by_name('subjectCode').send_keys(subject_code)
    time.sleep(2)
    driver.find_element_by_xpath('/html/body/div[6]/ul/li[1]').click()
    time.sleep(0.5)
    # 输入验证码,识别错误时重新输入
    while '递减' not in driver.page_source:
        # 识别验证码
        check_code = check_code_ocr(driver)
        # 填入验证码
        driver.find_element_by_id('f_checkcode').clear()
        driver.find_element_by_id('f_checkcode').send_keys(check_code)
        time.sleep(0.5)
        # 查询
        driver.find_element_by_id('searchBt').click()
        time.sleep(0.5)

    result_list = get_data(driver)
    if result_list:
        result_df = pd.DataFrame(data=result_list,
                                 columns=[
                                     'id', '项目批准号', '申请代码', '项目名称', '项目负责人',
                                     '依托单位', '批准金额', '项目起止年月'
                                 ])
        write2sql([['nsfc_data', result_df]])
예제 #7
0
def crawl_h_index(input_data):
    count = 0
    while count < len(input_data):
        # 启动浏览器并获取cookies
        driver = start_driver()
        cookies = get_cookies(driver)
        driver.close()
        result_list = []
        try:
            # 开始对每位学者再scopus上进行匹配和信息获取
            for i in range(count, len(input_data)):
                print('当前进度:%s / %s' % (i + 1, len(input_data)))
                url = 'https://www.scopus.com/authid/detail.uri?authorId=%s' % input_data[
                    i][1]
                detail_page = requests.get(url,
                                           proxies=proxies,
                                           headers=headers,
                                           timeout=300,
                                           cookies=cookies)
                soup = bs(detail_page.text, 'lxml')
                h_index = soup.find(id='authorDetailsHindex')
                element = h_index.find(class_='fontLarge')
                if element:
                    result_list.append(
                        [input_data[i][0], input_data[i][1], element.text])

            # 结束循环
            count = len(input_data)

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            count = i

        # 将已完成的部分进行数据写入
        result_df = pd.DataFrame(data=result_list,
                                 columns=['person_id', 'scopus_id', 'h_index'])
        write2sql([['h_index', result_df]])
예제 #8
0
def get_detail_data(input_data):
    data_no = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(10, 99))
    count = 0
    t_index = 4
    while count < len(input_data):
        # 启动浏览器并获取cookies
        driver = start_driver()
        cookies = get_cookies(driver)
        driver.close()
        ins_data_list = []
        hindex_data_list = []
        article_data_list = []
        try:
            for i in range(count, len(input_data)):
                print('当前进度:%s / %s' % (i + 1, len(input_data)))
                # 获取学者文献及引证数据
                article_data_df = catch_info(input_data[i][t_index], cookies)
                article_data_df['name_zh'] = input_data[i][2]
                article_data_df['person_id'] = input_data[i][0]
                article_data_list.append(article_data_df)

                # 获取学者h_index数据
                detail_url = detail_base_url % input_data[i][t_index]
                detail_page = requests.get(detail_url, proxies=proxies, headers=headers, timeout=300, cookies=cookies)
                detail_soup = bs(detail_page.text, 'lxml')
                h_index = detail_soup.find(id='authorDetailsHindex')
                h_index_element = h_index.find(class_='fontLarge')
                if h_index_element:
                    hindex_data_list.append([input_data[i][0], input_data[i][t_index], h_index_element.text])

                # 获取学者机构数据
                ins_url = ins_base_url % input_data[i][t_index]
                ins_page = requests.get(ins_url, proxies=proxies, headers=headers, timeout=300, cookies=cookies)
                ins_data = eval(ins_page.text)
                for ins in ins_data:
                    # print(len(ins['affiliationName']), ins['affiliationName'])
                    ins['start_year'] = ins['dateRange'][0]
                    ins['end_year'] = ins['dateRange'][1]
                    ins.pop('dateRange')
                ins_data_df = pd.DataFrame(ins_data)
                rename_dict = {'affiliationCity': 'aff_city',
                               'affiliationName': 'aff_name',
                               'affiliationCountry': 'aff_country',
                               'affiliationId': 'aff_id',
                               'affiliationUrl': 'aff_url'}
                ins_data_df.rename(columns=rename_dict, inplace=True)
                ins_data_df['scopus_id'] = input_data[i][t_index]
                ins_data_df['person_id'] = input_data[i][0]
                ins_data_df['name_zh'] = input_data[i][2]
                ins_data_list.append(ins_data_df)

            count = len(input_data)

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            count = i

        all_ins_data = pd.DataFrame() if not ins_data_list else pd.concat(ins_data_list)
        all_hindex_data = pd.DataFrame() if not ins_data_list else pd.DataFrame(data=hindex_data_list, columns=['person_id', 'scopus_id', 'h_index'])
        all_article_data = pd.DataFrame() if not ins_data_list else pd.concat(article_data_list)

        for _df in [all_ins_data, all_article_data]:
            if len(_df) > 0:
                _df['data_no'] = data_no
                _df['flag'] = 2

        all_hindex_data['flag'] = 2

        write2sql([['author_info_new', all_article_data], ['author_exp', all_ins_data],
                   ['h_index', all_hindex_data]])
    try:
        for i in range(count, len(data_list)):
            print(i)
            data = data_list[i]
            url = data[1]
            # page = requests.get(url=url, headers=headers, proxies=proxies, cookies=cookies_dict)
            page = requests.get(url=url, headers=headers, cookies=cookies_dict)
            source = re.findall(r'''<div class="i-list_baokan t1_12">[\s\S]*?</div>''', page.text)
            if source:
                link_list = re.findall(r'''<li style=".*?"><a title=".*?" href="(.*?)">''', source[0])
                title_list = re.findall(r'''<li style=".*?"><a title="(.*?)" href=".*?''', source[0])
                domain_url = 'http://www.apabi.com/sjtu/'
                link_list = [domain_url + i for i in link_list]
                df = pd.DataFrame(data={'title': title_list, 'text_link': link_list})
                df['text_date'] = data[2]
                df['page_link'] = data[1]
                df['page_name'] = data[0]
                result_list.append(df)
        # 结束循环
        count = len(data_list)

    # 出现错误时,从错误处中断,再从该处开始
    except Exception as err:
        print('ERROR:%s' % err)
        print('当前进度:%s / %s' % (i + 1, len(data_list)))
        count = i

    if result_list:
        all_data_df = pd.concat(result_list)
        write2sql([['rmrb_copy', all_data_df]])
def get_detail_data(input_data):
    data_no = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(
        random.randint(10, 99))
    count = 0
    while count < len(input_data):
        all_person_data_list = []
        driver = start_driver()
        try:
            for i in range(count, len(input_data)):
                print('当前进度:%s / %s' % (i + 1, len(input_data)))
                url = detail_base_url % input_data[i][1]
                driver.get(url)
                try:
                    driver.find_element_by_id('authViewCitOver').click()
                except Exception as err:
                    print('ERROR:%s' % err)
                    continue
                time.sleep(10)
                cto_id1 = re.findall(r'CTOF_([0-9]+)', driver.current_url)
                cto_id2 = re.findall(r'var ctoId = "CTODS_([0-9]+)"',
                                     driver.page_source)
                one_person_data_list = []
                # 页数循环
                for j in range(5):
                    url2 = article_url % (cto_id2[0], cto_id1[0], 200 * j)
                    driver.get(url2)
                    if 'Error 500' in driver.page_source:
                        break
                    soup = bs(driver.page_source.replace('&nbsp;', '0'),
                              'lxml')
                    publish_year = re.findall(r'<span>([0-9]{4})</span>',
                                              driver.page_source)
                    previous_cell_count = [
                        int(k.text)
                        for k in soup.find_all(class_='previousCellCount')
                    ]
                    previous_years = [
                        int(k.text)
                        for k in soup.find_all(class_='previousYears')
                    ]
                    sub_total = [
                        int(k.text) for k in soup.find_all(class_='subTotal')
                    ]
                    all_total = [
                        int(k.text) for k in soup.find_all(class_='subtotal')
                    ]
                    prev_latest = [
                        int(k.text)
                        for k in soup.find_all(class_='prevLatestYears')
                    ]
                    one_page_data = pd.DataFrame({
                        'publish_year': publish_year,
                        'previous_cell_count': previous_cell_count,
                        '2006': previous_years[0::15],
                        '2007': previous_years[1::15],
                        '2008': previous_years[2::15],
                        '2009': previous_years[3::15],
                        '2010': previous_years[4::15],
                        '2011': previous_years[5::15],
                        '2012': previous_years[6::15],
                        '2013': previous_years[7::15],
                        '2014': previous_years[8::15],
                        '2015': previous_years[9::15],
                        '2016': previous_years[10::15],
                        '2017': previous_years[11::15],
                        '2018': previous_years[12::15],
                        '2019': previous_years[13::15],
                        '2020': previous_years[14::15],
                        'sub_total': sub_total,
                        'all_total': all_total,
                        'prev_latest': prev_latest
                    })
                    one_person_data_list.append(one_page_data)

                if one_person_data_list:
                    one_person_data = pd.concat(one_person_data_list)
                    one_person_data['person_id'] = input_data[i][0]
                    one_person_data['scopus_id'] = input_data[i][1]
                    all_person_data_list.append(one_person_data)

            count = len(input_data)
            driver.close()

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            count = i
            driver.close()

        if all_person_data_list:
            all_person_data = pd.concat(all_person_data_list)
            all_person_data['article_num'] = 1
            final_data = all_person_data.groupby(
                by=['person_id', 'scopus_id',
                    'publish_year'], as_index=False).sum()
            final_data['data_no'] = data_no

            write2sql([['article_cite_data', final_data]])
예제 #11
0
def main_prog(input_data):
    '''

    :param input_data: [{'person_id':1234564, 'name':'liu bo', 'ins':['fudan university', 'xx university', 'xxx university'],
                        'ins_id':[111, 222, 333], 'name_zh':'刘博'}, {...}]
    :return:
    '''
    data_no = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + str(random.randint(10, 99))
    logger.info('数据版本号: %s' % data_no)
    count = 0
    while count < len(input_data):
        all_aff_df = pd.DataFrame(data=None, columns=None)
        basic_info_df = pd.DataFrame(data=None, columns=None)
        mult_result_df = pd.DataFrame(data=None, columns=None)
        not_matched_df = pd.DataFrame(data=None, columns=None)
        # 启动浏览器并获取cookies
        driver = start_driver()
        cookies = get_cookies(driver)
        driver.close()
        try:
            # 开始对每位学者再scopus上进行匹配和信息获取
            for i in range(count, len(input_data)):
                person_id = input_data[i]['person_id']
                author_name = input_data[i]['name']
                author_name_zh = input_data[i]['name_zh']
                author_ins = input_data[i]['ins']
                author_ins_id = input_data[i]['ins_id']
                logger.info('当前进度:软科id:%s, 姓名:%s,%s' % (person_id, author_name_zh, author_name[0]))
                # 机构英文名称全部转为小写
                author_ins = [i.lower() for i in author_ins]

                # todo 0624临时修改
                # authorID_list = get_id(person_id, author_name, author_name_zh, author_ins[0])
                authorID_list = []
                for _name in author_name:
                    for _ins in author_ins:
                        authorID_list.extend(get_id(person_id, _name, author_name_zh, _ins))
                        if authorID_list:
                            break
                    if authorID_list:
                        break

                # 以机构对应的scopus_id匹配
                aff_df, basic_info, mult_re, not_match = match(cookies, person_id, author_name, author_name_zh,
                                                               author_ins_id, authorID_list)

                all_aff_df = all_aff_df.append(aff_df, ignore_index=True)
                basic_info_df = basic_info_df.append(basic_info, ignore_index=True)
                mult_result_df = mult_result_df.append(mult_re, ignore_index=True)
                not_matched_df = not_matched_df.append(not_match, ignore_index=True)

            # 结束循环
            count = len(input_data)

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            logger.info('ERROR:%s' % err)
            logger.info('当前进度:%s / %s' % (i+1, len(input_data)))
            count = i

        # 添加data_no字段
        for df in [all_aff_df, basic_info_df, mult_result_df, not_matched_df]:
            if len(df) > 0:
                df['data_no'] = data_no
        # 将已完成的部分进行数据写入
        write2sql([['author_info_new', basic_info_df], ['author_exp', all_aff_df],
                   ['mult_matched_author', mult_result_df], ['not_matched_author', not_matched_df]])
예제 #12
0
# df = pd.read_excel('C:/Users/Administrator/Desktop/校名映射表.xlsx')
# data = pd.merge(data, df, how='left', on='orgName')
#
# print('merge完成')
# print('开始写入数据库')
# write2sql([['wos_doc_data0930', data]])
# print('写入数据库完成')

dbutil = DBUtil(host, port, database, username, password)
sql = 'select * from wos_doc_data_detail'
data = dbutil.get_allresult(sql, 'df')
print('读取源数据完成')
df = pd.read_excel('C:/Users/Administrator/Desktop/校名映射表.xlsx')
data = pd.merge(data, df, how='left', on='orgName')

data1 = data.loc[0:499999]
data2 = data.loc[500000:999999]
data3 = data.loc[1000000:1499999]
data4 = data.loc[1500000:1999999]
data5 = data.loc[2000000:2499999]
data6 = data.loc[2500000:2999999]
data7 = data.loc[3000000:]

data_list = [data1, data2, data3, data4, data5, data6, data7]

i = 0
for data in data_list:
    print(i)
    write2sql([['wos_doc_data_detail0930', data]])
    i += 1
    try:
        for i in range(count, len(data_list)):
            print(i)
            data = data_list[i]
            url = data[1]
            # page = requests.get(url=url, headers=headers, proxies=proxies, cookies=cookies_dict)
            page = requests.get(url=url, headers=headers, cookies=cookies_dict)
            source = re.findall(r'''<div class="i-list_baokan t1_12">[\s\S]*?</div>''', page.text)
            if source:
                link_list = re.findall(r'''<li style=".*?"><a title=".*?" href="(.*?)">''', source[0])
                title_list = re.findall(r'''<li style=".*?"><a title="(.*?)" href=".*?''', source[0])
                domain_url = 'http://www.apabi.com/sjtu/'
                link_list = [domain_url + i for i in link_list]
                df = pd.DataFrame(data={'title': title_list, 'text_link': link_list})
                df['text_date'] = data[2]
                df['page_link'] = data[1]
                df['page_name'] = data[0]
                result_list.append(df)
        # 结束循环
        count = len(data_list)

    # 出现错误时,从错误处中断,再从该处开始
    except Exception as err:
        print('ERROR:%s' % err)
        print('当前进度:%s / %s' % (i + 1, len(data_list)))
        count = i

    if result_list:
        all_data_df = pd.concat(result_list)
        write2sql([['rmrb', all_data_df]])
def get_doc_data(input_data, author_position, period):
    if period == '近10年':
        post_data['filters']['period']['is'] = [2010, 2019]

    if period == '近5年' or period == '近5年Q1':
        post_data['filters']['period']['is'] = [2015, 2019]

    if 'authorposition' in post_data['filters']:
        post_data['filters']['authorposition']['is'] = [author_position]
        if author_position == 'First & Corresponding':
            post_data['filters']['authorposition']['is'] = [
                'First', 'Corresponding'
            ]
        if author_position == 'All':
            del post_data['filters']['authorposition']

    if author_position == 'First & Corresponding':
        post_data['filters']['authorposition'] = {
            "is": ['First', 'Corresponding']
        }

    a = post_data
    count = 0
    no_result = []
    while count < len(input_data):
        result_df_list = []
        try:
            session = requests.session()
            session.post(url=login_url,
                         data=login_data,
                         headers=headers_login,
                         timeout=300)
            for i in range(count, len(input_data)):
                print('当前进度:%s / %s' % (i + 1, len(input_data)))
                post_data['filters']['personId']['is'] = input_data[i][2]
                author_data = session.post(url=post_url,
                                           data=json.dumps(post_data),
                                           headers=headers,
                                           timeout=300).json()
                if author_data['items']:
                    for item in author_data['items']:
                        item['doc_num'] = item['wosDocuments']['value']
                        item['highly_cited_paper'] = item['highlyCitedPapers'][
                            'value']
                        item['inter_colla'] = item['intCollaborations'][
                            'value']
                        del item['wosDocuments']
                        del item['highlyCitedPapers']
                        del item['intCollaborations']

                    data_df = pd.DataFrame(data=author_data['items'])
                    data_df['person_id'] = input_data[i][0]
                    data_df['name_zh'] = input_data[i][1]
                    result_df_list.append(data_df)

                else:
                    no_result.append([
                        input_data[i][0], input_data[i][1], author_position, 0
                    ])

            count = len(input_data)
            session.close()

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            session.close()
            count = i

        if result_df_list:
            all_data_df = pd.concat(result_df_list)
            all_data_df['author_position'] = author_position
            all_data_df['period'] = period
            write2sql([['incites_author_data1008', all_data_df]])

    if no_result:
        no_result_df = pd.DataFrame(data=no_result,
                                    columns=['人才编号', '姓名', '作者类型', '文献数量'])
        no_result_df.to_excel(
            'C:/Users/Administrator/Desktop/no_result1008.xlsx', index=False)
def wos_article(input_data):
    count = 0
    page_count = 33
    while count < len(input_data):
        articel_data_all = []
        try:
            # 建立连接session
            session = requests.session()
            connection = session.get(url, headers=headers, proxies=proxies)
            # 获取sid
            sid = re.findall(r'SID=(\w+)&', connection.url)[0]
            # 开始循环获取数据
            for i in range(count, len(input_data)):
                print('当前进度:%s / %s' % (i + 1, len(input_data)))
                print(input_data[i])

                name = input_data[i][0]
                data = data_ini.format(sid, sid, name)
                search_page = session.post(post_url,
                                           data=data,
                                           proxies=proxies,
                                           headers=headers,
                                           timeout=300)

                qid = re.findall(r'qid=([0-9]+)&', search_page.text)[0]
                search_result_num = re.findall(
                    r'FINAL_DISPLAY_RESULTS_COUNT = ([0-9]+);',
                    search_page.text)[0]
                print(search_result_num)
                page_num = math.ceil(int(search_result_num) / 500)

                for k in range(page_count, page_num):
                    print(f'{k}/{page_num}')
                    data_dl = data_dl_ini.format(qid, sid, qid, sid,
                                                 k * 500 + 500, k * 500 + 1,
                                                 name, k * 500 + 1,
                                                 k * 500 + 500)
                    dl_page = session.post(dl_post_url,
                                           data=data_dl,
                                           proxies=proxies,
                                           headers=headers,
                                           timeout=300)
                    if ('PT' not in dl_page.text) and ('AU'
                                                       not in dl_page.text):
                        raise Exception('error')
                    data_list = dl_page.text.strip().split('\n')
                    articel_data_temp = [
                        j.strip().split('\t') for j in data_list[1:]
                    ]
                    df_temp = pd.DataFrame(data=articel_data_temp,
                                           columns=columns)
                    df_temp['name_zh'] = input_data[i][1]
                    articel_data_all.append(df_temp)

                page_count = 0

            # 结束循环
            count = len(input_data)
            session.close()

        # 出现错误时,从错误处中断,再从该处开始
        except Exception as err:
            print('ERROR:%s' % err)
            session.close()
            count = i
            page_count = k

        if articel_data_all:
            articel_data_df = pd.concat(articel_data_all)
            write2sql([['wos_article_data', articel_data_df]])
                    find_df_list.append(temp_df)
                else:
                    not_find.append(source_list[i][4])

            else:
                not_find.append(source_list[i][4])

        # 结束循环
        count = len(source_list)

    # 出现错误时,从错误处中断,再从该处开始
    except Exception as err:
        print('ERROR:%s' % err)
        print('当前进度:%s / %s' % (i + 1, len(source_list)))
        count = i
        if str(type(err)) == "<class 'UnicodeEncodeError'>":
            count += 1
            not_find.append(source_list[i][4])

    if len(find_df_list) > 0:
        all_find = pd.concat(find_df_list)
    else:
        all_find = pd.DataFrame()

    if len(not_find) > 0:
        all_not_find = pd.DataFrame(data=not_find, columns=['person_id'])
    else:
        all_not_find = pd.DataFrame()

    write2sql([['find_result', all_find], ['not_find', all_not_find]])