def craw_info(self):
        '''
        爬取官员数据
        :return:
        '''
        global temp, counter

        while True:
            if len(self.may_officer_info) == 0:
                break
            officer_data = self.may_officer_info.pop(0)
            raw_data = get_html_by_url(officer_data[2])
            if raw_data == None:
                continue
            soup = BeautifulSoup(raw_data, 'html.parser')
            is_officer, filter_info = self.officer_filter(soup, officer_data)
            if not is_officer:
                print('Not Officer: ', filter_info, officer_data)
                # self.other_csv_writer.writerow(officer_data)
                continue
            self.add_officer_info(soup, officer_data)

        threadLock_init.acquire()
        counter -= 1
        if counter == 0:
            self.out.close()
            self.out_other()
        threadLock_init.release()
def fetch_person_index(key, part_url, index_list):
    if part_url is None:
        return
    pro_url = 'http://ldzl.people.com.cn/dfzlk/front/' + part_url
    try:
        raw_data = get_html_by_url(pro_url)
    except:
        print('province url is error: ', pro_url)

    soup = BeautifulSoup(raw_data, 'html.parser')
    div_tag = soup.find('div', class_='fr p2j_reports_right title_2j sjzlk')
    city_h2_tags = div_tag.find_all('h2')
    city_div_tags = div_tag.find_all('div', class_='zlk_list')

    name_set = set()
    for i in range(len(city_h2_tags)):
        city_name = re.sub(r'\n', '', city_h2_tags[i].get_text())
        li_tags = city_div_tags[i].find_all('li')
        if len(li_tags) == 0:
            continue
        for li_tag in li_tags:
            district = del_content_blank(li_tag.find('span').get_text())
            person_name = li_tag.find('em').get_text()
            if person_name not in name_set:
                name_set.add(person_name)
                person_index = (person_name, key, city_name, district)
                print(person_index)
                index_list.append(person_index)
    return
def get_lemmid_and_pic_url(baike_url):
    try:
        raw_data = get_html_by_url(baike_url)
    except:
        print('baike url is error: ', baike_url)
    soup = BeautifulSoup(raw_data, 'html.parser')
    lemmid = soup.find(
        'div', class_='lemmaWgt-promotion-rightPreciseAd').get('data-lemmaid')
    pic_div = soup.find('div', class_='summary-pic')
    pic_url = ''
    if pic_div is not None:
        pic_url = pic_div.find('img').get('src')
    return lemmid, pic_url
    def add_may_officer_info(self, lemmaId):
        '''
        添加官员的相关人物信息
        :param lemmaId: 要添加相关人物的官员id
        :return: 相关人物链接
        '''
        global total_count, success_count

        total_count += 1

        json_url = 'https://baike.baidu.com/wikiui/api/zhixinmap?lemmaId=' + lemmaId
        raw_data = get_html_by_url(json_url)

        if raw_data == None:
            print("获取相关链接:" + str(success_count) + '/' + str(total_count))
            print('may officer info is None')
            return []

        json_data = json.loads(str(raw_data, encoding='utf-8'))
        if not isinstance(json_data, list):
            print("获取相关链接:" + str(success_count) + '/' + str(total_count))
            print('json is false')
            return []

        relative_links = []
        for item1 in json_data:
            if item1['tipTitle'].find('人物') < 0 and item1['tipTitle'].find(
                    '学者') < 0:
                continue
            data = item1['data']
            for item in data:
                name = item['title']
                url = item['url']
                pic = item['pic']
                lemmaid = item['lemmaId']
                relative_links.append(url)

                threadLock_id_set_and_may_officer.acquire()

                if int(lemmaid) not in self.may_id_set:
                    self.may_id_set.add(int(lemmaid))
                    off_info = [lemmaid, name, url, pic]
                    self.may_officer_info.append(off_info)

                threadLock_id_set_and_may_officer.release()
                print('add may officer info Success!')

        success_count += 1
        print("获取相关链接:" + str(success_count) + '/' + str(total_count))

        return relative_links
def get_index_list():
    url = 'http://ldzl.people.com.cn/dfzlk/front/xian35.htm'
    raw_data = get_html_by_url(url)
    soup = BeautifulSoup(raw_data, 'html.parser')
    li_tags = soup.find('div', class_='fl p2j_reports_left').find_all('li')
    province_dict = {}
    for li_tag in li_tags:
        a_tag = li_tag.find('a')
        href = a_tag.get('href')
        if href == '#':
            continue
        province_dict[a_tag.get_text()] = href

    index_list = []
    for key in province_dict.keys():
        fetch_person_index(key, province_dict[key], index_list)
    return index_list
def get_person_baike_url(row):
    name, province, city, district = row
    url_list = list()
    name_url = "https://baike.baidu.com/search/word?word={0}".format(
        urllib.request.quote(name))
    html = get_html_by_url(name_url)
    soup = BeautifulSoup(html, 'html.parser')
    items = soup.select(
        'body > div.body-wrapper > div.before-content > div > ul > li')
    for item in items:
        text = item.getText()[1:]
        if province in text or city in text or district in text:
            try:
                new_name_url = "https://baike.baidu.com" + item.a['href']
                url_list.append(new_name_url)
            except AttributeError:
                url_list.append(name_url)
            except TypeError:
                url_list.append(name_url)
    return url_list
def update_head_image():
    """
    该函数主要是更新数据库officer_message表中的数据,主要是对头像图片(head_image)字段的更新,
    根据 数据库已存的头像链接字段(head_image_url),爬取图片并将已二进制写入数据库,更新字段
    头像图片(head_image)
    :return: 无返回内容
    """
    Connection = getCon(database='cof', user='******', password='******', host='192.168.10.6')
    select_sql = "SELECT id_index, head_image_url FROM crawler.officer_message WHERE officer_message.head_image ='';"
    update_sql = "UPDATE crawler.officer_message SET head_image = {0} WHERE id_index = '{1}';"
    cur = Connection.cursor()
    cur.execute(select_sql)
    for line in cur.fetchall():
        id_index, head_image_url = line
        head_image = get_html_by_url(head_image_url)
        finish_update_sql = update_sql.format(psycopg2.Binary(head_image), id_index)
        print(finish_update_sql)
        cur.execute(finish_update_sql)
        Connection.commit()
        print(id_index, "的头像已存入")
    cur.close()
    Connection.close()
if __name__ == '__main__':
    f_in = open('../data/官员信息.csv', newline='', encoding='utf-8')
    csv_reader = csv.reader(f_in)

    f_out = open('../data/官员信息new.csv', 'w', newline='', encoding='utf-8')
    csv_writer = csv.writer(f_out, dialect='excel')

    i = 0
    for row in csv_reader:
        sleep(1)
        i += 1
        print(i)
        lemmaId = row[0]
        json_url = 'https://baike.baidu.com/wikiui/api/zhixinmap?lemmaId=' + lemmaId
        raw_data = get_html_by_url(json_url)

        if raw_data == None:
            print('may officer info is None')
            csv_writer.writerow(row)
            continue

        json_data = json.loads(str(raw_data, encoding='utf-8'))
        if not isinstance(json_data, list):
            print('json is false: ', json_data)
            csv_writer.writerow(row)
            continue

        print('find relative links Success!')
        off_infos = []
        for item1 in json_data: