def update_int_gender_and_background(): """ 该函数主要是更新数据库officer_message表中的数据,主要是对 性别代号(gender_int)字段的更新 以及 学历代号(background_int)字段的更新: 性别代号说明:未知:0,男:1,女:2 学历代号说明:小学: 1, 初中: 2, 高中: 3, 中专: 4, 大专: 5, 本科: 6, 大学: 6, 学士: 6, 硕士: 7, 研究生: 7, 博士: 8 根据 字段(gender) 和 字段(educational_background)更新 :return: 无返回内容 """ Connection = getCon(database='cof', user='******', password='******', host='192.168.10.6') select_sql = "SELECT id_index, gender, educational_background FROM crawler.officer_message WHERE gender_int ISNULL;" update_sql = "UPDATE crawler.officer_message SET gender_int = {0}, edu_int = {1} WHERE id_index = '{2}';" cur = Connection.cursor() cur.execute(select_sql) for line in cur.fetchall(): id_index, gender, educational_background = line print(line) if '男' in gender: gender_int = 1 elif '女' in gender: gender_int = 2 else: gender_int = 0 edu_int = 0 for edu in edu_int_dict.keys(): if edu in educational_background: edu_int = edu_int_dict[edu] finish_update_sql = update_sql.format(gender_int, edu_int, id_index) cur.execute(finish_update_sql) Connection.commit() print(finish_update_sql) print(id_index, '的 性别编号 和 学历编号 信息已更新') cur.close() Connection.close() print('\n更新完毕')
def update_workplace(): """ 该函数主要是更新数据库officer_message表中的数据,主要是对工作地点(work_place)字段的更新, 将 当前工作的岗位(current_site) 中有工作地点的数据,将其中的工作地点提取出来,更新到表中 工作地点(work_place)字段里 :return: 无返回内容 """ Connection = getCon(database='cof', user='******', password='******', host='192.168.10.6') select_sql = "SELECT id_index, current_site FROM crawler.officer_message WHERE work_place ISNULL;" update_sql = "UPDATE crawler.officer_message SET current_site = '{0}', work_place = '{1}' WHERE id_index = '{2}';" cur = Connection.cursor() cur.execute(select_sql) for line in cur.fetchall(): id_index, current_site = line # print(line) work_place = '' new_current_site = current_site if '县' in current_site: work_place = current_site[: current_site.find('县') + 1] new_current_site = current_site[current_site.find('县') + 1:] elif '区' in current_site: work_place = current_site[: current_site.find('区') + 1] new_current_site = current_site[current_site.find('区') + 1:] elif '市' in current_site: work_place = current_site[: current_site.find('市') + 1] new_current_site = current_site[current_site.find('市') + 1:] elif '省' in current_site: work_place = current_site[: current_site.find('省') + 1] new_current_site = current_site[current_site.find('省') + 1:] if len(work_place) > 10: if '省' in current_site: work_place = current_site[: current_site.find('省') + 1] new_current_site = current_site[current_site.find('省') + 1:] finish_update_sql = update_sql.format(new_current_site, work_place, id_index) print(finish_update_sql) cur.execute(finish_update_sql) Connection.commit() print(id_index, '的 工作地 信息已更新') cur.close() Connection.close() print('\n更新完毕')
def update_head_image(): """ 该函数主要是更新数据库officer_message表中的数据,主要是对头像图片(head_image)字段的更新, 根据 数据库已存的头像链接字段(head_image_url),爬取图片并将已二进制写入数据库,更新字段 头像图片(head_image) :return: 无返回内容 """ Connection = getCon(database='cof', user='******', password='******', host='192.168.10.6') select_sql = "SELECT id_index, head_image_url FROM crawler.officer_message WHERE officer_message.head_image ='';" update_sql = "UPDATE crawler.officer_message SET head_image = {0} WHERE id_index = '{1}';" cur = Connection.cursor() cur.execute(select_sql) for line in cur.fetchall(): id_index, head_image_url = line head_image = get_html_by_url(head_image_url) finish_update_sql = update_sql.format(psycopg2.Binary(head_image), id_index) print(finish_update_sql) cur.execute(finish_update_sql) Connection.commit() print(id_index, "的头像已存入") cur.close() Connection.close()
def update_age(): """ 该函数主要是更新数据库officer_message表中的数据,主要是对年龄(age)字段的更新, 根据 字段出生日期(date_of_birth) 计算出对应的年龄,并更新字段 年龄(age) :return: 无返回内容 """ Connection = getCon(database='cof', user='******', password='******', host='192.168.10.6') select_sql = "SELECT id_index, date_of_birth FROM crawler.officer_message WHERE age ISNULL;" update_sql = "UPDATE crawler.officer_message SET age = '{0}' WHERE id_index = '{1}';" cur = Connection.cursor() cur.execute(select_sql) for line in cur.fetchall(): officer_id, date_of_birth = line date = re.findall('\d\d\d\d', date_of_birth) print(date) if len(date) != 0: age = time.gmtime().tm_year - int(date[0]) # print(age) finish_update_sql = update_sql.format(str(age), officer_id) cur.execute(finish_update_sql) Connection.commit() cur.close() Connection.close()
def update_relative_person_links(file): """ 该函数主要是更新数据库officer_message表中的数据,主要是对 相关人物链接(relative_person_links)字段的更新 根据 原csv文件中 相关人物链接的数据 更新字段 :param file: 要更新的原csv文件名 eg:'../data/官员信息-1-14.csv' :return: 无返回内容 注:后期应该添加到主模块中 """ Connection = getCon(database='cof', user='******', password='******', host='192.168.10.6') update_sql = "UPDATE crawler.officer_message SET relative_person_links = '{0}' WHERE id_index = '{1}';" cur = Connection.cursor() # file = '../data/官员信息-1-14.csv' for line in getData(file): if len(line[8]) != 0: print('开始更新:' + line[0]) finish_update_sql = update_sql.format(re.sub('\[|\]|\'', '', str(line[8])), line[0]) print(finish_update_sql) cur.execute(finish_update_sql) Connection.commit() print(line[0], '的 相关链接 信息已更新') cur.close() Connection.close() print('\n更新完毕')
def update_city(): """ 该函数主要是更新将数据库officer_message表中的数据,主要是对 四个直辖市的更新, 将 省(province) 城市(city)的字段都更新为所在地的直辖市名字 :return: 无返回 """ Connection = getCon(database='cof', user='******', password='******', host='192.168.10.6') select_sql = "SELECT id_index, province, city, place_of_birth FROM crawler.officer_message WHERE province='';" update_sql = "UPDATE crawler.officer_message SET province = '{0}', city = '{1}' WHERE id_index = '{2}';" cur = Connection.cursor() cur.execute(select_sql) for line in cur.fetchall(): id_index, province, city, place_of_birth = line for super_city in super_cities: if super_city in place_of_birth: province = super_city city = super_city finish_update_sql = update_sql.format(province, city, id_index) cur.execute(finish_update_sql) Connection.commit() print(finish_update_sql) print(id_index, '的城市信息已更新') cur.close() Connection.close()
if sp_start_time[1] > 12: sp_start_time[1] = 1 sp_start_time[2] = 1 if sp_end_time[1] > 12: sp_end_time[1] = 1 sp_end_time[2] = 1 if sp_start_time[2] > 31: sp_start_time[2] = 1 if sp_end_time[2] > 31: sp_end_time[2] = 1 return sp_start_time, sp_end_time, time_length if __name__ == '__main__': Connection = getCon(database='cof', user='******', password='******', host='192.168.10.6') select_sql = "SELECT id_index, time_and_job, message_source FROM crawler.officer_message WHERE head_image = '';" insert_sql = "INSERT INTO crawler.officer_resume (id_index, id_index_n, start_time, end_time, resume, " \ "message_source, work_age) VALUES ('{0}', '{1}', '{2}', '{3}'" \ ", '{4}', '{5}', '{6}') " cur = Connection.cursor() cur.execute(select_sql) for line in cur.fetchall(): id_index, time_and_job, message_source = line print('开始往数据库写入' + id_index + '的履历信息') split_time_job = time_and_job.split(', ') # print(split_time_job, len(split_time_job)) # try: for i in range(int((len(split_time_job) + 1) / 2)): time = split_time_job[i * 2]