def merge_from_others(): # db_host = database.Mysql('root','990211','data_policy',use_flow=True) # db_host_r = database.Mysql('root', '990211', 'data_policy') db_sever_w = database.Mysql('root', '990211', 'dbpolicy', host='121.36.22.40') db_sever_r = database.Mysql('root', '990211', 'dbpolicy', host='121.36.22.40', use_flow=True) i = 0 for line in db_sever_r.select('data'): if (i % 10 == 0): print(i) i += 1 if not line['gov']: gov = '人民政府' else: gov = line['gov'] if line['date']: date = line['date'] else: date = None insert_dict = { 'code': line['code'], 'province': line['province'], 'city': line['city'], 'title': line['title'], 'gov': gov, 'date': date, 'sub_url': line['sub_url'] } db_sever_w.insert_one('links', insert_dict)
def spider(url): info=[] dic={} dbinfo =[] mysql_s = database.Mysql() global video video_info=get_video_info(url) for i in video_info: real_url=get_video_true_url(i['display_url']) if real_url==None: break title=unicode(i['title']) dic['title']=title image_url=i['pc_image_url'] dic['image_url']=image_url video_play_count=i['video_play_count'] dic['play_count']=str(video_play_count) video_duration_format=i['video_duration_format'] dic['ti']=video_duration_format dic['video_url']=real_url info.append(title) info.append(real_url) info.append(str(video_play_count)) video.append(info) info=[] dbinfo.append(mysql_s.insertData("video", dic)) log['video']=video log['dbinfo']=dbinfo
def update_from_local(): db = database.Mysql('root', '990211', 'dbpolicy') with open('local_result.csv', 'r', encoding='utf-8-sig') as fr: csv_r = csv.reader(fr) for line in csv_r: insert_dict = { 'code': line[0], 'province': line[1], 'city': line[2], 'gov': line[3], 'title': line[4], 'date': line[5], 'sub_url': line[6] } response = Request(insert_dict['sub_url']).text if not response: continue else: task = parse_context.MAIN_TEXT(insert_dict['sub_url'], response) try: result = task.main() insert_dict['main_text'] = result['content'] insert_dict['attachment'] = ','.join(result['attachment']) insert_dict['img'] = ','.join(result['img']) except Exception: continue db.insert_one('data', insert_dict)
def migrations(): db = database.Mysql('root', '990211', 'dbpolicy_web', host='121.36.22.40') with open('./data/configure_gov_v2.csv', 'r', encoding='utf-8') as fr: csv_r = csv.DictReader(fr) id = 0 for line in csv_r: id += 1 loc_id = db.select('api_location', "id", "code={}".format(line['\ufeffcode']), fetch_one=True)['id'] insert_config = { "id": id, 'gov': line["gov"], 'target_url': line['target_url'], 'is_active': test_url(line['target_url']), "item_pattern": line["item_pattern"], "main_text_pattern": line["main_text_pattern"], "date_pattern": line['date_type'], "zupei_pattern": line['zupei_type'], "source_pattern": line['source'], 'title_pattern': line['title'], 'next_pattern': line['next_button'], 'action_pattern': line['action'], 'loc_id': loc_id, 'file_count': 0, 'author_id': 1, } res = db.insert_one('api_config', insert_config) if not res: id -= 1
def insertImagedb(images): mysql_s = database.Mysql() for image in images: print(image) mydict = {} mydict['title'] = image[0] mydict['image_url'] = image[1] mydict['url'] = image[2] print(mydict) mysql_s.insertData("tb_images", mydict)
def insertNewstoDB(): mysql_s = database.Mysql() data=getTopNewsimg() send_email.send(str(data).replace('u\'', '\'').decode("unicode-escape")) for news in data: print(news) mydict={} mydict['title']=news[1] mydict['content'] = news[4] mydict['url'] = news[2] mydict['image_url'] = news[3] print(mydict) mysql_s.insertData("tb_news",mydict)
def get_from_local(): db = database.Mysql('root', '990211', 'data_policy') matrix = db.select('data') for line in matrix: code = line[0] result = db.select('map_location', condition='code="{}"'.format(code), fetch_one=True) province = result[1] city = result[2] insert_list = [code, province, city] + list(line)[1:] with open('local_result.csv', 'a', encoding='utf-8-sig', newline='') as fa: csv_a = csv.writer(fa) csv_a.writerow(insert_list)
def get_video_url_range(begin, end): text = [] table = "budejie_copy" mysql_s = database.Mysql() sortcontent = sort_content_tags.SortContentByTags() for i in range(begin, end): url = 'http://www.budejie.com/' + str(i) try: req = urllib2.Request(url) #请求页面 # 加一个键值对,针对服务器的反爬机制 req.add_header( "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36" ) html = urllib2.urlopen(req).read() soup = BeautifulSoup(html, 'html.parser') video_content = soup.find_all('div', class_=re.compile('j-video-c')) video_url = soup.find_all('div', class_=re.compile('^j-video$')) print(len(video_content)) dic = {} le = len(video_content) if le != 0: for i in video_url: dic['video_url'] = str(i['data-mp4']).strip() dic['image_url'] = str(i['data-poster']).strip() dic['video_id'] = str(i['data-id']).strip() dic['title'] = " " dic['ti'] = " " dic['public_date'] = " " dbinfo = mysql_s.insertData(table, dic) text.append(dbinfo) print(dbinfo) dic = {} for i in video_content: dic['video_id'] = str(i['data-id']).strip() dic['public_date'] = str(i['data-date']).strip() dic['title'] = str(i['data-title']).strip() print(dic['title']) dic['ti'] = str(i['data-time']).strip() text.append(dbinfo) dic['tag_id'] = sortcontent.gettagbytitle(dic['title']) dbinfo = mysql_s.upData1(table, dic) except Exception as e: print("No Web Pages") return text
def search(): res_dict = list() db = database.Mysql('root', '990211', 'dbpolicy', host='121.36.22.40', use_flow=True) for policy in db.select('data'): title = policy['title'] avi_label = is_satisfied(title) if avi_label: label = ' '.join(avi_label) policy['label'] = label res_dict.append(policy) with open('data_with_label.csv', 'w', encoding='utf-8-sig', newline='') as fw: c_w = csv.DictWriter(fw, list(res_dict[0].keys())) c_w.writeheader() c_w.writerows(res_dict)
sys.setdefaultencoding('utf-8') import re import ast import requests from bs4 import BeautifulSoup import database import time import multiprocessing head = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"} db = database.Mysql( '127.0.0.1', 'root', 'Mroot@123', 3306, 'mayi_news', 'utf8')[0] def get_soup(url): data = requests.get(url, headers=head).content soup = BeautifulSoup(data, 'lxml') return soup def verify_url(news_url): sql = "select url from news_tb where url='%s';" % news_url result = db.select_data(sql) if result == ():
class SortContentByTags: mysql_s=database.Mysql() tag_videos = [u"电影", u'视频', u'片', u'电视'] # 1002 tag_society = [u'社会', u'交通', u'国家', u'人'] # 1003 tag_entertain = [u'娱乐', u'休闲', u'乐'] # 1004 tag_technology = [u'科技', u'数据', u'智能', u'科'] # 1005 tag_car = [u'车', u'摩托', u'高铁'] # 1006 tag_sports = [u'运动', u'体育', u'跑', u'跳', u'走'] # 1007 tag_finance = [u'金融', u'钱', u'经济'] # 1008 tag_military = [u'军', u'装备', u'机'] # 1009 tag_global = [u'全球', u'世界', u'组织'] # 1010 tag_episode = [u'片段', u'短片', u'集'] # 1011 tag_funny_news = [u'笑', u'逗', u'哈'] # 1012 tag_health = [u'健康', u'养生', u'保健', u'食物', u'医', u'药', u'病'] # 1013 tag_shortvideo = [u'短视频', u'抖音', u'片段'] # 1014 tag_animal = [ u'动物',u'狗', u'猫', u'宠物'] # 1015 tag_education = [u'教育', u'学', u'生'] # 1016 tag_shopping = [u'购物', u'天猫', u'京东', u'网购', u'淘宝', u'支付宝'] # 1017 tag_music = [u'音', u'唱片', u'歌', u'声'] # 1018 tags_mapping = {'1002': tag_videos, '1003': tag_society, '1004': tag_entertain, '1005': tag_technology , '1006': tag_car, '1007': tag_sports, '1008': tag_finance, '1009': tag_military, '1010': tag_global , '1011': tag_episode, '1012': tag_funny_news, '1013': tag_health, '1014': tag_shortvideo, '1015': tag_animal, '1016': tag_education, '1017': tag_shopping, '1018':tag_music} def __init__(self): print("初始化......") def gettagbytitle(self,title): for key in self.tags_mapping: for tag in self.tags_mapping[key]: #print(title+" 正在和标签库匹配"+self.tags_mapping[key].__str__()) if title.__contains__(tag): print(title+"匹配成功"+self.tags_mapping[key].__str__()+"tagid: "+key ) return key print(title + "匹配默认" +"tagid: 1001" ) return '1001' def printf(self,title_map_list): for key in title_map_list: print("标题为"+key) print ("该标题所匹配的标签为"+set(title_map_list[key]).__str__()) def querybycontent_tagset(self,content): tag_set=set() for key in self.tags_mapping: for tag in self.tags_mapping[key]: if content.__contains__(tag): print(content + "匹配成功" + self.tags_mapping[key].__str__() + "tagid: " + key) tag_set.add(key) tag_set.add('1001') return tag_set def addtags_2users_by_analyse_comment(self): sql="select user_id,content from tb_review_content" db_list=self.mysql_s.getData(sql) table="tb_matching" for i in db_list: print(i[1]) print(self.querybycontent_tagset(i[1])) my_dict=list(self.querybycontent_tagset(i[1])) if len(my_dict)<6: for l in range(0,6): my_dict.append('') sql = "update %s set tag_id1 = %s ,tag_id2=%s,tag_id3=%s ,tag_id4=%s,tag_id5=%s,tag_id6=%s where user_id=%s" % ( table, '"' + my_dict[0] + '"', '"' + my_dict[1] + '"', '"' + my_dict[2] + '"', '"'+ my_dict[3] + '"', '"' + my_dict[4] + '"', '"' + my_dict[5] + '"', '"'+i[0] + '"') self.mysql_s.upData(my_dict,sql)
def init_mysql(machine_name="dbpolicy", use_flow=False): return database.Mysql(MYSQL["user"], MYSQL["password"], MYSQL["database"], host=MYSQL[machine_name], use_flow=use_flow)
"source_pattern": line['source'], 'title_pattern': line['title'], 'next_pattern': line['next_button'], 'action_pattern': line['action'], 'loc_id': loc_id, 'file_count': 0, 'author_id': 1, } res = db.insert_one('api_config', insert_config) if not res: id -= 1 if __name__ == '__main__': # search() # merge_from_others() # migrations() # get_from_local() # update_from_local() # migrant() db = database.Mysql('root', '990211', 'dbpolicy_web', host='121.36.22.40') for line in db.select('api_config', condition="is_active=0"): url = line['target_url'] response = Request(url, timeout=10).text if response: print(line) db.update( 'update api_config set is_active=1 where id = "{}"'.format( line['id'])) else: db.delete('api_config', 'id="{}"'.format(line['id']))