def download_pic(): db_session = db_tool.Session(env.connect_str) models = db_session.query_all(data_model.CommentImage) guid_tuple_list = list(map(lambda o: (o.guid, o.pic_url), models)) guid_list = set(list(map(lambda o: o[0], guid_tuple_list))) guid_list = list(guid_list) total_count = len(guid_list) for index in range(total_count): if index < 4375: continue pics = list(filter(lambda o: o[0] == guid_list[index], guid_tuple_list)) pic_count = len(pics) for i in range(pic_count): file_name = '{0}.jpg'.format(str(i)) abs_file_name = os.path.join('images', guid_list[index], file_name) image_url = pics[i][1] try: image_stream = requests.get(url=image_url).content except: print('图片{0}无法下载!'.format(image_url)) with open('error_image_url.txt', 'a', encoding='utf-8') as err_f: err_f.write(image_url+'\n') continue if os.path.exists(abs_file_name): continue with open(abs_file_name, 'wb') as w: w.write(image_stream) print('已完成{0}/{1}'.format(index, total_count))
def make_pic_dir(): db_session = db_tool.Session(env.connect_str) models = db_session.query_all(data_model.CommentImage) guid_list = list(map(lambda o: o.guid, models)) print(len(guid_list)) guid_list = set(guid_list) print(len(guid_list)) for item in guid_list: os.makedirs('images/{0}'.format(item))
def parse_list_page(src, des, page_index): connect_str = open('config/connect_str.txt', 'r', encoding='utf-8').readline().strip('\n') db_session = db_tool.Session(connect_str) for i in range(1, page_index + 1): filename = r'html_pages/{1}_{2}_page_{0}.html'.format(str(i), src, des) models = parse_list_page_to_data_model(file_name=filename) db_session.db_list_writer(models) db_session.close_session()
def error_image_parser(): reader = open('error_image_url.txt', 'r', encoding='utf-8') lines = reader.readlines() urls = [] for line in lines: if line == '\n': continue urls.append(line.strip('\n')) db_session = db_tool.Session(env.connect_str) models = db_session.query_image_models_by_urls(urls) for model in models: print(model.guid) print(model.pic_url)
def image_parser(): db_session = db_tool.Session(env.connect_str) models = db_session.query_all(data_model.CommentDetailMerged) with_image_models = list(filter(lambda o: o.image_count > 0, models)) for model in with_image_models: image_models = [] model: data_model.CommentDetailMerged for url in model.image_urls.split(','): image_model = data_model.CommentImage() image_model.guid = model.guid image_model.pic_url = url image_models.append(image_model) db_session.db_list_writer(image_models)
def parse_comment_main(): db_session = db_tool.Session(env.connect_str) file_names = os.listdir('comment_json_pages') file_names = list(filter(lambda o: o.split('_')[1] == '1.json', file_names)) for file_name in file_names: abs_filename = os.path.join('comment_json_pages', file_name) comment_dict: dict = json.loads( open(abs_filename, 'r', encoding='utf-8').readline().strip('\n')) if 'iserror' in comment_dict.keys(): continue model = data_model.CommentMain() model.product_id = int(file_name.split('_')[0]) model.comment_count = comment_dict['count']['allNumber'] model.good_count = comment_dict['count']['goodNumber'] model.mid_count = comment_dict['count']['modeNumber'] model.bad_count = comment_dict['count']['negativeNumber'] model.with_photo_count = comment_dict['count']['photoNumber'] db_session.db_writer(model) db_session.close_session()
def get_comment_guid_distinct(): db_session = db_tool.Session(env.connect_str) models = db_session.query_all(data_model.CommentDetail) guid_set = set(list(map(lambda o: o.guid, models))) total_count = len(guid_set) count = 0 for guid in guid_set: distinct_models = list(filter(lambda o: o.guid == guid, models)) product_id_list = list( map(lambda o: str(o.product_id), distinct_models)) merged_model = data_model.CommentDetailMerged() merged_model.product_id_group = ','.join(product_id_list) distinct_model = distinct_models[0] props = dir(distinct_model) for prop in props: if prop[0] == '_': continue setattr(merged_model, prop, getattr(distinct_model, prop)) db_session.db_writer(merged_model) count = count + 1 print('Finished {0}/{1}'.format(str(count), str(total_count)))
def parse_comment_detail(): db_session = db_tool.Session(env.connect_str) for json_file in os.listdir('comment_json_pages'): abs_filename = 'comment_json_pages/' + json_file reader = open(abs_filename, 'r', encoding='utf-8') json_line = reader.readline().strip('\n') try: comment_dict: dict = json.loads(json_line) except json.decoder.JSONDecodeError: print(json_file) continue if 'iserror' in comment_dict.keys(): continue comments = comment_dict['dpList'] models = [] for comment in comments: model = data_model.CommentDetail() model.comment_source = comment['commentSource'] model.dp_site = comment['DPSite'] model.walk_aim = comment['walkAim'] model.content = comment['DPContent'] model.rating = comment['DPRating'] model.image_count = comment['imageCount'] model.dp_date = comment['DPDate'] model.guid = comment['DPGuid'] model.is_elite = comment['DPIsElite'] model.item_name = comment['DPItemName'] model.prize_jiangjin = comment['DPPrize_JiangJin'] model.product_id = int(json_file.split('_')[0]) model.user_level = comment['DPUserLevel'] model.user_name = comment['DPUserName'] model.vote_count = comment['DPVoteCount'] model.image_urls = comment['DPImagesStr'] models.append(model) db_session.db_list_writer(models) print('Finish {0}'.format(json_file)) db_session.close_session()
def __init__(self, login_url): self.__browser = webdriver.Chrome() self.__db_session = db_tool.Session() self.__login(login_url=login_url)
import db_tool import DataModel from sklearn.feature_extraction.text import TfidfVectorizer session = db_tool.Session() models = session.query_all(DataModel.JianggeWeiboWordcut) corpus = [] for model in models: model: DataModel.JianggeWeiboWordcut weibo_line = model.post_text.replace(';', ' ') corpus.append((weibo_line)) vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(corpus) words = vectorizer.get_feature_names() for i in range(len(words)): tuple_list = [] for j in range(len(words)): word = words[j] tfidf_result = tfidf_matrix[i, j] tuple_list.append((word, tfidf_result)) sorted_tuple_list = sorted(tuple_list, key=lambda o: o[1], reverse=True) # print(sorted_tuple_list[:3][0]) # 排序使用的是元组的list,需要取出每个元素的第一个位置,组成一个新字符串的list,然后再join; 使用循环或者map函数 result = ';'.join(list(map(lambda o: o[0], sorted_tuple_list[:3]))) # print(result) new_model = DataModel.JianggeWeiboTfidfFilter() db_tool.model_setter(models[i], new_model) new_model.sid = models[i].sid new_model.tfidf = result session.db_writer(new_model)