예제 #1
0
def download_pic():
    db_session = db_tool.Session(env.connect_str)
    models = db_session.query_all(data_model.CommentImage)
    guid_tuple_list = list(map(lambda o: (o.guid, o.pic_url), models))
    guid_list = set(list(map(lambda o: o[0], guid_tuple_list)))
    guid_list = list(guid_list)
    total_count = len(guid_list)
    for index in range(total_count):
        if index < 4375:
            continue
        pics = list(filter(lambda o: o[0] ==
                           guid_list[index], guid_tuple_list))
        pic_count = len(pics)
        for i in range(pic_count):
            file_name = '{0}.jpg'.format(str(i))
            abs_file_name = os.path.join('images', guid_list[index], file_name)
            image_url = pics[i][1]
            try:
                image_stream = requests.get(url=image_url).content
            except:
                print('图片{0}无法下载!'.format(image_url))
                with open('error_image_url.txt', 'a', encoding='utf-8') as err_f:
                    err_f.write(image_url+'\n')
                continue
            if os.path.exists(abs_file_name):
                continue
            with open(abs_file_name, 'wb') as w:
                w.write(image_stream)
        print('已完成{0}/{1}'.format(index, total_count))
예제 #2
0
def make_pic_dir():
    db_session = db_tool.Session(env.connect_str)
    models = db_session.query_all(data_model.CommentImage)
    guid_list = list(map(lambda o: o.guid, models))
    print(len(guid_list))
    guid_list = set(guid_list)
    print(len(guid_list))
    for item in guid_list:
        os.makedirs('images/{0}'.format(item))
예제 #3
0
def parse_list_page(src, des, page_index):
    connect_str = open('config/connect_str.txt', 'r',
                       encoding='utf-8').readline().strip('\n')
    db_session = db_tool.Session(connect_str)
    for i in range(1, page_index + 1):
        filename = r'html_pages/{1}_{2}_page_{0}.html'.format(str(i), src, des)
        models = parse_list_page_to_data_model(file_name=filename)
        db_session.db_list_writer(models)
    db_session.close_session()
예제 #4
0
def error_image_parser():
    reader = open('error_image_url.txt', 'r', encoding='utf-8')
    lines = reader.readlines()
    urls = []
    for line in lines:
        if line == '\n':
            continue
        urls.append(line.strip('\n'))
    db_session = db_tool.Session(env.connect_str)
    models = db_session.query_image_models_by_urls(urls)
    for model in models:
        print(model.guid)
        print(model.pic_url)
예제 #5
0
def image_parser():
    db_session = db_tool.Session(env.connect_str)
    models = db_session.query_all(data_model.CommentDetailMerged)
    with_image_models = list(filter(lambda o: o.image_count > 0, models))

    for model in with_image_models:
        image_models = []
        model: data_model.CommentDetailMerged
        for url in model.image_urls.split(','):
            image_model = data_model.CommentImage()
            image_model.guid = model.guid
            image_model.pic_url = url
            image_models.append(image_model)
        db_session.db_list_writer(image_models)
예제 #6
0
def parse_comment_main():
    db_session = db_tool.Session(env.connect_str)
    file_names = os.listdir('comment_json_pages')
    file_names = list(filter(lambda o: o.split('_')[1] == '1.json',
                             file_names))
    for file_name in file_names:
        abs_filename = os.path.join('comment_json_pages', file_name)
        comment_dict: dict = json.loads(
            open(abs_filename, 'r', encoding='utf-8').readline().strip('\n'))
        if 'iserror' in comment_dict.keys():
            continue
        model = data_model.CommentMain()
        model.product_id = int(file_name.split('_')[0])
        model.comment_count = comment_dict['count']['allNumber']
        model.good_count = comment_dict['count']['goodNumber']
        model.mid_count = comment_dict['count']['modeNumber']
        model.bad_count = comment_dict['count']['negativeNumber']
        model.with_photo_count = comment_dict['count']['photoNumber']
        db_session.db_writer(model)
    db_session.close_session()
예제 #7
0
def get_comment_guid_distinct():
    db_session = db_tool.Session(env.connect_str)
    models = db_session.query_all(data_model.CommentDetail)
    guid_set = set(list(map(lambda o: o.guid, models)))
    total_count = len(guid_set)
    count = 0
    for guid in guid_set:
        distinct_models = list(filter(lambda o: o.guid == guid, models))
        product_id_list = list(
            map(lambda o: str(o.product_id), distinct_models))
        merged_model = data_model.CommentDetailMerged()
        merged_model.product_id_group = ','.join(product_id_list)
        distinct_model = distinct_models[0]
        props = dir(distinct_model)
        for prop in props:
            if prop[0] == '_':
                continue
            setattr(merged_model, prop, getattr(distinct_model, prop))
        db_session.db_writer(merged_model)
        count = count + 1
        print('Finished {0}/{1}'.format(str(count), str(total_count)))
예제 #8
0
def parse_comment_detail():
    db_session = db_tool.Session(env.connect_str)
    for json_file in os.listdir('comment_json_pages'):
        abs_filename = 'comment_json_pages/' + json_file
        reader = open(abs_filename, 'r', encoding='utf-8')
        json_line = reader.readline().strip('\n')
        try:
            comment_dict: dict = json.loads(json_line)
        except json.decoder.JSONDecodeError:
            print(json_file)
            continue
        if 'iserror' in comment_dict.keys():
            continue
        comments = comment_dict['dpList']
        models = []
        for comment in comments:
            model = data_model.CommentDetail()
            model.comment_source = comment['commentSource']
            model.dp_site = comment['DPSite']
            model.walk_aim = comment['walkAim']
            model.content = comment['DPContent']
            model.rating = comment['DPRating']
            model.image_count = comment['imageCount']
            model.dp_date = comment['DPDate']
            model.guid = comment['DPGuid']
            model.is_elite = comment['DPIsElite']
            model.item_name = comment['DPItemName']
            model.prize_jiangjin = comment['DPPrize_JiangJin']
            model.product_id = int(json_file.split('_')[0])
            model.user_level = comment['DPUserLevel']
            model.user_name = comment['DPUserName']
            model.vote_count = comment['DPVoteCount']
            model.image_urls = comment['DPImagesStr']
            models.append(model)
        db_session.db_list_writer(models)
        print('Finish {0}'.format(json_file))
    db_session.close_session()
예제 #9
0
 def __init__(self, login_url):
     self.__browser = webdriver.Chrome()
     self.__db_session = db_tool.Session()
     self.__login(login_url=login_url)
예제 #10
0
import db_tool
import DataModel
from sklearn.feature_extraction.text import TfidfVectorizer

session = db_tool.Session()
models = session.query_all(DataModel.JianggeWeiboWordcut)
corpus = []

for model in models:
    model: DataModel.JianggeWeiboWordcut
    weibo_line = model.post_text.replace(';', ' ')
    corpus.append((weibo_line))
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names()
for i in range(len(words)):
    tuple_list = []
    for j in range(len(words)):
        word = words[j]
        tfidf_result = tfidf_matrix[i, j]
        tuple_list.append((word, tfidf_result))
    sorted_tuple_list = sorted(tuple_list, key=lambda o: o[1], reverse=True)
    # print(sorted_tuple_list[:3][0])
    # 排序使用的是元组的list,需要取出每个元素的第一个位置,组成一个新字符串的list,然后再join; 使用循环或者map函数
    result = ';'.join(list(map(lambda o: o[0], sorted_tuple_list[:3])))
    # print(result)
    new_model = DataModel.JianggeWeiboTfidfFilter()
    db_tool.model_setter(models[i], new_model)
    new_model.sid = models[i].sid
    new_model.tfidf = result
    session.db_writer(new_model)