Пример #1
0
    def __init__(self,
                 use_redis=False,
                 debug=False,
                 file_name_head="",
                 filename="",
                 analysis=False):
        """

        :param use_redis:
        :param debug:
        :param file_name_head:
        :param filename:
        :param filename_list:
        """
        self.debug = debug
        self.df = None
        self.filename = filename
        self.file_name_head = file_name_head
        USER_BASE_DIR = BASE_DIR + file_name_head + '/data/result/'
        util.check_dir_exist(USER_BASE_DIR)
        self.N_E_FILE_NAME = USER_BASE_DIR + 'n_E_mood_data.csv'
        self.CMT_RESULT_NAMES = USER_BASE_DIR + 'cmt_result_names.csv'
        if self.filename == '' and self.file_name_head != '':
            self.filename = USER_BASE_DIR + 'mood_data.csv'

        if not analysis:
            self.read_data_from_csv()
Пример #2
0
    def __init__(self,
                 use_redis=False,
                 debug=False,
                 analysis=False,
                 recover=False,
                 username='',
                 mood_begin=0,
                 mood_num=-1,
                 stop_time='-1',
                 from_web=True,
                 nickname='',
                 no_delete=True,
                 cookie_text='',
                 export_excel=False,
                 export_csv=True):
        """

        :param use_redis: 是否使用redis
        :param debug: 是否开启debug模式
        :param analysis: 如果为true, 会执行爬虫程序,再执行分析程序,如果为false,只执行分析程序
        """
        QQZoneSpider.__init__(self,
                              use_redis,
                              debug,
                              recover=recover,
                              username=username,
                              mood_num=mood_num,
                              mood_begin=mood_begin,
                              stop_time=stop_time,
                              from_web=from_web,
                              nickname=nickname,
                              no_delete=no_delete,
                              cookie_text=cookie_text)

        if self.g_tk == 0 and analysis == False:
            self.login()

        FRIEND_DIR_HEAD = BASE_DIR + 'friend/' + self.file_name_head
        self.FRIEND_LIST_FILE_NAME = FRIEND_DIR_HEAD + '_friend_list.json'
        self.FRIEND_DETAIL_FILE_NAME = FRIEND_DIR_HEAD + '_friend_detail.json'
        self.FRIEND_DETAIL_LIST_FILE_NAME = FRIEND_DIR_HEAD + '_friend_detail_list.csv'
        self.FRIEND_DETAIL_EXCEL_FILE_NAME = FRIEND_DIR_HEAD + '_friend_detail_list.xlsx'
        # 头像下载到web的static文件夹,以便在web中调用
        self.FRIEND_HEADER_IMAGE_PATH = '../web/static/image/header/' + self.file_name_head + '/'

        util.check_dir_exist(self.FRIEND_HEADER_IMAGE_PATH)
        self.friend_detail = []
        self.friend_list = []
        self.friend_df = pd.DataFrame()
        self.re = self.connect_redis()
        self.friend_thread_list = []
        self.export_excel = export_excel
        self.export_csv = export_csv
Пример #3
0
 def init_log(self):
     logging_dir = os.path.join(BASE_DIR, "logs/")
     if self.debug:
         print("logging_dir:", logging_dir)
     check_dir_exist(logging_dir)
     logger = logging.getLogger('log')
     logger.setLevel(logging.INFO)
     log_path = logging_dir + get_now_time() + ".log"
     # 存在bug,无法按天分割
     # 参考博客:https://blog.csdn.net/weixin_38107388/article/details/90639151
     # fh = logging.handlers.TimedRotatingFileHandler(logging_dir + 'support', when='S', backupCount=5, encoding='utf-8')
     # fh.suffix = "%Y%m%d.log"
     fh = logging.FileHandler(log_path, encoding='utf-8', mode='a')
     fh.setLevel(logging.INFO)
     formatter = logging.Formatter(LOGGING_FORMAT)
     fh.setFormatter(formatter)
     logger.addHandler(fh)
     return logger
Пример #4
0
 def init_log(self):
     filelog = True
     logging_dir = self.USER_BASE_DIR + 'log/'
     if self.debug:
         print("logging_dir:", logging_dir)
     util.check_dir_exist(logging_dir)
     # logging.basicConfig(level=logging.INFO,
     #                     format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
     #                     datefmt='%a, %d %b %Y %H:%M:%S',
     #                     filename=logging_dir + self.username + '.log',
     #                     filemode='w+')
     log_path = logging_dir + self.username + '.log'
     logger = logging.getLogger('log')
     logger.setLevel(logging.INFO)
     formatter = logging.Formatter(
         '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s'
     )
     if filelog:
         fh = logging.FileHandler(log_path, encoding='utf-8')
         fh.setLevel(logging.DEBUG)
         fh.setFormatter(formatter)
         logger.addHandler(fh)
     return logger
Пример #5
0
    def init_file_name(self):

        self.USER_BASE_DIR = BASE_DIR + self.username + '/'
        logging_dir = self.USER_BASE_DIR + 'log/'
        print("logging_dir:", logging_dir)
        util.check_dir_exist(logging_dir)
        logging.basicConfig(
            level=logging.INFO,
            format=
            '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
            datefmt='%a, %d %b %Y %H:%M:%S',
            filename=logging_dir + self.username + '.log',
            filemode='w+')
        logging.info('file_name_head:' + self.username)

        DATA_DIR_HEAD = self.USER_BASE_DIR + 'data/'
        self.CONTENT_FILE_NAME = DATA_DIR_HEAD + 'QQ_content.json'
        self.LIKE_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_detail' + '.json'
        self.LIKE_LIST_NAME_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_list_name' + '.json'
        self.MOOD_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_mood_detail' + '.json'

        ERROR_DIR_HEAD = self.USER_BASE_DIR + 'error/'
        self.ERROR_LIKE_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error' + '.json'
        self.ERROR_LIKE_LIST_NAME_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_name_error' + '.json'
        self.ERROR_MOOD_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error' + '.json'
        self.ERROR_LIKE_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error_unikey' + '.txt'
        self.ERROR_LIKE_LIST_NAME_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_error_unikey' + '.txt'
        self.ERROR_MOOD_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error_unikey' + '.txt'

        self.SMALL_IMAGE_DIR = self.USER_BASE_DIR + 'qq_image/'
        self.BIG_IMAGE_DIR = self.USER_BASE_DIR + 'qq_big_image/'
        util.check_dir_exist(DATA_DIR_HEAD)
        util.check_dir_exist(ERROR_DIR_HEAD)
        util.check_dir_exist(self.SMALL_IMAGE_DIR)
        util.check_dir_exist(self.BIG_IMAGE_DIR)
        print("Init file Name Finish:", self.USER_BASE_DIR)
Пример #6
0
    def init_analysis_path(self):
        self.friend_dir = BASE_DIR + self.username + '/friend/' + 'friend_detail_list.csv'
        self.history_like_agree_file_name = BASE_DIR + self.username + '/friend/' + 'history_like_list.json'
        RESULT_BASE_DIR = self.USER_BASE_DIR + "data/result/"

        self.MOOD_DATA_FILE_NAME = RESULT_BASE_DIR + 'mood_data.csv'
        self.MOOD_DATA_EXCEL_FILE_NAME = RESULT_BASE_DIR + 'mood_data.xlsx'

        LABEL_BASE_DIR = self.USER_BASE_DIR + "data/label/"
        self.LABEL_FILE_CSV = LABEL_BASE_DIR + 'label_data.csv'
        self.LABEL_FILE_EXCEL = LABEL_BASE_DIR + 'label_data.xlsx'

        self.label_path = self.USER_BASE_DIR + 'data/label/'
        self.image_path = self.USER_BASE_DIR + 'image/'
        util.check_dir_exist(RESULT_BASE_DIR)
        util.check_dir_exist(LABEL_BASE_DIR)
        util.check_dir_exist(self.label_path)
        util.check_dir_exist(self.image_path)
Пример #7
0
    def init_file_name(self):
        logging.info('file_name_head:' + self.file_name_head)

        DATA_DIR_HEAD = self.USER_BASE_DIR + 'data/'
        self.CONTENT_FILE_NAME = DATA_DIR_HEAD + 'QQ_content.json'
        self.LIKE_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_detail' + '.json'
        self.LIKE_LIST_NAME_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_list_name' + '.json'
        self.MOOD_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_mood_detail' + '.json'

        ERROR_DIR_HEAD = self.USER_BASE_DIR + 'error/'
        self.ERROR_LIKE_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error' + '.json'
        self.ERROR_LIKE_LIST_NAME_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_name_error' + '.json'
        self.ERROR_MOOD_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error' + '.json'
        self.ERROR_LIKE_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error_unikey' + '.txt'
        self.ERROR_LIKE_LIST_NAME_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_error_unikey' + '.txt'
        self.ERROR_MOOD_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error_unikey' + '.txt'

        self.SMALL_IMAGE_DIR = self.USER_BASE_DIR + 'qq_image/'
        self.BIG_IMAGE_DIR = self.USER_BASE_DIR + 'qq_big_image/'
        util.check_dir_exist(DATA_DIR_HEAD)
        util.check_dir_exist(ERROR_DIR_HEAD)
        util.check_dir_exist(self.SMALL_IMAGE_DIR)
        util.check_dir_exist(self.BIG_IMAGE_DIR)
        print("Init file Name Finish:", self.USER_BASE_DIR)
Пример #8
0
    def init_file_name(self):
        """
        初始化所有文件名
        :return:
        """
        self.USER_BASE_DIR = BASE_DIR + self.username + '/'
        logging_dir = self.USER_BASE_DIR + 'log/'
        if self.debug:
            print("logging_dir:", logging_dir)
        util.check_dir_exist(logging_dir)
        logging.basicConfig(
            level=logging.INFO,
            format=
            '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
            datefmt='%a, %d %b %Y %H:%M:%S',
            filename=logging_dir + self.username + '.log',
            filemode='w+')
        logging.info('file_name_head:' + self.username)

        DATA_DIR_HEAD = self.USER_BASE_DIR + 'data/'
        self.CONTENT_FILE_NAME = DATA_DIR_HEAD + 'QQ_content.json'
        self.LIKE_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_detail' + '.json'
        self.LIKE_LIST_NAME_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_list_name' + '.json'
        self.MOOD_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_mood_detail' + '.json'

        ERROR_DIR_HEAD = self.USER_BASE_DIR + 'error/'
        self.ERROR_LIKE_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error' + '.json'
        self.ERROR_LIKE_LIST_NAME_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_name_error' + '.json'
        self.ERROR_MOOD_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error' + '.json'
        self.ERROR_LIKE_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error_unikey' + '.txt'
        self.ERROR_LIKE_LIST_NAME_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_error_unikey' + '.txt'
        self.ERROR_MOOD_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error_unikey' + '.txt'

        self.SMALL_IMAGE_DIR = self.USER_BASE_DIR + 'qq_image/'
        self.BIG_IMAGE_DIR = self.USER_BASE_DIR + 'qq_big_image/'
        util.check_dir_exist(DATA_DIR_HEAD)
        util.check_dir_exist(ERROR_DIR_HEAD)
        util.check_dir_exist(self.SMALL_IMAGE_DIR)
        util.check_dir_exist(self.BIG_IMAGE_DIR)

        USER_BASE_DIR = BASE_DIR + self.username + '/'
        util.check_dir_exist(USER_BASE_DIR)
        FRIEND_DIR_HEAD = USER_BASE_DIR + 'friend/'
        self.FRIEND_LIST_FILE_NAME = FRIEND_DIR_HEAD + 'friend_list.json'
        self.FRIEND_DETAIL_FILE_NAME = FRIEND_DIR_HEAD + 'friend_detail.json'
        self.FRIEND_DETAIL_LIST_FILE_NAME = FRIEND_DIR_HEAD + 'friend_detail_list.csv'
        self.FRIEND_DETAIL_EXCEL_FILE_NAME = FRIEND_DIR_HEAD + 'friend_detail_list.xlsx'
        # 头像下载到web的static文件夹,以便在web中调用

        self.FRIEND_HEADER_IMAGE_PATH = BASE_PATH + '/src/web/static/image/' + self.username + '/header/'
        self.web_image_bash_path = BASE_PATH + '/src/web/static/image/' + self.username + '/'
        util.check_dir_exist(USER_BASE_DIR + 'friend/')
        util.check_dir_exist(self.FRIEND_HEADER_IMAGE_PATH)
        self.init_analysis_path()
        if self.debug:
            print("Init file Name Finish:", self.USER_BASE_DIR)
Пример #9
0
    def init_file_name(self):
        """
        初始化所有文件名
        :return:
        """
        self.USER_BASE_DIR = BASE_DIR + self.username + '/'

        self.logging = self.init_log()

        self.logging.info('file_name_head:' + self.username)

        DATA_DIR_HEAD = self.USER_BASE_DIR + 'data/'
        self.CONTENT_FILE_NAME = DATA_DIR_HEAD + 'QQ_content.json'
        self.LIKE_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_detail' + '.json'
        self.LIKE_LIST_NAME_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_list_name' + '.json'
        self.MOOD_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_mood_detail' + '.json'

        ERROR_DIR_HEAD = self.USER_BASE_DIR + 'error/'
        self.ERROR_LIKE_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error' + '.json'
        self.ERROR_LIKE_LIST_NAME_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_name_error' + '.json'
        self.ERROR_MOOD_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error' + '.json'
        self.ERROR_LIKE_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error_unikey' + '.txt'
        self.ERROR_LIKE_LIST_NAME_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_error_unikey' + '.txt'
        self.ERROR_MOOD_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error_unikey' + '.txt'

        self.SMALL_IMAGE_DIR = self.USER_BASE_DIR + 'qq_image/'
        self.BIG_IMAGE_DIR = self.USER_BASE_DIR + 'qq_big_image/'
        util.check_dir_exist(DATA_DIR_HEAD)
        util.check_dir_exist(ERROR_DIR_HEAD)
        util.check_dir_exist(self.SMALL_IMAGE_DIR)
        util.check_dir_exist(self.BIG_IMAGE_DIR)

        USER_BASE_DIR = BASE_DIR + self.username + '/'
        util.check_dir_exist(USER_BASE_DIR)
        FRIEND_DIR_HEAD = USER_BASE_DIR + 'friend/'
        self.FRIEND_LIST_FILE_NAME = FRIEND_DIR_HEAD + 'friend_list.json'
        self.FRIEND_DETAIL_FILE_NAME = FRIEND_DIR_HEAD + 'friend_detail.json'
        self.FRIEND_DETAIL_LIST_FILE_NAME = FRIEND_DIR_HEAD + 'friend_detail_list.csv'
        self.FRIEND_DETAIL_EXCEL_FILE_NAME = FRIEND_DIR_HEAD + 'friend_detail_list.xlsx'
        # 头像下载到web的static文件夹,以便在web中调用

        self.FRIEND_HEADER_IMAGE_PATH = BASE_PATH + '/src/web/static/image/' + self.username + '/header/'
        self.web_image_bash_path = BASE_PATH + '/src/web/static/image/' + self.username + '/'
        util.check_dir_exist(USER_BASE_DIR + 'friend/')
        util.check_dir_exist(self.FRIEND_HEADER_IMAGE_PATH)
        self.init_analysis_path()
        if self.debug:
            print("Init file Name Finish:", self.USER_BASE_DIR)
Пример #10
0
 def __init__(self, username):
     self.temp_dir = BASE_DIR + username + '/temp/'
     check_dir_exist(self.temp_dir)
Пример #11
0
    def __init__(self, use_redis=False, debug=False, mood_begin=0, mood_num=-1, stop_time='-1',
                 download_small_image=False, download_big_image=False,
                 download_mood_detail=True, download_like_detail=True, download_like_names=True, recover=False,
                 cookie_text=None, from_web=False, username='', nickname='', no_delete=True, pool_flag='127.0.0.1'):
        # 初始化下载项
        self.mood_begin = mood_begin
        self.mood_num = mood_num
        self.recover = recover
        self.download_small_image = download_small_image
        self.download_big_image = download_big_image
        self.download_mood_detail = download_mood_detail
        self.download_like_detail = download_like_detail
        self.download_like_names = download_like_names
        self.thread_num = 5
        self.thread_list = []
        self.no_delete = no_delete
        if stop_time != '-1':
            self.stop_time = util.get_mktime(stop_time)
        else:
            self.stop_time = -1
        self.begin_time = datetime.datetime.now()
        self.host = 'https://user.qzone.qq.com'
        self.h5_host = 'h5.qzone.qq.com'
        self.http_host = 'http://user.qzone.qq.com'
        self.use_redis = use_redis
        self.debug = debug
        self.cookie_text = cookie_text
        self.pool_flag = pool_flag
        if from_web:
            self.username = username
            self.file_name_head = username
            self.nickname = nickname
        else:
            self.username, self.password, self.file_name_head, self.nick_name = self.get_username_password()
        self.mood_host = self.http_host + '/' + self.username + '/mood/'
        # 在爬取好友动态时username会变为好友的QQ号,所以此处需要备份
        self.raw_username = deepcopy(self.username)

        self.headers = {
            'host': 'user.qzone.qq.com',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.8',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:66.0) Gecko/20100101 Firefox/66.0',
            'connection': 'keep-alive'
        }
        self.h5_headers = deepcopy(self.headers)
        self.h5_headers['host'] = self.h5_host
        self.USER_BASE_DIR = BASE_DIR + self.username + '/'
        logging_dir = self.USER_BASE_DIR + 'log/'
        util.check_dir_exist(logging_dir)
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                            datefmt='%a, %d %b %Y %H:%M:%S',
                            filename=logging_dir + self.username + '.log',
                            filemode='w+')
        if (use_redis):
            self.re = self.connect_redis()

        self.user_info = UserInfo(self.username).load()
        if self.user_info is None:
            self.user_info = UserInfo(self.username)
        self.user_info.QQ = self.username
        self.user_info.nickname = self.nickname
Пример #12
0
 def check_dirs(self):
     check_dir_exist(DATA_DIR)
     check_dir_exist(os.path.join(BASE_DIR + "/download_image/"))
Пример #13
0
 def test_check_dir(self):
     path1 = os.path.join(BASE_DIR, 'test1')
     check_dir_exist(path1)
     pass
Пример #14
0
 def __init__(self):
     check_dir_exist(self.temp_dir)