def search_one_account_passage_by_id(id): # 使用 cursor() 方法创建一个游标对象 cursor db = connect() cursor = db.cursor() # mc = MysqlClient() # SQL 查询语句 sql = "SELECT title, passage_link, official_account_id FROM passage_link_list \ WHERE official_account_id = '{}'".format(id) try: # results = mc.select_many(sql) # 获取所有记录列表 cursor.execute(sql) results = cursor.fetchall() for i in range(len(results)): print(results[i]) # return json.dumps(results[1], ensure_ascii=False) cursor.close() db.close() return results except Exception as e: db.rollback() # 发生错误时回滚 # mc.end() logger.error(str(e)) logger.warning("Failed to search the history passage.") return False
def start_scraping(self): # session_begin_date=session_begin_date if session_begin_date else scraping_cfg.session_begin_date # session_end_date = session_end_date if session_end_date else scraping_cfg.session_end_date if not (self.scrape_profiles or self.scrape_tweets): logger.warning(f'Nothing to do. Did you forget "profiles" or "tweets" instruction?') return None if self.usersnames_df.empty: logger.warning(f'Nothing to do. Did you forget to set "all_users" or "users_list"? Or all users already exist?') return None processes = min(len(self.usersnames_df), self.n_processes) if self.scrape_profiles: self._populate_proxy_queue() print(self.usersnames_df) # mp_iterable = [(username,) for _, (_, username) in self.usersnames_df.iterrows()] mp_iterable = [(username,) for username in self.usersnames_df['username']] with mp.Pool(processes=processes) as pool: pool.starmap(self.scrape_a_user_profile, mp_iterable) if self.scrape_tweets: self._populate_proxy_queue() if self.rescrape: mp_iterable = [(username, begin_date, end_date) for _, (username, begin_date, end_date) in self.usersnames_df.iterrows()] else: mp_iterable = [(username, scraping_cfg.session_begin_date, scraping_cfg.session_end_date) for username in self.usersnames_df['username']] with mp.Pool(processes=processes) as pool: pool.starmap(self.scrape_a_user_tweets, mp_iterable)
def getindex(self): try: index_info = self.sess.get(self.indexurl, headers=self.header, timeout=5, proxies=self.proxy) index_html = index_info.text except: # 如果访问失败了 # 换一个代理IP重试一遍 self.count = self.count + 1 logger.warning("代理ip不可用" + str(self.count) + "次") self.proxy = server.get_proxy() return self.getindex() if (self.count >= 3): #如果换了两个IP还不行返回失败 return False pattern = re.compile( '[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12}-') try: sec_str = pattern.findall(index_html) if (sec_str == None): return False hash_str = sec_str[0] hash_str = hash_str + self.password sha1 = hashlib.sha1() sha1.update(hash_str.encode('utf-8')) res = sha1.hexdigest() return res except: return False
def insert_wechat_account(official_account_name): # 使用 cursor() 方法创建一个游标对象 cursor db = connect() cursor = db.cursor() # SQL 查询语句 sql = """INSERT INTO wechat_account_list (official_account_name) VALUES (%s)""" try: # 执行SQL语句 cursor.execute(sql, (official_account_name)) # 获取所有记录列表 sql = "SELECT id FROM wechat_account_list \ WHERE official_account_name = '{}'".format(official_account_name) cursor.execute(sql) db.commit() # 获取所有记录列表 results = cursor.fetchall() cursor.close() db.close() official_account_id = results[0][0] return official_account_id except Exception as e: db.rollback() # 发生错误时回滚 logger.error(str(e)) logger.warning("Failed to add a official account.") return False
def save_excel_file(path, dataframes, sheet_names=None, extension=".xlsx", check_path=False, to_excel_kwargs=None, **options): """Write dataframes to disk. :param path: Excel file path (to be checked before with 'save_file' function). :param dataframes: dataframe or list of dataframes :param sheet_names: sheet name or ordered list of sheet names :param extension: Excel file extension :param check_path: check path with save_file :param to_excel_kwargs: keyword arguments for _write_excel function :param options: options for save_file function :return: path """ if check_path: path = save_file(path, extension=extension, **options) if isinstance(dataframes, pd.DataFrame): dataframes = [dataframes] length = len(dataframes) if sheet_names is None: sheet_names = ["Sheet{}".format(i + 1) for i in range(length)] elif isinstance(sheet_names, str): sheet_names = [sheet_names] if len(sheet_names) != length: logger.warning("Invalid sheet names! Default sheet names will be used.") to_excel_kwargs = {} if to_excel_kwargs is None else to_excel_kwargs n_path = _write_excel(Path(path), dataframes, sheet_names, **to_excel_kwargs) return n_path
def handle_file_error(err, func, path, args=None, kwargs=None, pos_path=0, key_path=None, change_path_func=save_file, title='', msg='', return_if_ignore=None): """If PermissionError when opening/saving file, propose to retry, change file path or cancel :param err: exception :param func: function to execute if the user wants to retry :param path: file path :param args: args to pass to func :param kwargs: kwargs to pass to func :param pos_path: position of the positional argument path in func (only if key_path is None) :param key_path: name of the keyword argument path in func (if None, positional argument is used) :param change_path_func: function to get a new path, with no positional argument and 'initialdir' keyword argument :param title: title of the error :param msg: message of the error :param return_if_ignore: return if Ignore option is selected :return: """ logger.debug(err) args = args or [] kwargs = kwargs or {} title = title or 'File error!' msg = msg or "Unknown error with file '{}'. \nOriginal error: {}".format(path, err) logger.warning('User action needed!') res = messagebox.askcustomquestion(title=title, message=msg, choices=["Retry", "Rename automatically", "Change file path", "Ignore", "Debug (developer only)", "Cancel"]) if res == "Retry": if key_path is not None: kwargs[key_path] = path else: args.insert(pos_path, path) return func(*args, **kwargs) if res == "Rename automatically": n_path = _handle_existing_file_conflict(path=path, overwrite='rename') if key_path is not None: kwargs[key_path] = n_path else: args.insert(pos_path, n_path) return func(*args, **kwargs) elif res == "Change file path": initialdir = Path(path).dirname if Path(path).dirname.exists else None if key_path is not None: kwargs[key_path] = change_path_func(initialdir=initialdir) else: args.insert(pos_path, change_path_func(initialdir=initialdir)) return func(*args, **kwargs) elif res == "Ignore": logger.warning("Function ignored!") logger.debug("Function '{}' with path '{}' ignored!") return return_if_ignore elif res == "Debug (developer only)": pdb.set_trace() elif res in [None, "Cancel"]: err = UnknownError if not isinstance(err, BaseException) else err logger.exception(err) raise err.__class__(err) else: raise TypeError("Bad return of function 'messagebox.askcustomquestion': '{}'".format(res))
def rescrape_dead_periods(self, session_id=-1): self.rescrape = True self.usersnames_df = get_dead_tweets_periods(session_id=session_id) logger.warning(f'Rescraping following periods') print(self.usersnames_df) self.session_id *= -1 self.scrape_tweets = True return self
def _populate_proxy_queue(self): # Todo: Trow out every proxy with problems. Reload fast besed on ratio ok/fail proxy_df = get_proxies(max_delay=self.max_proxy_delay) # Shuffle the proxies otherwise always same order proxy_df = proxy_df.sample(frac=1., replace=False) for _, proxy in proxy_df.iterrows(): # self.proxy_queue.put((proxy['ip'], proxy['port'])) self.proxy_queue.put({'ip': proxy['ip'], 'port': proxy['port']}) logger.warning(f'Proxy queue poulates. Contains {self.proxy_queue.qsize()} servers')
def _handle_existing_file_conflict(path: Path, overwrite='ask', backup=False, **kwargs) -> Union[Path, None]: """Handle conflict if a file already exist by opening adapted dialog.""" # overwrite 'ask': ask user to modify overwrite arg into 'overwrite' (Yes) or 'rename' (No) or return None (Cancel) if overwrite == 'ask': logger.warning('User action needed!') res = messagebox.askyesnocancel(title="File existing", message="File {} already exists.\nDo you want to overwrite it?" "\n\nIf you select 'No', the file will be " "renamed automatically.".format(path)) if res is None: logger.info("'Save file' operation cancelled by the user.".format(path)) logger.debug("The path 'None' will be returned.") return None if res: overwrite = 'overwrite' else: overwrite = 'rename' # overwrite 'rename' or False: add '-i' at the end of the path to make it unique, where 'i' is an integer. if overwrite == 'rename' or overwrite is False: r_path, r_ext = path.splitext # def rename_method1(r_path, sep='-'):#todo ls_end = re.findall(r'-(\d+)$', r_path) if ls_end: # if the path already ends by '-i', change end to 'i+1' end = ls_end[0] r_path = r_path[:-(len(end) + 1)] added_ending = "-{}".format(int(end) + 1) else: added_ending = "-1" n_path = r_path + added_ending + r_ext logger.debug("Path {} changed to {} (renaming)".format(path, n_path)) return save_file(n_path, overwrite=overwrite, backup=backup, **kwargs) # backup True: backup the old file if backup and path.isfile: suffix = datetime.datetime.now().strftime("-%Y-%m-%d_%H-%M_") + uuid.uuid4().hex[:5] try: shutil.copyfile(path, path.radix + suffix + path.ext) except (PermissionError, FileNotFoundError) as err: logger.exception(err) logger.error("Failed to backup previous configuration file.") # overwrite 'overwrite' or True: do not modify the path and make the old file writable to allow overwriting if overwrite == 'overwrite' or overwrite is True: logger.debug("File {} will be overwritten".format(path)) _set_writable(path) # overwrite 'ignore': do nothing elif overwrite == 'ignore': pass # other case of overwrite: do nothing (same as 'ignore') else: logger.warning("Unexpected argument 'overwrite'! File {} will be overwritten".format(path)) return path
def get_random_ua(): count = 0 try: ua = UserAgent(verify_ssl=False) return ua.random except FakeUserAgentError as e: count += 1 if count < 5: get_random_ua() else: logger.warning(e)
async def test_proxy(self, proxy): try: if len(proxy.split('-')[1]) > 1: if not await self.is_high_anon( proxy.split('-')[1].replace('https://', 'http://')): self.redis.adjust_score(proxy, -self.minus_every_time, key=self.key) else: self.redis.adjust_score(proxy, +1, key=self.key) except CancelledError as e: logger.warning('proxy: %s, %s' % (proxy, e))
def _handle_error(self, flag, e, username, proxy, fail_counter, period_begin_date=None, period_end_date=None): txt = f'{flag} | {username}, {period_begin_date}/{period_end_date}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}' logger.warning(txt) logger.warning(e) update_proxy_stats(flag, proxy)
def insert_account_passage_link(title, passage_link, official_account_id): # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() # SQL 查询语句 sql = "INSERT INTO passage_link_list(title, passage_link, official_account_id) select '{}', '{}', '{}' from DUAL where not exists (select title, passage_link, official_account_id from passage_link_list where passage_link = '{}')".format( title, passage_link, official_account_id, passage_link) try: # 执行SQL语句 cursor.execute(sql) # 获取所有记录列表 db.commit() except Exception as e: db.rollback() # 发生错误时回滚 logger.error(str(e)) logger.warning("新增公众号推文失败。") return False
def _populate_proxy_queue(self): update_proxies_ratio() proxy_df = get_proxies() columns = [ 'datetime', 'ip', 'port', 'source', 'delay', 'blacklisted', 'scrape_n_failed', 'scrape_n_used', 'scrape_n_used_total', 'scrape_n_failed_total', 'last_flag', 'fail_ratio' ] # Sort by ratio proxy_df.sort_values('fail_ratio', inplace=True) print(proxy_df[columns]) for _, proxy in proxy_df.iterrows(): self.proxy_queue.put({'ip': proxy['ip'], 'port': proxy['port']}) logger.warning( f'Proxy queue poulated. Contains {self.proxy_queue.qsize()} servers' )
def search_one_account_passage_by_id(id): # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() # SQL 查询语句 sql = "SELECT title, passage_link, official_account_id FROM passage_link_list \ WHERE official_account_id = '{}'".format(id) try: cursor.execute(sql) # 获取所有记录列表 results = cursor.fetchall() for i in range(len(results)): print(results[i]) return results except Exception as e: db.rollback() # 发生错误时回滚 logger.error(str(e)) logger.warning("查询历史推文失败。") return False
def q_save_a_proxy(proxy): collection = get_collection() d = proxy # New proxies have not been tested d['delay'] = 999999 d['blacklisted'] = True d['error_code'] = 0 d['test_n_blacklisted'] = 0 d['test_n_tested'] = 0 d['scrape_success'] = True d['scrape_n_used'] = 0 d['scrape_n_failed'] = 0 d['scrape_n_used_total'] = 0 d['scrape_n_failed_total'] = 0 try: collection.insert_one(d) except DuplicateKeyError as e: logger.warning(f"Duplicate proxy: {proxy['ip']}:{proxy['port']}")
def choose_filedialog(dialog_type: str, multiple_paths: bool = False, return_on_cancellation: str = None, behavior_on_cancellation: str = 'ignore', initialdir: str = None, filetypes: list = None, title: str = None, **kwargs) -> Union[tuple, Path]: """Open a filedialog window.""" # Check dialog type. if dialog_type == 'save': user_input_func = filedialog.asksaveasfilename elif dialog_type == 'open': if multiple_paths: # selection of multiple files user_input_func = filedialog.askopenfilenames else: # selection of a unique file user_input_func = filedialog.askopenfilename elif dialog_type == 'open_dir': user_input_func = filedialog.askdirectory else: msg = "Argument 'dialog_type' must be 'save, 'open' or 'open_dir'." logger.error(msg) raise ValueError(msg) # Check inputs. if filetypes is None or not isinstance_filetypes(filetypes): filetypes = [("all files", "*.*")] if not isinstance(initialdir, str) or not Path(initialdir).isdir: initialdir = None if not isinstance(title, str): title = None # Get filenames. logger.warning('User action needed!') filetypes_kwargs = {} if dialog_type == 'open_dir' else dict( filetypes=filetypes) path = user_input_func(title=title, initialdir=initialdir, **filetypes_kwargs) if not path: # if no file selected # raise an anomaly with flag behavior_on_cancellation ('ask', 'ignore', 'warning' or 'error'). raise_no_file_selected_anomaly(flag=behavior_on_cancellation) return Path(return_on_cancellation) return Path(path)
def add(self, proxy, name=PROXY_ORIGINAL, score=INITIAL_SCORE): """ 添加代理,设置分数为最高 :param score: 默认分值 :param name:键名 :param proxy: 代理 :return: 添加结果 """ ip_port = proxy.split('-')[1] match = self.pattern.match(ip_port) if match: if not self.db.zscore(name, proxy): return self.db.zadd(name, {proxy: score}) else: logger.info('proxy %s already exists' % proxy) if int(self.db.zscore(name, proxy)) == 100: return self.db.zadd(name, {proxy: INITIAL_SCORE}) else: logger.warning('illegal proxy: %s' % proxy)
async def is_proxy_valid(proxy, url=TEST_URL): url = url ua = get_random_ua() headers = {'User-Agent': ua} try: conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(headers=headers, connector=conn) as session: async with session.get(url, proxy=proxy, ssl=False) as resp: code = resp.status if 200 <= code < 300: logger.info('%s is valid' % proxy) return True else: logger.info('%s is invalid, code: %s' % (proxy, code)) return False except (ClientConnectionError, ClientHttpProxyError, TimeoutError, CancelledError, ClientProxyConnectionError, Exception) as e: logger.warning(e) return False
def selectScore(self): if (self.count >= 2): #失败次数大于2次 return 2 #从数据库中拿出成绩 term = configfile.getConfig("term", "termStr") scoreInfo = self.session.query(Scores).filter_by(openid=self.openidStr, termStr=term).first() #如果数据库中的成绩不存在 if (scoreInfo == None): logger.info("用户" + self.openidStr + "数据库中无成绩") #插入成绩 0表示插入 TrueOrFalse = self.__updateScore(0) #如果插入失败 if (TrueOrFalse == False): #失败次数+1 self.count = self.count + 1 logger.warning("用户" + self.openidStr + "已爬" + str(self.count) + "次") #调用一次自身 return self.selectScore() #如果数据库取出的成绩过期 elif (self.__compareTime(scoreInfo.updateTime.date()) == False): logger.info("用户" + self.openidStr + "使用爬虫爬取成绩") #更新 1表示更新 TrueOrFalse = self.__updateScore(1) if (TrueOrFalse == False): #如果今天的成绩为空,更新失败,但是数据库原来有数据 #直接取原来的数据库的数据 logger.info("用户" + self.openidStr + "从数据库中取成绩") return scoreInfo.score else: #如果数据库跟新成功了,那就直接取数据库的 return self.selectScore() else: logger.info("用户" + self.openidStr + "从数据库中取成绩") return scoreInfo.score
def _write_excel(path, dataframes, sheet_names, index=False, **kwargs): """Save dataframes to an Excel workbook. :param path: output path :param dataframes: list of dataframes :param sheet_names: list of sheet names (same index as dataframes) :param index: if True, index names are exported :param kwargs: keyword arguments for pd.DataFrame.to_excel function :return: final output path """ if path is None or path.isnone: logger.warning('No path set to write data to Excel! No file has been created.') return None try: logger.debug("Trying to open file '{}'".format(path)) with pd.ExcelWriter(path) as writer: logger.debug("File {} opened.".format(path)) for df, sheet_name in zip(dataframes, sheet_names): if isinstance(df.columns, pd.MultiIndex): index = True # index must be True if MultiIndex columns, otherwise NotImplementedError is raised df.to_excel(writer, sheet_name=sheet_name, index=index, **kwargs) logger.debug("Sheet {} written.".format(sheet_name)) logger.debug("File {} closed.".format(path)) except PermissionError as err: logger.exception(err) path = handle_permission_error(err, func=_write_excel, path=path, args=[dataframes, sheet_names, index], kwargs=kwargs, change_path_func=save_file, handle_read_only_error=True) except ValueError as err: if str(err).startswith("No engine for filetype"): path = handle_bad_extension_error(err, func=_write_excel, path=path, args=[dataframes, sheet_names, index], kwargs=kwargs, change_path_func=save_file, extension=".xlsx") else: raise err except FileNotFoundError as err: logger.exception(err) path = handle_file_not_found_error(err, func=_write_excel, path=path, args=[dataframes, sheet_names, index], kwargs=kwargs, change_path_func=save_file) return path
async def is_high_anon(self, proxy): url = ANON_CHECK_URL try: async with aiohttp.ClientSession() as session: async with session.get(url, proxy=proxy, ssl=False, timeout=15) as resp: code = resp.status if 200 <= code < 300: x_forwarded_for_json = await resp.json() if self.anon_check_url == ANON_CHECK_URL: x_forwarded_for = x_forwarded_for_json['origin'] else: # 根据接口自己定义 x_forwarded_for = x_forwarded_for_json[ 'X-Forwarded-For'] if self.real_ip in x_forwarded_for: return False return True return False except (ClientConnectionError, ClientHttpProxyError, TimeoutError, CancelledError, ClientProxyConnectionError, Exception) as e: logger.warning('proxy: %s, %s' % (proxy, e)) return False
def start(self): if not (self.scrape_profiles or self.scrape_tweets): logger.warning( f'Nothing to do. Did you forget "profiles" or "tweets" instruction?' ) return None if self.usersnames_df.empty: logger.warning( f'Nothing to do. Did you forget to set "users_all" or "users_list"? Or all users already exist?' ) return None else: n_processes = min(len(self.usersnames_df), self.n_processes) logger.info( f'Start Twitter Scraping. | n_processes={n_processes}, session_id={self.session_id}, ' f'session_begin_date={self.session_begin_date}, session_end_date={self.session_end_date}, timedelta={self.timedelta}, missing_dates={self.missing_dates}' ) if self.scrape_profiles: self._populate_proxy_queue() mp_iterable = [(username, ) for username in self.usersnames_df['username']] with mp.Pool(processes=n_processes) as pool: pool.starmap(self.scrape_a_user_profile, mp_iterable) if self.scrape_tweets: self._populate_proxy_queue() if self.rescrape: mp_iterable = [ (username, begin_date, end_date) for _, (username, begin_date, end_date) in self.usersnames_df.iterrows() ] else: mp_iterable = [(username, self.session_begin_date, self.session_end_date) for username in self.usersnames_df['username']] with mp.Pool(processes=n_processes) as pool: pool.starmap(self.scrape_a_user_tweets, mp_iterable)
def add_file_extension(path: Union[str, Path], extension=None, replace=False, keep_existing=False, force_add=False) -> Path: """Add a specific extension to 'path'. :param path: path :param extension: extension. None is considered as '' extension :param replace: replace instead of append extension :param keep_existing: add extension only if it doesn't already exist :param force_add: add extension even if the correct extension already exists (not recommended) :return Path object """ path = Path(path) if path.ext and keep_existing: if path.ext != FileExt(extension): logger.warning("Bad extension kept: {}".format(path.ext)) return path if replace: return path.replace_ext(extension) if path.ext == FileExt(extension) and not force_add: return path return path.join_ext(extension)
def on_closing(self): # Case of an active thread if self.active_thread and self.active_thread.is_alive(): logger.warning("Process running in background!") logger.debug("Thread '{}: {}' still alive.".format( self.active_thread.name, self.active_thread)) logger.debug( "number of active threads: {}, current thread: {}, main thread: {}." .format(threading.active_count(), threading.current_thread(), threading.main_thread())) self.active_thread.join(timeout=1) # wait 1 second if self.active_thread.is_alive( ): # if the thread is still alive after 1 second if not self.ask_exit(self.active_thread.func_name): logger.debug("Exit cancelled") return False # Remove logger handler attached to tkinter frame self.logger_frame.quit() # Destroy the window self.destroy() logger.debug("Main window destroyed.") self.quit() logger.debug("Main window quited.") return True
def convert_dict_from_str(dico, allow_multiple=True, error='ignore', drop_none=False, drop_empty_iterable=False, ascendant=False, no_flag='ignore', inplace=False, duplicates='first', **parser_cfg): """Convert a dict of str (generated from a file for example) to a typed dict. Simple types that can be recognised: str, bool, int, float, list, tuple. Custom classes: Reference, Path Advanced types (with 'auto' flag): expressions with dict, set, bytes, None and simple types. :param dico: dictionary-like object to convert with keys and values of type 'str'. :param allow_multiple: if True, multiple flags are allowed (applied from left to right) :param error: behavior on casting error. Possible values: 'ignore' (returns the initial string), 'drop' (returns None), 'error' (raise an error if casting fails), 'auto-conversion' (try to convert automatically; if it fails, returns the initial string) :param drop_none: if True, None values are dropped :param drop_empty_iterable: if True, empty iterable objects (list, tuple) are dropped :param ascendant: if True, the flags are applied from the last to the first :param no_flag: behavior if no flag found. 'ignore', 'error', 'drop', 'auto-conversion' :param inplace: returns dico inplace :param duplicates: behavior if duplicates found. 'drop', 'first', 'last', 'error'. :param parser_cfg: kwargs for _parse_key function :return: dictionary-like object (same type as 'dico') # Simple test >>> test_dict = {"a": "without_flag", "@i-b": "1", "@f-c": "9.2", "@b-d": "", "@b-e": "5", "@b-f": "False"} >>> convert_dict_from_str(test_dict) {'a': 'without_flag', 'b': 1, 'c': 9.2, 'd': False, 'e': True, 'f': False} # Numbers test >>> num_dict = {"@f-d1": "1.6", "@i-d2": "1.7", "@f-@i-d3": "1.8", "@i-@f-d4": "1.9", "d5": 2.0, "@ftwsdc-d6": "2 252,9"} >>> convert_dict_from_str(num_dict) {'d1': 1.6, 'd2': '1.7', 'd3': 1, 'd4': 1.9, 'd5': 2.0, 'd6': 2252.9} # Duplicates handling test >>> dup_dict = OrderedDict([("@s-overwritten", "value1"), ("overwritten", "value2"), ("@auto-overwritten", "value3")]) >>> convert_dict_from_str(dup_dict, duplicates='rename') OrderedDict([('overwritten', 'value1'), ('overwritten_1', 'value2'), ('overwritten_2', 'value3')]) >>> convert_dict_from_str(dup_dict, duplicates='first') OrderedDict([('overwritten', 'value1')]) >>> convert_dict_from_str(dup_dict, duplicates='last') OrderedDict([('overwritten', 'value3')]) # Date test >>> date_dict = {"@date-date": "2019-04-01", "@date-date2": "04-13-2018", "@date-date3": "13/04/2018"} >>> convert_dict_from_str(date_dict) {'date': Timestamp('2019-04-01 00:00:00'), 'date2': Timestamp('2018-04-13 00:00:00'), 'date3': Timestamp('2018-04-13 00:00:00')} >>> date_special_dict = {"@date-date_std": "04/11/2018", "@datedb-date_day_before": "04/11/2018"} >>> convert_dict_from_str(date_special_dict) {'date_std': Timestamp('2018-04-11 00:00:00'), 'date_day_before': Timestamp('2018-11-04 00:00:00')} # List test >>> list_dict = {"@auto-list1": "[18, 13]", "@l-list2": "[19, 13]", "@auto-list3": "[{(18, 13): 'a'}, 'end']"} >>> convert_dict_from_str(list_dict) {'list1': [18, 13], 'list2': ['19', '13'], 'list3': [{(18, 13): 'a'}, 'end']} # Auto conversion test >>> auto_dict = {"a": "True", "b": "False", "c": "None", "d": "[{(18, 13): 'a'}, 'end']", "e": '9.9', "f": 9.9} >>> convert_dict_from_str(auto_dict, no_flag="auto-conversion") {'a': True, 'b': False, 'c': None, 'd': [{(18, 13): 'a'}, 'end'], 'e': 9.9, 'f': 9.9} """ # It is supposed that all keys are lower case! # If some keys are identical, only one will be retained (the last), others will be overwritten! if dico is None: logger.warning("none as dict") return None n_dico = type(dico)() for k, v in dico.items(): n_k, flags = _parse_key(k, **parser_cfg) if not allow_multiple: flags = flags[0] if flags else None if not flags: flags = _no_flag_handling(k, no_flag=no_flag) if flags is None: continue n_v = _multiple_item_conversion( v, flags, error=error, drop_none=drop_none, drop_empty_iterable=drop_empty_iterable, ascendant=ascendant) if n_v is not None or not drop_none: _handle_duplicates(n_dico, n_k, n_v, duplicates, inplace=True) if inplace: dico.clear() dico.update(n_dico) return return n_dico
def init(self, default_config: Union[dict, ConfigDict] = None, path: Union[str, Path] = None, auto_load: bool = True, default_section: str = DEFAULT_SECTION, section: str = None, conversion_dict: dict = None, force_load: bool = False, load_empty: bool = False, auto_cast: bool = False, write_flags: bool = None, ask_path: bool = True, search_in_default_config: bool = True, merge_default_how: str = 'right', **kwargs): """cf. __init__ :param path: path of the current configuration file. :param default_config: default configuration dictionary-like object with two levels. Preferred type is ConfigDict. :param auto_load: if True, configuration file is loaded at initialisation. :param default_section: string of the default section in configuration file. :param section: current section of current configuration :param conversion_dict: conversion of string values into other types. :param force_load: argument for load method. :param load_empty: if True, empty configuration can overwrite existing one :param auto_cast: if True, the read configuration values are automatically converted to basic Python types :param write_flags: if None, same as auto_cast. If True, flags are added to the written configuration keys to explicit value types :param ask_path: if True and path is None, the configuration path is asked to the user, otherwise, the path remains None. Actually, it is the argument of open_file function. :param search_in_default_config: if True, the default configuration is automatically used when a key is not found in the current configuration. If a section, which exists in default config and doesn't in current config, is tried to being accessed, it is created in the current config " todo :param merge_default_how: merge method for load method. Default is 'outer' (both existing and new keys are kept, values are updated) :param kwargs: other keyword arguments (not used) """ # Check arguments if kwargs: logger.warning("Keyword arguments '{}' are not valid.".format(kwargs)) # self._check_args(path=path, default_config=default_config, auto_load=auto_load, # default_section=default_section, section=section, conversion_dict=conversion_dict, # force_load=force_load, auto_cast=auto_cast, write_flags=write_flags) if not isinstance(conversion_dict, dict): conversion_dict = {} # Parameters self._auto_cast = auto_cast # if write_flags is not defined, write_flags is the same as auto_cast self._write_flags = auto_cast if write_flags is None else write_flags self._force_load = force_load self._load_empty = load_empty self._ask_path = ask_path self._search_in_default_config = search_in_default_config # todo self._conversion_dict = conversion_dict # Load default config self._default_config = ConfigDict(default_config) self._cfg = self.default_config.deepcopy() # first current configuration, before load # todo: needed? # Sections if not isinstance(default_section, str): default_section = DEFAULT_SECTION self._default_config.default_section = default_section # default section of default config self._default_config.section = self._default_config.default_section if section is None else section self.default_section = default_section # default section of current config self.section = self.default_section if section is None else section # Paths self.path = path self._default_path = self._path.copy() or Path(path) # if path is None, keep the first path # Load configuration from file if auto_load: self.load(merge_how=merge_default_how) self._init_count += 1 logger.debug("Config initialized. Number of initialization(s): {}".format(self._init_count))
def __init__(self, *args, **kwargs): if args or kwargs: logger.warning("Config object doesn't take any argument.") super().__init__()
# collection_names = ['profiles', 'proxies'] collection_names = ['profiles', 'proxies', 'tweets'] source_database = 'twitter_database' backup_database = f'twitter_database_backup_{datetime.now().date()}' client = MongoClient() source_db = client[source_database] backup_db = client[backup_database] for collection_name in collection_names: source_collection = source_db[collection_name] backup_collection = backup_db[collection_name] # check if source exists. If so, do nothing if collection_name in backup_db.list_collection_names(): logger.warning(f'{collection_name} already exists in {backup_db}') else: # Create indices for name, index_info in source_collection.index_information().items(): keys = index_info['key'] del (index_info['ns']) del (index_info['v']) del (index_info['key']) backup_collection.create_index(keys, name=name, **index_info) logger.info(f'Index {name} for {collection_name} created') # Copy documents i = 0 for doc in source_collection.find({}): backup_collection.insert_one(doc) i += 1
def raise_anomaly(flag="ask", error=None, title=None, message=None, use_messagebox=True): """Raise an 'anomaly' depending on the flag argument: - 'ignore': do nothing and return, - 'warning': show a warning (warning logger and tkinter messagebox), - 'error': raises an exception, - 'ask': ask the user either to ignore the anomaly or to raise an error. :param flag: must be 'ask', 'ignore', 'warning', 'error' :param error: Exception :param title: Title for message box and logger. :param message: Message for message box and error logger. :param use_messagebox: if True the tkinter messagebox will be used to show errors. :return: """ # Incorrect flag if not isinstance(flag, str): msg = "Incorrect type '{}' for flag argument which must be a string.".format( type(flag)) logger.error(msg) raise TypeError(msg) # Define unknown errors if not isinstance(title, str) or message is None: title = "Unknown error." if not isinstance(message, str) or message is None: message = "Unknown error." # Ask flag if flag == 'ask': res = messagebox.askyesnocancel( title=title, message="{}\n\nDo you want to continue " "the program otherwise (not recommended)?".format(message), default='no') if res is None: logger.debug("Anomaly 'ask' raised with title '{}' " "and message '{}'.".format(title, message)) logger.info("Program stopped by the user ('Cancel' button).") sys.exit(0) if res: flag = 'ignore' else: flag = 'error' use_messagebox = False # message has already been shown. # Ignore flag if flag == 'ignore': logger.debug( "Anomaly 'ignore' raised with title '{}' and message '{}'.".format( title, message)) return # Warning flag if flag == 'warning': logger.warning( "Anomaly 'warning' raised with title '{} and message '{}'.".format( title, message)) if use_messagebox: messagebox.showwarning(title=title, message=message) return # Error flag if flag == 'error': logger.debug("Anomaly 'error' raised with title '{}'.".format(title)) if not isinstance(error, BaseException): error = UnknownError if use_messagebox: messagebox.showerror(title=title, message=message) msg = '{}\n\n{}'.format(title, message) logger.error(msg) raise error(msg) # Unknown flag msg = "Incorrect value '{}' for flag argument which must be 'ask', 'ignore', 'warning' or 'error'. " \ "Original error was: {}\n\n{}".format(flag, title, message) logger.error(msg) raise ValueError(msg)