class Storage: def __init__(self, mode=StorageModes.MEMORY): self.mode = mode if self.mode == StorageModes.PERSISTENT: self.cache = SqliteDict('../my_db.sqlite', autocommit=True) elif self.mode == StorageModes.MEMORY: self.cache = dict() def set(self, k, v): self.cache[k] = v if self.mode == StorageModes.PERSISTENT: # need to commit manually, as autocomit only commits with commit(blocking=False) and might not persist data self.cache.commit() def dump(self, k): self.cache.pop(k) if self.mode == StorageModes.PERSISTENT: # need to commit manually, as autocomit only commits with commit(blocking=False) and might not persist data self.cache.commit() def get(self, k): return self.cache.get(k) def append(self, k, v): current_data = self.cache.get(k) if not current_data: self.set(k, [v]) else: if not isinstance(current_data, list): current_data = [current_data] current_data.append(v) self.set(k, current_data)
def _import_sql_data(data_dir): file_path = os.path.join(data_dir, DATA_FILE) # Find out what format we have with sqlite3.connect(file_path) as conn: try: conn.execute('select count(*) from zipgun_info') zipgun_info = SqliteDict(file_path, tablename='zipgun_info') version = zipgun_info.get('version', 0) except sqlite3.OperationalError: version = 0 if version == 0: country_postal_codes = SqliteDict(file_path) elif version == 1: country_postal_codes = {} for country_code in zipgun_info['country_codes']: if country_code in country_postal_codes: raise ValueError('Duplicate entry found for {}'.format( country_code)) country_postal_codes[country_code] = SqliteDict( file_path, tablename='zg_{}'.format(country_code), journal_mode='OFF') zipgun_info.close() else: raise ValueError('Unknown data file version {}'.format(version)) return country_postal_codes
def _import_sql_data(data_dir): import sqlite3 from sqlitedict import SqliteDict file_path = os.path.join(data_dir, DATA_FILE) # Find out what format we have with sqlite3.connect(file_path) as conn: try: conn.execute('select count(*) from zipgun_info') zipgun_info = SqliteDict(file_path, tablename='zipgun_info') version = zipgun_info.get('version', 0) except sqlite3.OperationalError: version = 0 if version == 0: country_postal_codes = SqliteDict(file_path) elif version == 1: country_postal_codes = {} for country_code in zipgun_info['country_codes']: if country_code in country_postal_codes: raise ValueError( 'Duplicate entry found for {}'.format(country_code)) country_postal_codes[country_code] = SqliteDict( file_path, tablename='zg_{}'.format(country_code), journal_mode='OFF') zipgun_info.close() else: raise ValueError('Unknown data file version {}'.format(version)) return country_postal_codes
class db(object): def __init__(self, user): self.user = str(user) self.db = SqliteDict(self.getCfgPath(), autocommit=True) def get(self, key=''): return self.db.get(key) if key else self.db.iteritems() def set(self, key='', data=''): if not key: key = self.user if data: self.db[key] = data else: self.db.__delitem__(key) def getCfgPath(self): if os.path.isdir('hoshino'): if not os.path.isdir('hoshino/modules/ASF_Plus/config'): os.mkdir('hoshino/modules/ASF_Plus/config') return os.path.join( os.path.abspath('hoshino/modules/ASF_Plus/config'), f'{self.user}.sqlite') else: if not os.path.isdir('../config'): os.mkdir('../config') return os.path.join(os.path.abspath('../config'), f'{self.user}.sqlite')
async def post(self): request = self.request data = await request.post() try: mydict = SqliteDict('./my_db.sqlite', autocommit=True) if mydict.get(data["url"]) is not None: return web.Response(text=str(mydict.get(data["url"]))) image = await fetch(session, data["url"]) nsfw_prob = classify(image) text = nsfw_prob.astype(str) mydict[data["url"]] = text return web.Response(text=text) except KeyError: return HTTPBadRequest(text="Missing `url` POST parameter") except OSError as e: if "cannot identify" in str(e): raise HTTPUnsupportedMediaType(text="Invalid image") else: raise e
class Cache(object): """ Cache -- Key-Value Store for Twizzle to reduce unnecessary recomputations """ def __init__(self, bPersistent=False, sPathToPersistenceDB="twizzle_cache.db"): """Constructor of the Twizzle Cache Note: You can decide whether the Chache should be persistent between multiple executions or just a runtime cache for one execution of a set of tests Args: bPersistent (bool): Flag whether the chache should be persistent or not (Note: persistent cache is much slower because it has to write the data to the harddisk) sPathToPersistenceDB (str): Path to the Cache DB where the Cache should write its data to """ self._cache = {} self._lock = Lock() self._persistent = bPersistent self._first_get = True if bPersistent: if not sPathToPersistenceDB: raise Exception( "On persistent mode a path to the persistence database has to be defined" ) self._db = SqliteDict(sPathToPersistenceDB) def set(self, sKey, oValue): """set cache element by key""" # debug print("ADDING CACHELINE: %s" % (sKey)) self._lock.acquire() self._cache[sKey] = oValue if self._persistent: self._db[CACHE_KEY] = self._cache self._db.commit() self._lock.release() def get(self, sKey): """get cache element by key""" if self._persistent: self._lock.acquire() if self._first_get: self._first_get = False self._cache = self._db.get(CACHE_KEY, {}) self._lock.release() return self._cache.get(sKey, None) def calc_unique_key(self, *params): """create a unique key based on parameters given by converting them to string and concatenating them""" return "".join([str(elem) for elem in params])
def get_cached_embedding(s): global db if db is None: db = SqliteDict(default_sbert_db, autocommit=True) print(f'Inited db using {default_sbert_db}') array_str = db.get(s) if array_str is not None: return t.from_numpy(__string_to_numpy(array_str)) else: print(f'Not cached: {s}') tensor = get_embedding(s) db[s] = __numpy_to_string(tensor.numpy()) return tensor
def get_annotations(dbpath, ids, options): # No context manager: close() can block and this is read-only db = SqliteDict(dbpath, flag='r', autocommit=False) for docid, annid in ids: so_key = docid + options.ann_suffix so = db.get(so_key) if so is None: warning('{} not found in {}, skipping'.format(so_key, dbpath)) continue text_key = docid + options.text_suffix text = db.get(text_key) if text is None: warning('{} not found in {}, skipping'.format(text_key, dbpath)) continue ann = get_annotation(so, annid) before = 'DOCSTART ' + text[:ann.start] after = text[ann.end:] + 'DOCEND' before = get_words(before, options.words, reverse=True) after = get_words(after, options.words, reverse=False) before = normalize_space(before) after = normalize_space(after) print('\t'.join([docid, annid, ann.type, before, ann.text, after]))
class OutputNewOrChangedEntires(beam.DoFn): def __init__(self, cache_file: str): super().__init__() self._cache_file = cache_file self._cache = None def start_bundle(self): self._cache = SqliteDict(self._cache_file, autocommit=True) def finish_bundle(self): self._cache.close() def process( self, element: Tuple[EntryId, Dict[str, Any]], *args, **kwargs) -> Generator[Tuple[EntryId, Dict[str, Any]], None, None]: # Make the type checker happy assert isinstance(self._cache, SqliteDict) (wikidata_id, entry) = element cached_entry = self._cache.get(wikidata_id) if cached_entry is None or cached_entry != entry: self._cache[wikidata_id] = entry yield wikidata_id, entry
class ToolDocumentCache: def __init__(self, cache_dir): self.cache_dir = cache_dir if not os.path.exists(self.cache_dir): os.makedirs(self.cache_dir) self.cache_file = os.path.join(self.cache_dir, 'cache.sqlite') self.writeable_cache_file = None self._cache = None self.disabled = False self._get_cache(create_if_necessary=True) def close(self): self._cache and self._cache.close() def _get_cache(self, flag='r', create_if_necessary=False): try: if create_if_necessary and not os.path.exists(self.cache_file): # Create database if necessary using 'c' flag self._cache = SqliteDict(self.cache_file, flag='c', encode=encoder, decode=decoder, autocommit=False) if flag == 'r': self._cache.flag = flag else: cache_file = self.writeable_cache_file.name if self.writeable_cache_file else self.cache_file self._cache = SqliteDict(cache_file, flag=flag, encode=encoder, decode=decoder, autocommit=False) except sqlite3.OperationalError: log.warning('Tool document cache unavailable') self._cache = None self.disabled = True @property def cache_file_is_writeable(self): return os.access(self.cache_file, os.W_OK) def reopen_ro(self): self._get_cache(flag='r') self.writeable_cache_file = None def get(self, config_file): try: tool_document = self._cache.get(config_file) except sqlite3.OperationalError: log.debug("Tool document cache unavailable") return None if not tool_document: return None if tool_document.get( 'tool_cache_version') != CURRENT_TOOL_CACHE_VERSION: return None if self.cache_file_is_writeable: for path, modtime in tool_document['paths_and_modtimes'].items(): if os.path.getmtime(path) != modtime: return None return tool_document def _make_writable(self): if not self.writeable_cache_file: self.writeable_cache_file = tempfile.NamedTemporaryFile( dir=self.cache_dir, suffix='cache.sqlite.tmp', delete=False) if os.path.exists(self.cache_file): shutil.copy(self.cache_file, self.writeable_cache_file.name) self._get_cache(flag='c') def persist(self): if self.writeable_cache_file: self._cache.commit() os.rename(self.writeable_cache_file.name, self.cache_file) self.reopen_ro() def set(self, config_file, tool_source): try: if self.cache_file_is_writeable: self._make_writable() to_persist = { 'document': tool_source.to_string(), 'macro_paths': tool_source.macro_paths, 'paths_and_modtimes': tool_source.paths_and_modtimes(), 'tool_cache_version': CURRENT_TOOL_CACHE_VERSION, } try: self._cache[config_file] = to_persist except RuntimeError: log.debug("Tool document cache not writeable") except sqlite3.OperationalError: log.debug("Tool document cache unavailable") def delete(self, config_file): if self.cache_file_is_writeable: self._make_writable() try: del self._cache[config_file] except (KeyError, RuntimeError): pass def __del__(self): if self.writeable_cache_file: try: os.unlink(self.writeable_cache_file.name) except Exception: pass
class GoogleDrive: auth_url = 'https://accounts.google.com/o/oauth2/v2/auth' token_url = 'https://www.googleapis.com/oauth2/v4/token' api_url = 'https://www.googleapis.com/drive/' redirect_url = 'urn:ietf:wg:oauth:2.0:oob' scopes = ['https://www.googleapis.com/auth/drive.readonly'] def __init__(self, config, client_id: str, client_secret: str, token_path: str, cache_path: str): self.cfg = config self.client_id = client_id self.client_secret = client_secret self.token_path = token_path self.cache_path = cache_path self.cache = SqliteDict(self.cache_path, tablename='cache', encode=json.dumps, decode=json.loads, autocommit=False) self.transcodes_cache = ExpiringDict(max_len=5000, max_age_seconds=2 * (60 * 60)) self.token = self._load_token() self.token_refresh_lock = Lock() self.http = self._new_http_object() ############################################################ # CORE CLASS METHODS ############################################################ def get_auth_link(self): auth_url, state = self.http.authorization_url(self.auth_url, access_type='offline', prompt='select_account') return auth_url def exchange_code(self, code: str): token = self.http.fetch_token(self.token_url, code=code, client_secret=self.client_secret) if 'access_token' in token: self._token_saver(token) return self.token def query(self, path: str, method: str = 'GET', fetch_all_pages: bool = False, callbacks={}, **kwargs): resp: Response = None pages: int = 1 resp_json = {} request_url = self.api_url + path.lstrip('/') if not path.startswith('http') else path try: while True: resp = self._do_query(request_url, method, **kwargs) log.debug(f"Request URL: {resp.url}") log.debug(f"Request ARG: {kwargs}") log.debug(f'Response Status: {resp.status_code} {resp.reason}') if 'stream' in kwargs and kwargs['stream']: return True, resp, None if 'Content-Type' in resp.headers and 'json' in resp.headers['Content-Type']: if fetch_all_pages: resp_json.pop('nextPageToken', None) new_json = resp.json() # does this page have changes extended_changes = False changes = [] if 'changes' in new_json: if 'changes' in resp_json: changes.extend(resp_json['changes']) changes.extend(new_json['changes']) extended_changes = True resp_json.update(new_json) if extended_changes: resp_json['changes'] = changes else: return False if resp.status_code != 200 else True, resp, resp.text # call page_token_callback to update cached page_token, if specified if 'page_token_callback' in callbacks: if 'nextPageToken' in resp_json: callbacks['page_token_callback'](resp_json['nextPageToken']) elif 'newStartPageToken' in resp_json: callbacks['page_token_callback'](resp_json['newStartPageToken']) # call data_callback, fetch_all_pages is true if fetch_all_pages and 'data_callback' in callbacks: callbacks['data_callback'](resp.json(), callbacks) # handle nextPageToken if fetch_all_pages and 'nextPageToken' in resp_json and resp_json['nextPageToken']: # there are more pages pages += 1 log.info("Fetching extra results from page %d", pages) if 'params' in kwargs: kwargs['params'].update({'pageToken': resp_json['nextPageToken']}) elif 'json' in kwargs: kwargs['json'].update({'pageToken': resp_json['nextPageToken']}) elif 'data' in kwargs: kwargs['data'].update({'pageToken': resp_json['nextPageToken']}) continue break return True if resp_json and len(resp_json) else False, resp, resp_json if ( resp_json and len(resp_json)) else resp.text except Exception: log.exception(f"Exception sending request to {request_url} with kwargs={kwargs}: ") return False, resp, None ############################################################ # DRIVE FUNCTIONS ############################################################ def validate_access_token(self): success, resp, data = self.query('/v3/changes/startPageToken', params={'supportsTeamDrives': self.cfg.google.teamdrive}) if success and resp.status_code == 200: if 'startPageToken' not in data: log.error("Failed validate up to date access_token:\n\n%s\n", data) return False return True else: log.error("Error validating access token, status_code = %d, data =\n\n%s\n", resp.status_code if resp is not None else 0, data) return False def get_changes(self, new_items_callback=None, removed_items_callback=None): callbacks = {'page_token_callback': self._page_token_saver, 'data_callback': self._process_changes} if new_items_callback: callbacks['new_items_callback'] = new_items_callback if removed_items_callback: callbacks['removed_items_callback'] = removed_items_callback success, resp, data = self.query('/v3/changes', params={ 'pageToken': self.token['page_token'] if 'page_token' in self.token else '1', 'pageSize': 1000, 'includeRemoved': True, 'includeTeamDriveItems': self.cfg.google.teamdrive, 'supportsTeamDrives': self.cfg.google.teamdrive, 'fields': 'changes(file(md5Checksum,mimeType,modifiedTime,' 'name,parents,teamDriveId,trashed),' 'fileId,removed,teamDrive(id,name),' 'teamDriveId),newStartPageToken,nextPageToken'}, fetch_all_pages=True, callbacks=callbacks) return def get_file(self, file_id, stream=True, headers=None, timeout=30): req_url = '/v2/files/%s' % file_id if not file_id.startswith('http') else file_id success, resp, data = self.query(req_url, params={ 'includeTeamDriveItems': self.cfg.google.teamdrive, 'supportsTeamDrives': self.cfg.google.teamdrive, 'alt': 'media' }, stream=stream, headers=headers, timeout=timeout) return resp def get_stream_link(self, file_id): # validate / refersh current access_token if not self.validate_access_token(): return '' log.debug("Validated access_token is current") # generate url req = Request('GET', f'{self.api_url.rstrip("/")}/v2/files/{file_id}', params={'includeTeamDriveItems': self.cfg.google.teamdrive, 'supportsTeamDrives': self.cfg.google.teamdrive, 'alt': 'media', 'access_token': self.token['access_token']}).prepare() log.debug(f'Direct Stream URL: {req.url}') return req.url def get_transcodes(self, file_id): # do we have the transcoded versions already cached within the last 5 minutes? cached_transcodes = self.transcodes_cache.get(file_id, None) if cached_transcodes is not None and len(cached_transcodes): log.debug(f"Loaded {len(cached_transcodes)} transcode streams from temporary cache for: {file_id}") return cached_transcodes # retrieve transcoded versions from google docs success, resp, data = self.query(f'https://docs.google.com/get_video_info?docid={file_id}') if not success or (not data or 'fmt_stream_map' not in data or 'fmt_list' not in data): log.error(f"Failed to find transcoded versions data for: {file_id}") return None # parse main response tmp = parse_qs(data) tmp_versions = tmp['fmt_list'][0] tmp_stream_map = tmp['fmt_stream_map'][0] drive_stream_cookie = resp.cookies.get('DRIVE_STREAM', '') # parse required variables transcode_versions = {} transcode_streams = {} # parse version list for version in tmp_versions.split(','): tmp_v = version.split('/') transcode_versions[tmp_v[0]] = tmp_v[1].split('x')[1] if not len(transcode_versions): log.error(f"Failed to parse transcoded versions (fmt_list) for: {file_id}") return None # parse transcode lists for stream in tmp_stream_map.split(','): tmp_s = stream.split('|') transcode_streams[transcode_versions[tmp_s[0]]] = tmp_s[1] if not len(transcode_streams): log.error(f"Failed to parse transcoded streams (fmt_stream_map) for: {file_id}") return None # cache the transcode streams for 5 minutes self.transcodes_cache[file_id] = transcode_streams log.debug(f"Added {len(transcode_streams)} transcode streams to temporary cache for: {file_id}") return transcode_streams ############################################################ # CACHE ############################################################ def get_id_metadata(self, item_id, teamdrive_id=None): # return cache from metadata if available cached_metadata = self._get_cached_metdata(item_id) if cached_metadata: return True, cached_metadata # does item_id match teamdrive_id? if teamdrive_id is not None and item_id == teamdrive_id: success, resp, data = self.query('v3/teamdrives/%s' % str(item_id)) if success and resp.status_code == 200 and 'name' in data: # we successfully retrieved this teamdrive info, lets place a mimeType key in the result # so we know it needs to be cached data['mimeType'] = 'application/vnd.google-apps.folder' else: # retrieve file metadata success, resp, data = self.query('v3/files/%s' % str(item_id), params={ 'supportsTeamDrives': self.cfg.google.teamdrive, 'fields': 'id,md5Checksum,mimeType,modifiedTime,name,parents,' 'trashed,teamDriveId'}) if success and resp.status_code == 200: return True, data else: log.error("Error retrieving metadata for item %r:\n\n%s\n", item_id, data) return False, data def get_id_file_paths(self, item_id, teamdrive_id=None): file_paths = [] added_to_cache = 0 try: def get_item_paths(obj_id, path, paths, new_cache_entries, teamdrive_id=None): success, obj = self.get_id_metadata(obj_id, teamdrive_id) if not success: return new_cache_entries teamdrive_id = teamdrive_id if 'teamDriveId' not in obj else obj['teamDriveId'] # add item object to cache if we know its not from cache if 'mimeType' in obj: # we know this is a new item fetched from the api, because the cache does not store this field self.add_item_to_cache(obj['id'], obj['name'], [] if 'parents' not in obj else obj['parents']) new_cache_entries += 1 if path.strip() == '': path = obj['name'] else: path = os.path.join(obj['name'], path) if 'parents' in obj and obj['parents']: for parent in obj['parents']: new_cache_entries += get_item_paths(parent, path, paths, new_cache_entries, teamdrive_id) if (not obj or 'parents' not in obj or not obj['parents']) and len(path): paths.append(path) return new_cache_entries return new_cache_entries added_to_cache += get_item_paths(item_id, '', file_paths, added_to_cache, teamdrive_id) if added_to_cache: log.debug("Dumping cache due to new entries!") self._dump_cache() if len(file_paths): return True, file_paths else: return False, file_paths except Exception: log.exception("Exception retrieving filepaths for '%s': ", item_id) return False, [] def add_item_to_cache(self, item_id, item_name, item_parents): if item_id not in self.cache: log.debug("Added '%s' to cache: %s", item_id, item_name) self.cache[item_id] = {'name': item_name, 'parents': item_parents} return def remove_item_from_cache(self, item_id): if self.cache.pop(item_id, None): return True return False def get_item_name_from_cache(self, item_id): try: item = self.cache.get(item_id) return item['name'] if isinstance(item, dict) else 'Unknown' except Exception: pass return 'Unknown' def get_item_from_cache(self, item_id): try: item = self.cache.get(item_id, None) return item except Exception: pass return None ############################################################ # INTERNALS ############################################################ def _do_query(self, request_url: str, method: str, **kwargs): tries: int = 0 max_tries: int = 2 lock_acquirer: bool = False resp: Response = None use_timeout: int = 30 # override default timeout if 'timeout' in kwargs and isinstance(kwargs['timeout'], int): use_timeout = kwargs['timeout'] kwargs.pop('timeout', None) # remove un-needed kwargs kwargs.pop('fetch_all_pages', None) kwargs.pop('page_token_callback', None) # do query while tries < max_tries: if self.token_refresh_lock.locked() and not lock_acquirer: log.debug("Token refresh lock is currently acquired... trying again in 500ms") time.sleep(0.5) continue if method == 'POST': resp = self.http.post(request_url, timeout=use_timeout, **kwargs) elif method == 'PATCH': resp = self.http.patch(request_url, timeout=use_timeout, **kwargs) elif method == 'DELETE': resp = self.http.delete(request_url, timeout=use_timeout, **kwargs) else: resp = self.http.get(request_url, timeout=use_timeout, **kwargs) tries += 1 if resp.status_code == 401 and tries < max_tries: # unauthorized error, lets refresh token and retry self.token_refresh_lock.acquire(False) lock_acquirer = True log.warning(f"Unauthorized Response (Attempts {tries}/{max_tries})") self.token['expires_at'] = time() - 10 self.http = self._new_http_object() else: break return resp def _load_token(self): try: if not os.path.exists(self.token_path): return {} with open(self.token_path, 'r') as fp: return json.load(fp) except Exception: log.exception(f"Exception loading token from {self.token_path}: ") return {} def _dump_token(self): try: with open(self.token_path, 'w') as fp: json.dump(self.token, fp, indent=2) return True except Exception: log.exception(f"Exception dumping token to {self.token_path}: ") return False def _token_saver(self, token: dict): # update internal token dict self.token.update(token) try: if self.token_refresh_lock.locked(): self.token_refresh_lock.release() except Exception: log.exception("Exception releasing token_refresh_lock: ") self._dump_token() log.info("Renewed access token!") return def _page_token_saver(self, page_token: str): # update internal token dict self.token['page_token'] = page_token self._dump_token() return def _new_http_object(self): return OAuth2Session(client_id=self.client_id, redirect_uri=self.redirect_url, scope=self.scopes, auto_refresh_url=self.token_url, auto_refresh_kwargs={'client_id': self.client_id, 'client_secret': self.client_secret}, token_updater=self._token_saver, token=self.token) def _get_cached_metdata(self, item_id): if item_id in self.cache: return self.cache[item_id] return None def _dump_cache(self): self.cache.commit() return def _remove_unwanted_paths(self, paths_list: list, mime_type: str): # remove paths that were not allowed - this is always enabled for item_path in copy(paths_list): allowed_path = False for allowed_file_path in self.cfg.google.allowed.file_paths: if item_path.lower().startswith(allowed_file_path.lower()): allowed_path = True break if not allowed_path: log.debug("Ignoring %r because its not an allowed path", item_path) paths_list.remove(item_path) continue # remove unallowed extensions if self.cfg.google.allowed.file_extensions: for item_path in copy(paths_list): allowed_file = False for allowed_extension in self.cfg.google.allowed.file_extensions_list: if item_path.lower().endswith(allowed_extension.lower()): allowed_file = True break if not allowed_file: log.debug("Ignoring %r because it was not an allowed extension", item_path) paths_list.remove(item_path) # remove unallowed mimes if self.cfg.google.allowed.mime_types: allowed_file = False for allowed_mime in self.cfg.google.allowed.mime_types_list: if allowed_mime.lower() in mime_type.lower(): if 'video' in mime_type.lower(): # we want to validate this is not a .sub file, which for some reason, google shows as video/MP2G double_checked_allowed = True for item_path in paths_list: if item_path.lower().endswith('.sub'): double_checked_allowed = False if double_checked_allowed: allowed_file = True break else: allowed_file = True break if not allowed_file: log.debug("Ignoring %s because it was not an allowed mime: %s", paths_list, mime_type) for item_path in copy(paths_list): paths_list.remove(item_path) def _process_changes(self, data: dict, callbacks: dict = {}): removed_file_paths = {} added_file_paths = {} if not data or 'changes' not in data: log.error("There were no changes to process") return log.info("Processing %d changes", len(data['changes'])) # process changes for change in data['changes']: if 'file' in change and 'fileId' in change: # dont consider trashed/removed events for processing if ('trashed' in change['file'] and change['file']['trashed']) or ( 'removed' in change and change['removed']): # store the removed file paths - only if we have this item cached, otherwise we are not interested # as we would not have stored it anyway... item_exists = self.get_item_from_cache(change['fileId']) if item_exists is not None: success, item_paths = self.get_id_file_paths(change['fileId'], change['file']['teamDriveId'] if 'teamDriveId' in change['file'] else None) self._remove_unwanted_paths(item_paths, change['file']['mimeType'] if 'mimeType' in change[ 'file'] else 'Unknown') if success and len(item_paths): if change['fileId'] in removed_file_paths: removed_file_paths[change['fileId']].extend(item_paths) else: removed_file_paths[change['fileId']] = item_paths # remove item from cache if self.remove_item_from_cache(change['fileId']): log.debug("Removed '%s' from cache: %s", change['fileId'], change['file']['name']) continue existing_cache_item = self.get_item_from_cache(change['fileId']) existing_success, existing_cache_item_paths = self.get_id_file_paths(change['fileId'], change['file']['teamDriveId'] if 'teamDriveId' in change[ 'file'] else None) if \ existing_cache_item is not None else (None, None) # we always want to add changes to the cache so renames etc can be reflected inside the cache self.add_item_to_cache(change['fileId'], change['file']['name'], [] if 'parents' not in change['file'] else change['file']['parents']) # dont process folder events if 'mimeType' in change['file'] and 'vnd.google-apps.folder' in change['file']['mimeType']: # ignore this change as we dont want to scan folders continue # get this files paths success, item_paths = self.get_id_file_paths(change['fileId'], change['file']['teamDriveId'] if 'teamDriveId' in change[ 'file'] else None) # remove unwanted paths if existing_success and len(existing_cache_item_paths): self._remove_unwanted_paths(existing_cache_item_paths, change['file']['mimeType'] if 'mimeType' in change[ 'file'] else 'Unknown') if success and len(item_paths): self._remove_unwanted_paths(item_paths, change['file']['mimeType'] if 'mimeType' in change[ 'file'] else 'Unknown') # was this an existing item? if (existing_cache_item is not None and existing_success and len(existing_cache_item_paths)) and ( success and len(item_paths)): # this was an existing item, and we are re-processing it again # we need to find the differences between the before and after paths. existing_path_set = set(existing_cache_item_paths) new_path_set = set(item_paths) removed_item_paths = existing_path_set.difference(new_path_set) added_item_paths = new_path_set.difference(existing_path_set) if len(removed_item_paths): if change['fileId'] in removed_file_paths: removed_file_paths[change['fileId']].extend(list(removed_item_paths)) else: removed_file_paths[change['fileId']] = list(removed_item_paths) if len(added_item_paths): if change['fileId'] in added_file_paths: added_file_paths[change['fileId']].extend(list(added_item_paths)) else: added_file_paths[change['fileId']] = list(added_item_paths) elif success and len(item_paths): # these are new paths/files that were not already in the cache if change['fileId'] in added_file_paths: added_file_paths[change['fileId']].extend(item_paths) else: added_file_paths[change['fileId']] = item_paths elif 'teamDrive' in change and 'teamDriveId' in change: # this is a teamdrive change # dont consider trashed/removed events for processing if 'removed' in change and change['removed']: # remove item from cache if self.remove_item_from_cache(change['teamDriveId']): log.info("Removed teamDrive '%s' from cache: %s", change['teamDriveId'], change['teamDrive']['name'] if 'name' in change[ 'teamDrive'] else 'Unknown teamDrive') continue if 'id' in change['teamDrive'] and 'name' in change['teamDrive']: # we always want to add changes to the cache so renames etc can be reflected inside the cache self.add_item_to_cache(change['teamDrive']['id'], change['teamDrive']['name'], []) continue # always dump the cache after running changes self._dump_cache() log.info('%d added / %d removed', len(added_file_paths), len(removed_file_paths)) # call further callbacks if len(removed_file_paths) and 'removed_items_callback' in callbacks: callbacks['removed_items_callback'](removed_file_paths) if len(added_file_paths) and 'new_items_callback' in callbacks: callbacks['new_items_callback'](added_file_paths) return
class SqliteDb: KEY_PREFIX = "slb:" KEY_SUBSCRIBED_CHANNELS = KEY_PREFIX + "subscribed_channels" KEY_NOTIFICATION_TASK_STORE = KEY_PREFIX + "notification_task_store" KEY_GUILD = KEY_PREFIX + "guild:{}" def __init__(self, sqlite_location: str): self.sqlite = SqliteDict(sqlite_location, autocommit=False) def init_defaults(self) -> None: if self.KEY_NOTIFICATION_TASK_STORE not in self.sqlite: self.sqlite[self.KEY_NOTIFICATION_TASK_STORE] = [False, ""] if self.KEY_SUBSCRIBED_CHANNELS not in self.sqlite: self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] = set() self.sqlite.commit() def get_notification_task_store(self) -> Tuple[bool, Dict]: return self.sqlite[self.KEY_NOTIFICATION_TASK_STORE] def set_notification_task_store(self, ls_notif_sent: bool, li_embed_dict: Dict) -> None: self.sqlite[self.KEY_NOTIFICATION_TASK_STORE] = [ ls_notif_sent, li_embed_dict ] self.sqlite.commit() def set_guild_mentions(self, guild_id: int, to_mention: str) -> None: self.sqlite[self.KEY_GUILD.format(guild_id)] = to_mention self.sqlite.commit() def get_guild_mentions(self, guild_id: int) -> str: return self.sqlite.get(self.KEY_GUILD.format(guild_id), "") def delete_guild_mentions(self, guild_id: int) -> int: if self.KEY_GUILD.format(guild_id) in self.sqlite: del self.sqlite[self.KEY_GUILD.format(guild_id)] self.sqlite.commit() return 1 return 0 def get_subbed_channels(self) -> Set[int]: return self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] def add_subbed_channel(self, channel_id: int) -> int: channels = self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] if channel_id not in channels: channels.add(channel_id) self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] = channels self.sqlite.commit() return 1 return 0 def remove_subbed_channel(self, channel_id: int) -> int: channels = self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] if channel_id in channels: channels.remove(channel_id) self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] = channels self.sqlite.commit() return 1 return 0 def remove_subbed_channels(self, channels_to_remove: Set[int]) -> None: self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] = ( self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] - channels_to_remove) self.sqlite.commit() def subbed_channels_count(self) -> int: return len(self.sqlite[self.KEY_SUBSCRIBED_CHANNELS]) def stop(self): self.sqlite.commit() self.sqlite.close()
class MockBinanceManager(BinanceAPIManager): def __init__( self, config: Config, db: Database, logger: Logger, start_date: datetime = None, start_balances: Dict[str, float] = None, ): super().__init__(config, db, logger) self.config = config self.datetime = start_date or datetime(2021, 1, 1) self.balances = start_balances or {config.BRIDGE.symbol: 100} self.cache = SqliteDict("data/backtest_cache.db") def setup_websockets(self): pass # No websockets are needed for backtesting def increment(self, interval=1): self.datetime += timedelta(minutes=interval) def get_fee(self, origin_coin: Coin, target_coin: Coin, selling: bool): return 0.0075 def get_ticker_price(self, ticker_symbol: str): """ Get ticker price of a specific coin """ target_date = self.datetime.replace(second=0, microsecond=0) target_date_str = self.datetime.isoformat(timespec="seconds") key = f"{ticker_symbol} - {target_date}" val = self.cache.get(key, None) if val is None: end_date = self.datetime + timedelta(minutes=1000) if end_date > datetime.now(): end_date = datetime.now() end_date_str = end_date.isoformat(timespec="seconds") self.logger.info( f"Fetching prices for {ticker_symbol} between {target_date} and {end_date}" ) # Use internal binance_client method because the public one doesn't # actually pass on limits. results = self.binance_client._historical_klines( ticker_symbol, "1m", start_str=target_date_str, end_str=end_date_str, limit=1000) prices = {} for result in results: result_date = datetime.utcfromtimestamp(result[0] / 1000) result_date = result_date.replace(second=0, microsecond=0) price = float(result[1]) prices[f"{ticker_symbol} - {result_date}"] = price # Verify all intervals were returning, explicitly mark as missing # otherwise so we can skip fetch. for verify_date in (target_date + timedelta(minutes=n) for n in range(1000)): verify_key = f"{ticker_symbol} - {verify_date}" self.cache[verify_key] = prices.get(verify_key, "MISSING") self.cache.commit() val = self.cache.get(key, None) if val == "MISSING": return None return val def get_currency_balance(self, currency_symbol: str, force=False): """ Get balance of a specific coin """ return self.balances.get(currency_symbol, 0) def buy_alt(self, origin_coin: Coin, target_coin: Coin): origin_symbol = origin_coin.symbol target_symbol = target_coin.symbol target_balance = self.get_currency_balance(target_symbol) from_coin_price = self.get_ticker_price(origin_symbol + target_symbol) order_quantity = self._buy_quantity(origin_symbol, target_symbol, target_balance, from_coin_price) target_quantity = order_quantity * from_coin_price self.balances[target_symbol] -= target_quantity self.balances[origin_symbol] = self.balances.get( origin_symbol, 0) + order_quantity * ( 1 - self.get_fee(origin_coin, target_coin, False)) self.logger.info( f"Bought {origin_symbol}, balance now: {self.balances[origin_symbol]} - bridge: " f"{self.balances[target_symbol]}") event = defaultdict(lambda: None, order_price=from_coin_price, cumulative_quote_asset_transacted_quantity=0) return BinanceOrder(event) def sell_alt(self, origin_coin: Coin, target_coin: Coin): origin_symbol = origin_coin.symbol target_symbol = target_coin.symbol origin_balance = self.get_currency_balance(origin_symbol) from_coin_price = self.get_ticker_price(origin_symbol + target_symbol) order_quantity = self._sell_quantity(origin_symbol, target_symbol, origin_balance) target_quantity = order_quantity * from_coin_price self.balances[target_symbol] = self.balances.get( target_symbol, 0) + target_quantity * ( 1 - self.get_fee(origin_coin, target_coin, True)) self.balances[origin_symbol] -= order_quantity self.logger.info( f"Sold {origin_symbol}, balance now: {self.balances[origin_symbol]} - bridge: " f"{self.balances[target_symbol]}") return {"price": from_coin_price} def collate_coins(self, target_symbol: str): total = 0 for coin, balance in self.balances.items(): if coin == target_symbol: total += balance continue if coin == self.config.BRIDGE.symbol: price = self.get_ticker_price(target_symbol + coin) if price is None: continue total += balance / price else: price = self.get_ticker_price(coin + target_symbol) if price is None: continue total += price * balance return total def close(self): self.cache.close()
import os import pandas as pd from math import inf from sqlitedict import SqliteDict from statsmodels.tsa.ar_model import AutoReg cache = SqliteDict('changed.db', autocommit=True) # A US manufacturer buys raw materials in multiple currencies purchases = pd.read_excel('Purchases.xlsx') # For each of those currencies, find the best model to forecast prices best_model = {} for currency in purchases.currency: print('Currency', currency) file_time = os.stat(f'{currency}.xlsx').st_mtime if cache.get(currency, (0, 0))[0] < file_time: data = pd.read_excel(f'{currency}.xlsx') data = data[data[currency] > 0] best_aic, best_fit = inf, None for lags in (3, 5, 7, 10, 14, 28, 60, 90, 120, 183, 365, 730, 1095): print(' Lag', lags) model = AutoReg(data[currency], lags=lags) fit = model.fit() if fit.aic < best_aic: best_aic, best_fit = fit.aic, fit cache[currency] = (file_time, best_fit) best_model[currency] = cache[currency] # Estimate next month's price increase assuming the same volume as today forecasted_value = 0 for index, row in purchases.iterrows():
"author": line["author"], "subreddit": line["subreddit"].lower() if k == "all" else k, "timestamp": int(line["created_utc"]) } val = tmp_dict.get(subreddit, []) val.append(dict_val) tmp_dict[subreddit] = val if count % 1000000 == 0: print(datetime.datetime.now() - now, len(list(tmp_dict.keys()))) now = datetime.datetime.now() c = 0 for key, item in tmp_dict.items(): c += 1 try: val = dict_db.get(key, []) val += item dict_db[key] = val except: print("ERROR", key) continue if count % 1000000 == 0: print(datetime.datetime.now() - now) now = datetime.datetime.now() dict_db.commit() dict_db.commit() dict_db.close()
from math import inf from sqlitedict import SqliteDict from statsmodels.tsa.ar_model import AutoReg cache = SqliteDict('precompute.db', autocommit=True) # A US manufacturer buys raw materials in multiple currencies purchases = pd.read_excel('Purchases.xlsx') # For each of those currencies, find the best model to forecast prices best_model = {} for currency in purchases.currency: print('Currency', currency) data = pd.read_excel(f'{currency}.xlsx') data = data[data[currency] > 0] best_aic, best_fit, best_lags = inf, None, None check_lags = cache.get( currency, (3, 5, 7, 10, 14, 28, 60, 90, 120, 183, 365, 730, 1095)) for lags in check_lags: print(' Lags', lags) model = AutoReg(data[currency], lags=lags) fit = model.fit() if fit.aic < best_aic: best_aic, best_fit, best_lags = fit.aic, fit, lags cache[currency] = (best_lags, ) best_model[currency] = best_fit # Estimate next month's price increase assuming the same volume as today forecasted_value = 0 for index, row in purchases.iterrows(): fit = best_model[row.currency] prices = fit.predict(fit.model.nobs, fit.model.nobs + 30) change = prices.iloc[-1] / prices.iloc[0]
class Twizzle(object): """Twizzle multi purpose benchmarking system -- base class """ def __init__(self, sDBPath): """Constructor of the Twizzle class Note: Please pass the path of the SQLite as parameter Args: sDBPath (str): Path to the SQLite database. """ if sDBPath is None: raise Exception("Path to SQL-Database has to be defined") self._db = SqliteDict(sDBPath) def add_challenge(self, sName, aOriginalObjects, aComparativeObjects, aTargetDecisions, dicMetadata={}): """Adds a challenge under the given name to the database Note: The three lists describe a table of the following format: | Original object | Comparative object | target decision | |----------------|-------------------|-----------------| | Img1.png | Img1_scaled.png | True | | Img2.png | Img2_brighter.png | True | | Img2.png | Img9.png | False | Args: sName (str): the name of the challenge. aOriginalObjects (:obj:`list` of :obj:`str`): List of paths of the original objects aComparativeObjects (:obj:`list` of :obj:`str`): List of paths of the objects that should be compared to the original objects at the same position in the list aTargetDecisions (:obj:`list` of :obj:`bool`): List of boolean defining whether the objects linked in aOriginalObjects and aComparativeObjects beeing at the same position in the list are the same (True) or not (False) dicMetadata (:obj:): an object defining metadata for the challenge like what printer was used or what kind of attack using which parameters was performed Returns: None """ # catch wrong parameters if (not sName) or (aOriginalObjects is None) or ( aComparativeObjects is None) or (aTargetDecisions is None): raise Exception("Parameters can not be None.") if not (len(aOriginalObjects) == len(aComparativeObjects) == len(aTargetDecisions)): raise Exception( "Objects sets and target decisions have to have the same amount of entries." ) if not (all(isinstance(x, str) for x in aOriginalObjects) and all(isinstance(x, str) for x in aComparativeObjects)): raise Exception( "All objects have to be defined as path given as string.") if (not all(isinstance(x, bool) for x in aTargetDecisions)) and not isinstance( aTargetDecisions, np.ndarray) and not ( aTargetDecisions.dtype == np.dtype("bool")): raise Exception("The target decisions have to be boolean only.") # get current challenges from database aChallenges = self._db.get(DB_CHALLENGES_KEY, []) # test whether name was used before aChallengesSameName = [ ch for ch in aChallenges if ch["challenge"] == sName ] if len(aChallengesSameName) != 0: raise Exception( "Challenge name %s is already in use. Define an other one. Aborting." % sName) # append new challenge dicChallenge = { "challenge": sName, "originalObjects": aOriginalObjects, "comparativeObjects": aComparativeObjects, "targetDecisions": aTargetDecisions } # adding additional information if given if dicMetadata: dicChallenge = {**dicMetadata, **dicChallenge} aChallenges.append(dicChallenge) self._db[DB_CHALLENGES_KEY] = aChallenges self._db.commit() def del_challenge(self, sName): """ deletes an existing challenge by its name Args: sName (str): the name of the challenge to be deleted Returns: None """ # get current challenges from database aChallenges = self._db.get(DB_CHALLENGES_KEY, []) aMatches = [ch for ch in aChallenges if ch["challenge"] == sName] if len(aMatches) == 0: raise Exception("No challenge named %s found." % sName) # remove element aChallenges.remove(aMatches[0]) # save new db self._db[DB_CHALLENGES_KEY] = aChallenges self._db.commit() def get_challenges(self): """ getting a list of all defined challenges Returns: :obj:`list` of :obj:: `obj`: List of all defined challenges """ return self._db.get(DB_CHALLENGES_KEY, []) def get_challenge(self, sChallengeName): """ getting a single challenge object Args: sChallengeName (str): the name of the challenge to get Returns: :obj:: `obj`: Object defining the challenge having the name sChallengeName """ aChallenges = self._db.get(DB_CHALLENGES_KEY, []) aMatches = [ ch for ch in aChallenges if ch["challenge"] == sChallengeName ] if len(aMatches) == 0: raise Exception("No challenge with name %s found." % sChallengeName) return aMatches[0] def clear_challenges(self): """ clears all challenge entries from the database """ self._db[DB_CHALLENGES_KEY] = [] self._db.commit() def run_test(self, sChallengeName, fnCallback, dicCallbackParameters={}, autosave_to_db=False): """ run single challenge as test using given callback function and optional params Note: fnCallback has to fullfill following specifications Parameters: fnCallback(aOriginalObjects, aComparativeObjects, **dicCallbackParameters) - aOriginalObjects: list of strings describing paths to original objects - aComparativeObjects: list of strings describing paths to comparative objects ... arbitrary number of further parameters Returns: aDecisions, dicAdditionalInformation = fnCallback(...) - aDecisions: list of boolean decisions describing wether the algorithm has decided that the original object and the comparative objects are the same (True) or not (False) - dicAdditionalInformation: the algorithm can supply additional information that can be used in the evaluation later on to compare different settings Args: sChallengeName (str): the challenge that should be executed fnCallback (function): Pointer to wrapper-function that tests a challenge on a specific algorithm and makes decisions whether the objects are the same or not depending on its decision algorithm dicCallbackParameters (:obj:): Dictionary defining parameters for the function in fnCallback Returns: dicTest: dictionary of test results that can be saved to db """ if not (sChallengeName) or not (fnCallback): raise Exception("Parameters are not allowed to be None.") dicChallenge = self.get_challenge(sChallengeName) sChallengeName = dicChallenge["challenge"] aOriginalObjects = dicChallenge["originalObjects"] aComparativeObjects = dicChallenge["comparativeObjects"] aTargetDecisions = dicChallenge["targetDecisions"] # run challenge aDecisions, dicAdditionalInformation = fnCallback( aOriginalObjects, aComparativeObjects, **dicCallbackParameters) # check if site of decisions is right if len(aDecisions) != len(aTargetDecisions): raise Exception( "Array of Decisions is not the same size as given set of objects. Aborting." ) # calculate rates lTP = np.sum(np.logical_and(aDecisions, aTargetDecisions)) lTN = np.sum( np.logical_and(np.logical_not(aDecisions), np.logical_not(aTargetDecisions))) lFP = np.sum( np.logical_and(aDecisions, np.logical_not(aTargetDecisions))) lFN = np.sum( np.logical_and(np.logical_not(aDecisions), aTargetDecisions)) #True positive Rate / Recall -- Robustness in PIH dTPR = lTP / (lTP + lFN) if ((lTP + lFN) > 0) else 0. # True negative Rate -- Sensitivity dTNR = lTN / (lTN + lFP) if ((lTN + lFP) > 0) else 0. # False positive Rate / FAR dFPR = 1 - dTNR # False negative Rate / FRR dFNR = 1 - dTPR dAccuracy = (lTP + lTN) / (lTP + lTN + lFP + lFN) dPrecision = lTP / (lTP + lFP) if ((lTP + lFP) > 0.) else 0. dF1score = 2 * ((dPrecision * dTPR) / (dPrecision + dTPR)) if ( (dPrecision + dTPR) > 0) else 0. # fill test object dicTest = dicAdditionalInformation dicTest["challenge"] = sChallengeName #dicTest["TP"] = lTP #dicTest["TN"] = lTN #dicTest["FP"] = lFP #dicTest["FN"] = lFN dicTest["TPR"] = dTPR #Recall dicTest["TNR"] = dTNR dicTest["FPR"] = dFPR # FAR dicTest["FNR"] = dFNR #FRR dicTest["Accuracy"] = dAccuracy dicTest["Precision"] = dPrecision dicTest["F1_score"] = dF1score # save test in db if autosave_to_db: self.__save_test(dicTest) return dicTest def __save_test(self, dicTest): """ saves a test object to the database""" if not dicTest: raise Exception("Test object must not be None.") aTests = self._db.get(DB_TESTS_KEY, []) aTests.append(dicTest) self._db[DB_TESTS_KEY] = aTests self._db.commit() def save_test_threadsafe(self, dicTest, lock): """ saves a test object to the database threadsafe""" lock.acquire() self.__save_test(dicTest) lock.release() def get_tests(self): """getting all tests Returns: :obj:`list` of :obj:: `obj`: List of all tests executed """ return self._db.get(DB_TESTS_KEY, []) def clear_tests(self): """ delete all tests from the database """ self._db[DB_TESTS_KEY] = [] self._db.commit()
class SimServer(object): """ Top-level functionality for similarity services. A similarity server takes care of:: 1. creating semantic models 2. indexing documents using these models 3. finding the most similar documents in an index. An object of this class can be shared across network via Pyro, to answer remote client requests. It is thread safe. Using a server concurrently from multiple processes is safe for reading = answering similarity queries. Modifying (training/indexing) is realized via locking = serialized internally. """ def __init__(self, basename, use_locks=False): """ All data will be stored under directory `basename`. If there is a server there already, it will be loaded (resumed). The server object is stateless in RAM -- its state is defined entirely by its location. There is therefore no need to store the server object. """ if not os.path.isdir(basename): raise ValueError("%r must be a writable directory" % basename) self.basename = basename self.use_locks = use_locks self.lock_update = threading.RLock( ) if use_locks else gensim.utils.nocm #self.lock_update = threading._RLock if use_locks else gensim.utils.nocm #self.lock_update = RLock() if use_locks else gensim.utils.nocm try: self.fresh_index = SimIndex.load(self.location('index_fresh')) except: logger.debug("starting a new fresh index") self.fresh_index = None try: self.opt_index = SimIndex.load(self.location('index_opt')) except: logger.debug("starting a new optimized index") self.opt_index = None try: self.model = SimModel.load(self.location('model')) except: self.model = None self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) self.flush(save_index=False, save_model=False, clear_buffer=True) logger.info("loaded %s" % self) def location(self, name): return os.path.join(self.basename, name) @gensim.utils.synchronous('lock_update') def flush(self, save_index=False, save_model=False, clear_buffer=False): """Commit all changes, clear all caches.""" if save_index: if self.fresh_index is not None: self.fresh_index.save(self.location('index_fresh')) if self.opt_index is not None: self.opt_index.save(self.location('index_opt')) if save_model: if self.model is not None: self.model.save(self.location('model')) self.payload.commit() if clear_buffer: if hasattr(self, 'fresh_docs'): try: self.fresh_docs.terminate( ) # erase all buffered documents + file on disk except: pass self.fresh_docs = SqliteDict( journal_mode=JOURNAL_MODE ) # buffer defaults to a random location in temp self.fresh_docs.sync() def close(self): """Explicitly close open file handles, databases etc.""" try: self.payload.close() except: pass try: self.model.close() except: pass try: self.fresh_index.close() except: pass try: self.opt_index.close() except: pass try: self.fresh_docs.terminate() except: pass def __del__(self): """When the server went out of scope, make an effort to close its DBs.""" self.close() @gensim.utils.synchronous('lock_update') def buffer(self, documents): """ Add a sequence of documents to be processed (indexed or trained on). Here, the documents are simply collected; real processing is done later, during the `self.index` or `self.train` calls. `buffer` can be called repeatedly; the result is the same as if it was called once, with a concatenation of all the partial document batches. The point is to save memory when sending large corpora over network: the entire `documents` must be serialized into RAM. See `utils.upload_chunked()`. A call to `flush()` clears this documents-to-be-processed buffer (`flush` is also implicitly called when you call `index()` and `train()`). """ logger.info("adding documents to temporary buffer of %s" % (self)) for doc in documents: docid = doc['id'] # logger.debug("buffering document %r" % docid) if docid in self.fresh_docs: logger.warning("asked to re-add id %r; rewriting old value" % docid) self.fresh_docs[docid] = doc self.fresh_docs.sync() @gensim.utils.synchronous('lock_update') def train(self, corpus=None, method='auto', clear_buffer=True, params=None): """ Create an indexing model. Will overwrite the model if it already exists. All indexes become invalid, because documents in them use a now-obsolete representation. The model is trained on documents previously entered via `buffer`, or directly on `corpus`, if specified. """ if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "train called but no training corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if method == 'auto': numdocs = len(self.fresh_docs) if numdocs < 1000: logging.warning( "too few training documents; using simple log-entropy model instead of latent semantic indexing" ) method = 'logentropy' else: method = 'lsi' if params is None: params = {} self.model = SimModel(self.fresh_docs, method=method, params=params) self.flush(save_model=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def index(self, corpus=None, clear_buffer=True): """ Permanently index all documents previously added via `buffer`, or directly index documents from `corpus`, if specified. The indexing model must already exist (see `train`) before this function is called. """ if not self.model: msg = 'must initialize model for %s before indexing documents' % self.basename logger.error(msg) raise AttributeError(msg) if corpus is not None: # use the supplied corpus only (erase existing buffer, if any) self.flush(clear_buffer=True) self.buffer(corpus) if not self.fresh_docs: msg = "index called but no indexing corpus specified for %s" % self logger.error(msg) raise ValueError(msg) if not self.fresh_index: logger.info("starting a new fresh index for %s" % self) self.fresh_index = SimIndex(self.location('index_fresh'), self.model.num_features) self.fresh_index.index_documents(self.fresh_docs, self.model) if self.opt_index is not None: self.opt_index.delete(self.fresh_docs.keys()) logger.info("storing document payloads") for docid in self.fresh_docs: payload = self.fresh_docs[docid].get('payload', None) if payload is None: # HACK: exit on first doc without a payload (=assume all docs have payload, or none does) break self.payload[docid] = payload self.flush(save_index=True, clear_buffer=clear_buffer) @gensim.utils.synchronous('lock_update') def optimize(self): """ Precompute top similarities for all indexed documents. This speeds up `find_similar` queries by id (but not queries by fulltext). Internally, documents are moved from a fresh index (=no precomputed similarities) to an optimized index (precomputed similarities). Similarity queries always query both indexes, so this split is transparent to clients. If you add documents later via `index`, they go to the fresh index again. To precompute top similarities for these new documents too, simply call `optimize` again. """ if self.fresh_index is None: logger.warning("optimize called but there are no new documents") return # nothing to do! if self.opt_index is None: logger.info("starting a new optimized index for %s" % self) self.opt_index = SimIndex(self.location('index_opt'), self.model.num_features) self.opt_index.merge(self.fresh_index) self.fresh_index.terminate() # delete old files self.fresh_index = None self.flush(save_index=True) @gensim.utils.synchronous('lock_update') def drop_index(self, keep_model=True): """Drop all indexed documents. If `keep_model` is False, also dropped the model.""" modelstr = "" if keep_model else "and model " logger.info("deleting similarity index " + modelstr + "from %s" % self.basename) # delete indexes for index in [self.fresh_index, self.opt_index]: if index is not None: index.terminate() self.fresh_index, self.opt_index = None, None # delete payload if self.payload is not None: self.payload.close() fname = self.location('payload') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) #except Exception, e: except Exception as e: logger.warning("failed to delete %s" % fname) self.payload = SqliteDict(self.location('payload'), autocommit=True, journal_mode=JOURNAL_MODE) # optionally, delete the model as well if not keep_model and self.model is not None: self.model.close() fname = self.location('model') try: if os.path.exists(fname): os.remove(fname) logger.info("deleted %s" % fname) #except Exception, e: except Exception as e: logger.warning("failed to delete %s" % fname) self.model = None self.flush(save_index=True, save_model=True, clear_buffer=True) @gensim.utils.synchronous('lock_update') def delete(self, docids): """Delete specified documents from the index.""" logger.info("asked to drop %i documents" % len(docids)) for index in [self.opt_index, self.fresh_index]: if index is not None: index.delete(docids) self.flush(save_index=True) def is_locked(self): #return self.use_locks and self.lock_update._RLock__count > 0 #return self.use_locks #return self.use_locks and self.lock_update._RLock._count > 0 return self.use_locks and self.lock_update._count > 0 def vec_by_id(self, docid): for index in [self.opt_index, self.fresh_index]: if index is not None and docid in index: return index.vec_by_id(docid) def find_similar(self, doc, min_score=0.0, max_results=100): """ Find `max_results` most similar articles in the index, each having similarity score of at least `min_score`. The resulting list may be shorter than `max_results`, in case there are not enough matching documents. `doc` is either a string (=document id, previously indexed) or a dict containing a 'tokens' key. These tokens are processed to produce a vector, which is then used as a query against the index. The similar documents are returned in decreasing similarity order, as `(doc_id, similarity_score, doc_payload)` 3-tuples. The payload returned is identical to what was supplied for this document during indexing. """ logger.debug("received query call with %r" % doc) if self.is_locked(): msg = "cannot query while the server is being updated" logger.error(msg) raise RuntimeError(msg) sims_opt, sims_fresh = None, None for index in [self.fresh_index, self.opt_index]: if index is not None: index.topsims = max_results #if isinstance(doc, basestring): if isinstance(doc, str): # query by direct document id docid = doc if self.opt_index is not None and docid in self.opt_index: sims_opt = self.opt_index.sims_by_id(docid) if self.fresh_index is not None: vec = self.opt_index.vec_by_id(docid) sims_fresh = self.fresh_index.sims_by_vec(vec, normalize=False) elif self.fresh_index is not None and docid in self.fresh_index: sims_fresh = self.fresh_index.sims_by_id(docid) if self.opt_index is not None: vec = self.fresh_index.vec_by_id(docid) sims_opt = self.opt_index.sims_by_vec(vec, normalize=False) else: raise ValueError("document %r not in index" % docid) else: if 'topics' in doc: # user supplied vector directly => use that vec = gensim.matutils.any2sparse(doc['topics']) else: # query by an arbitrary text (=tokens) inside doc['tokens'] vec = self.model.doc2vec( doc) # convert document (text) to vector if self.opt_index is not None: sims_opt = self.opt_index.sims_by_vec(vec) if self.fresh_index is not None: sims_fresh = self.fresh_index.sims_by_vec(vec) merged = merge_sims(sims_opt, sims_fresh) logger.debug( "got %s raw similars, pruning with max_results=%s, min_score=%s" % (len(merged), max_results, min_score)) result = [] for docid, score in merged: if score < min_score or 0 < max_results <= len(result): break result.append((docid, float(score), self.payload.get(docid, None))) return result #def find_similar(self, doc, min_score=0.0, max_results=100): def find_dissimilar(self, doc, max_score=1.0, max_results=100): """ Find `max_results` most similar articles in the index, each having similarity score of at least `min_score`. The resulting list may be shorter than `max_results`, in case there are not enough matching documents. `doc` is either a string (=document id, previously indexed) or a dict containing a 'tokens' key. These tokens are processed to produce a vector, which is then used as a query against the index. The similar documents are returned in decreasing similarity order, as `(doc_id, similarity_score, doc_payload)` 3-tuples. The payload returned is identical to what was supplied for this document during indexing. """ logger.debug("received query call with %r" % doc) if self.is_locked(): msg = "cannot query while the server is being updated" logger.error(msg) raise RuntimeError(msg) sims_opt, sims_fresh = None, None for index in [self.fresh_index, self.opt_index]: if index is not None: #index.topsims = max_results index.topsims = 10000000 #if isinstance(doc, basestring): if isinstance(doc, str): # query by direct document id docid = doc if self.opt_index is not None and docid in self.opt_index: sims_opt = self.opt_index.sims_by_id(docid) if self.fresh_index is not None: vec = self.opt_index.vec_by_id(docid) sims_fresh = self.fresh_index.sims_by_vec(vec, normalize=False) elif self.fresh_index is not None and docid in self.fresh_index: sims_fresh = self.fresh_index.sims_by_id(docid) if self.opt_index is not None: vec = self.fresh_index.vec_by_id(docid) sims_opt = self.opt_index.sims_by_vec(vec, normalize=False) else: raise ValueError("document %r not in index" % docid) else: if 'topics' in doc: # user supplied vector directly => use that vec = gensim.matutils.any2sparse(doc['topics']) else: # query by an arbitrary text (=tokens) inside doc['tokens'] vec = self.model.doc2vec( doc) # convert document (text) to vector if self.opt_index is not None: sims_opt = self.opt_index.sims_by_vec(vec) if self.fresh_index is not None: sims_fresh = self.fresh_index.sims_by_vec(vec) merged = merge_sims(sims_opt, sims_fresh) #merged.sort(reverse=False) merged.sort(key=lambda tup: tup[1], reverse=False) #logger.debug("got %s raw similars, pruning with max_results=%s, min_score=%s" % # (len(merged), max_results, min_score)) logger.debug( "got %s raw similars, pruning with max_results=%s, max_score=%s" % (len(merged), max_results, max_score)) result = [] #print("merged = ", merged) #print("len(merged) = ", len(merged)) for docid, score in merged: #if score < min_score or 0 < max_results <= len(result): #if score > max_score or 0 < max_results <= len(result): if score > max_score: if len(result) >= max_results: break #elif len(result) >= (10 * max_results): #elif len(result) >= (2 * max_results): elif len(result) >= (1 * max_results): break result.append((docid, float(score), self.payload.get(docid, None))) return result def __str__(self): return ("SimServer(loc=%r, fresh=%s, opt=%s, model=%s, buffer=%s)" % (self.basename, self.fresh_index, self.opt_index, self.model, self.fresh_docs)) def __len__(self): return sum( len(index) for index in [self.opt_index, self.fresh_index] if index is not None) def __contains__(self, docid): """Is document with `docid` in the index?""" return any(index is not None and docid in index for index in [self.opt_index, self.fresh_index]) def get_tfidf(self, *args, **kwargs): return self.model.get_tfidf(*args, **kwargs) def status(self): return str(self) def keys(self): """Return ids of all indexed documents.""" result = [] if self.fresh_index is not None: result += self.fresh_index.keys() if self.opt_index is not None: result += self.opt_index.keys() return result def memdebug(self): from guppy import hpy return str(hpy().heap())
class CharLMEmbeddings(TokenEmbeddings): """Contextual string embeddings of words, as proposed in Akbik et al., 2018.""" def __init__(self, model, detach: bool = True, use_cache: bool = True, cache_directory: str = None): """ initializes contextual string embeddings using a character-level language model. :param model: model string, one of 'news-forward', 'news-backward', 'mix-forward', 'mix-backward', 'german-forward', 'german-backward' depending on which character language model is desired :param detach: if set to False, the gradient will propagate into the language model. this dramatically slows down training and often leads to worse results, so not recommended. :param use_cache: if set to False, will not write embeddings to file for later retrieval. this saves disk space but will not allow re-use of once computed embeddings that do not fit into memory :param cache_directory: if cache_directory is not set, the cache will be written to ~/.flair/embeddings. otherwise the cache is written to the provided directory. """ super().__init__() # news-english-forward if model.lower() == 'news-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-backward if model.lower() == 'news-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-forward if model.lower() == 'news-forward-fast': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # news-english-backward if model.lower() == 'news-backward-fast': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-english-forward if model.lower() == 'mix-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-english-backward if model.lower() == 'mix-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-german-forward if model.lower() == 'german-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-forward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # mix-german-backward if model.lower() == 'german-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt' model = cached_path(base_path, cache_dir='embeddings') # common crawl Polish forward if model.lower() == 'polish-forward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-forward-v0.2.pt' model = cached_path(base_path, cache_dir='embeddings') # common crawl Polish backward if model.lower() == 'polish-backward': base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-backward-v0.2.pt' model = cached_path(base_path, cache_dir='embeddings') self.name = model self.static_embeddings = detach from flair.models import LanguageModel self.lm = LanguageModel.load_language_model(model) self.detach = detach self.is_forward_lm: bool = self.lm.is_forward_lm # caching variables self.use_cache: bool = use_cache self.cache = None self.cache_directory: str = cache_directory dummy_sentence: Sentence = Sentence() dummy_sentence.add_token(Token('hello')) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length: int = len(embedded_dummy[0].get_token(1).get_embedding()) def __getstate__(self): # Copy the object's state from self.__dict__ which contains # all our instance attributes. Always use the dict.copy() # method to avoid modifying the original state. state = self.__dict__.copy() # Remove the unpicklable entries. state['cache'] = None state['use_cache'] = False state['cache_directory'] = None return state @property def embedding_length(self) -> int: return self.__embedding_length def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]: # this whole block is for compatibility with older serialized models TODO: remove in version 0.4 if 'cache' not in self.__dict__ or 'cache_directory' not in self.__dict__: self.use_cache = False self.cache_directory = None else: cache_path = '{}-tmp-cache.sqllite'.format(self.name) if not self.cache_directory else os.path.join( self.cache_directory, '{}-tmp-cache.sqllite'.format(os.path.basename(self.name))) if not os.path.exists(cache_path): self.use_cache = False self.cache_directory = None # if cache is used, try setting embeddings from cache first if self.use_cache: # lazy initialization of cache if not self.cache: from sqlitedict import SqliteDict self.cache = SqliteDict(cache_path, autocommit=True) # try populating embeddings from cache all_embeddings_retrieved_from_cache: bool = True for sentence in sentences: key = sentence.to_tokenized_string() embeddings = self.cache.get(key) if not embeddings: all_embeddings_retrieved_from_cache = False break else: for token, embedding in zip(sentence, embeddings): token.set_embedding(self.name, torch.FloatTensor(embedding)) if all_embeddings_retrieved_from_cache: return sentences # if this is not possible, use LM to generate embedding. First, get text sentences text_sentences = [sentence.to_tokenized_string() for sentence in sentences] longest_character_sequence_in_batch: int = len(max(text_sentences, key=len)) # pad strings with whitespaces to longest sentence sentences_padded: List[str] = [] append_padded_sentence = sentences_padded.append end_marker = ' ' extra_offset = 1 for sentence_text in text_sentences: pad_by = longest_character_sequence_in_batch - len(sentence_text) if self.is_forward_lm: padded = '\n{}{}{}'.format(sentence_text, end_marker, pad_by * ' ') append_padded_sentence(padded) else: padded = '\n{}{}{}'.format(sentence_text[::-1], end_marker, pad_by * ' ') append_padded_sentence(padded) # get hidden states from language model all_hidden_states_in_lm = self.lm.get_representation(sentences_padded, self.detach) # take first or last hidden states from language model as word representation for i, sentence in enumerate(sentences): sentence_text = sentence.to_tokenized_string() offset_forward: int = extra_offset offset_backward: int = len(sentence_text) + extra_offset for token in sentence.tokens: token: Token = token offset_forward += len(token.text) if self.is_forward_lm: offset = offset_forward else: offset = offset_backward embedding = all_hidden_states_in_lm[offset, i, :] # if self.tokenized_lm or token.whitespace_after: offset_forward += 1 offset_backward -= 1 offset_backward -= len(token.text) token.set_embedding(self.name, embedding) if self.use_cache: for sentence in sentences: self.cache[sentence.to_tokenized_string()] = [token._embeddings[self.name].tolist() for token in sentence] return sentences
class SimIndex(gensim.utils.SaveLoad): """ An index of documents. Used internally by SimServer. It uses the Similarity class to persist all document vectors to disk (via mmap). """ def __init__(self, fname, num_features, shardsize=SHARD_SIZE, topsims=TOP_SIMS): """ Spill index shards to disk after every `shardsize` documents. In similarity queries, return only the `topsims` most similar documents. """ self.fname = fname self.shardsize = int(shardsize) self.topsims = int(topsims) self.id2pos = { } # map document id (string) to index position (integer) self.pos2id = { } # reverse mapping for id2pos; redundant, for performance self.id2sims = SqliteDict( self.fname + '.id2sims', journal_mode=JOURNAL_MODE ) # precomputed top similar: document id -> [(doc_id, similarity)] self.qindex = gensim.similarities.Similarity(self.fname + '.idx', corpus=None, num_best=None, num_features=num_features, shardsize=shardsize) self.length = 0 def save(self, fname): tmp, self.id2sims = self.id2sims, None super(SimIndex, self).save(fname) self.id2sims = tmp @staticmethod def load(fname): result = gensim.utils.SaveLoad.load(fname) result.fname = fname result.check_moved() result.id2sims = SqliteDict(fname + '.id2sims', journal_mode=JOURNAL_MODE) return result def check_moved(self): output_prefix = self.fname + '.idx' if self.qindex.output_prefix != output_prefix: logger.info( "index seems to have moved from %s to %s; updating locations" % (self.qindex.output_prefix, output_prefix)) self.qindex.output_prefix = output_prefix self.qindex.check_moved() def close(self): "Explicitly release important resources (file handles, db, ...)" try: self.id2sims.close() except: pass try: del self.qindex except: pass def terminate(self): """Delete all files created by this index, invalidating `self`. Use with care.""" try: self.id2sims.terminate() except: pass import glob for fname in glob.glob(self.fname + '*'): try: os.remove(fname) logger.info("deleted %s" % fname) #except Exception, e: except Exception as e: logger.warning("failed to delete %s: %s" % (fname, e)) for val in self.__dict__.keys(): try: delattr(self, val) except: pass def index_documents(self, fresh_docs, model): """ Update fresh index with new documents (potentially replacing old ones with the same id). `fresh_docs` is a dictionary-like object (=dict, sqlitedict, shelve etc) that maps document_id->document. """ docids = fresh_docs.keys() vectors = (model.docs2vecs(fresh_docs[docid] for docid in docids)) logger.info("adding %i documents to %s" % (len(docids), self)) self.qindex.add_documents(vectors) self.qindex.save() self.update_ids(docids) def update_ids(self, docids): """Update id->pos mapping with new document ids.""" logger.info("updating %i id mappings" % len(docids)) for docid in docids: if docid is not None: pos = self.id2pos.get(docid, None) if pos is not None: logger.info("replacing existing document %r in %s" % (docid, self)) del self.pos2id[pos] self.id2pos[docid] = self.length try: del self.id2sims[docid] except: pass self.length += 1 self.id2sims.sync() self.update_mappings() def update_mappings(self): """Synchronize id<->position mappings.""" #self.pos2id = dict((v, k) for k, v in self.id2pos.iteritems()) self.pos2id = dict((v, k) for k, v in self.id2pos.items()) assert len(self.pos2id) == len( self.id2pos), "duplicate ids or positions detected" def delete(self, docids): """Delete documents (specified by their ids) from the index.""" logger.debug("deleting %i documents from %s" % (len(docids), self)) deleted = 0 for docid in docids: try: del self.id2pos[docid] deleted += 1 del self.id2sims[docid] except: pass self.id2sims.sync() if deleted: logger.info("deleted %i documents from %s" % (deleted, self)) self.update_mappings() def sims2scores(self, sims, eps=1e-7): """Convert raw similarity vector to a list of (docid, similarity) results.""" result = [] if isinstance(sims, numpy.ndarray): sims = abs( sims ) # TODO or maybe clip? are opposite vectors "similar" or "dissimilar"?! for pos in numpy.argsort(sims)[::-1]: if pos in self.pos2id and sims[ pos] > eps: # ignore deleted/rewritten documents # convert positions of resulting docs back to ids result.append((self.pos2id[pos], sims[pos])) if len(result) == self.topsims: break else: for pos, score in sims: if pos in self.pos2id and abs( score) > eps: # ignore deleted/rewritten documents # convert positions of resulting docs back to ids result.append((self.pos2id[pos], abs(score))) if len(result) == self.topsims: break return result def vec_by_id(self, docid): """Return indexed vector corresponding to document `docid`.""" pos = self.id2pos[docid] return self.qindex.vector_by_id(pos) def sims_by_id(self, docid): """Find the most similar documents to the (already indexed) document with `docid`.""" result = self.id2sims.get(docid, None) if result is None: self.qindex.num_best = self.topsims sims = self.qindex.similarity_by_id(self.id2pos[docid]) result = self.sims2scores(sims) return result def sims_by_vec(self, vec, normalize=None): """ Find the most similar documents to a given vector (=already processed document). """ if normalize is None: #normalize = self.qindex.normalize normalize = self.qindex.norm #norm, self.qindex.normalize = self.qindex.normalize, normalize # store old value norm, self.qindex.norm = self.qindex.norm, normalize # store old value self.qindex.num_best = self.topsims sims = self.qindex[vec] #self.qindex.normalize = norm # restore old value of qindex.normalize self.qindex.norm = norm # restore old value of qindex.norm return self.sims2scores(sims) def merge(self, other): """Merge documents from the other index. Update precomputed similarities in the process.""" #other.qindex.normalize, other.qindex.num_best = False, self.topsims other.qindex.norm, other.qindex.num_best = False, self.topsims # update precomputed "most similar" for old documents (in case some of # the new docs make it to the top-N for some of the old documents) logger.info("updating old precomputed values") pos, lenself = 0, len(self.qindex) for chunk in self.qindex.iter_chunks(): for sims in other.qindex[chunk]: if pos in self.pos2id: # ignore masked entries (deleted, overwritten documents) docid = self.pos2id[pos] sims = self.sims2scores(sims) self.id2sims[docid] = merge_sims(self.id2sims[docid], sims, self.topsims) pos += 1 if pos % 10000 == 0: logger.info("PROGRESS: updated doc #%i/%i" % (pos, lenself)) self.id2sims.sync() logger.info("merging fresh index into optimized one") pos, docids = 0, [] for chunk in other.qindex.iter_chunks(): for vec in chunk: if pos in other.pos2id: # don't copy deleted documents self.qindex.add_documents([vec]) docids.append(other.pos2id[pos]) pos += 1 self.qindex.save() self.update_ids(docids) logger.info("precomputing most similar for the fresh index") pos, lenother = 0, len(other.qindex) #norm, self.qindex.normalize = self.qindex.normalize, False norm, self.qindex.norm = self.qindex.norm, False topsims, self.qindex.num_best = self.qindex.num_best, self.topsims for chunk in other.qindex.iter_chunks(): for sims in self.qindex[chunk]: if pos in other.pos2id: # ignore masked entries (deleted, overwritten documents) docid = other.pos2id[pos] self.id2sims[docid] = self.sims2scores(sims) pos += 1 if pos % 10000 == 0: logger.info("PROGRESS: precomputed doc #%i/%i" % (pos, lenother)) #self.qindex.normalize, self.qindex.num_best = norm, topsims self.qindex.norm, self.qindex.num_best = norm, topsims self.id2sims.sync() def __len__(self): return len(self.id2pos) def __contains__(self, docid): return docid in self.id2pos def keys(self): return self.id2pos.keys() def __str__(self): return "SimIndex(%i docs, %i real size)" % (len(self), self.length)
class SqliteSparseSequence(MutableSequence[Any]): def __init__(self, filename: Union[str, PathLike], read_only: bool = False): self.table = SqliteDict(filename, "sparse_sequence", flag="r" if read_only else "c") def __del__(self): self.close() def __getitem__(self, i: Union[int, slice]) -> Any: if isinstance(i, int): try: return self.table[str(i)] except KeyError: current_length = len(self) if i >= current_length or current_length <= 0: raise IndexError("list index out of range") elif i < 0 < current_length: return self.__getitem__(i % current_length) else: return None elif isinstance(i, slice): return SlicedSequence(self, i) else: raise TypeError(f"list indices must be integers or slices, not {i.__class__.__name__}") def __setitem__(self, i: Union[int, slice], value: Any): if isinstance(i, int): current_length = len(self) if i < 0: i %= current_length self.table[str(i)] = value self.table["_len"] = max(i, current_length) self.table.commit() else: raise TypeError(f"list indices must be integers, not {i.__class__.__name__}") def __delitem__(self, i: Union[int, slice]): current_length = len(self) if isinstance(i, int): if i < 0: i %= current_length if i >= current_length: raise IndexError("list assignment index out of range") for index in range(i + 1, current_length): self.table[str(index - 1)] = self.table.get(str(index)) del self.table[str(current_length - 1)] self.table["_len"] = current_length - 1 self.table.commit() elif isinstance(i, slice): # This isn't very efficient for continuous slices. for index in reversed(range(*i.indices(current_length))): del self[index] else: raise TypeError(f"list indices must be integers or slices, not {i.__class__.__name__}") def extend(self, values: Iterable[Any]) -> None: current_length = len(self) index = -1 for index, value in enumerate(values): self.table[str(index + current_length)] = value if index < 0: return self.table["_len"] = current_length + index + 1 self.table.commit() def insert(self, i: int, value: Any) -> None: current_length = len(self) for index in reversed(range(i, current_length)): self.table[str(index + 1)] = self.table.get(str(index)) self.table[str(i)] = value self.table["_len"] = current_length + 1 self.table.commit() def __len__(self) -> int: try: return self.table["_len"] except KeyError: return 0 def clear(self) -> None: self.table.clear() self.table.commit() def close(self) -> None: if self.table is not None: self.table.close() self.table = None def copy_to(self, target: Union[str, PathLike]): try: os.link(self.table.filename, target) except OSError as e: if e.errno == 18: # Cross-device link shutil.copy(self.table.filename, target) else: raise
class abcService(abc.ABC): name: str author: str version: str preferred_port: int | None = None host = 'localhost' def __init__(self): for att in ['name', 'author', 'version']: assert hasattr(self, att), f'Missing attribute {att}' self.dirs = AppDirs( appname=self.name, appauthor=self.author, version=self.version, ) self.site_config_dir = Path(self.dirs.site_config_dir) os.makedirs(self.site_config_dir, exist_ok=True) self.db_service = SqliteDict( filename=self.site_config_dir / 'service.db', autocommit=True, ) curr_pid = self.db_service.get('pid', -1) curr_port = self.db_service.get('port', self.preferred_port) self.db_service['pid'] = curr_pid self.db_service['port'] = curr_port @property def pid(self) -> int: return self.db_service['pid'] @property def port(self) -> int | None: return self.db_service['port'] def alive(self, check=True): alive = (self.pid > 0) if check and alive and not self._alive(): self.db_service['pid'] = -1 self.db_service['port'] = None alive = False return alive def _alive(self): return psutil.pid_exists(self.pid) def stop(self): if self.alive(): stop_process(self.pid) else: print('(already stopped)') print(self) @abc.abstractmethod def start(self): raise NotImplementedError def cli_start(self): if not self.alive(): setproctitle.setproctitle(self.name) # just fancy self.db_service['pid'] = pid = os.getpid() self.db_service['port'] = find_free_port(self.port) try: self.start() procs = psutil.Process().children() psutil.wait_procs(procs) finally: self.db_service['pid'] = -1 self.db_service['port'] = None stop_process(pid) else: print('(already active)') print(self) return def __repr__(self): out = [ f'alive: {self.alive()}', f' pid: {self.pid}', f' port: {self.port}', f' url: http://{self.host}:{self.port}', ] return '\n'.join(out) if self.alive() else out[0]
class SessionState: def __init__(self, cache_file: Path, cache_key: str, redis: str, user_requested=False): self.user_requested = user_requested self._cache_file = cache_file self._cache_key = cache_key self._cache: Optional[SqliteDict] = None random.seed() self._session_key = random.randint(0, 999999) self._redis = Redis(host=redis) if not user_requested: self._open() if self._cache_key != self._cache.get("_cache_key_", None): self._cache.close() self._cache: Optional[SqliteDict] = None self._cache_file.unlink() self._open() self._cache["_cache_key_"] = self._cache_key self.session = Session() # noinspection PyTypeChecker self.session.mount( 'https://', HTTPAdapter(max_retries=Retry(total=3, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]))) self.sites = {} self.wikidata = Sparql() self.primary_site = self.get_site(primary_domain) def __enter__(self): return self def __exit__(self, typ, value, traceback): self.session.close() if self._cache is not None: self._cache.close() self._cache = None print(f'Closed SQL connection for {self._session_key} at {datetime.utcnow()}') def _open(self): if self._cache is None: print(f'Opening SQL connection for {self._session_key} at {datetime.utcnow()}') self._cache_file.parent.mkdir(parents=True, exist_ok=True) self._cache = SqliteDict(self._cache_file, autocommit=True) def get_site(self, domain: Domain) -> WikiSite: try: return self.sites[domain] except KeyError: # noinspection PyTypeChecker site = WikiSite(domain, self.session, domain == primary_domain) if self.user_requested: site.maxlag = None self.sites[domain] = site return site def delete_cached_items(self, prefix: str) -> None: self._open() for vv in {v for v in self._cache.keys() if v.startswith(prefix)}: del self._cache[vv] def del_obj(self, key: str) -> Any: self._redis.delete(self.redis_key(key)) self._open() print(f"%% del {key}") return self._cache.pop(key, None) def load_obj(self, key: str, default: Any = None) -> Any: value = self._redis.get(self.redis_key(key)) if value is not None: return loads(value) self._open() print(f"%% load {key}") value = self._cache.get(key, default) self._redis.set(self.redis_key(key), dumps(value)) return value def save_obj(self, key: str, value: Any): self._open() print(f"%% save {key}") self._cache[key] = value self._redis.set(self.redis_key(key), dumps(value)) def redis_key(self, key: str): return self._cache_key + key