def collect_customer_bg(cusid='', page=0, src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='customer_erp'): ###### initialize objects src = APIStorage(vgvars.erp) if not src else src # des = VarStorage({'data': defaultdict(list)}) if not des else des # des = FileStorage({'fpath': '{}{}_{}'.format(out_dir, output_name)}) apicollector = DataCollector(src, des) customer_stmt_temp = vgvars.erp['customer'] if cusid: customer_stmt_temp = customer_stmt_temp.format('id={}'.format(cusid)) data = apicollector.fetch_data(customer_stmt_temp)['data']['currentItems'] des = FileStorage({'fpath': '{}{}_cusid_{}'.format(out_dir, output_name, cusid)}) apicollector.des = des apicollector.insert_data({ 'selected_format': 'json', 'values': data }) else: customer_stmt_temp = customer_stmt_temp.format('page={}') if not page: page_num = apicollector.fetch_data(customer_stmt_temp.format(1))['data']['totalPage'] # page_num = 2 # Testing for i in range(1, page_num + 1): data = apicollector.fetch_data(customer_stmt_temp.format(i))['data']['currentItems'] des = FileStorage({'fpath': '{}{}_{}'.format(out_dir, output_name, i)}) apicollector.des = des apicollector.insert_data({ 'selected_format': 'json', 'values': data })
def create(config: StorageConfiguration) -> Storage: storage = Storage( JsonFormattedHostStorage(FileStorage(config.hosts_filename)), JsonFormattedUserAccountStorage( FileStorage(config.user_accounts_filename)), JsonFormattedUnixAccountStorage( FileStorage(config.unix_accounts_filename)), UserAccountActivationInMemoryStorage()) return storage
def collect_orders(start_date, end_date, eids=[], src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='order_data', gen_stmt=get_collect_order_stmt): ###### prepare has_des = False if not des else True ###### initialize objects src = DBStorage(vgvars.vgdb) if not src else src db_collector = VGDBCollector(src) ###### collecting # make sure that data is collect within a selected period of time like a year, a month, or a week date_list = convert_datetime.divide_dates(start_date, end_date) # in months for i in date_list: start_date, end_date = i # update destination storage # des = GSStorage(des) des = des if has_des else FileStorage({ 'fpath': '{}{}_{}_{}'.format(out_dir, output_name, start_date, end_date) }) db_collector.des = des # collect data collect(start_date, end_date, db_collector, eids, gen_stmt=gen_stmt) return db_collector.des.data
def setup_function(function): global storage, flower_entry with open(STORAGE_FILENAME, 'w') as f: pass storage = FileStorage(STORAGE_FILENAME) flower_entry = FlowerEntry(1, "tree", 1, 1)
def open(self): self.synclock.acquire() try: self.storage = FileStorage(self.path, self.name, self.extension) self.storage.open() finally: self.synclock.release()
def crawl_follows(self): def _crawl(parser, uid, page, num_pages=''): msg = 'Crawl user(%s)\'s follows-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.com/%s/follow?page=%s' %(uid, page) html = self._fetch(url, query=settings.QUERY_FOLLOWS) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %(self.uid) logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path) start_time = time.time() parser = ComFollowsParser(self.storage, uids_storage=self.uids_storage) num_pages = _crawl(parser, self.uid, page=1) if settings.PAGE_LIMIT != 0: if num_pages > settings.PAGE_LIMIT: msg = 'For sina policy, reduce page count from %s to %s' %(num_pages, settings.PAGE_LIMIT) write_message(msg, self.window) num_pages = settings.PAGE_LIMIT pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s follows: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def collect_estore_bg(eids=[], eloginnames=[], get_eids=False, get_eids_args=[], get_eids_function=get_ordered_estore_id, src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='e_bg', gen_stmt=get_collect_estore_bg_stmt, max_query=vgvars.max_vgdb_query_num): ###### prepare has_des = False if not des else True ###### initialize objects src = DBStorage(vgvars.vgdb) if not src else src db_collector = VGDBCollector(src) # by default, collect estore bg of estore that has order in a given period of time if get_eids: get_eids_args.append(db_collector) eids = get_eids_function(*get_eids_args) if eloginnames or eids: if eloginnames: qnum = math.ceil(len(eloginnames) / max_query) else: qnum = math.ceil(len(eids) / max_query) for i in range(qnum): start_index = i * max_query end_index = start_index + max_query selectedloginnames = eloginnames[start_index:end_index] selectedeids = eids[start_index:end_index] des = des if has_des else FileStorage({ 'fpath': '{}{}_{}_{}'.format(out_dir, output_name, start_index, end_index - 1) }) db_collector.des = des if selectedeids or selectedloginnames: logging.debug('collect estore bg from {} to {}'.format( start_index, end_index - 1)) stmt = get_collect_estore_bg_stmt( estore_loginnames=selectedloginnames, estore_ids=selectedeids, ) db_collector.fetch_data(stmt) db_collector.insert_data() else: # later. collect bg when no eids and eloginnames provided pass
def __init__(self): self.parser = AdvertisementParser() self.storage = MongoStorage( 'adv_data') if storage_type == 'mongo' else FileStorage('adv_data') if isinstance(self.storage, MongoStorage): self.links = self.storage.load('adv_links', {'flag': False}) else: self.links = self.storage.load('lnk') self.queue = self.create_queue()
def __init__(self): self.db = FileStorage('./pybl.db') self.current_hash = None if self.db.empty(): genesis = Block.genesis_block() self.db.put_block(genesis) self.tip = genesis.hash else: self.tip = self.db.get_last_hash()
def test_storage_delete_entry_by_name_false(): # Given another_storage = FileStorage(STORAGE_FILENAME) # When deleted = storage.delete_entry_by_name(flower_entry) # Then assert not deleted assert_is_entry_in_storage(storage, flower_entry, amount=0, exists=False) assert_is_entry_in_storage(another_storage, flower_entry, amount=0, exists=False)
def crawl_weibos(self): def _crawl(parser, uid, page, num_pages=''): msg = 'Crawl user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path) start_time = time.time() parser = ComWeibosParser(self.uid, self.storage, weibos_storage=self.weibos_storage) num_pages = _crawl(parser, self.uid, page=1) pages = [i for i in xrange(2, num_pages+1)] """ if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() """ cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s weibos: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def test_storage_add_entry(): # Given # When storage.add_entry(flower_entry) another_storage = FileStorage(STORAGE_FILENAME) # Then assert_is_entry_in_storage(storage, flower_entry, amount=1, exists=True) assert_is_entry_in_storage(another_storage, flower_entry, amount=1, exists=True)
def crawl_msg_reposts(self): def _crawl(parser, msg_id, page, num_pages=''): msg = 'Crawl message(%s)\'s reposts-page:%s:%s' %(self.msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_repost(msg_id, page) try: pq_doc = pq(html) parser.parse(pq_doc) except: pass return num_pages msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: msg = 'Not exist: %s.' %self.msg_url logger.info(msg) write_message(msg, self.window) return self.msg_id = msg_id self.storage= FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path) start_time = time.time() parser = ComRepostsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s reposts: total page=%s,' ' cost time=%s sec, connections=%s' %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def collect_estore_contracts_erp(src='', des='', contract_ids=[], from_date='', serviceName='', page=0, out_dir=vgvars.dir_path['out_dir'], output_name='e_erp_contract'): ###### initialize objects src = APIStorage(vgvars.erp) if not src else src des = VarStorage({'data': defaultdict(list)}) if not des else des apicollector = DataCollector(src, des) ####### perform contract_stmt_temp = vgvars.erp['contract'] contract_dict = defaultdict(list) if not page: page_num = apicollector.fetch_data(contract_stmt_temp.format( 1, '', ''))['data']['totalPage'] # page_num = 2 # TESTING for n in range(1, page_num + 1): data = apicollector.fetch_data(contract_stmt_temp.format( n, '', ''))['data']['currentItems'] if from_date: df = DataFrame(data) origin_dates = df['createdDateTime'] df['createdDateTime'] = pd.to_datetime(df['createdDateTime']) selectedDf = df[df['createdDateTime'] >= from_date] selectedDf['createdDateTime'] = selectedDf['createdDateTime'].map( lambda x: x.strftime('%Y-%m-%d')) selectedDf = selectedDf.T selected_data = selectedDf.to_dict() group_contract_by_start_date(contract_dict, selected_data) if len(selected_data) < len(data): break else: group_contract_by_start_date(contract_dict, data) for m in contract_dict: apicollector.des = FileStorage( {'fpath': '{}{}_{}'.format(out_dir, output_name, m)}) apicollector.insert_data({ 'selected_format': 'json', 'values': contract_dict[m] })
def process_file(file_url: str) -> Tuple[str, Tuple[str, ...]]: """Process file with download, cache and upgrade.""" _, file_ext = os.path.splitext(file_url) folder_hash = md5(file_url.encode('utf-8')).hexdigest() path = f"/notebooks/{folder_hash}" original = f"original{file_ext}" converted = f"converted{file_ext}" # TODO: delete the folder completely if `force` if not os.path.exists(path): file_content = _download_file(file_url) os.mkdir(path) with open(f"{path}/{original}", "w") as original_file: original_file.write(file_content) try: output = _convert_file(f"{path}/{original}", f"{path}/{converted}") except ConvertionException as error: shutil.rmtree(path) raise error with open(f"{path}/output", "w") as summary_output: summary_output.write('\n'.join(output)) shutil.copy('report.txt', f"{path}/report") # persist `report.txt` to GCS storage = FileStorage() storage.save_file('report.txt', folder_hash) # found a python file, need to encode separately if original.endswith('.py'): result_filenames = [] for py_file in [original, converted]: result_filenames.append(_save_ipynb_from_py(path, py_file)) assert len(result_filenames) == 2 return path, tuple(result_filenames[:2]) if original.endswith('.py'): return path, (original.replace('.py', '.ipynb'), converted.replace('.py', '.ipynb')) return path, (original, converted)
def collect_dept_bg(company_id=vgvars.erp_default['vghn'], src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='dept_erp'): ###### initialize objects src = APIStorage(vgvars.erp) if not src else src # des = VarStorage({'data': defaultdict(list)}) if not des else des des = FileStorage({'fpath': '{}{}_{}'.format(out_dir, output_name, company_id)}) apicollector = DataCollector(src,des) ####### perform dept_stmt_temp = vgvars.erp['dept'] data = apicollector.fetch_data(dept_stmt_temp.format(company_id))['data'] apicollector.insert_data({ 'selected_format': 'json', 'values': data })
def main(): args = docopt(__doc__) storage = FileStorage("entries.txt") bailer.init_storage(storage) if args.get('getall'): print(bailer.get_all_flowers()) elif args.get('add'): print( bailer.add_flower(args.get('<flower-name>'), args.get('<watering-interval>'))) elif args.get('remove'): print(bailer.remove_flower(args.get('<flower-name>'))) elif args.get('water'): if args.get('--force'): print(bailer.water_flower_force(args.get('<flower-name>'))) else: print(bailer.water_flower(args.get('<flower-name>')))
def collect_estore_spending(start_date, end_date, eids=[], get_eids=False, src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='e_spending_data', gen_stmt=get_collect_estore_spending_stmt): ###### prepare has_des = False if not des else True ###### initialize objects src = DBStorage(vgvars.vgdb) if not src else src db_collector = VGDBCollector(src) ###### collecting # make sure that data is collect within a selected period of time like a year, a month, or a week date_list = convert_datetime.divide_dates(start_date, end_date) # in months for d in date_list: start_date, end_date = d # update destination storage # des = GSStorage(des) des = des if has_des else FileStorage({ 'fpath': '{}{}_{}_{}'.format(out_dir, output_name, start_date, end_date) }) db_collector.des = des # not good enough, since data is check against all estores having orders in a month if get_eids: min_ord_id, max_ord_id = db_collector.get_max_min_id_by_time( start_date, end_date) eids = collect_estore_ids(min_ord_id, max_ord_id, db_collector) # collect data collect(start_date, end_date, db_collector, eids, gen_stmt=gen_stmt) return db_collector.des.data
def check_new_weibos(self): def _crawl(parser, uid, page, num_pages=''): msg = 'check new weibo in user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path) start_time = time.time() parser = ComWeibosParser(self.uid, self.storage) num_pages = _crawl(parser, self.uid, page=1) cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s weibos: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def crawl_infos(self): msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return msg = 'Crawl user(%s)\'s profile' %self.uid logger.info(msg) write_message(msg, self.window) self.storage = FileStorage(self.uid, settings.MASK_INFO, self.store_path) start_time = time.time() url = 'http://weibo.com/%s/info' % self.uid parser = ComInfosParser(self.uid, self.storage) html = self._fetch(url, query=settings.QUERY_INFO) try: pq_doc = pq(html) parser.parse(pq_doc) except: pass cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s infos: cost time=%s sec, connections=%s' %(self.uid, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def collect_staff_bg_erp(compids=['315', '319', '305', '320'], page=0, src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='sfaff_bg_erp'): ###### initialize objects src = APIStorage(vgvars.erp) if not src else src apicollector = DataCollector(src, '') ####### perform dept_stmt_temp = vgvars.erp['staffbg'] if compids: for compid in compids: if not page: page_num = apicollector.fetch_data( dept_stmt_temp.format(compid, 1, '', ''))['data']['totalPage'] # page_num = 2 # TESTING datalist = [] for n in range(1, page_num + 1): data = apicollector.fetch_data( dept_stmt_temp.format(compid, n, '', ''))['data']['currentItems'] datalist.extend(data) des = FileStorage( {'fpath': '{}{}_{}'.format(out_dir, output_name, compid)}) apicollector.des = des apicollector.insert_data({ 'selected_format': 'json', 'values': datalist })
def __init__(self, cities=default_cities, link=base_link): self.cities = cities self.link = link self.storage = MongoStorage( 'adv_links') if storage_type == 'mongo' else FileStorage( 'adv_links')
def crawl_msg_comments(self): def _crawl(parser, msg_id, page, num_pages='?'): msg = 'Crawl message(%s)\'s comments-page:%s:%s' % ( msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_comment(msg_id, page) if html is None: return None try: pq_doc = pq(html) parser.parse(pq_doc) return num_pages except: return None msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if msg_id is False: msg = 'Not exist: %s.' % self.msg_url logger.info(msg) write_message(msg, self.window) return False self.msg_id = msg_id self.storage = FileStorage(self.msg_id, settings.MASK_COMMENT, self.store_path) start_time = time.time() parser = ComCommentsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s comments: total page=%s,' ' cost time=%s sec, connections=%s' % (self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
def mock_storage(self): retriever = mock.create_autospec(PickleStorageRetriever) saver = mock.create_autospec(PickleStorageSaver) return FileStorage(retriever, saver)
def crawl_fans(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.cn/%s/fans?page=%s' % (uid, page) html = self._fetch(url) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' % self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' % (self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path) start_time = time.time() parser = CnFansParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s fans: total page=%s,' ' cost time=%s sec, connections=%s' % (self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
def setUp(self): self.storage = FileStorage() self.storage._db_path = 'todo_test_database.csv'
def __set_storage(): if STORAGE_TYPE == 'mongo': return MongoStorage() return FileStorage()
from typing import Optional from contextlib import closing from hashlib import md5 from storage import FileStorage from db import Blob,Session from storage import ExistsError,NotFoundError from exceptions import DbCorruptionError storage_backend = FileStorage() def withsession(fn): def inner(sess=None,*args,**kwargs): if sess is None: with closing(Session()) as sess: return fn(sess = sess,*args,**kwargs) else: fn(sess,*args,**kwargs) return inner @withsession def store(data: bytes,sess: Optional[Session] = None): if sess is not None: data_id = md5(data).hexdigest() existing = sess.query(Blob).get(data_id) if existing is not None: raise ExistsError blob = Blob(id=data_id) sess.add(blob)
def __init__(self, repoDir): if not os.path.exists(repoDir): raise InvalidRepo("Invalid Repo path") #creating required structure self.path = repoDir self.storage = FileStorage(os.path.join(repoDir, ".svcs"))
def __init__(self, folder=None, overwrite=False, encoding='utf-8', debug=False, default_webentity_creation_rule=None, webentity_creation_rules=None): # Handling encoding self.encoding = encoding # Debugging mode if debug: if not default_webentity_creation_rule: default_webentity_creation_rule = '' if not webentity_creation_rules: webentity_creation_rules = {} else: # Ensuring the creation rules are set if not isinstance(default_webentity_creation_rule, basestring): raise TraphException('Given default webentity creation rule is not a string!') if not isinstance(webentity_creation_rules, dict): raise TraphException('Given webentity creation rules is not a dict!') # TODO: check if each value is correctly a string # Files self.folder = folder self.lru_trie_file = None self.link_store_file = None self.lru_trie_path = None self.link_store_path = None create = overwrite self.in_memory = not bool(folder) # Solving paths if not self.in_memory: self.lru_trie_path = os.path.join(folder, 'lru_trie.dat') self.link_store_path = os.path.join(folder, 'link_store.dat') # Ensuring the given folder exists try: os.makedirs(folder) except OSError as exception: if exception.errno == errno.EEXIST and os.path.isdir(folder): pass else: raise # Testing existence of files lru_trie_file_exists = os.path.isfile(self.lru_trie_path) link_store_file_exists = os.path.isfile(self.link_store_path) # Checking consistency if lru_trie_file_exists and not link_store_file_exists: raise TraphException( 'File inconsistency: `lru_trie.dat` file exists but not `link_store.dat`.' ) if not lru_trie_file_exists and link_store_file_exists: raise TraphException( 'File inconsistency: `link_store.dat` file exists but not `lru_trie.dat`.' ) # Do we need to create the files for the first time? create = overwrite or (not lru_trie_file_exists and not link_store_file_exists) flags = 'wb+' if create else 'rb+' self.lru_trie_file = open(self.lru_trie_path, flags) self.link_store_file = open(self.link_store_path, flags) self.lru_trie_storage = FileStorage( LRU_TRIE_NODE_BLOCK_SIZE, self.lru_trie_file ) self.links_store_storage = FileStorage( LINK_STORE_NODE_BLOCK_SIZE, self.link_store_file ) # Checking for corruption if not create and self.lru_trie_storage.check_for_corruption(): raise TraphException( 'File corrupted: `lru_trie.dat`' ) if not create and self.links_store_storage.check_for_corruption(): raise TraphException( 'File corrupted: `link_store.dat`' ) else: self.lru_trie_storage = MemoryStorage(LRU_TRIE_NODE_BLOCK_SIZE) self.links_store_storage = MemoryStorage(LINK_STORE_NODE_BLOCK_SIZE) # LRU Trie initialization self.lru_trie = LRUTrie(self.lru_trie_storage, encoding=encoding) # Link Store initialization self.link_store = LinkStore(self.links_store_storage) # Webentity creation rules are stored in RAM if not debug: self.default_webentity_creation_rule = re.compile( default_webentity_creation_rule, re.I ) self.webentity_creation_rules = {} for prefix, pattern in webentity_creation_rules.items(): self.add_webentity_creation_rule(prefix, pattern, create)