def collect_customer_bg(cusid='', page=0, src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='customer_erp'): ###### initialize objects src = APIStorage(vgvars.erp) if not src else src # des = VarStorage({'data': defaultdict(list)}) if not des else des # des = FileStorage({'fpath': '{}{}_{}'.format(out_dir, output_name)}) apicollector = DataCollector(src, des) customer_stmt_temp = vgvars.erp['customer'] if cusid: customer_stmt_temp = customer_stmt_temp.format('id={}'.format(cusid)) data = apicollector.fetch_data(customer_stmt_temp)['data']['currentItems'] des = FileStorage({'fpath': '{}{}_cusid_{}'.format(out_dir, output_name, cusid)}) apicollector.des = des apicollector.insert_data({ 'selected_format': 'json', 'values': data }) else: customer_stmt_temp = customer_stmt_temp.format('page={}') if not page: page_num = apicollector.fetch_data(customer_stmt_temp.format(1))['data']['totalPage'] # page_num = 2 # Testing for i in range(1, page_num + 1): data = apicollector.fetch_data(customer_stmt_temp.format(i))['data']['currentItems'] des = FileStorage({'fpath': '{}{}_{}'.format(out_dir, output_name, i)}) apicollector.des = des apicollector.insert_data({ 'selected_format': 'json', 'values': data })
def open(self): self.synclock.acquire() try: self.storage = FileStorage(self.path, self.name, self.extension) self.storage.open() finally: self.synclock.release()
def create(config: StorageConfiguration) -> Storage: storage = Storage( JsonFormattedHostStorage(FileStorage(config.hosts_filename)), JsonFormattedUserAccountStorage( FileStorage(config.user_accounts_filename)), JsonFormattedUnixAccountStorage( FileStorage(config.unix_accounts_filename)), UserAccountActivationInMemoryStorage()) return storage
def __init__(self): self.db = FileStorage('./pybl.db') self.current_hash = None if self.db.empty(): genesis = Block.genesis_block() self.db.put_block(genesis) self.tip = genesis.hash else: self.tip = self.db.get_last_hash()
class Blockchain: def __init__(self): self.db = FileStorage('./pybl.db') self.current_hash = None if self.db.empty(): genesis = Block.genesis_block() self.db.put_block(genesis) self.tip = genesis.hash else: self.tip = self.db.get_last_hash() def __iter__(self): self.current_hash = self.tip return self def __next__(self): if self.current_hash == '': raise StopIteration block = self.db.get_block(self.current_hash) prev_hash = block.prev_block_hash self.current_hash = prev_hash return block def add_block(self, data): last = self.db.get_last_hash() block = Block(data, last) self.db.put_block(block) self.tip = block.hash
class Repo: def __init__(self, repoDir): if not os.path.exists(repoDir): raise InvalidRepo("Invalid Repo path") #creating required structure self.path = repoDir self.storage = FileStorage(os.path.join(repoDir,".svcs")) #create objects directory under.svcs directory #os.makedirs(os.path.join(wd,".svcs","objects")) #os.makedirs(os.path.join(wd,".svcs","tip")) def commit(self,commitMsg, userId, listOfFiles): """stre all the give files""" latestCommitId =None date = datetime.datetime.utcnow().replace(microsecond = 0) #parent = None files =[] for itm in listOfFiles: filePath = os.path.join(self.path,itm) if not os.path.exists(filePath): raise InvalidFile fd= open(filePath, "r") fileObj =File(fd.read()) self.storage.store_object(fileObj) fd.close() latestCommitId = fileObj.id files.append([itm,fileObj.id]) parent = self.storage.get_tip() if parent!=None: parent = parent.id comObj = Commit(userId,commitMsg, date,parent,files) self.storage.store_object(comObj) self.storage.update_tip(comObj) return latestCommitId def getLogs(self): """ Returns all log entry in the current repo""" currentTip = self.storage.get_tip() c = currentTip.id logs =[] while c!=None: c = self.storage.get_object(c) logs.append({"id":c.files[0][1],"committer":c.committer, "message":c.message}) c = c.parent pass return logs
class Repo: def __init__(self, repoDir): if not os.path.exists(repoDir): raise InvalidRepo("Invalid Repo path") #creating required structure self.path = repoDir self.storage = FileStorage(os.path.join(repoDir, ".svcs")) #create objects directory under.svcs directory #os.makedirs(os.path.join(wd,".svcs","objects")) #os.makedirs(os.path.join(wd,".svcs","tip")) def commit(self, commitMsg, userId, listOfFiles): """stre all the give files""" latestCommitId = None date = datetime.datetime.utcnow().replace(microsecond=0) #parent = None files = [] for itm in listOfFiles: filePath = os.path.join(self.path, itm) if not os.path.exists(filePath): raise InvalidFile fd = open(filePath, "r") fileObj = File(fd.read()) self.storage.store_object(fileObj) fd.close() latestCommitId = fileObj.id files.append([itm, fileObj.id]) parent = self.storage.get_tip() if parent != None: parent = parent.id comObj = Commit(userId, commitMsg, date, parent, files) self.storage.store_object(comObj) self.storage.update_tip(comObj) return latestCommitId def getLogs(self): """ Returns all log entry in the current repo""" currentTip = self.storage.get_tip() c = currentTip.id logs = [] while c != None: c = self.storage.get_object(c) logs.append({ "id": c.files[0][1], "committer": c.committer, "message": c.message }) c = c.parent pass return logs
def crawl_follows(self): def _crawl(parser, uid, page, num_pages=''): msg = 'Crawl user(%s)\'s follows-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.com/%s/follow?page=%s' %(uid, page) html = self._fetch(url, query=settings.QUERY_FOLLOWS) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %(self.uid) logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path) start_time = time.time() parser = ComFollowsParser(self.storage, uids_storage=self.uids_storage) num_pages = _crawl(parser, self.uid, page=1) if settings.PAGE_LIMIT != 0: if num_pages > settings.PAGE_LIMIT: msg = 'For sina policy, reduce page count from %s to %s' %(num_pages, settings.PAGE_LIMIT) write_message(msg, self.window) num_pages = settings.PAGE_LIMIT pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s follows: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def collect_orders(start_date, end_date, eids=[], src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='order_data', gen_stmt=get_collect_order_stmt): ###### prepare has_des = False if not des else True ###### initialize objects src = DBStorage(vgvars.vgdb) if not src else src db_collector = VGDBCollector(src) ###### collecting # make sure that data is collect within a selected period of time like a year, a month, or a week date_list = convert_datetime.divide_dates(start_date, end_date) # in months for i in date_list: start_date, end_date = i # update destination storage # des = GSStorage(des) des = des if has_des else FileStorage({ 'fpath': '{}{}_{}_{}'.format(out_dir, output_name, start_date, end_date) }) db_collector.des = des # collect data collect(start_date, end_date, db_collector, eids, gen_stmt=gen_stmt) return db_collector.des.data
def setup_function(function): global storage, flower_entry with open(STORAGE_FILENAME, 'w') as f: pass storage = FileStorage(STORAGE_FILENAME) flower_entry = FlowerEntry(1, "tree", 1, 1)
def process_file(file_url: str) -> Tuple[str, Tuple[str, ...]]: """Process file with download, cache and upgrade.""" _, file_ext = os.path.splitext(file_url) folder_hash = md5(file_url.encode('utf-8')).hexdigest() path = f"/notebooks/{folder_hash}" original = f"original{file_ext}" converted = f"converted{file_ext}" # TODO: delete the folder completely if `force` if not os.path.exists(path): file_content = _download_file(file_url) os.mkdir(path) with open(f"{path}/{original}", "w") as original_file: original_file.write(file_content) try: output = _convert_file(f"{path}/{original}", f"{path}/{converted}") except ConvertionException as error: shutil.rmtree(path) raise error with open(f"{path}/output", "w") as summary_output: summary_output.write('\n'.join(output)) shutil.copy('report.txt', f"{path}/report") # persist `report.txt` to GCS storage = FileStorage() storage.save_file('report.txt', folder_hash) # found a python file, need to encode separately if original.endswith('.py'): result_filenames = [] for py_file in [original, converted]: result_filenames.append(_save_ipynb_from_py(path, py_file)) assert len(result_filenames) == 2 return path, tuple(result_filenames[:2]) if original.endswith('.py'): return path, (original.replace('.py', '.ipynb'), converted.replace('.py', '.ipynb')) return path, (original, converted)
def local(db='file', folder=None, uids=[]): global give_ups create = create_cookie_file() fetcher = CnFetcher(account, pwd, cookie_file if not create else None) if create: fetcher.login(cookie_filename=cookie_file) while give_ups > 0: while len(tokens) == 0: if give_ups > 0: pass else: return token = tokens.pop() cb = callback(token) if len(uids) == 0: give_ups = 0 else: uid = uids.pop() try: crawler = UserCrawler(uid, is_uid=True, fetcher=fetcher, fetch_fans=False, callbacks=cb, span=False) uid = crawler.uid if db == 'file' and folder is not None: storage = FileStorage(uid, folder) elif db == 'mongo': storage = MongoStorage(uid) else: raise ValueError('db must be "file" or "mongo", ' + 'when is "file", you must define folder parameter.') if storage.crawled: storage.complete() cb() continue else: crawler.set_storage(storage) crawler.start() except Exception, e: cb() # raise e logger.exception(e)
def collect_estore_bg(eids=[], eloginnames=[], get_eids=False, get_eids_args=[], get_eids_function=get_ordered_estore_id, src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='e_bg', gen_stmt=get_collect_estore_bg_stmt, max_query=vgvars.max_vgdb_query_num): ###### prepare has_des = False if not des else True ###### initialize objects src = DBStorage(vgvars.vgdb) if not src else src db_collector = VGDBCollector(src) # by default, collect estore bg of estore that has order in a given period of time if get_eids: get_eids_args.append(db_collector) eids = get_eids_function(*get_eids_args) if eloginnames or eids: if eloginnames: qnum = math.ceil(len(eloginnames) / max_query) else: qnum = math.ceil(len(eids) / max_query) for i in range(qnum): start_index = i * max_query end_index = start_index + max_query selectedloginnames = eloginnames[start_index:end_index] selectedeids = eids[start_index:end_index] des = des if has_des else FileStorage({ 'fpath': '{}{}_{}_{}'.format(out_dir, output_name, start_index, end_index - 1) }) db_collector.des = des if selectedeids or selectedloginnames: logging.debug('collect estore bg from {} to {}'.format( start_index, end_index - 1)) stmt = get_collect_estore_bg_stmt( estore_loginnames=selectedloginnames, estore_ids=selectedeids, ) db_collector.fetch_data(stmt) db_collector.insert_data() else: # later. collect bg when no eids and eloginnames provided pass
def __init__(self): self.parser = AdvertisementParser() self.storage = MongoStorage( 'adv_data') if storage_type == 'mongo' else FileStorage('adv_data') if isinstance(self.storage, MongoStorage): self.links = self.storage.load('adv_links', {'flag': False}) else: self.links = self.storage.load('lnk') self.queue = self.create_queue()
def test_storage_delete_entry_by_name_false(): # Given another_storage = FileStorage(STORAGE_FILENAME) # When deleted = storage.delete_entry_by_name(flower_entry) # Then assert not deleted assert_is_entry_in_storage(storage, flower_entry, amount=0, exists=False) assert_is_entry_in_storage(another_storage, flower_entry, amount=0, exists=False)
def crawl_weibos(self): def _crawl(parser, uid, page, num_pages=''): msg = 'Crawl user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path) start_time = time.time() parser = ComWeibosParser(self.uid, self.storage, weibos_storage=self.weibos_storage) num_pages = _crawl(parser, self.uid, page=1) pages = [i for i in xrange(2, num_pages+1)] """ if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() """ cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s weibos: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def test_storage_add_entry(): # Given # When storage.add_entry(flower_entry) another_storage = FileStorage(STORAGE_FILENAME) # Then assert_is_entry_in_storage(storage, flower_entry, amount=1, exists=True) assert_is_entry_in_storage(another_storage, flower_entry, amount=1, exists=True)
def crawl_msg_reposts(self): def _crawl(parser, msg_id, page, num_pages=''): msg = 'Crawl message(%s)\'s reposts-page:%s:%s' %(self.msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_repost(msg_id, page) try: pq_doc = pq(html) parser.parse(pq_doc) except: pass return num_pages msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: msg = 'Not exist: %s.' %self.msg_url logger.info(msg) write_message(msg, self.window) return self.msg_id = msg_id self.storage= FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path) start_time = time.time() parser = ComRepostsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s reposts: total page=%s,' ' cost time=%s sec, connections=%s' %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def collect_estore_contracts_erp(src='', des='', contract_ids=[], from_date='', serviceName='', page=0, out_dir=vgvars.dir_path['out_dir'], output_name='e_erp_contract'): ###### initialize objects src = APIStorage(vgvars.erp) if not src else src des = VarStorage({'data': defaultdict(list)}) if not des else des apicollector = DataCollector(src, des) ####### perform contract_stmt_temp = vgvars.erp['contract'] contract_dict = defaultdict(list) if not page: page_num = apicollector.fetch_data(contract_stmt_temp.format( 1, '', ''))['data']['totalPage'] # page_num = 2 # TESTING for n in range(1, page_num + 1): data = apicollector.fetch_data(contract_stmt_temp.format( n, '', ''))['data']['currentItems'] if from_date: df = DataFrame(data) origin_dates = df['createdDateTime'] df['createdDateTime'] = pd.to_datetime(df['createdDateTime']) selectedDf = df[df['createdDateTime'] >= from_date] selectedDf['createdDateTime'] = selectedDf['createdDateTime'].map( lambda x: x.strftime('%Y-%m-%d')) selectedDf = selectedDf.T selected_data = selectedDf.to_dict() group_contract_by_start_date(contract_dict, selected_data) if len(selected_data) < len(data): break else: group_contract_by_start_date(contract_dict, data) for m in contract_dict: apicollector.des = FileStorage( {'fpath': '{}{}_{}'.format(out_dir, output_name, m)}) apicollector.insert_data({ 'selected_format': 'json', 'values': contract_dict[m] })
def collect_dept_bg(company_id=vgvars.erp_default['vghn'], src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='dept_erp'): ###### initialize objects src = APIStorage(vgvars.erp) if not src else src # des = VarStorage({'data': defaultdict(list)}) if not des else des des = FileStorage({'fpath': '{}{}_{}'.format(out_dir, output_name, company_id)}) apicollector = DataCollector(src,des) ####### perform dept_stmt_temp = vgvars.erp['dept'] data = apicollector.fetch_data(dept_stmt_temp.format(company_id))['data'] apicollector.insert_data({ 'selected_format': 'json', 'values': data })
def main(): args = docopt(__doc__) storage = FileStorage("entries.txt") bailer.init_storage(storage) if args.get('getall'): print(bailer.get_all_flowers()) elif args.get('add'): print( bailer.add_flower(args.get('<flower-name>'), args.get('<watering-interval>'))) elif args.get('remove'): print(bailer.remove_flower(args.get('<flower-name>'))) elif args.get('water'): if args.get('--force'): print(bailer.water_flower_force(args.get('<flower-name>'))) else: print(bailer.water_flower(args.get('<flower-name>')))
def check_new_weibos(self): def _crawl(parser, uid, page, num_pages=''): msg = 'check new weibo in user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) try: pq_doc = pq(html) return parser.parse(pq_doc) except: return 0 msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path) start_time = time.time() parser = ComWeibosParser(self.uid, self.storage) num_pages = _crawl(parser, self.uid, page=1) cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s weibos: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def collect_estore_spending(start_date, end_date, eids=[], get_eids=False, src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='e_spending_data', gen_stmt=get_collect_estore_spending_stmt): ###### prepare has_des = False if not des else True ###### initialize objects src = DBStorage(vgvars.vgdb) if not src else src db_collector = VGDBCollector(src) ###### collecting # make sure that data is collect within a selected period of time like a year, a month, or a week date_list = convert_datetime.divide_dates(start_date, end_date) # in months for d in date_list: start_date, end_date = d # update destination storage # des = GSStorage(des) des = des if has_des else FileStorage({ 'fpath': '{}{}_{}_{}'.format(out_dir, output_name, start_date, end_date) }) db_collector.des = des # not good enough, since data is check against all estores having orders in a month if get_eids: min_ord_id, max_ord_id = db_collector.get_max_min_id_by_time( start_date, end_date) eids = collect_estore_ids(min_ord_id, max_ord_id, db_collector) # collect data collect(start_date, end_date, db_collector, eids, gen_stmt=gen_stmt) return db_collector.des.data
def crawl_infos(self): msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: return if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return msg = 'Crawl user(%s)\'s profile' %self.uid logger.info(msg) write_message(msg, self.window) self.storage = FileStorage(self.uid, settings.MASK_INFO, self.store_path) start_time = time.time() url = 'http://weibo.com/%s/info' % self.uid parser = ComInfosParser(self.uid, self.storage) html = self._fetch(url, query=settings.QUERY_INFO) try: pq_doc = pq(html) parser.parse(pq_doc) except: pass cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s infos: cost time=%s sec, connections=%s' %(self.uid, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window)
def collect_staff_bg_erp(compids=['315', '319', '305', '320'], page=0, src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='sfaff_bg_erp'): ###### initialize objects src = APIStorage(vgvars.erp) if not src else src apicollector = DataCollector(src, '') ####### perform dept_stmt_temp = vgvars.erp['staffbg'] if compids: for compid in compids: if not page: page_num = apicollector.fetch_data( dept_stmt_temp.format(compid, 1, '', ''))['data']['totalPage'] # page_num = 2 # TESTING datalist = [] for n in range(1, page_num + 1): data = apicollector.fetch_data( dept_stmt_temp.format(compid, n, '', ''))['data']['currentItems'] datalist.extend(data) des = FileStorage( {'fpath': '{}{}_{}'.format(out_dir, output_name, compid)}) apicollector.des = des apicollector.insert_data({ 'selected_format': 'json', 'values': datalist })
def crawl_msg_comments(self): def _crawl(parser, msg_id, page, num_pages='?'): msg = 'Crawl message(%s)\'s comments-page:%s:%s' %(msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_comment(msg_id, page) if html is None: return None try: pq_doc = pq(html) parser.parse(pq_doc) return num_pages except: return None msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if msg_id is False: msg = 'Not exist: %s.' %self.msg_url logger.info(msg) write_message(msg, self.window) return False self.msg_id = msg_id self.storage= FileStorage(self.msg_id, settings.MASK_COMMENT, self.store_path) start_time = time.time() parser = ComCommentsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s comments: total page=%s,' ' cost time=%s sec, connections=%s' %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
def crawl_infos(self): msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return False msg = 'Crawl user(%s)\'s profile' %self.uid logger.info(msg) write_message(msg, self.window) self.storage = FileStorage(self.uid, settings.MASK_INFO, self.store_path) start_time = time.time() url = 'http://weibo.com/%s/info' % self.uid parser = ComInfosParser(self.uid, self.storage) html = self._fetch(url, query=settings.QUERY_INFO) cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s infos: cost time=%s sec, connections=%s' %(self.uid, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) if html is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.infos_fp, self.storage.infos_f_name) except: pass return None try: pq_doc = pq(html) parser.parse(pq_doc) return True except: msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.infos_fp, self.storage.infos_f_name) except: pass return None #error occur
class ComWeiboCrawler(object): def __init__(self, fetcher, store_path, **kwargs): self.fetcher = fetcher self.store_path = store_path self.uid = kwargs.get('uid', None) self.msg_url = kwargs.get('msg_url', None) self.window = kwargs.get('window', None) def _check_page_right(self, html): ''' check whether the page is got before login or after. ''' if html is None: return False if len(html) == 0: msg = u'weibo改版了,信息标签发生变化' logger.info(msg) write_message(msg, self.window) return None return not (u'<title>' in html) def _fetch_weibo(self, uid, page): html = self.fetcher.fetch_weibo(uid, page) page_right = self._check_page_right(html) if page_right is None: return None if page_right: return html tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window) time.sleep(sec) html = self.fetcher.fetch_weibo(uid, page) page_right = self._check_page_right(html) if page_right: return html tries += 1 return None def _fetch(self, url, query): html = self.fetcher.fetch(url, query) page_right = self._check_page_right(html) if page_right is None: return None if page_right: return html tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window) time.sleep(sec) html = self.fetcher.fetch(url, query) page_right = self._check_page_right(html) if page_right: return html tries += 1 return None def _fetch_msg_repost(self, msg_id, page=1): html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page) page_right = self._check_page_right(html) if page_right is None: return None if page_right: return html, num_pages tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window) time.sleep(sec) html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page) page_right = self._check_page_right(html) if page_right: return html, num_pages tries += 1 return None, None def _fetch_msg_comment(self, msg_id, page=1): html, num_pages = self.fetcher.fetch_msg_comments(msg_id, page) page_right = self._check_page_right(html) if page_right is None: return None if page_right: return html, num_pages tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window) time.sleep(sec) html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page) page_right = self._check_page_right(html) if page_right: return html, num_pages tries += 1 return None, None def crawl_weibos(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path) start_time = time.time() parser = ComWeibosParser(self.uid, self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.weibos_fp, self.storage.weibos_f_name) except: pass return None pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.weibos_fp, self.storage.weibos_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s weibos: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True def crawl_follows(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s follows-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.com/%s/follow?page=%s' %(uid, page) html = self._fetch(url, query=settings.QUERY_FOLLOWS) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' %(self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path) start_time = time.time() parser = ComFollowsParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.follows_fp, self.storage.follows_f_name) except: pass return None if settings.PAGE_LIMIT != 0: if num_pages > settings.PAGE_LIMIT: msg = 'For sina policy, reduce page count from %s to %s' %(num_pages, settings.PAGE_LIMIT) write_message(msg, self.window) num_pages = settings.PAGE_LIMIT pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur: _crawl return None msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.follows_fp, self.storage.follows_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s follows: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True def crawl_fans(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.com/%s/fans?page=%s' %(uid, page) html = self._fetch(url, query=settings.QUERY_FANS) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' %(self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path) start_time = time.time() parser = ComFansParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None if settings.PAGE_LIMIT != 0: if num_pages > settings.PAGE_LIMIT: msg = 'For sina policy, reduce page count from %s to %s' %(num_pages, settings.PAGE_LIMIT) write_message(msg, self.window) num_pages = settings.PAGE_LIMIT pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s fans: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True def crawl_infos(self): msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' %self.uid logger.info(msg) write_message(msg, self.window) return False msg = 'Crawl user(%s)\'s profile' %self.uid logger.info(msg) write_message(msg, self.window) self.storage = FileStorage(self.uid, settings.MASK_INFO, self.store_path) start_time = time.time() url = 'http://weibo.com/%s/info' % self.uid parser = ComInfosParser(self.uid, self.storage) html = self._fetch(url, query=settings.QUERY_INFO) cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s infos: cost time=%s sec, connections=%s' %(self.uid, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) if html is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.infos_fp, self.storage.infos_f_name) except: pass return None try: pq_doc = pq(html) parser.parse(pq_doc) return True except: msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.infos_fp, self.storage.infos_f_name) except: pass return None #error occur def crawl_msg_reposts(self): def _crawl(parser, msg_id, page, num_pages='?'): msg = 'Crawl message(%s)\'s reposts-page:%s:%s' %(self.msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_repost(msg_id, page) if html is None: return None try: pq_doc = pq(html) parser.parse(pq_doc) return num_pages except: return None msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if msg_id is None: msg = 'Not exist: %s.' %self.msg_url logger.info(msg) write_message(msg, self.window) return False self.msg_id = msg_id self.storage= FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path) start_time = time.time() parser = ComRepostsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.reposts_fp, self.storage.reposts_f_name) except: pass return None pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.reposts_fp, self.storage.reposts_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s reposts: total page=%s,' ' cost time=%s sec, connections=%s' %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True def crawl_msg_comments(self): def _crawl(parser, msg_id, page, num_pages='?'): msg = 'Crawl message(%s)\'s comments-page:%s:%s' %(msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_comment(msg_id, page) if html is None: return None try: pq_doc = pq(html) parser.parse(pq_doc) return num_pages except: return None msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if msg_id is False: msg = 'Not exist: %s.' %self.msg_url logger.info(msg) write_message(msg, self.window) return False self.msg_id = msg_id self.storage= FileStorage(self.msg_id, settings.MASK_COMMENT, self.store_path) start_time = time.time() parser = ComCommentsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s comments: total page=%s,' ' cost time=%s sec, connections=%s' %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
class Persistent(object): def __init__(self, name, **kw): super(Persistent, self).__init__() self.name = name self.storage = None self.synclock = RLock() self.path = kw.get('path', PDODIR) self.encode = kw.get('encode', repr) self.decode = kw.get('decode', eval) self.extension = kw.get('extension', 'dat') self.autopersist = kw.get('autopersist', True) if self.autopersist: self.load() def open(self): self.synclock.acquire() try: self.storage = FileStorage(self.path, self.name, self.extension) self.storage.open() finally: self.synclock.release() def close(self): self.synclock.acquire() try: self.storage.close() self.storage = None finally: self.synclock.release() def closed(self): storage = self.storage if storage is None: return True elif storage.closed(): return True return False def update_storage(self): """ Serialize and data associated with object and update storage record to match serialization. """ self.synclock.acquire() try: data = self.getencoded() self.storage.set(data) finally: self.synclock.release() def update_data(self): self.synclock.acquire() try: data = self.storage.getdata() self.setencoded(data) finally: self.synclock.release() def commit(self): """ Update storage with most recent data, then commit changes. """ self.synclock.acquire() try: self.update_storage() self.storage.commit() self.notify_committed() finally: self.synclock.release() def load(self): """ Load most recently stored data, then update current data with loaded content. """ self.synclock.acquire() try: if self.storage is None: self.open() self.storage.load() self.update_data() self.notify_loaded() finally: self.synclock.release() def serialize(self, data): if self.encode is not None: data = self.encode(data) return data def unserialize(self, data): if self.decode is not None: data = self.decode(data) return data def getencoded(self): """ Return encoded representation of current data object. This method must be overridden in type-specific subclasses. """ raise TypeError("Method must be overridden") def setencoded(self, data): """ Use encoded representation of persisted data object to update current data object. This method must be overridden in type-specific subclasses. """ raise TypeError("Method must be overridden") def notify_committed(self): pass def notify_loaded(self): pass
class MainWindow(QDialog): def __init__(self, parent=None): QWidget.__init__(self, parent) self.storage = FileStorage() #self.languages = Languages() self.setWindowTitle("Find snippet") self.setWindowFlags(self.windowFlags() | Qt.WindowStaysOnTopHint) # ----------------------------------------------------------- # Window layout self.input = QLineEdit(self) self.input.setMinimumWidth(300) QObject.connect(self.input, SIGNAL('returnPressed()'), self.on_return) self.outcome = QLabel("") layout = QVBoxLayout() layout.addWidget(self.input) layout.addWidget(self.outcome) layout.setSizeConstraint(QLayout.SetFixedSize) self.setLayout(layout) # ----------------------------------------------------------- # In window shortcuts def create_shortcut(keys, slot, *args): shortcut = QShortcut(self) shortcut.setKey(keys) if slot: if args: QObject.connect(shortcut, SIGNAL("activated()"), partial(slot, *args)) else: QObject.connect(shortcut, SIGNAL("activated()"), slot) for i in xrange(0, 10): create_shortcut("Ctrl+%d" % i, self.on_copy, i) create_shortcut("Shift+Ctrl+%d" % i, self.on_delete, i) create_shortcut("Esc", self.on_escape) create_shortcut("Ctrl+Up", self.on_page, 'prev') create_shortcut("Ctrl+Down", self.on_page, 'next') create_shortcut("Up", self.on_page, 'prev') create_shortcut("Down", self.on_page, 'next') # ----------------------------------------------------------- # Systray and global shortcuts self.systray = KSystemTrayIcon(self) self.systray.setIcon(QIcon(icon_path())) self.systray.show() def add_action(systray, id, text, icon, shortcut, slot): action = systray.actionCollection().addAction(id) action.setText(text) action.setIcon(icon) if shortcut: ashortcut = KShortcut(shortcut) action.setShortcut(ashortcut) action.setGlobalShortcut(ashortcut) self.connect(action, SIGNAL("triggered()"), slot) menu = systray.contextMenu() menu.addAction(action) add_action(self.systray, 'find-snippet', "Find snippet", QIcon(icon_path()), 'Ctrl+Alt+B', self.on_toogle) add_action(self.systray, 'add-snippet', "Add snippet", QIcon(icon_path()), 'Ctrl+Alt+N', self.on_add) self.add_dialog = AddDialog(self) self.set_results([]) def closeEvent(self, event): self.setVisible(False) event.ignore() def on_systray(self, reason): # QSystemTrayIcon.DoubleClick if reason == QSystemTrayIcon.Trigger: self.on_toogle() if reason == QSystemTrayIcon.MiddleClick: self.on_add() def on_toogle(self, *a): if self.isVisible(): self.hide() else: self.show() def on_add(self): #self.add_dialog.show() self.add_dialog.display() def on_copy(self, nr): nr = nr - 1; if nr < (len(self.search_results) - 10 * self.search_page): text = self.search_results[10 * self.search_page + nr].code QApplication.clipboard().setText(text) self.close() def on_delete(self, nr): nr = nr - 1; if nr < (len(self.search_results) - 10 * self.search_page): snippet = self.search_results[10 * self.search_page + nr] reply = QMessageBox.question( self, "Delete snippet", "Delete this snippet?" + format_code(snippet.code, snippet.lang), QMessageBox.Yes|QMessageBox.Default, QMessageBox.No|QMessageBox.Escape ) if reply: self.storage.delete(snippet) self.close() def on_return(self, *a): query_str = unicode(self.input.text()) try: query_ast = parse(query_str) result = self.storage.search(query_ast) self.set_results(result) except ParseError, e: self.display_error()
def __init__(self, parent=None): QWidget.__init__(self, parent) self.storage = FileStorage() #self.languages = Languages() self.setWindowTitle("Find snippet") self.setWindowFlags(self.windowFlags() | Qt.WindowStaysOnTopHint) # ----------------------------------------------------------- # Window layout self.input = QLineEdit(self) self.input.setMinimumWidth(300) QObject.connect(self.input, SIGNAL('returnPressed()'), self.on_return) self.outcome = QLabel("") layout = QVBoxLayout() layout.addWidget(self.input) layout.addWidget(self.outcome) layout.setSizeConstraint(QLayout.SetFixedSize) self.setLayout(layout) # ----------------------------------------------------------- # In window shortcuts def create_shortcut(keys, slot, *args): shortcut = QShortcut(self) shortcut.setKey(keys) if slot: if args: QObject.connect(shortcut, SIGNAL("activated()"), partial(slot, *args)) else: QObject.connect(shortcut, SIGNAL("activated()"), slot) for i in xrange(0, 10): create_shortcut("Ctrl+%d" % i, self.on_copy, i) create_shortcut("Shift+Ctrl+%d" % i, self.on_delete, i) create_shortcut("Esc", self.on_escape) create_shortcut("Ctrl+Up", self.on_page, 'prev') create_shortcut("Ctrl+Down", self.on_page, 'next') create_shortcut("Up", self.on_page, 'prev') create_shortcut("Down", self.on_page, 'next') # ----------------------------------------------------------- # Systray and global shortcuts self.systray = KSystemTrayIcon(self) self.systray.setIcon(QIcon(icon_path())) self.systray.show() def add_action(systray, id, text, icon, shortcut, slot): action = systray.actionCollection().addAction(id) action.setText(text) action.setIcon(icon) if shortcut: ashortcut = KShortcut(shortcut) action.setShortcut(ashortcut) action.setGlobalShortcut(ashortcut) self.connect(action, SIGNAL("triggered()"), slot) menu = systray.contextMenu() menu.addAction(action) add_action(self.systray, 'find-snippet', "Find snippet", QIcon(icon_path()), 'Ctrl+Alt+B', self.on_toogle) add_action(self.systray, 'add-snippet', "Add snippet", QIcon(icon_path()), 'Ctrl+Alt+N', self.on_add) self.add_dialog = AddDialog(self) self.set_results([])
class ComWeiboCrawler(object): def __init__(self, fetcher, store_path, **kwargs): self.fetcher = fetcher self.store_path = store_path self.uid = kwargs.get('uid', None) self.msg_url = kwargs.get('msg_url', None) self.window = kwargs.get('window', None) def _check_page_right(self, html): ''' check whether the page is got before login or after. ''' if html is None: return False if len(html) == 0: msg = u'weibo改版了,信息标签发生变化' logger.info(msg) write_message(msg, self.window) return None return not (u'<title>' in html) def _fetch_weibo(self, uid, page): html = self.fetcher.fetch_weibo(uid, page) page_right = self._check_page_right(html) if page_right is None: return None if page_right: return html tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message( '_fetch trying: %s, sleep: %s seconds' % (tries, sec), self.window) time.sleep(sec) html = self.fetcher.fetch_weibo(uid, page) page_right = self._check_page_right(html) if page_right: return html tries += 1 return None def _fetch(self, url, query): html = self.fetcher.fetch(url, query) page_right = self._check_page_right(html) if page_right is None: return None if page_right: return html tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message( '_fetch trying: %s, sleep: %s seconds' % (tries, sec), self.window) time.sleep(sec) html = self.fetcher.fetch(url, query) page_right = self._check_page_right(html) if page_right: return html tries += 1 return None def _fetch_msg_repost(self, msg_id, page=1): html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page) page_right = self._check_page_right(html) if page_right is None: return None if page_right: return html, num_pages tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message( '_fetch trying: %s, sleep: %s seconds' % (tries, sec), self.window) time.sleep(sec) html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page) page_right = self._check_page_right(html) if page_right: return html, num_pages tries += 1 return None, None def _fetch_msg_comment(self, msg_id, page=1): html, num_pages = self.fetcher.fetch_msg_comments(msg_id, page) page_right = self._check_page_right(html) if page_right is None: return None if page_right: return html, num_pages tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message( '_fetch trying: %s, sleep: %s seconds' % (tries, sec), self.window) time.sleep(sec) html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page) page_right = self._check_page_right(html) if page_right: return html, num_pages tries += 1 return None, None def crawl_weibos(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s weibos-page: %s:%s' % (self.uid, num_pages, page) write_message(msg, self.window) html = self._fetch_weibo(uid, page) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' % self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' % self.uid logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path) start_time = time.time() parser = ComWeibosParser(self.uid, self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.weibos_fp, self.storage.weibos_f_name) except: pass return None pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.weibos_fp, self.storage.weibos_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s weibos: total page=%s,' ' cost time=%s sec, connections=%s' % (self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True def crawl_follows(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s follows-page: %s:%s' % (self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.com/%s/follow?page=%s' % (uid, page) html = self._fetch(url, query=settings.QUERY_FOLLOWS) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' % self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' % (self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path) start_time = time.time() parser = ComFollowsParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.follows_fp, self.storage.follows_f_name) except: pass return None if settings.PAGE_LIMIT != 0: if num_pages > settings.PAGE_LIMIT: msg = 'For sina policy, reduce page count from %s to %s' % ( num_pages, settings.PAGE_LIMIT) write_message(msg, self.window) num_pages = settings.PAGE_LIMIT pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur: _crawl return None msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.follows_fp, self.storage.follows_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s follows: total page=%s,' ' cost time=%s sec, connections=%s' % (self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True def crawl_fans(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.com/%s/fans?page=%s' % (uid, page) html = self._fetch(url, query=settings.QUERY_FANS) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' % self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' % (self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path) start_time = time.time() parser = ComFansParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None if settings.PAGE_LIMIT != 0: if num_pages > settings.PAGE_LIMIT: msg = 'For sina policy, reduce page count from %s to %s' % ( num_pages, settings.PAGE_LIMIT) write_message(msg, self.window) num_pages = settings.PAGE_LIMIT pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s fans: total page=%s,' ' cost time=%s sec, connections=%s' % (self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True def crawl_infos(self): msg = 'Checking: whether user(%s) exists or not...' % self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' % self.uid logger.info(msg) write_message(msg, self.window) return False msg = 'Crawl user(%s)\'s profile' % self.uid logger.info(msg) write_message(msg, self.window) self.storage = FileStorage(self.uid, settings.MASK_INFO, self.store_path) start_time = time.time() url = 'http://weibo.com/%s/info' % self.uid parser = ComInfosParser(self.uid, self.storage) html = self._fetch(url, query=settings.QUERY_INFO) cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s infos: cost time=%s sec, connections=%s' % (self.uid, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) if html is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.infos_fp, self.storage.infos_f_name) except: pass return None try: pq_doc = pq(html) parser.parse(pq_doc) return True except: msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.infos_fp, self.storage.infos_f_name) except: pass return None #error occur def crawl_msg_reposts(self): def _crawl(parser, msg_id, page, num_pages='?'): msg = 'Crawl message(%s)\'s reposts-page:%s:%s' % (self.msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_repost(msg_id, page) if html is None: return None try: pq_doc = pq(html) parser.parse(pq_doc) return num_pages except: return None msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if msg_id is None: msg = 'Not exist: %s.' % self.msg_url logger.info(msg) write_message(msg, self.window) return False self.msg_id = msg_id self.storage = FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path) start_time = time.time() parser = ComRepostsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.reposts_fp, self.storage.reposts_f_name) except: pass return None pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.reposts_fp, self.storage.reposts_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s reposts: total page=%s,' ' cost time=%s sec, connections=%s' % (self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True def crawl_msg_comments(self): def _crawl(parser, msg_id, page, num_pages='?'): msg = 'Crawl message(%s)\'s comments-page:%s:%s' % ( msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_comment(msg_id, page) if html is None: return None try: pq_doc = pq(html) parser.parse(pq_doc) return num_pages except: return None msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if msg_id is False: msg = 'Not exist: %s.' % self.msg_url logger.info(msg) write_message(msg, self.window) return False self.msg_id = msg_id self.storage = FileStorage(self.msg_id, settings.MASK_COMMENT, self.store_path) start_time = time.time() parser = ComCommentsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s comments: total page=%s,' ' cost time=%s sec, connections=%s' % (self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
def crawl_msg_comments(self): def _crawl(parser, msg_id, page, num_pages='?'): msg = 'Crawl message(%s)\'s comments-page:%s:%s' % ( msg_id, num_pages, page) write_message(msg, self.window) html, num_pages = self._fetch_msg_comment(msg_id, page) if html is None: return None try: pq_doc = pq(html) parser.parse(pq_doc) return num_pages except: return None msg = 'Checking: whether message exists or not...' write_message(msg, self.window) msg_id = self.fetcher.check_message(self.msg_url) if msg_id is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if msg_id is False: msg = 'Not exist: %s.' % self.msg_url logger.info(msg) write_message(msg, self.window) return False self.msg_id = msg_id self.storage = FileStorage(self.msg_id, settings.MASK_COMMENT, self.store_path) start_time = time.time() parser = ComCommentsParser(msg_id, self.storage) num_pages = _crawl(parser, self.msg_id, 1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl message(%s)\'s comments: total page=%s,' ' cost time=%s sec, connections=%s' % (self.msg_id, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
def __init__(self, cities=default_cities, link=base_link): self.cities = cities self.link = link self.storage = MongoStorage( 'adv_links') if storage_type == 'mongo' else FileStorage( 'adv_links')
class CnWeiboCrawler(object): def __init__(self, fetcher, store_path, uid, window=None): self.fetcher = fetcher self.store_path = store_path self.uid = uid self.window = window def _check_page_right(self, html): if html is None: return False try: pq_doc = pq(html) title = pq_doc.find('title').text().strip() return title != u'微博广场' and title != u'新浪微博-新浪通行证' except AttributeError: return False def _fetch(self, url): html = self.fetcher.fetch(url) page_right = self._check_page_right(html) if page_right: return html tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message( '_fetch trying: %s, sleep: %s seconds' % (tries, sec), self.window) time.sleep(sec) html = self.fetcher.fetch(url) page_right = self._check_page_right(html) if page_right: return html tries += 1 return None def crawl_follows(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s follows-page: %s:%s' % (self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.cn/%s/follow?page=%s' % (uid, page) html = self._fetch(url) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' % self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' % (self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path) start_time = time.time() parser = CnFollowsParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.follows_fp, self.storage.follows_f_name) except: pass return None pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur: _crawl return None msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.follows_fp, self.storage.follows_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s follows: total page=%s,' ' cost time=%s sec, connections=%s' % (self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True def crawl_fans(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.cn/%s/fans?page=%s' % (uid, page) html = self._fetch(url) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' % self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' % (self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path) start_time = time.time() parser = CnFansParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s fans: total page=%s,' ' cost time=%s sec, connections=%s' % (self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
def __init__(self, repoDir): if not os.path.exists(repoDir): raise InvalidRepo("Invalid Repo path") #creating required structure self.path = repoDir self.storage = FileStorage(os.path.join(repoDir,".svcs"))
def crawl_fans(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.cn/%s/fans?page=%s' % (uid, page) html = self._fetch(url) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' % self.uid write_message(msg, self.window) is_exist = self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' % (self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path) start_time = time.time() parser = CnFansParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None pages = [i for i in xrange(2, num_pages + 1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s fans: total page=%s,' ' cost time=%s sec, connections=%s' % (self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
class CnWeiboCrawler(object): def __init__(self, fetcher, store_path, uid, window=None): self.fetcher = fetcher self.store_path = store_path self.uid = uid self.window = window def _check_page_right(self, html): if html is None: return False try: pq_doc = pq(html) title = pq_doc.find('title').text().strip() return title != u'微博广场' and title != u'新浪微博-新浪通行证' except AttributeError: return False def _fetch(self, url): html = self.fetcher.fetch(url) page_right = self._check_page_right(html) if page_right: return html tries = 0 while not page_right and tries <= 10: time.sleep(10) self.fetcher.check_cookie() sec = (tries + 1) * 10 write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window) time.sleep(sec) html = self.fetcher.fetch(url) page_right = self._check_page_right(html) if page_right: return html tries += 1 return None def crawl_follows(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s follows-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.cn/%s/follow?page=%s' %(uid, page) html = self._fetch(url) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' %(self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path) start_time = time.time() parser = CnFollowsParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.follows_fp, self.storage.follows_f_name) except: pass return None pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur: _crawl return None msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.follows_fp, self.storage.follows_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s follows: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True def crawl_fans(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.cn/%s/fans?page=%s' %(uid, page) html = self._fetch(url) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' %(self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path) start_time = time.time() parser = CnFansParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s fans: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
def crawl_fans(self): def _crawl(parser, uid, page, num_pages='?'): msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page) write_message(msg, self.window) url = 'http://weibo.cn/%s/fans?page=%s' %(uid, page) html = self._fetch(url) if html is None: return None try: pq_doc = pq(html) return parser.parse(pq_doc) except: return None msg = 'Checking: whether user(%s) exists or not...' %self.uid write_message(msg, self.window) is_exist= self.fetcher.check_user(self.uid) if is_exist is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) return None if not is_exist: msg = 'Not exist: %s.' %(self.uid) logger.info(msg) write_message(msg, self.window) return False self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path) start_time = time.time() parser = CnFansParser(self.storage) num_pages = _crawl(parser, self.uid, page=1) if num_pages is None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None pages = [i for i in xrange(2, num_pages+1)] if len(pages) > 0: n_threads = 5 worker_manager = WorkerManager(n_threads) for pg in pages: worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages) worker_manager.wait_all_complete() is_None = worker_manager.get_result() worker_manager.stop() if is_None: #error occur msg = 'Error' logger.info(msg) write_message(msg, self.window) try: self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name) except: pass return None cost_time = int(time.time() - start_time) msg = ('Crawl user(%s)\'s fans: total page=%s,' ' cost time=%s sec, connections=%s' %(self.uid, num_pages, cost_time, self.fetcher.n_connections)) logger.info(msg) write_message(msg, self.window) return True
from typing import Optional from contextlib import closing from hashlib import md5 from storage import FileStorage from db import Blob,Session from storage import ExistsError,NotFoundError from exceptions import DbCorruptionError storage_backend = FileStorage() def withsession(fn): def inner(sess=None,*args,**kwargs): if sess is None: with closing(Session()) as sess: return fn(sess = sess,*args,**kwargs) else: fn(sess,*args,**kwargs) return inner @withsession def store(data: bytes,sess: Optional[Session] = None): if sess is not None: data_id = md5(data).hexdigest() existing = sess.query(Blob).get(data_id) if existing is not None: raise ExistsError blob = Blob(id=data_id) sess.add(blob)