def collect_customer_bg(cusid='', page=0, src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='customer_erp'):
	###### initialize objects
	src = APIStorage(vgvars.erp) if not src else src
	# des = VarStorage({'data': defaultdict(list)}) if not des else des
	# des = FileStorage({'fpath': '{}{}_{}'.format(out_dir, output_name)})
	apicollector = DataCollector(src, des)	

	customer_stmt_temp = vgvars.erp['customer']

	if cusid: 
		customer_stmt_temp =  customer_stmt_temp.format('id={}'.format(cusid))
		data = apicollector.fetch_data(customer_stmt_temp)['data']['currentItems']
		des = FileStorage({'fpath': '{}{}_cusid_{}'.format(out_dir, output_name, cusid)})
		apicollector.des = des
		apicollector.insert_data({
			'selected_format': 'json',
			'values': data
		})
	else:
		customer_stmt_temp = customer_stmt_temp.format('page={}')
		if not page:
			page_num = apicollector.fetch_data(customer_stmt_temp.format(1))['data']['totalPage']	
		
		# page_num = 2 # Testing	

		for i in range(1, page_num + 1):	
			data = apicollector.fetch_data(customer_stmt_temp.format(i))['data']['currentItems']
			des = FileStorage({'fpath': '{}{}_{}'.format(out_dir, output_name, i)})			
			apicollector.des = des
			apicollector.insert_data({
				'selected_format': 'json',
				'values': data
			})
示例#2
0
def create(config: StorageConfiguration) -> Storage:
    storage = Storage(
        JsonFormattedHostStorage(FileStorage(config.hosts_filename)),
        JsonFormattedUserAccountStorage(
            FileStorage(config.user_accounts_filename)),
        JsonFormattedUnixAccountStorage(
            FileStorage(config.unix_accounts_filename)),
        UserAccountActivationInMemoryStorage())
    return storage
示例#3
0
def collect_orders(start_date,
                   end_date,
                   eids=[],
                   src='',
                   des='',
                   out_dir=vgvars.dir_path['out_dir'],
                   output_name='order_data',
                   gen_stmt=get_collect_order_stmt):
    ###### prepare
    has_des = False if not des else True

    ###### initialize objects
    src = DBStorage(vgvars.vgdb) if not src else src
    db_collector = VGDBCollector(src)

    ###### collecting
    # make sure that data is collect within a selected period of time like a year, a month, or a week
    date_list = convert_datetime.divide_dates(start_date,
                                              end_date)  # in months

    for i in date_list:
        start_date, end_date = i

        # update destination storage
        # des = GSStorage(des)
        des = des if has_des else FileStorage({
            'fpath':
            '{}{}_{}_{}'.format(out_dir, output_name, start_date, end_date)
        })
        db_collector.des = des

        # collect data
        collect(start_date, end_date, db_collector, eids, gen_stmt=gen_stmt)

    return db_collector.des.data
def setup_function(function):
    global storage, flower_entry

    with open(STORAGE_FILENAME, 'w') as f:
        pass
    storage = FileStorage(STORAGE_FILENAME)
    flower_entry = FlowerEntry(1, "tree", 1, 1)
示例#5
0
 def open(self):
     self.synclock.acquire()
     try:
         self.storage = FileStorage(self.path, self.name, self.extension)
         self.storage.open()
     finally:
         self.synclock.release()
示例#6
0
    def crawl_follows(self):
        def _crawl(parser, uid, page, num_pages=''):
            msg = 'Crawl user(%s)\'s follows-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)
        
            url  = 'http://weibo.com/%s/follow?page=%s' %(uid, page)
            html = self._fetch(url, query=settings.QUERY_FOLLOWS)
            
            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0
        
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:
            return
        
        if not is_exist:
            msg = 'Not exist: %s.' %(self.uid)
            logger.info(msg)
            write_message(msg, self.window)
            
            return

        self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path)
        
        start_time = time.time()
        
        parser = ComFollowsParser(self.storage, uids_storage=self.uids_storage)
        
        num_pages = _crawl(parser, self.uid, page=1)
        if settings.PAGE_LIMIT != 0:
            if num_pages > settings.PAGE_LIMIT:
                msg = 'For sina policy, reduce page count from %s to %s' %(num_pages, settings.PAGE_LIMIT)
                write_message(msg, self.window)
        
                num_pages = settings.PAGE_LIMIT
        
        pages = [i for i in xrange(2, num_pages+1)]
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
                
            worker_manager.wait_all_complete()

        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s follows: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
示例#7
0
def collect_estore_bg(eids=[],
                      eloginnames=[],
                      get_eids=False,
                      get_eids_args=[],
                      get_eids_function=get_ordered_estore_id,
                      src='',
                      des='',
                      out_dir=vgvars.dir_path['out_dir'],
                      output_name='e_bg',
                      gen_stmt=get_collect_estore_bg_stmt,
                      max_query=vgvars.max_vgdb_query_num):

    ###### prepare
    has_des = False if not des else True

    ###### initialize objects
    src = DBStorage(vgvars.vgdb) if not src else src
    db_collector = VGDBCollector(src)

    # by default, collect estore bg of estore that has order in a given period of time
    if get_eids:
        get_eids_args.append(db_collector)
        eids = get_eids_function(*get_eids_args)

    if eloginnames or eids:
        if eloginnames:
            qnum = math.ceil(len(eloginnames) / max_query)
        else:
            qnum = math.ceil(len(eids) / max_query)

        for i in range(qnum):

            start_index = i * max_query
            end_index = start_index + max_query

            selectedloginnames = eloginnames[start_index:end_index]
            selectedeids = eids[start_index:end_index]

            des = des if has_des else FileStorage({
                'fpath':
                '{}{}_{}_{}'.format(out_dir, output_name, start_index,
                                    end_index - 1)
            })
            db_collector.des = des

            if selectedeids or selectedloginnames:
                logging.debug('collect estore bg from {} to {}'.format(
                    start_index, end_index - 1))
                stmt = get_collect_estore_bg_stmt(
                    estore_loginnames=selectedloginnames,
                    estore_ids=selectedeids,
                )
                db_collector.fetch_data(stmt)
                db_collector.insert_data()

    else:  # later. collect bg when no eids and eloginnames provided
        pass
示例#8
0
 def __init__(self):
     self.parser = AdvertisementParser()
     self.storage = MongoStorage(
         'adv_data') if storage_type == 'mongo' else FileStorage('adv_data')
     if isinstance(self.storage, MongoStorage):
         self.links = self.storage.load('adv_links', {'flag': False})
     else:
         self.links = self.storage.load('lnk')
     self.queue = self.create_queue()
示例#9
0
文件: blockchain.py 项目: ivpal/pybl
    def __init__(self):
        self.db = FileStorage('./pybl.db')
        self.current_hash = None

        if self.db.empty():
            genesis = Block.genesis_block()
            self.db.put_block(genesis)
            self.tip = genesis.hash
        else:
            self.tip = self.db.get_last_hash()
示例#10
0
def test_storage_delete_entry_by_name_false():
    # Given
    another_storage = FileStorage(STORAGE_FILENAME)

    # When
    deleted = storage.delete_entry_by_name(flower_entry)
    
    # Then
    assert not deleted
    assert_is_entry_in_storage(storage, flower_entry, amount=0, exists=False)
    assert_is_entry_in_storage(another_storage, flower_entry, amount=0, exists=False)
示例#11
0
    def crawl_weibos(self):
        def _crawl(parser, uid, page, num_pages=''):
            msg = 'Crawl user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)
        
            html = self._fetch_weibo(uid, page)
            
            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0
            
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        
        is_exist = self.fetcher.check_user(self.uid)
        
        if is_exist is None:
            return
        
        if not is_exist:
            msg = 'Not exist: %s.' %self.uid
            logger.info(msg)
            write_message(msg, self.window)
            
            return
        
        self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path)
        
        start_time = time.time()
        
        parser = ComWeibosParser(self.uid, self.storage, weibos_storage=self.weibos_storage)
        
        num_pages = _crawl(parser, self.uid, page=1)

        pages = [i for i in xrange(2, num_pages+1)]
        """
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
            
            worker_manager.wait_all_complete()
        """
        cost_time = int(time.time() - start_time)
        msg = ('Crawl user(%s)\'s weibos: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
示例#12
0
def test_storage_add_entry():
    # Given
    

    # When
    storage.add_entry(flower_entry)
    
    another_storage = FileStorage(STORAGE_FILENAME)

    # Then
    assert_is_entry_in_storage(storage, flower_entry, amount=1, exists=True)
    assert_is_entry_in_storage(another_storage, flower_entry, amount=1, exists=True)
示例#13
0
 def crawl_msg_reposts(self):
     def _crawl(parser, msg_id, page, num_pages=''):
         msg = 'Crawl message(%s)\'s reposts-page:%s:%s' %(self.msg_id, num_pages, page)
         write_message(msg, self.window)
     
         html, num_pages = self._fetch_msg_repost(msg_id, page)
         
         try:
             pq_doc = pq(html)
             parser.parse(pq_doc)
         except:
             pass
         
         return num_pages
     
     msg = 'Checking: whether message exists or not...'
     write_message(msg, self.window)
     msg_id = self.fetcher.check_message(self.msg_url)
     
     if msg_id is None:
         msg = 'Not exist: %s.' %self.msg_url            
         logger.info(msg)
         write_message(msg, self.window)
         
         return
       
     self.msg_id = msg_id
     self.storage= FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path)
     
     start_time = time.time()
     
     parser = ComRepostsParser(msg_id, self.storage)
     num_pages = _crawl(parser, self.msg_id, 1)
     pages = [i for i in xrange(2, num_pages+1)]
     if len(pages) > 0:
         n_threads = 5
         
         worker_manager = WorkerManager(n_threads)
         
         for pg in pages:
             worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages)
         
         worker_manager.wait_all_complete()
         
     cost_time = int(time.time() - start_time)
     
     msg = ('Crawl message(%s)\'s reposts: total page=%s,'
            ' cost time=%s sec, connections=%s' 
            %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
     logger.info(msg)
     write_message(msg, self.window) 
示例#14
0
def collect_estore_contracts_erp(src='',
                                 des='',
                                 contract_ids=[],
                                 from_date='',
                                 serviceName='',
                                 page=0,
                                 out_dir=vgvars.dir_path['out_dir'],
                                 output_name='e_erp_contract'):

    ###### initialize objects
    src = APIStorage(vgvars.erp) if not src else src
    des = VarStorage({'data': defaultdict(list)}) if not des else des
    apicollector = DataCollector(src, des)

    ####### perform
    contract_stmt_temp = vgvars.erp['contract']
    contract_dict = defaultdict(list)

    if not page:
        page_num = apicollector.fetch_data(contract_stmt_temp.format(
            1, '', ''))['data']['totalPage']

    # page_num = 2 # TESTING

    for n in range(1, page_num + 1):
        data = apicollector.fetch_data(contract_stmt_temp.format(
            n, '', ''))['data']['currentItems']
        if from_date:
            df = DataFrame(data)
            origin_dates = df['createdDateTime']
            df['createdDateTime'] = pd.to_datetime(df['createdDateTime'])
            selectedDf = df[df['createdDateTime'] >= from_date]
            selectedDf['createdDateTime'] = selectedDf['createdDateTime'].map(
                lambda x: x.strftime('%Y-%m-%d'))
            selectedDf = selectedDf.T
            selected_data = selectedDf.to_dict()

            group_contract_by_start_date(contract_dict, selected_data)
            if len(selected_data) < len(data): break

        else:
            group_contract_by_start_date(contract_dict, data)

    for m in contract_dict:
        apicollector.des = FileStorage(
            {'fpath': '{}{}_{}'.format(out_dir, output_name, m)})
        apicollector.insert_data({
            'selected_format': 'json',
            'values': contract_dict[m]
        })
示例#15
0
def process_file(file_url: str) -> Tuple[str, Tuple[str, ...]]:
    """Process file with download, cache and upgrade."""

    _, file_ext = os.path.splitext(file_url)
    folder_hash = md5(file_url.encode('utf-8')).hexdigest()

    path = f"/notebooks/{folder_hash}"
    original = f"original{file_ext}"
    converted = f"converted{file_ext}"

    # TODO: delete the folder completely if `force`
    if not os.path.exists(path):
        file_content = _download_file(file_url)

        os.mkdir(path)
        with open(f"{path}/{original}", "w") as original_file:
            original_file.write(file_content)

        try:
            output = _convert_file(f"{path}/{original}", f"{path}/{converted}")
        except ConvertionException as error:
            shutil.rmtree(path)
            raise error

        with open(f"{path}/output", "w") as summary_output:
            summary_output.write('\n'.join(output))

        shutil.copy('report.txt', f"{path}/report")

        # persist `report.txt` to GCS
        storage = FileStorage()
        storage.save_file('report.txt', folder_hash)

        # found a python file, need to encode separately
        if original.endswith('.py'):
            result_filenames = []
            for py_file in [original, converted]:
                result_filenames.append(_save_ipynb_from_py(path, py_file))

            assert len(result_filenames) == 2
            return path, tuple(result_filenames[:2])

    if original.endswith('.py'):
        return path, (original.replace('.py', '.ipynb'),
                      converted.replace('.py', '.ipynb'))

    return path, (original, converted)
def collect_dept_bg(company_id=vgvars.erp_default['vghn'], src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='dept_erp'):

	###### initialize objects
	src = APIStorage(vgvars.erp) if not src else src
	# des = VarStorage({'data': defaultdict(list)}) if not des else des
	des = FileStorage({'fpath': '{}{}_{}'.format(out_dir, output_name, company_id)})
	apicollector = DataCollector(src,des)	

	####### perform
	dept_stmt_temp = vgvars.erp['dept']

	data = apicollector.fetch_data(dept_stmt_temp.format(company_id))['data']

	apicollector.insert_data({
		'selected_format': 'json',
		'values': data
	})
示例#17
0
def main():
    args = docopt(__doc__)

    storage = FileStorage("entries.txt")
    bailer.init_storage(storage)

    if args.get('getall'):
        print(bailer.get_all_flowers())
    elif args.get('add'):
        print(
            bailer.add_flower(args.get('<flower-name>'),
                              args.get('<watering-interval>')))
    elif args.get('remove'):
        print(bailer.remove_flower(args.get('<flower-name>')))
    elif args.get('water'):
        if args.get('--force'):
            print(bailer.water_flower_force(args.get('<flower-name>')))
        else:
            print(bailer.water_flower(args.get('<flower-name>')))
def collect_estore_spending(start_date,
                            end_date,
                            eids=[],
                            get_eids=False,
                            src='',
                            des='',
                            out_dir=vgvars.dir_path['out_dir'],
                            output_name='e_spending_data',
                            gen_stmt=get_collect_estore_spending_stmt):
    ###### prepare
    has_des = False if not des else True

    ###### initialize objects
    src = DBStorage(vgvars.vgdb) if not src else src
    db_collector = VGDBCollector(src)

    ###### collecting
    # make sure that data is collect within a selected period of time like a year, a month, or a week
    date_list = convert_datetime.divide_dates(start_date,
                                              end_date)  # in months

    for d in date_list:
        start_date, end_date = d

        # update destination storage
        # des = GSStorage(des)
        des = des if has_des else FileStorage({
            'fpath':
            '{}{}_{}_{}'.format(out_dir, output_name, start_date, end_date)
        })
        db_collector.des = des

        # not good enough, since data is check against all estores having orders in a month
        if get_eids:
            min_ord_id, max_ord_id = db_collector.get_max_min_id_by_time(
                start_date, end_date)
            eids = collect_estore_ids(min_ord_id, max_ord_id, db_collector)

        # collect data
        collect(start_date, end_date, db_collector, eids, gen_stmt=gen_stmt)

    return db_collector.des.data
示例#19
0
    def check_new_weibos(self):
        def _crawl(parser, uid, page, num_pages=''):
            msg = 'check new weibo in user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)

            html = self._fetch_weibo(uid, page)

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0

        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)

        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:
            return

        if not is_exist:
            msg = 'Not exist: %s.' %self.uid
            logger.info(msg)
            write_message(msg, self.window)

            return

        self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path)

        start_time = time.time()

        parser = ComWeibosParser(self.uid, self.storage)

        num_pages = _crawl(parser, self.uid, page=1)

        cost_time = int(time.time() - start_time)
        msg = ('Crawl user(%s)\'s weibos: total page=%s,'
               ' cost time=%s sec, connections=%s'
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
示例#20
0
    def crawl_infos(self):
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:
            return
        
        if not is_exist:
            msg = 'Not exist: %s.' %self.uid
            logger.info(msg)
            write_message(msg, self.window)
            return
        
        msg = 'Crawl user(%s)\'s profile' %self.uid
        logger.info(msg)
        write_message(msg, self.window)
        
        self.storage = FileStorage(self.uid, settings.MASK_INFO, self.store_path)
        
        start_time = time.time()

        url    = 'http://weibo.com/%s/info' % self.uid
        parser = ComInfosParser(self.uid, self.storage)
        
        html   = self._fetch(url, query=settings.QUERY_INFO)
        try:
            pq_doc = pq(html)
            parser.parse(pq_doc)
        except:
            pass
    
        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s infos: cost time=%s sec, connections=%s' 
               %(self.uid, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
def collect_staff_bg_erp(compids=['315', '319', '305', '320'],
                         page=0,
                         src='',
                         des='',
                         out_dir=vgvars.dir_path['out_dir'],
                         output_name='sfaff_bg_erp'):
    ###### initialize objects
    src = APIStorage(vgvars.erp) if not src else src

    apicollector = DataCollector(src, '')

    ####### perform
    dept_stmt_temp = vgvars.erp['staffbg']

    if compids:
        for compid in compids:

            if not page:
                page_num = apicollector.fetch_data(
                    dept_stmt_temp.format(compid, 1, '',
                                          ''))['data']['totalPage']

            # page_num = 2 # TESTING
            datalist = []
            for n in range(1, page_num + 1):
                data = apicollector.fetch_data(
                    dept_stmt_temp.format(compid, n, '',
                                          ''))['data']['currentItems']
                datalist.extend(data)

            des = FileStorage(
                {'fpath': '{}{}_{}'.format(out_dir, output_name, compid)})
            apicollector.des = des
            apicollector.insert_data({
                'selected_format': 'json',
                'values': datalist
            })
示例#22
0
 def __init__(self, cities=default_cities, link=base_link):
     self.cities = cities
     self.link = link
     self.storage = MongoStorage(
         'adv_links') if storage_type == 'mongo' else FileStorage(
             'adv_links')
示例#23
0
    def crawl_msg_comments(self):
        def _crawl(parser, msg_id, page, num_pages='?'):
            msg = 'Crawl message(%s)\'s comments-page:%s:%s' % (
                msg_id, num_pages, page)
            write_message(msg, self.window)

            html, num_pages = self._fetch_msg_comment(msg_id, page)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                parser.parse(pq_doc)

                return num_pages
            except:
                return None

        msg = 'Checking: whether message exists or not...'
        write_message(msg, self.window)
        msg_id = self.fetcher.check_message(self.msg_url)

        if msg_id is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if msg_id is False:
            msg = 'Not exist: %s.' % self.msg_url
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.msg_id = msg_id
        self.storage = FileStorage(self.msg_id, settings.MASK_COMMENT,
                                   self.store_path)

        start_time = time.time()

        parser = ComCommentsParser(msg_id, self.storage)
        num_pages = _crawl(parser, self.msg_id, 1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.comments_fp,
                                    self.storage.comments_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.msg_id, pg,
                                       num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.comments_fp,
                                        self.storage.comments_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl message(%s)\'s comments: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True
示例#24
0
 def mock_storage(self):
     retriever = mock.create_autospec(PickleStorageRetriever)
     saver = mock.create_autospec(PickleStorageSaver)
     return FileStorage(retriever, saver)
示例#25
0
    def crawl_fans(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages,
                                                          page)
            write_message(msg, self.window)

            url = 'http://weibo.cn/%s/fans?page=%s' % (uid, page)
            html = self._fetch(url)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None

        msg = 'Checking: whether user(%s) exists or not...' % self.uid
        write_message(msg, self.window)
        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if not is_exist:
            msg = 'Not exist: %s.' % (self.uid)
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.storage = FileStorage(self.uid, settings.MASK_FAN,
                                   self.store_path)

        start_time = time.time()

        parser = CnFansParser(self.storage)

        num_pages = _crawl(parser, self.uid, page=1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.fans_fp,
                                    self.storage.fans_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.fans_fp,
                                        self.storage.fans_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl user(%s)\'s fans: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True
 def setUp(self):
     self.storage = FileStorage()
     self.storage._db_path = 'todo_test_database.csv'
示例#27
0
 def __set_storage():
     if STORAGE_TYPE == 'mongo':
         return MongoStorage()
     return FileStorage()
示例#28
0
from typing import Optional

from contextlib import closing
from hashlib import md5
from storage import FileStorage
from db import Blob,Session
from storage import ExistsError,NotFoundError
from exceptions import DbCorruptionError

storage_backend = FileStorage()

def withsession(fn):
    def inner(sess=None,*args,**kwargs):
        if sess is None:
            with closing(Session()) as sess:
                return fn(sess = sess,*args,**kwargs)
        else:
            fn(sess,*args,**kwargs)
    return inner

@withsession
def store(data: bytes,sess: Optional[Session] = None):
    if sess is not None:
        data_id = md5(data).hexdigest()

        existing = sess.query(Blob).get(data_id)
        if existing is not None:
            raise ExistsError

        blob = Blob(id=data_id)
        sess.add(blob)
示例#29
0
 def __init__(self, repoDir):
     if not os.path.exists(repoDir):
         raise InvalidRepo("Invalid Repo path")
     #creating required structure
     self.path = repoDir
     self.storage = FileStorage(os.path.join(repoDir, ".svcs"))
示例#30
0
    def __init__(self, folder=None, overwrite=False, encoding='utf-8',
                 debug=False, default_webentity_creation_rule=None,
                 webentity_creation_rules=None):

        # Handling encoding
        self.encoding = encoding

        # Debugging mode
        if debug:
            if not default_webentity_creation_rule:
                default_webentity_creation_rule = ''
            if not webentity_creation_rules:
                webentity_creation_rules = {}
        else:

            # Ensuring the creation rules are set
            if not isinstance(default_webentity_creation_rule, basestring):
                raise TraphException('Given default webentity creation rule is not a string!')

            if not isinstance(webentity_creation_rules, dict):
                raise TraphException('Given webentity creation rules is not a dict!')
                # TODO: check if each value is correctly a string

        # Files
        self.folder = folder
        self.lru_trie_file = None
        self.link_store_file = None
        self.lru_trie_path = None
        self.link_store_path = None

        create = overwrite
        self.in_memory = not bool(folder)

        # Solving paths
        if not self.in_memory:
            self.lru_trie_path = os.path.join(folder, 'lru_trie.dat')
            self.link_store_path = os.path.join(folder, 'link_store.dat')

            # Ensuring the given folder exists
            try:
                os.makedirs(folder)
            except OSError as exception:
                if exception.errno == errno.EEXIST and os.path.isdir(folder):
                    pass
                else:
                    raise

            # Testing existence of files
            lru_trie_file_exists = os.path.isfile(self.lru_trie_path)
            link_store_file_exists = os.path.isfile(self.link_store_path)

            # Checking consistency
            if lru_trie_file_exists and not link_store_file_exists:
                raise TraphException(
                    'File inconsistency: `lru_trie.dat` file exists but not `link_store.dat`.'
                )

            if not lru_trie_file_exists and link_store_file_exists:
                raise TraphException(
                    'File inconsistency: `link_store.dat` file exists but not `lru_trie.dat`.'
                )

            # Do we need to create the files for the first time?
            create = overwrite or (not lru_trie_file_exists and not link_store_file_exists)

            flags = 'wb+' if create else 'rb+'

            self.lru_trie_file = open(self.lru_trie_path, flags)
            self.link_store_file = open(self.link_store_path, flags)

            self.lru_trie_storage = FileStorage(
                LRU_TRIE_NODE_BLOCK_SIZE,
                self.lru_trie_file
            )

            self.links_store_storage = FileStorage(
                LINK_STORE_NODE_BLOCK_SIZE,
                self.link_store_file
            )

            # Checking for corruption
            if not create and self.lru_trie_storage.check_for_corruption():
                raise TraphException(
                    'File corrupted: `lru_trie.dat`'
                )

            if not create and self.links_store_storage.check_for_corruption():
                raise TraphException(
                    'File corrupted: `link_store.dat`'
                )

        else:
            self.lru_trie_storage = MemoryStorage(LRU_TRIE_NODE_BLOCK_SIZE)
            self.links_store_storage = MemoryStorage(LINK_STORE_NODE_BLOCK_SIZE)

        # LRU Trie initialization
        self.lru_trie = LRUTrie(self.lru_trie_storage, encoding=encoding)

        # Link Store initialization
        self.link_store = LinkStore(self.links_store_storage)

        # Webentity creation rules are stored in RAM
        if not debug:
            self.default_webentity_creation_rule = re.compile(
                default_webentity_creation_rule,
                re.I
            )

            self.webentity_creation_rules = {}

            for prefix, pattern in webentity_creation_rules.items():
                self.add_webentity_creation_rule(prefix, pattern, create)