def make_blocks(num_records=2000, codec='null'): records = make_records(num_records) new_file = BytesIO() fastavro.writer(new_file, schema, records, codec=codec) new_file.seek(0) block_reader = fastavro.block_reader(new_file, schema) blocks = list(block_reader) new_file.close() return blocks, records
def avro_to_pandas(fname, reader_schema=None, num_cores=None): """ Converts Avro file to pandas dataframe using parallel processing. :param fname: path of avro file to be converted :param reader_schema: if schema of avro file is available (optional) :param num_cores: Number of processors to use (optional). By default uses all processors available :return: Pandas dataframe """ if num_cores is None: num_cores = cpu_count() with open(fname, 'rb') as fo: avro_reader = block_reader(fo, reader_schema) results = Parallel(n_jobs=num_cores)(delayed(process_block)(i) for i in avro_reader) results = list(itertools.chain.from_iterable(results)) return DataFrame(results)
def make_blocks(num_records=2000, codec='null'): records = make_records(num_records) new_file = MemoryIO() fastavro.writer(new_file, schema, records, codec=codec) new_file.seek(0) block_reader = fastavro.block_reader(new_file, schema) blocks = list(block_reader) new_file.close() return blocks, records
def __init__(self, datafile, datadir='', n_records=None, n_jobs=1): self.datadir = datadir self.datafile = datafile self.fo = open(os.path.join(self.datadir, self.datafile), "rb") self.reader = block_reader(self.fo) # Dictionary mapping categories to ordered lists of ids of all items in the category self.itemsByCategory = {} # Dictionary mapping item ids to their position in the input file self.itemPositions = {} self.logger = logging.getLogger( 'similar_item_service.search.SearchIndex') self._build_(n_records=n_records, n_jobs=n_jobs) # Keep track of current block and index in datafile to avoid unecessarily reseeking/iteration self.curBlock = None self.curIdx = None
def make_blocks(num_records=2000, codec='null', write_to_disk=False): records = make_records(num_records) new_file = NamedTemporaryFile() if write_to_disk else MemoryIO() fastavro.writer(new_file, schema, records, codec=codec) bytes = new_file.tell() new_file.seek(0) block_reader = fastavro.block_reader(new_file, schema) blocks = list(block_reader) new_file.close() return blocks, records, bytes
def _get_(self, blockStart, index): if self.curBlock is None or self.curBlock[ 0] != blockStart or self.curIdx > index: self.fo.seek(blockStart) try: self.curBlock = (blockStart, next(self.reader)) except StopIteration: self.fo.seek(0) self.reader = block_reader(self.fo) self.fo.seek(blockStart) self.curBlock = (blockStart, next(self.reader)) self.curIdx = 0 for item in self.curBlock[1]: self.curIdx += 1 if self.curIdx > index: return item
def read_avro_blocks(path, logger=None): """ Reads the avro file in argument and returns an iterator @param path: full path of the avro file to read @return: avro blocks iterator """ if not os.path.exists(path): if logger: logger.error(f"No file found: {path}") else: print(f"No file found: {path}") with open(path, "rb") as f: reader = fastavro.block_reader(f) for block in reader: yield block
def load(cls, fname): ''' Load a pickled search index from file fname ''' index = cls.__new__(cls) super(SearchIndex, index).__init__() index.logger = logging.getLogger( 'similar_item_service.search.SearchIndex') with open(fname, 'rb') as f: index.datadir, index.datafile, index.itemsByCategory, index.itemPositions = pickle.load( f) index.logger.info( f"Loading search index for {os.path.join(index.datadir, index.datafile)} from {fname}" ) index.fo = open(os.path.join(index.datadir, index.datafile), "rb") index.reader = block_reader(index.fo) index.curBlock = None index.curIdx = None return index