class BaseFactory(object): def __init__(self): self.mapper = Mapper() def add_trait(self, *args): for flavour in args: flavour(self) def __call__(self, data): if self.__doc__ is None: raise RuntimeError("Factories not implementing __call__ must provide a docstring") obj = ETtree.Element(self.__doc__) self.mapper.map(data).into(obj) return obj
class Worker: def __init__(self, fs, name, addr, opts): self.addr = addr self.jt_addr = opts["jt_addr"] self.jt = ServerProxy(self.jt_addr) self.hb_timeout = 0.2 # heartbeat timeout in seconds self.on = True self.mapper = Mapper(opts, fs, "map" + name, addr) self.reducer = Reducer(fs, "reduce" + name, addr, opts, RPCMapperClient()) def start(self): print('Init worker') print('Start sending heartbeats to', self.jt_addr) _thread.start_new_thread(self._heartbeat, ()) print('Server is ready') def _heartbeat(self): while self.on: try: self.jt.heartbeat(self.addr) except Exception as e: print(e) time.sleep(self.hb_timeout) # map data by applying some data function # task_id - unique task_id # reducers_count - number of reducers for the task # chunk_path - DFS path to the chunk file to map # map_script - DFS path to script of map function # restart_task - if True then restart map task even its already completed or executing now def map(self, task_id, rds_count, chunk_path, map_script, restart_task=False): return self.mapper.map(task_id, rds_count, chunk_path, map_script, restart_task) # get status of task execution for the current task def get_status(self, task_id, chunk_path): return self.mapper.get_status(task_id, chunk_path) # read mapped data for specific region # task_id - unique task_id # region - is a integer region which is specified for the current reducer # Return dict {status: Status.ok, data: list of tuples} # if file not exists then status = Status.not_found # if file is empty then returns ok and empty list def read_mapped_data(self, task_id, region_number): return self.mapper.read_mapped_data(task_id, region_number) # signal from JT for starting reducing # task_id - unique task_id # region for which reducer is responsible # mappers which contain data for current task # path in DFS to files def reduce(self, task_id, region, mappers, script_path): return self.reducer.reduce(task_id, region, mappers, script_path)
def run(self, config_json, to_file=True): self.create_dirs() results = None try: config = Config(config_json, self.database_config) source_sparql = SPARQL(config, 'source') target_sparql = SPARQL(config, 'target') info_logger = InfoLogger( 'InfoLogger', '{}_{}'.format(source_sparql.get_query_hash(), target_sparql.get_query_hash())) source_cache = Cache(info_logger, config, source_sparql, 'source') source_cache.create_cache() target_cache = Cache(info_logger, config, target_sparql, 'target') target_cache.create_cache() mapper = Mapper(info_logger, config, source_sparql, target_sparql) results = mapper.map(to_file) except ConfigNotValidError as e: results = "Config not valid" print(e) except HTTPError as e: print(e) except JSONDecodeError as e: print(e) return results
def job(grouperNum, chunksQueue, listSaveStateNameGrouper, listListLastCallNum): print 'Starting worker ' + str(grouperNum) while True: # Get new chunck to process chunk = chunksQueue.get() # Work print 'Worker ' + str(grouperNum) + ' mapping chunk ' + str(chunk) MapIterator = MapChunkIterator(mapChunksNameGenerator(chunk)) # Iterator to iterate through the chunck theContext = MapContext(groupChunksNameGenerator(chunk),MapIterator) Mapper.map(theContext) print 'Worker ' + str(grouperNum) + ' grouping locally chunck ' + str(chunk) idx = listListLastCallNum[grouperNum]+1 theGrouper = Grouper(grouperNum,idx,idx-1,directory); listSaveStateNameGrouper[grouperNum] = theGrouper.group(theContext) listListLastCallNum[grouperNum] = idx ; # "Close" chunk chunksQueue.task_done()
class TestMapperMethod(unittest.TestCase): def setUp(self): self.mapper = Mapper("data/refFlatMm10.txt") def test_normal_case(self): cds_pos, aa_pos = self.mapper.map(101153495, "NM_146145") self.assertEqual(cds_pos, 3462) self.assertEqual(aa_pos, 1154) def test_coord_in_intron(self): cds_pos, aa_pos = self.mapper.map(101153494, "NM_146145") self.assertEqual(cds_pos, None) self.assertEqual(aa_pos, None) def test_refseq_id_not_in_file(self): cds_pos, aa_pos = self.mapper.map(101153495, "NM_899287") self.assertEqual(cds_pos, None) self.assertEqual(aa_pos, None)
def run(self, Mapper, Reducer, data): #map mapper = Mapper() tuples = mapper.map(data) #combine combined = {} for k, v in tuples: if k not in combined: combined[k] = [] combined[k].append(v) #reduce reducer = Reducer() output = reducer.reduce(combined) for line in output: print(line)