def convert_batch(data, output=None): # rtn_obj = {} # rtn_tup = (pyrdf(row['s']), pyrdf(row['p']), pyrdf(row['o'])) # return pyrdf(row['s']) #rtn_tup # for key, value in row.items(): # # try: # # print("convert_row_main: ", value) # # if value.get("datatype") == 'http://www.w3.org/2001/XMLSchema#dateTime': # # pdb.set_trace() # rtn_obj[key] = pyrdf(value) # # print(rtn_obj) # # except: # # pdb.set_trace() # return rtn_obj print("starting") # data_l = len(data) # i = 0 # while i < data_l: # converted = [] # for row in data[i:i+1000]: # converted.append({key:pyrdf(value) for key, value in row.items()}) # i += 1 # output.put(converted) for row in data: # output.append({key:pyrdf(value) for key, value in row.items()}) output.put([{key:pyrdf(value) for key, value in row.items()}]) # converted = [{key:pyrdf(value) for key, value in row.items()} # for row in data] print("converted") # output.put(converted) return
def convert_batch(data, output=None): # rtn_obj = {} # rtn_tup = (pyrdf(row['s']), pyrdf(row['p']), pyrdf(row['o'])) # return pyrdf(row['s']) #rtn_tup # for key, value in row.items(): # # try: # # print("convert_row_main: ", value) # # if value.get("datatype") == 'http://www.w3.org/2001/XMLSchema#dateTime': # # pdb.set_trace() # rtn_obj[key] = pyrdf(value) # # print(rtn_obj) # # except: # # pdb.set_trace() # return rtn_obj print("starting") # data_l = len(data) # i = 0 # while i < data_l: # converted = [] # for row in data[i:i+1000]: # converted.append({key:pyrdf(value) for key, value in row.items()}) # i += 1 # output.put(converted) for row in data: # output.append({key:pyrdf(value) for key, value in row.items()}) output.put([{key: pyrdf(value) for key, value in row.items()}]) # converted = [{key:pyrdf(value) for key, value in row.items()} # for row in data] print("converted") # output.put(converted) return
def _convert_results(self, data, **kwargs): """ converts the results of a query to RdfDatatype instances args: data: a list of triples """ if kwargs.get("multiprocessing", False): m = mp.Manager() output = m.Queue() pdb.set_trace() # processes = [mp.Process(target=convert_row_main, # args=(row, output,)) # for row in data] # # Run processes # for p in processes: # p.start() # # Exit the completed processes # for p in processes: # p.join() # # Get process results from the output queue # return [output.get() for p in processes] pool = mp.Pool(processes=pool_size) for i, row in enumerate(data): for key, val in row.items(): try: pool.apply(convert_row_main, args=( val, i, key, output, )) except: pass # # run = [pool.apply(convert_row_main, args=(row, i, output)) # for i, row in enumerate(data)] for item in output: pdb.set_trace() return output # with multiprocessing.Pool(processes=pool_size) as pool: # results = [convert_row_main, (row,)) # for row in data] # converted = [r.get() for r in results] # return converted #pool_outputs else: return [{key: pyrdf(value) for key, value in row.items()} for row in data]
def add_triple(self, sub, pred=None, obj=None, **kwargs): """ Adds a triple to the dataset args: sub: The subject of the triple or dictionary contaning a triple pred: Optional if supplied in sub, predicate of the triple obj: Optional if supplied in sub, object of the triple kwargs: map: Optional, a ditionary mapping for a supplied dictionary strip_orphans: Optional, remove triples that have an orphan blanknode as the object obj_method: if "list" than the object will be returned in the form of a list """ self.__set_map__(**kwargs) strip_orphans = kwargs.get("strip_orphans", False) obj_method = kwargs.get("obj_method") if isinstance(sub, DictClass) or isinstance(sub, dict): pred = sub[self.pmap] obj = sub[self.omap] sub = sub[self.smap] pred = pyrdf(pred) obj = pyrdf(obj) sub = pyrdf(sub) # reference existing attr for bnodes and uris if obj.type in self.relate_obj_types: if strip_orphans and not self.get(obj): return obj = self.get(obj, obj) try: self[sub].add_property(pred, obj) except KeyError: self[sub] = RdfClassBase(sub, self, **kwargs) self[sub].add_property(pred, obj)
def add_triple(self, sub, pred=None, obj=None, **kwargs): """ Adds a triple to the dataset args: sub: The subject of the triple or dictionary contaning a triple pred: Optional if supplied in sub, predicate of the triple obj: Optional if supplied in sub, object of the triple kwargs: map: Optional, a ditionary mapping for a supplied dictionary strip_orphans: Optional, remove triples that have an orphan blanknode as the object obj_method: if "list" than the object will be returned in the form of a list """ self.__set_map__(**kwargs) strip_orphans = kwargs.get("strip_orphans", False) obj_method = kwargs.get("obj_method") if isinstance(sub, DictClass) or isinstance(sub, dict): pred = sub[self.pmap] obj = sub[self.omap] sub = sub[self.smap] pred = pyrdf(pred) obj = pyrdf(obj) sub = pyrdf(sub) # reference existing attr for bnodes and uris if obj.type in self.relate_obj_types : if strip_orphans and not self.get(obj): return obj = self.get(obj,obj) try: self[sub].add_property(pred, obj) except KeyError: self[sub] = RdfClassBase(sub, self, **kwargs) self[sub].add_property(pred, obj)
def convert_row_main(val, i, key, output): # rtn_obj = {} # rtn_tup = (pyrdf(row['s']), pyrdf(row['p']), pyrdf(row['o'])) # return pyrdf(row['s']) #rtn_tup # for key, value in row.items(): # # try: # # print("convert_row_main: ", value) # # if value.get("datatype") == 'http://www.w3.org/2001/XMLSchema#dateTime': # # pdb.set_trace() # rtn_obj[key] = pyrdf(value) # # print(rtn_obj) # # except: # # pdb.set_trace() # return rtn_obj output.put((i, key, pyrdf(val),))
def convert_results(data, **kwargs): """ converts the results of a query to RdfDatatype instances args: data: a list of triples """ if kwargs.get("multiprocessing", False): manager = SharedManager() manager.register("BaseRdfDataType", BaseRdfDataType) manager.register("Uri", Uri) data_l = len(data) group_size = data_l // pool_size if data_l % pool_size: group_size += 1 split_data = [ data[i:i + group_size] for i in range(0, data_l, group_size) ] output = manager.Queue() # output = manager.list() # output_data = POOL.map(convert_row, split_data) workers = [ mp.Process(target=convert_batch, args=( item, output, )) for item in split_data ] for worker in workers: # worker.Daemon = True worker.start() results = [] while True: running = any(p.is_alive() for p in workers) while not output.empty(): results += output.get() if not running: break print("Finished - workers not stoped") for worker in workers: worker.join() # pdb.set_trace() # return output for i in range(output.qsize()): results += output.get() return results else: return [{key: pyrdf(value) for key, value in row.items()} for row in data]
def _convert_results(self, data, **kwargs): """ converts the results of a query to RdfDatatype instances args: data: a list of triples """ if kwargs.get("multiprocessing", False): m = mp.Manager() output = m.Queue() pdb.set_trace() # processes = [mp.Process(target=convert_row_main, # args=(row, output,)) # for row in data] # # Run processes # for p in processes: # p.start() # # Exit the completed processes # for p in processes: # p.join() # # Get process results from the output queue # return [output.get() for p in processes] pool = mp.Pool(processes=pool_size) for i, row in enumerate(data): for key, val in row.items(): try: pool.apply(convert_row_main, args=(val, i, key, output,)) except: pass # # run = [pool.apply(convert_row_main, args=(row, i, output)) # for i, row in enumerate(data)] for item in output: pdb.set_trace() return output # with multiprocessing.Pool(processes=pool_size) as pool: # results = [convert_row_main, (row,)) # for row in data] # converted = [r.get() for r in results] # return converted #pool_outputs else: return [{key:pyrdf(value) for key, value in row.items()} for row in data]
def convert_results(data, **kwargs): """ converts the results of a query to RdfDatatype instances args: data: a list of triples """ if kwargs.get("multiprocessing", False): manager = SharedManager() manager.register("BaseRdfDataType", BaseRdfDataType) manager.register("Uri", Uri) data_l = len(data) group_size = data_l // pool_size if data_l % pool_size: group_size += 1 split_data = [data[i:i + group_size] for i in range(0, data_l, group_size)] output = manager.Queue() # output = manager.list() # output_data = POOL.map(convert_row, split_data) workers = [mp.Process(target=convert_batch, args=(item, output,)) for item in split_data] for worker in workers: # worker.Daemon = True worker.start() results = [] while True: running = any(p.is_alive() for p in workers) while not output.empty(): results += output.get() if not running: break print("Finished - workers not stoped") for worker in workers: worker.join() # pdb.set_trace() # return output for i in range(output.qsize()): results += output.get() return results else: return [{key:pyrdf(value) for key, value in row.items()} for row in data]
def convert_row_main(val, i, key, output): # rtn_obj = {} # rtn_tup = (pyrdf(row['s']), pyrdf(row['p']), pyrdf(row['o'])) # return pyrdf(row['s']) #rtn_tup # for key, value in row.items(): # # try: # # print("convert_row_main: ", value) # # if value.get("datatype") == 'http://www.w3.org/2001/XMLSchema#dateTime': # # pdb.set_trace() # rtn_obj[key] = pyrdf(value) # # print(rtn_obj) # # except: # # pdb.set_trace() # return rtn_obj output.put(( i, key, pyrdf(val), ))
def get_json_qry_item(dataset, param, no_key=False): """ reads the paramater and returns the selected element args: dataset: the dataset to search param: the paramater to search by no_key: wheather to use the 'param' 'element' to filter the list. This is passed True after the first run during recurssive call when the key has already been used to select subset of the dataset """ def get_dataset_vals(ds, key, filter_tup=tuple()): def reduce_list(value): if isinstance(value, list): if len(value) == 1: return value[0] return value def merge_list(value): if isinstance(value, list): rtn_list = [] for item in value: if isinstance(item, list): rtn_list += item else: rtn_list.append(item) try: return list(set(rtn_list)) except TypeError: return rtn_list return value def test_elem(elem, filter_tup): search_lst = elem if isinstance(elem, dict): search_lst = elem.get(filter_tup[0], []) if filter_tup[2] == '=': try: if elem.subject == filter_tup[1]: return True except AttributeError: pass test_lst = [item for item in search_lst \ if (isinstance(item, dict) \ and item.subject == filter_tup[1]) \ or item == filter_tup[1]] if test_lst: return True return False def filter_list(ds, key, filter_tup): rtn_list = ds if key: rtn_list = merge_list([reduce_list(reduce_list(elem)[key]) \ for elem in ds if isinstance(reduce_list(elem), dict) and reduce_list(elem).get(key)]) if filter_tup: return [elem for elem in rtn_list \ if test_elem(elem, filter_tup)] return rtn_list if isinstance(ds, list): return filter_list(ds, key, filter_tup) elif isinstance(ds, dict): search_dict = ds if key: search_dict = ds.get(key,[]) if filter_tup: datalist = [] for elem in search_dict: if filter_tup[2] == "=": # pdb.set_trace() if filter_tup[1] in elem.get(filter_tup[0], []): if isinstance(elem, list): datalist += elem else: datalist.append(elem) elif filter_tup[2] == "!=": if filter_tup[1] not in elem.get(filter_tup[0], []): datalist.append(elem) return datalist # return [elem for elem in ds[key] \ # if filter_tup[1] in elem.get(filter_tup[0], []) \ # and elem] return merge_list(search_dict) if param == "*": return dataset try: if param.startswith("="): # if the dataset length is '0' consider it a false match if dataset: return [pyrdf(param[1:])] return [] except AttributeError: pass if hasattr(param, 'parsed_tree'): param = param.parsed_tree if hasattr(param, 'selector'): if no_key: key = None else: key = get_element(param.selector) rtn_obj = None if hasattr(param, 'ident'): if key: rtn_obj = get_dataset_vals(dataset, key, ('rdf_type', param.ident, "=")) elif param.ident in dataset.get('rdf_type', []): rtn_obj = dataset else: rtn_obj = [value for value in dataset.values() if param.ident in value.get('rdf_type', [])] # pdb.set_trace() elif hasattr(param, 'attrib'): # if param.parsed_tree.attrib == 'bf_role': # pdb.set_trace() rtn_obj = get_dataset_vals(dataset, key, (param.attrib, param.value, param.operator)) if rtn_obj is not None: if hasattr(param, 'selector') \ and hasattr(param.selector, 'selector') \ and rtn_obj: rtn_obj = get_json_qry_item(rtn_obj, param.selector, True) return rtn_obj if key: return dataset[key] else: return dataset elif hasattr(param, 'element'): key = param.element return get_dataset_vals(dataset, key)
def get_json_qry_item(dataset, param, no_key=False): """ reads the paramater and returns the selected element args: dataset: the dataset to search param: the paramater to search by no_key: wheather to use the 'param' 'element' to filter the list. This is passed True after the first run during recurssive call when the key has already been used to select subset of the dataset """ def get_dataset_vals(ds, key, filter_tup=tuple()): def reduce_list(value): if isinstance(value, list): if len(value) == 1: return value[0] return value def merge_list(value): if isinstance(value, list): rtn_list = [] for item in value: if isinstance(item, list): rtn_list += item else: rtn_list.append(item) try: return list(set(rtn_list)) except TypeError: return rtn_list return value def test_elem(elem, filter_tup): search_lst = elem if isinstance(elem, dict): search_lst = elem.get(filter_tup[0], []) if filter_tup[2] == '=': try: if elem.subject == filter_tup[1]: return True except AttributeError: pass test_lst = [item for item in search_lst \ if (isinstance(item, dict) \ and item.subject == filter_tup[1]) \ or item == filter_tup[1]] if test_lst: return True return False def filter_list(ds, key, filter_tup): rtn_list = ds if key: rtn_list = merge_list([reduce_list(reduce_list(elem)[key]) \ for elem in ds if isinstance(reduce_list(elem), dict) and reduce_list(elem).get(key)]) if filter_tup: return [elem for elem in rtn_list \ if test_elem(elem, filter_tup)] return rtn_list if isinstance(ds, list): return filter_list(ds, key, filter_tup) elif isinstance(ds, dict): search_dict = ds if key: search_dict = ds.get(key, []) if filter_tup: datalist = [] for elem in search_dict: if filter_tup[2] == "=": # pdb.set_trace() if filter_tup[1] in elem.get(filter_tup[0], []): if isinstance(elem, list): datalist += elem else: datalist.append(elem) elif filter_tup[2] == "!=": if filter_tup[1] not in elem.get(filter_tup[0], []): datalist.append(elem) return datalist # return [elem for elem in ds[key] \ # if filter_tup[1] in elem.get(filter_tup[0], []) \ # and elem] return merge_list(search_dict) if param == "*": return dataset try: if param.startswith("="): # if the dataset length is '0' consider it a false match if dataset: return [pyrdf(param[1:])] return [] except AttributeError: pass if hasattr(param, 'parsed_tree'): param = param.parsed_tree if hasattr(param, 'selector'): if no_key: key = None else: key = get_element(param.selector) rtn_obj = None if hasattr(param, 'ident'): if key: rtn_obj = get_dataset_vals(dataset, key, ('rdf_type', param.ident, "=")) elif param.ident in dataset.get('rdf_type', []): rtn_obj = dataset else: rtn_obj = [ value for value in dataset.values() if param.ident in value.get('rdf_type', []) ] # pdb.set_trace() elif hasattr(param, 'attrib'): # if param.parsed_tree.attrib == 'bf_role': # pdb.set_trace() rtn_obj = get_dataset_vals( dataset, key, (param.attrib, param.value, param.operator)) if rtn_obj is not None: if hasattr(param, 'selector') \ and hasattr(param.selector, 'selector') \ and rtn_obj: rtn_obj = get_json_qry_item(rtn_obj, param.selector, True) return rtn_obj if key: return dataset[key] else: return dataset elif hasattr(param, 'element'): key = param.element return get_dataset_vals(dataset, key)