def __init__(self, fs, name, addr, opts): self.addr = addr self.jt_addr = opts["jt_addr"] self.jt = ServerProxy(self.jt_addr) self.hb_timeout = 0.2 # heartbeat timeout in seconds self.on = True self.mapper = Mapper(opts, fs, "map" + name, addr) self.reducer = Reducer(fs, "reduce" + name, addr, opts, RPCMapperClient())
def createReducer(self): reducer = Reducer(5003 + (self.num - 1) * 10, self.ip) reducer.logging(False) reducer.log('Starting Up') # execution code goes here reducer.listen() # exiting reducer.log('Exiting')
def main(): initialize_log() logging.info("Starting reducer.") config_params = parse_config_params() reducer = Reducer(config_params['aggregated_data_queue'], config_params['sink_queue'], config_params['aggregators_quantity'], config_params['unflatten_key'], config_params['unflatten_value_key']) reducer.start()
def _parse(toklist): """ Parse a token list as a query """ # Parse with the nonterminal 'QueryRoot' as the grammar root with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp: sent_begin = 0 num_sent = 0 num_parsed_sent = 0 rdc = Reducer(bp.grammar) trees = dict() sent = [] for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if not slen: continue num_sent += 1 # Parse the accumulated sentence num = 0 try: # Parse the sentence forest = bp.go(sent) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None if num > 0: num_parsed_sent += 1 # Obtain a text representation of the parse tree trees[num_sent] = ParseForestDumper.dump_forest(forest) #ParseForestPrinter.print_forest(forest) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent) return result, trees
class Worker: def __init__(self, fs, name, addr, opts): self.addr = addr self.jt_addr = opts["jt_addr"] self.jt = ServerProxy(self.jt_addr) self.hb_timeout = 0.2 # heartbeat timeout in seconds self.on = True self.mapper = Mapper(opts, fs, "map" + name, addr) self.reducer = Reducer(fs, "reduce" + name, addr, opts, RPCMapperClient()) def start(self): print('Init worker') print('Start sending heartbeats to', self.jt_addr) _thread.start_new_thread(self._heartbeat, ()) print('Server is ready') def _heartbeat(self): while self.on: try: self.jt.heartbeat(self.addr) except Exception as e: print(e) time.sleep(self.hb_timeout) # map data by applying some data function # task_id - unique task_id # reducers_count - number of reducers for the task # chunk_path - DFS path to the chunk file to map # map_script - DFS path to script of map function # restart_task - if True then restart map task even its already completed or executing now def map(self, task_id, rds_count, chunk_path, map_script, restart_task=False): return self.mapper.map(task_id, rds_count, chunk_path, map_script, restart_task) # get status of task execution for the current task def get_status(self, task_id, chunk_path): return self.mapper.get_status(task_id, chunk_path) # read mapped data for specific region # task_id - unique task_id # region - is a integer region which is specified for the current reducer # Return dict {status: Status.ok, data: list of tuples} # if file not exists then status = Status.not_found # if file is empty then returns ok and empty list def read_mapped_data(self, task_id, region_number): return self.mapper.read_mapped_data(task_id, region_number) # signal from JT for starting reducing # task_id - unique task_id # region for which reducer is responsible # mappers which contain data for current task # path in DFS to files def reduce(self, task_id, region, mappers, script_path): return self.reducer.reduce(task_id, region, mappers, script_path)
def main(): """ This main() implements the following process and print out the number of shared tokens from two txt files @ outputs sorted tokenized results from txt1 @ outputs sorted tokenized results from txt2 @ merges these two sorted results and print out the number of shared tokens @ outputs the merged results """ assert not os.path.exists( sys.argv[3]), "ERROR: merged output file already exists!" temp1, temp2 = ".f1_temp", ".f2_temp" t1, t2 = open(temp1, 'w'), open(temp2, 'w') f1, f2 = open(sys.argv[1], 'r'), open(sys.argv[2], 'r') wc1, wc2 = WordCount(f1), WordCount(f2) wc1.output_to(t1) wc2.output_to(t2) t1.close() t2.close() f1.close() f2.close() t1, t2 = open(temp1, 'r'), open(temp2, 'r') ofile = open(sys.argv[3], 'a') r = Reducer(t1, t2) num_of_same_token = r.merge_to(ofile, count_same_token=True) print "# of common tokens: ", num_of_same_token t1.close() t2.close() ofile.close() subprocess.call(["rm", ".f1_temp", ".f2_temp"]) return 0
def run(self, Mapper, Reducer, data): #map mapper = Mapper() tuples = mapper.map(data) #combine combined = {} for k, v in tuples: if k not in combined: combined[k] = [] combined[k].append(v) #reduce reducer = Reducer() output = reducer.reduce(combined) for line in output: print(line)
def __init__(self): """ Create singleton instance """ # Check whether we already have an instance if ReductionSingleton.__instance is None: # Create and remember instance ReductionSingleton.__instance = Reducer() # Store instance reference as the only member in the handle self.__dict__[ '_ReductionSingleton__instance'] = ReductionSingleton.__instance
def __init__(self, parser, toklist, verbose = False): self._parser = parser self._reducer = Reducer(parser.grammar) self._num_sent = 0 self._num_parsed_sent = 0 self._num_tokens = 0 self._total_ambig = 0.0 self._total_tokens = 0 self._start_time = time.time() self._verbose = verbose self._toklist = toklist
def main(): if not os.path.exists('.data/items.json'): raise Exception( 'The items.json file in the .data folder does not exist. Please run the scraping script before executing this script.' ) collection_name = 'rsbuddy' client = MongoClient('localhost', 27017) database = client[collection_name] items = list(read_json('.data/items.json').items()) indexes = np.array_split(np.arange(len(items)), len(items) // 50) threads = [] thread_count = 2 for index in range(len(indexes)): indexes_ = indexes[index] thread = Reducer(database, collection_name, indexes_, items) thread.start() threads.append(thread) if (index % thread_count == 0 and index != 0): for thread in threads: thread.join() threads = []
with open(input_file) as reader: all_data = json.loads(reader.read()) data = all_data['records'] # mapped to country mapper = Mapper() if sys.argv[2]: query_continent = sys.argv[2] country_data = mapper.map_continent(data, query_continent) else: country_data = mapper.map_continent(data,"all") # get stats of each country reducer = Reducer() country_stats = reducer.get_country_stats(country_data) country_deaths_counts, country_cases_counts, country_mortality_counts, country_infection_counts = reducer.slice_country_stats(country_stats) # sort the data final_result = {} sorted_deaths = reducer.sort(country_deaths_counts, 10) final_result['Top_Deaths'] = sorted_deaths sorted_cases = reducer.sort(country_cases_counts,10) final_result['Top_Cases'] = sorted_cases sorted_infection = reducer.sort(country_infection_counts,10) final_result['Top_Infection'] = sorted_infection sorted_mortality = reducer.sort(country_mortality_counts,10) final_result['Top_Mortality'] = sorted_mortality output_json = json.dumps(final_result, indent=4)
def parse_tokens(toklist, mim_tags, fast_p): """ Parse the given token list and return a result dict """ # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 tag_ix = 0 ntags = len(mim_tags) rdc = Reducer(fast_p.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest score = 0 # Reducer score of the best parse tree try: # Progress indicator: sentence count print("{}".format(num_sent), end="\r") # Parse the sentence forest = fast_p.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None num = 0 # Obtain the index of the offending token err_index = e.token_index if num > 0: num_parsed_sent += 1 # Extract the POS tags for the terminals in the forest pos_tags = find_pos_tags(forest) # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) # Check whether the token streams are in sync if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]: #print("Warning: mismatch between MIM token '{0}' and Greynir token '{1}'".format(mim_tags[tag_ix][1], t[1])) # Attempt to sync again by finding the Greynir token in the MIM tag stream gap = 1 MAX_LOOKAHEAD = 4 while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]: gap += 1 if gap < MAX_LOOKAHEAD: # Found the Greynir token ahead #print("Re-synced by skipping ahead by {0} tokens".format(gap)) tag_ix += gap if tag_ix < ntags: tag_ix += 1 return dict( tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 )
def parse_tokens(toklist, mim_tags, fast_p): """ Parse the given token list and return a result dict """ # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 tag_ix = 0 ntags = len(mim_tags) rdc = Reducer(fast_p.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest score = 0 # Reducer score of the best parse tree try: # Progress indicator: sentence count print("{}".format(num_sent), end="\r") # Parse the sentence forest = fast_p.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None # Obtain the index of the offending token err_index = e.token_index if num > 0: num_parsed_sent += 1 # Extract the POS tags for the terminals in the forest pos_tags = find_pos_tags(forest) # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) # Check whether the token streams are in sync if tag_ix < ntags and t[1] != mim_tags[tag_ix][1]: print("Warning: mismatch between MIM token '{0}' and Reynir token '{1}'".format(mim_tags[tag_ix][1], t[1])) # Attempt to sync again by finding the Reynir token in the MIM tag stream gap = 1 MAX_LOOKAHEAD = 3 while gap < MAX_LOOKAHEAD and (tag_ix + gap) < ntags and mim_tags[tag_ix + gap][1] != t[1]: gap += 1 if gap < MAX_LOOKAHEAD: # Found the Reynir token ahead print("Re-synced by skipping ahead by {0} tokens".format(gap)) tag_ix += gap if tag_ix < ntags: tag_ix += 1 return dict( tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 )
def execute(self, map_func, reduce_func, kill_idx=-1): ''' Executes the Master worker to complete the MapReduce task Args: 1. map_func - handle for UDF map function 2. reduce_func - handle for UDF reduce function 3. kill_idx - specifies the worker to be killed; used to simulate fault tolerance when >= 0 ''' # Logic for coordinating mappers and reducer self.mappers = [] self.reducers = [] self.active_reducers = [] #instantiate mappers for idx in range(len(self.input_file_paths)): self.mappers.append( Mapper(idx, self.R, self.input_file_paths[idx], f'{self.TMP_DIR}/intermediate', map_func)) # NOTE: Keeping this for future exextuion time comparison # for m in mappers: # m.execute_map() # while (m.status != 'DONE'): # continue # self.active_reducers = self.active_reducers | m.reducer_ids # print('MAPPER {} finished executing'.format(m.id+1)) #, m.id, m.status) print("Map phase:") self.phase_flag = 0 #instantiate processes for map phase self.processes = [None] * self.M self.reducer_ids = [None] * self.M self.ps, self.cs = [None] * self.M, [None] * self.M self.mapper_status = [True] * self.M self.attempts = [0] * self.M for i, m in enumerate(self.mappers): #queue used for message passing self.reducer_ids[i] = mp.Queue() # ps[i], cs[i] = mp.Pipe() self.cs[i] = mp.Queue() self.processes[i] = mp.Process(target=m.execute_map, args=(self.reducer_ids[i], self.cs[i])) #execute mapper self.processes[i].start() #simulate process crash to test fault tolerance if (kill_idx == i): print(f"Killing process {i}") self.processes[i].kill() # Code for testing fault tolerance timeout if (kill_idx == -2): print(f"Killing process 1") self.processes[1].kill() #wait until all mappers have finished #mapping_status: Checks if phase is complete mapping_status = False while (mapping_status == False): mapping_status = True for i, m in enumerate(self.mappers): curr_status = None while True: try: #heartbeat message [curr_status, timestamp] = self.cs[i].get(timeout=self.timeout) break except: #no message received, check if max attempts reached if (self.attempts[i] < self.max_attempts): # restart replacement worker, increment attempt count self.restart_process(i, self.M, kill_idx) self.attempts[i] += 1 else: for i, m in enumerate(self.mappers): self.processes[i].kill() raise ValueError( "RETRY_ERROR: Maximum attempts reached, job failed" ) #check status received if curr_status == 'DONE' and self.mapper_status[i] == True: self.mapper_status[i] = False #get all valid reducer_ids self.active_reducers += self.reducer_ids[i].get() #wait until all processes have been completed self.processes[i].join() else: mapping_status = False print("\nAll mappers have finished executing") print("\nReduce phase:") self.phase_flag = 1 # NOTE: Keeping this for future exextuion time comparison # for r in reducer: # r.execute_reduce() # while (r.status != 'DONE'): # continue # print('REDUCER {} finished executing'.format(r.id+1))#, r.id, r.status) #similar to map phase, instantiate all reducers and processes self.active_reducers = (list(set(self.active_reducers))) self.processes = [None] * self.R self.ps, self.cs = [None] * self.R, [None] * self.R self.reducer_status = [True] * len(self.active_reducers) for idx in (self.active_reducers): self.reducers.append( Reducer(idx, len(self.input_file_paths), f'{self.TMP_DIR}/intermediate', self.OUT_DIR, reduce_func)) #setting up processes for reducers for i, r in enumerate(self.reducers): self.cs[i] = mp.Queue() self.processes[i] = mp.Process(target=r.execute_reduce, args=(self.cs[i], )) self.processes[i].start() #killing certain workers to test fault tolerance if (kill_idx == i): print(f"Killing process {i+1}") self.processes[i].kill() #check for heartbeat messages, similar to map phase reducing_status = False while reducing_status == False: reducing_status = True for i, r in enumerate(self.reducers): curr_status = None while True: try: #print(self.reducer_status[i]) if (self.reducer_status[i] is True): [curr_status, timestamp] = self.cs[i].get(timeout=self.timeout) break except: if (self.attempts[i] < self.max_attempts): self.restart_process(i, self.R, kill_idx) self.attempts[i] += 1 else: print("Max attempts reached, task not completed") for i, m in enumerate(self.reducers): self.processes[i].kill() raise ValueError( "TIMEOUT ERROR: Max attempts reached, task not completed" ) if curr_status == 'DONE' and self.reducer_status[i] == True: self.reducer_status[i] = False self.processes[i].join() elif curr_status == 'RUNNING': reducing_status = False print("\nAll reducing tasks have been completed")
def clean(cls, reducer_cls=None): if reducer_cls == None: ReductionSingleton.__instance = Reducer() else: ReductionSingleton.__instance = reducer_cls()
def analyze(): """ Find word categories in the submitted text """ txt = request.form.get("txt", "").strip() # Tokenize the text entered as-is and return the token list toklist = list(tokenize(txt)) # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 with Fast_Parser(verbose=False) as bp: # Don't emit diagnostic messages rdc = Reducer(bp.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest try: # Parse the sentence forest = bp.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) assert Fast_Parser.num_combinations(forest) == 1 # Mark the token list with the identified word categories mark_categories(forest, toklist, sent_begin + 1) except ParseError as e: # Obtain the index of the offending token err_index = e.token_index print( "Parsed sentence of length {0} with {1} combinations{2}" .format( slen, num, "\n" + (" ".join(s[1] for s in sent) if num >= 100 else ""))) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num**(1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence( num_parses=num, err_index=err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(tokens=toklist, tok_num=len(toklist), num_sent=num_sent, num_parsed_sent=num_parsed_sent, avg_ambig_factor=(total_ambig / total_tokens) if total_tokens > 0 else 1.0) # Return the tokens as a JSON structure to the client return jsonify(result=result)
def main(): nodeId = sys.argv[1] ip = sys.argv[2] #instantiate reduce reduce = Reducer(nodeId, ip, 5004)
################ print '------------------' print('Global grouping...') print '------------------' listOfDirectory = [] globalGrouperDirectory = '/Users/lcambier/TempMapReduce/mapper_and_groupper_logs2/' for i in range(0,totalNumberOfGrouper): listOfDirectory.append('/Users/lcambier/TempMapReduce/mapper_and_groupper_logs2/') globalDict = Grouper.globalGrouper(saveStateNameGrouper,listGrouperNum,listLastCallNum,listOfDirectory,globalGrouperDirectory) print('Global grouping done.') ############ # Reducing # ############ print '------------' print('Reducing ...') print '------------' outputDict = dict() for key, globalNodeFileName in globalDict.iteritems(): reduceIterator = ReduceFromGroupIterator(globalNodeFileName) theReduceContext = ReduceContext(key,reduceIterator) outputDict[key] = Reducer.reduce(theReduceContext) print('Reducing done.') ########## # OUTPUT # ########## print '\n------------------------------\nOutput\n------------------------------\n' for key in outputDict : print str(key) + ' - ' + str(outputDict[key]) # print 'apta : ' + str(outputDict['apta']) + ' vs 7'
class Gui: def __init__(self, rule_list, random_forest): self.random_forest = random_forest self.reducer = Reducer(rule_list, self.random_forest) # self.red_ruleset = rule_list # self.new_ruleset = rule_list @staticmethod def print_rule(rule, feature_names): ret = 'if ' for i in range(0, len(rule) - 1): if rule[i][1] == 'l': lower_greater = '<=' else: lower_greater = '>' ret += feature_names[rule[i][0]] + " " + lower_greater + " " + str( rule[i][2]) if i < len(rule) - 2: ret += ' and ' else: ret += ' then ' if rule[len(rule) - 1][0] > rule[len(rule) - 1][1]: ret += '\nHEALTHY!' else: ret += '\nDISEASED!' return ret @staticmethod def print_all_rules(ruleset, feature_names): ret = '' for rule in ruleset: ret += Gui.print_rule(rule, feature_names) ret += ' \n ' return ret # implementation of the GUI # feature_names: list of strings with all feature names # ruleset: array of array of arrays with all rules # X_train: dataframe with all data samples of training set # y_train: ground truth of X_train as array # X_test: dataframe with all data samples of test set # y_test: ground truth of X_test as array def window(self, feature_names, data_set_name, ruleset, X_train, y_train, X_test, y_test): global red_ruleset global new_ruleset red_ruleset = ruleset new_ruleset = ruleset feature_info = "Please name your favourite features. Rules containing them will be less likely to be deleted " \ "from the rule set. You can name as many as you want. The order matters: the first feature is " \ "treated as the most preferred one. Please separate the features with a comma. An example would " \ "be: \n \t 1,2,3" perc_info = "Pleas name the percentage of the size of the original rule set, you would like the reduced rule set " \ "to have. Please only type in the number, without the percent sign. An example would be: \n \t 30" # eliminate useless queries within a rule def first_reduction(): global red_ruleset global new_ruleset red_ruleset = self.reducer.reduce_rules() red1_label.config(text="new rule size: " + str(len(red_ruleset))) # reduce the rule set based on given percentage and preferred features def reduce_action(): global new_ruleset global red_ruleset features = eingabefeld.get() percentage = entrytext.get() if features == "": features = [] else: features = helpers.string_to_int_list(features) if percentage == "": reduce_label.config(text="no percentage set") else: numtoelim = int( (1 - (int(percentage) / 100)) * len(red_ruleset)) new_ruleset = self.reducer.eliminate_weakest_rules_2( favourite_features=features, k=4, numtoelim=numtoelim, ruleset=red_ruleset, xtrain=X_train, ytrain=y_train) vector_pred = self.random_forest.apply_ruleset_get_vector_new( ruleset=new_ruleset, xtest=X_test) if DEBUG: print("gui: vector pred len: %s" % len(vector_pred)) print("gui: y_test len: %s" % len(y_test)) acc = self.random_forest.get_accuracy_of_ruleset_new( ruleset=new_ruleset, xtest=X_test, ytest=y_test) spec = helpers.get_specificity(reslist=vector_pred, truevals=y_test) if DEBUG: print("gui: spec: %s" % spec) sens = helpers.get_sensitivity(reslist=vector_pred, truevals=y_test) reduce_label.config(text="New Rule Size: " + str(len(new_ruleset))) acc_label.config(text="Accuracy: " + str(acc) + ", Sensitivity: " + str(sens) + ", Specificity: " + str(spec)) def predict_action(): global new_ruleset global red_ruleset f0_text = e_f0.get() f1_text = e_f1.get() f2_text = e_f2.get() f3_text = e_f3.get() f4_text = e_f4.get() f5_text = e_f5.get() f6_text = e_f6.get() f7_text = e_f7.get() f8_text = e_f8.get() f9_text = e_f9.get() f10_text = e_f10.get() f11_text = e_f11.get() f12_text = e_f12.get() f13_text = e_f13.get() f14_text = e_f14.get() f15_text = e_f15.get() f16_text = e_f16.get() f17_text = e_f17.get() f18_text = e_f18.get() f19_text = e_f19.get() f20_text = e_f20.get() f21_text = e_f21.get() if ((f0_text == "") | (f1_text == "") | (f2_text == "") | (f3_text == "") | (f4_text == "") | (f5_text == "") | (f6_text == "") | (f7_text == "") | (f8_text == "") | (f9_text == "") | (f10_text == "") | (f11_text == "") | (f12_text == "") | (f13_text == "") | (f14_text == "") | (f15_text == "") | (f16_text == "") | (f17_text == "") | (f18_text == "") | (f19_text == "") | (f20_text == "") | (f21_text == "")): predict_label.config(text="not all features set") else: vec = [ float(f0_text), float(f1_text), float(f2_text), float(f3_text), float(f4_text), float(f5_text), float(f6_text), float(f7_text), float(f8_text), float(f9_text), float(f10_text), float(f11_text), float(f12_text), float(f13_text), float(f14_text), float(f15_text), float(f16_text), float(f17_text), float(f18_text), float(f19_text), float(f20_text), float(f21_text) ] df = pd.DataFrame([vec], columns=feature_names) pred = self.random_forest.apply_ruleset_get_vector_new( ruleset=new_ruleset, xtest=df) if pred[0] == 0: string = "HEALTHY!" else: string = "ALZHEIMERS DISEASE" predict_label.config(text="Prediction: " + string + "!") def message_features(): tkMessageBox.showinfo("Favourite Features", feature_info) def message_percentage(): tkMessageBox.showinfo("Percentage", perc_info) def print_rules_(): win = Toplevel(fenster) win.title("All Rules in Reduced Rule Set") scroll = Scrollbar(win) # scroll.pack(side = RIGHT, fill = Y) scroll.grid(row=0, column=1, sticky=N + S) txt = Text(win, wrap=WORD, yscrollcommand=scroll.set, xscrollcommand=scroll.set) txt.grid(row=0, column=0, sticky=N + S + E + W) # txt.insert(INSERT, build_string_ruleset(ruleset=self.new_ruleset, featurenames=feature_names)) txt.insert(INSERT, Gui.print_all_rules(new_ruleset, feature_names)) # txt.insert(INSERT, "TEST") scroll.config(command=txt.yview) def bar_chart_orig_rules(): global new_ruleset global red_ruleset wind = Toplevel(fenster) wind.title( "Number of rules containing respective features in original rule set" ) f = Figure(figsize=(5, 4), dpi=100) ax = f.add_subplot(111) data = helpers.get_number_feat_in_rules(ruleset=red_ruleset, features=range(0, 22)) ind = np.arange(22) width = .5 rects1 = ax.bar(ind, data, width) canvas = FigureCanvasTkAgg(f, master=wind) canvas.draw() canvas.get_tk_widget().pack(side=TOP, fill=BOTH, expand=1) def bar_chart_red_rules(): global new_ruleset global red_ruleset wind = Toplevel(fenster) wind.title( "Number of rules containing respective features in reduced rule set" ) f = Figure(figsize=(5, 4), dpi=100) ax = f.add_subplot(111) data = helpers.get_number_feat_in_rules(ruleset=new_ruleset, features=range(0, 22)) ind = np.arange(22) # the x locations for the groups width = .5 rects1 = ax.bar(ind, data, width) canvas = FigureCanvasTkAgg(f, master=wind) canvas.draw() canvas.get_tk_widget().pack(side=TOP, fill=BOTH, expand=1) # creating main window fenster = Tk() fenster.title("Decision Support") # information labels dataset = Label(fenster, text=data_set_name) numrules = Label(fenster, text="Number of Rules: " + str(len(ruleset))) feat_label = Label(fenster, text="Favourite Features (optional) ") perc_label = Label(fenster, text="Percentage") label_f0 = Label(fenster, text=feature_names[0]) label_f1 = Label(fenster, text=feature_names[1]) label_f2 = Label(fenster, text=feature_names[2]) label_f3 = Label(fenster, text=feature_names[3]) label_f4 = Label(fenster, text=feature_names[4]) label_f5 = Label(fenster, text=feature_names[5]) label_f6 = Label(fenster, text=feature_names[6]) label_f7 = Label(fenster, text=feature_names[7]) label_f8 = Label(fenster, text=feature_names[8]) label_f9 = Label(fenster, text=feature_names[9]) label_f10 = Label(fenster, text=feature_names[10]) label_f11 = Label(fenster, text=feature_names[11]) label_f12 = Label(fenster, text=feature_names[12]) label_f13 = Label(fenster, text=feature_names[13]) label_f14 = Label(fenster, text=feature_names[14]) label_f15 = Label(fenster, text=feature_names[15]) label_f16 = Label(fenster, text=feature_names[16]) label_f17 = Label(fenster, text=feature_names[17]) label_f18 = Label(fenster, text=feature_names[18]) label_f19 = Label(fenster, text=feature_names[19]) label_f20 = Label(fenster, text=feature_names[20]) label_f21 = Label(fenster, text=feature_names[21]) red1_label = Label(fenster) reduce_label = Label(fenster) predict_label = Label(fenster) acc_label = Label(fenster) # Here the user can enter something eingabefeld = Entry(fenster, bd=5, width=40) entrytext = Entry(fenster, bd=5, width=40) e_f0 = Entry(fenster, bd=5, width=8) e_f1 = Entry(fenster, bd=5, width=8) e_f2 = Entry(fenster, bd=5, width=8) e_f3 = Entry(fenster, bd=5, width=8) e_f4 = Entry(fenster, bd=5, width=8) e_f5 = Entry(fenster, bd=5, width=8) e_f6 = Entry(fenster, bd=5, width=8) e_f7 = Entry(fenster, bd=5, width=8) e_f8 = Entry(fenster, bd=5, width=8) e_f9 = Entry(fenster, bd=5, width=8) e_f10 = Entry(fenster, bd=5, width=8) e_f11 = Entry(fenster, bd=5, width=8) e_f12 = Entry(fenster, bd=5, width=8) e_f13 = Entry(fenster, bd=5, width=8) e_f14 = Entry(fenster, bd=5, width=8) e_f15 = Entry(fenster, bd=5, width=8) e_f16 = Entry(fenster, bd=5, width=8) e_f17 = Entry(fenster, bd=5, width=8) e_f18 = Entry(fenster, bd=5, width=8) e_f19 = Entry(fenster, bd=5, width=8) e_f20 = Entry(fenster, bd=5, width=8) e_f21 = Entry(fenster, bd=5, width=8) reduce_rule_set_button = Button(fenster, text="Reduce Rule Set", command=reduce_action) predict_button = Button(fenster, text="Predict", command=predict_action) red1_button = Button(fenster, text="First Reduction", command=first_reduction) bar_chart_orig_button = Button( fenster, text="Show Features in Original Rule Set", command=bar_chart_orig_rules) bar_chart_red_button = Button(fenster, text="Show Features in Reduced Rule Set", command=bar_chart_red_rules) info_feat_button = Button(fenster, text="more info", command=message_features) info_perc_button = Button(fenster, text="more info", command=message_percentage) info_rules_button = Button(fenster, text="Print Rules", command=print_rules_) dataset.grid(row=0, column=0, columnspan=5) numrules.grid(row=0, column=6, columnspan=5) feat_label.grid(row=4, column=2, columnspan=3) perc_label.grid(row=5, column=2, columnspan=3) eingabefeld.grid(row=4, column=4, columnspan=5) reduce_rule_set_button.grid(row=6, column=1, columnspan=9) entrytext.grid(row=5, column=4, columnspan=5) predict_button.grid(row=12, column=1, columnspan=9) info_rules_button.grid(row=15, column=1, columnspan=9) # exit_button.grid(row = 4, column = 1) reduce_label.grid(row=7, column=0, columnspan=3) predict_label.grid(row=13, column=1, columnspan=9) acc_label.grid(row=7, column=3, columnspan=8) red1_button.grid(row=2, column=1, columnspan=9) red1_label.grid(row=3, column=1, columnspan=9) bar_chart_orig_button.grid(row=17, column=0, columnspan=5) bar_chart_red_button.grid(row=17, column=6, columnspan=5) info_feat_button.grid(row=4, column=9) info_perc_button.grid(row=5, column=9) label_f0.grid(row=8, column=0) label_f1.grid(row=8, column=1) label_f2.grid(row=8, column=2) label_f3.grid(row=8, column=3) label_f4.grid(row=8, column=4) label_f5.grid(row=8, column=5) label_f6.grid(row=8, column=6) label_f7.grid(row=8, column=7) label_f8.grid(row=8, column=8) label_f9.grid(row=8, column=9) label_f10.grid(row=8, column=10) label_f11.grid(row=10, column=0) label_f12.grid(row=10, column=1) label_f13.grid(row=10, column=2) label_f14.grid(row=10, column=3) label_f15.grid(row=10, column=4) label_f16.grid(row=10, column=5) label_f17.grid(row=10, column=6) label_f18.grid(row=10, column=7) label_f19.grid(row=10, column=8) label_f20.grid(row=10, column=9) label_f21.grid(row=10, column=10) e_f0.grid(row=9, column=0) e_f1.grid(row=9, column=1) e_f2.grid(row=9, column=2) e_f3.grid(row=9, column=3) e_f4.grid(row=9, column=4) e_f5.grid(row=9, column=5) e_f6.grid(row=9, column=6) e_f7.grid(row=9, column=7) e_f8.grid(row=9, column=8) e_f9.grid(row=9, column=9) e_f10.grid(row=9, column=10) e_f11.grid(row=11, column=0) e_f12.grid(row=11, column=1) e_f13.grid(row=11, column=2) e_f14.grid(row=11, column=3) e_f15.grid(row=11, column=4) e_f16.grid(row=11, column=5) e_f17.grid(row=11, column=6) e_f18.grid(row=11, column=7) e_f19.grid(row=11, column=8) e_f20.grid(row=11, column=9) e_f21.grid(row=11, column=10) fenster.mainloop()
def parse_grid(): """ Show the parse grid for a particular parse tree of a sentence """ MAX_LEVEL = 32 # Maximum level of option depth we can handle txt = request.form.get('txt', "") parse_path = request.form.get('option', "") debug_mode = get_json_bool(request, 'debug') use_reducer = not ("noreduce" in request.form) # Tokenize the text tokens = list(tokenize(txt)) # Parse the text with Fast_Parser(verbose=False) as bp: # Don't emit diagnostic messages err = dict() grammar = bp.grammar try: forest = bp.go(tokens) except ParseError as e: err["msg"] = str(e) # Relay information about the parser state at the time of the error err["info"] = None # e.info forest = None # Find the number of parse combinations combinations = 0 if forest is None else Fast_Parser.num_combinations( forest) score = 0 if Settings.DEBUG: # Dump the parse tree to parse.txt with open("parse.txt", mode="w", encoding="utf-8") as f: if forest is not None: print("Reynir parse forest for sentence '{0}'".format(txt), file=f) print("{0} combinations\n".format(combinations), file=f) if combinations < 10000: ParseForestPrinter.print_forest(forest, file=f) else: print("Too many combinations to dump", file=f) else: print("No parse available for sentence '{0}'".format(txt), file=f) if forest is not None and use_reducer: # Reduce the parse forest forest, score = Reducer(grammar).go_with_score(forest) if Settings.DEBUG: # Dump the reduced tree along with node scores with open("reduce.txt", mode="w", encoding="utf-8") as f: print("Reynir parse tree for sentence '{0}' after reduction". format(txt), file=f) ParseForestPrinter.print_forest(forest, file=f) # Make the parse grid with all options grid, ncols = make_grid(forest) if forest else ([], 0) # The grid is columnar; convert it to row-major # form for convenient translation into HTML # There will be as many columns as there are tokens nrows = len(grid) tbl = [[] for _ in range(nrows)] # Info about previous row spans rs = [[] for _ in range(nrows)] # The particular option path we are displaying if not parse_path: # Not specified: display the all-zero path path = [(0, ) * i for i in range(1, MAX_LEVEL)] else: # Disassemble the passed-in path def toint(s): """ Safe conversion of string to int """ try: n = int(s) except ValueError: n = 0 return n if n >= 0 else 0 p = [toint(s) for s in parse_path.split("_")] path = [tuple(p[0:i + 1]) for i in range(len(p))] # This set will contain all option path choices choices = set() NULL_TUPLE = tuple() for gix, gcol in enumerate(grid): # gcol is a dictionary of options # Accumulate the options that we want do display # according to chosen path cols = gcol[NULL_TUPLE] if NULL_TUPLE in gcol else [ ] # Default content # Add the options we're displaying for p in path: if p in gcol: cols.extend(gcol[p]) # Accumulate all possible path choices choices |= gcol.keys() # Sort the columns that will be displayed cols.sort(key=lambda x: x[0]) col = 0 for startcol, endcol, info in cols: #assert isinstance(info, Nonterminal) or isinstance(info, tuple) if col < startcol: gap = startcol - col gap -= sum(1 for c in rs[gix] if c < startcol) if gap > 0: tbl[gix].append((gap, 1, "", "")) rowspan = 1 if isinstance(info, tuple): cls = {"terminal"} rowspan = nrows - gix for i in range(gix + 1, nrows): # Note the rowspan's effect on subsequent rows rs[i].append(startcol) else: cls = {"nonterminal"} # Get the 'pure' name of the nonterminal in question #assert isinstance(info, Nonterminal) info = info.name if endcol - startcol == 1: cls |= {"vertical"} tbl[gix].append((endcol - startcol, rowspan, info, cls)) col = endcol ncols_adj = ncols - len(rs[gix]) if col < ncols_adj: tbl[gix].append((ncols_adj - col, 1, "", "")) # Calculate the unique path choices available for this parse grid choices -= {NULL_TUPLE} # Default choice: don't need it in the set unique_choices = choices.copy() for c in choices: # Remove all shorter prefixes of c from the unique_choices set unique_choices -= {c[0:i] for i in range(1, len(c))} # Create a nice string representation of the unique path choices uc_list = ["_".join(str(c) for c in choice) for choice in unique_choices] if not parse_path: # We are displaying the longest possible all-zero choice: find it i = 0 while (0, ) * (i + 1) in unique_choices: i += 1 parse_path = "_".join(["0"] * i) return render_template("parsegrid.html", txt=txt, err=err, tbl=tbl, combinations=combinations, score=score, debug_mode=debug_mode, choice_list=uc_list, parse_path=parse_path)
def __init__(self, rule_list, random_forest): self.random_forest = random_forest self.reducer = Reducer(rule_list, self.random_forest)
def parse(toklist, single, use_reducer, dump_forest = False, keep_trees = False): """ Parse the given token list and return a result dict """ # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 # Accumulate parsed sentences in a text dump format trees = OrderedDict() with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages version = bp.version rdc = Reducer(bp.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest score = 0 # Reducer score of the best parse tree try: # Parse the sentence forest = bp.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if single and dump_forest: # Dump the parse tree to parse.txt with open("parse.txt", mode = "w", encoding= "utf-8") as f: print("Reynir parse tree for sentence '{0}'".format(" ".join(sent)), file = f) print("{0} combinations\n".format(num), file = f) if num < 10000: ParseForestPrinter.print_forest(forest, file = f) else: print("Too many combinations to dump", file = f) if use_reducer and num > 1: # Reduce the resulting forest forest, score = rdc.go_with_score(forest) assert Fast_Parser.num_combinations(forest) == 1 if Settings.DEBUG: print(ParseForestDumper.dump_forest(forest)) num = 1 except ParseError as e: forest = None # Obtain the index of the offending token err_index = e.token_index if Settings.DEBUG: print("Parsed sentence of length {0} with {1} combinations, score {2}{3}" .format(slen, num, score, "\n" + (" ".join(s[1] for s in sent) if num >= 100 else ""))) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen if keep_trees: # We want to keep the trees for further processing down the line: # reduce and dump the best tree to text if num > 1: # Reduce the resulting forest before dumping it to text format forest = rdc.go(forest) trees[num_sent] = ParseForestDumper.dump_forest(forest) # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict( version = version, tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 ) # noinspection PyRedundantParentheses return (result, trees)
def analyze(): """ Find word categories in the submitted text """ txt = request.form.get("txt", "").strip() # Tokenize the text entered as-is and return the token list toklist = list(tokenize(txt)) # Count sentences num_sent = 0 num_parsed_sent = 0 total_ambig = 0.0 total_tokens = 0 sent = [] sent_begin = 0 with Fast_Parser(verbose = False) as bp: # Don't emit diagnostic messages rdc = Reducer(bp.grammar) for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: num_sent += 1 sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if slen: # Parse the accumulated sentence err_index = None num = 0 # Number of tree combinations in forest try: # Parse the sentence forest = bp.go(sent) if forest: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) assert Fast_Parser.num_combinations(forest) == 1 # Mark the token list with the identified word categories mark_categories(forest, toklist, sent_begin + 1) except ParseError as e: # Obtain the index of the offending token err_index = e.token_index print("Parsed sentence of length {0} with {1} combinations{2}".format(slen, num, "\n" + (" ".join(s[1] for s in sent) if num >= 100 else ""))) if num > 0: num_parsed_sent += 1 # Calculate the 'ambiguity factor' ambig_factor = num ** (1 / slen) # Do a weighted average on sentence length total_ambig += ambig_factor * slen total_tokens += slen # Mark the sentence beginning with the number of parses # and the index of the offending token, if an error occurred toklist[sent_begin] = TOK.Begin_Sentence(num_parses = num, err_index = err_index) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict( tokens = toklist, tok_num = len(toklist), num_sent = num_sent, num_parsed_sent = num_parsed_sent, avg_ambig_factor = (total_ambig / total_tokens) if total_tokens > 0 else 1.0 ) # Return the tokens as a JSON structure to the client return jsonify(result = result)