def hy_prediction(dataio, patient_info, patient_array, params): ### Task Setting task = params['task'][1] split_method = 'ratio' ratio = 0.8 # provide the ratio krun = 10 # run 5 times then average the result ### Initialization acc = np.zeros(krun, dtype='float32') # evaluation metric n_feature = dataio.feature.feature_len param_w = np.zeros([n_feature, krun], dtype='float32') # weights parameter ### H&Y Reading feature = dataio.feature patient_info = feature.get_hy_stage(patient_info, patient_array) print ('-----') # split = Splitter(task, patient_array, ratio, split_method, patient_info) split = Splitter(task, patient_array, ratio, split_method) for k in range(krun): ### Data Splitting train_data, test_data = split.get_splitter(k) ### Model Training hy_pred = HYPredictor(k, patient_info, train_data, params['result_path']) model, y_pred = hy_pred.train_model() param_w[:,k], _ = hy_pred.get_param() ### Evaluating hy_eval = Evaluator(model, test_data, patient_info, task, hy_pred) acc[k] = hy_eval.compute_accuracy() print ('-----') print ('Accuracy of the %s task: %f' %(task, np.sum(acc)/krun)) ### Displaying Feature (selected by prediction model) feature = dataio.feature feature.get_pred_feature(param_w, krun, 'yh')
def __init__(self, dset_name, net_names, hard_labels, device, exp=None): if exp is None: exp = 0 while osp.exists(osp.join(cfg.DATA_DIR, 'exp_' + str(exp))): exp += 1 self.exp_dir = osp.join(cfg.DATA_DIR, 'exp_' + str(exp)) self.num_exp = exp dset_dir = osp.join(self.exp_dir, dset_name) self.splitting_dir = osp.join(dset_dir, cfg.SPLITTING_DIR) self.feat_dir = osp.join(dset_dir, cfg.FEATURE_DIR) self.label_dir = osp.join(dset_dir, cfg.LABEL_DIR, 'hard' if hard_labels else 'soft') self.net_dir = osp.join(dset_dir, cfg.NET_DIR, 'hard' if hard_labels else 'soft') self.res_dir = osp.join(dset_dir, cfg.RESULT_DIR, 'hard' if hard_labels else 'soft') self.dset = cfg.DSETS[dset_name] self.splitter = Splitter(self.dset, self.splitting_dir) self.extractor = Extractor(self.dset, self.splitting_dir, self.feat_dir, net_names, device) self.augmenter = Augmenter(self.dset, self.splitting_dir, self.feat_dir, self.label_dir, net_names, hard_labels) self.trainer = Trainer(self.dset, self.label_dir, self.net_dir, self.res_dir, net_names, hard_labels, device)
def main(): # Set working directory to project folder os.chdir('../.') print(os.getcwd()) # Create Logger File to track all changes logger = Logger(os) # Create a list of words for parser to ignore stop_words = [] #['PSY', 'STAT'] ninja = Splitter(stop_words) ans = input('Do you want to manually input lines? ') # Create Messenger Object to ask prompts if 'y' in ans or 'Y' in ans: messenger = Messenger() line = '' exit = False while not exit: line = messenger.collect_input() if line == 'quit' or line == 'exit': exit = True else: # Output would be collected here print(ninja.split_line(line)) #else: print('Exiting code')
def add_splitter(self, model_name, receptacle_id): splitter_id = self.global_ids.get_next_splitter_id() rids = [] receptacle_count = SPLITTER_MODELS[model_name] for r_id in range(receptacle_count): rids.append(self.global_ids.get_next_receptacle_id()) if True == DEBUG: print('Support.add_splitter() receptacle_count:{0}, rids:{1}'.format(receptacle_count, rids)) sp = Splitter(model_name, splitter_id, rids) receptacle = self.get_receptacle_by_id(receptacle_id) receptacle.connect_load('SPLITTER', sp) self.full_receptacle_ids.append(receptacle_id) self.empty_receptacle_ids.remove(receptacle_id) r = Results() r.set_object_id(splitter_id) r.set_next_receptacle_id_from_list(rids) self.splitter_ids.append(splitter_id) del(r_id) for r_id in rids: r1 = sp.get_receptacle_by_id(r_id) if None == r1: print('Support.add_splitter sp.get_receptacle_by_id returned None for r1. r_id:{0}, rids:{1}'.format(r_id, rids)) self.empty_receptacle_ids.append(r_id) if True == DEBUG: print('Support.add_splitter(). Final empty_receptacle_ids:{0}'.format(self.empty_receptacle_ids)) print('Support.add_splitter(). Final full_receptacle_ids:{0}'.format(self.full_receptacle_ids)) return r
def moca_prediction(dataio, patient_info, patient_array, params): ### Task Setting task = params['task'][2] split_method = 'ratio' ratio = 0.8 # provide the ratio krun = 5 # run 5 times then average the result ### Initialization rmse = np.zeros(krun, dtype='float32') # evaluation metric n_feature = dataio.feature.feature_len param_w = np.zeros([n_feature, krun], dtype='float32') ### MoCA Reading feature = dataio.feature patient_info = feature.get_moca_score(patient_info, patient_array) print ('-----') split = Splitter(task, patient_array, ratio, split_method) for k in range(krun): ### Data Splitting train_data, test_data = split.get_splitter(k) ### Model Training moca_pred = MoCAPredictor(k, patient_info, train_data, params['result_path']) model, y_pred = moca_pred.train_model() param_w[:,k] = moca_pred.get_param() ### Evaluating pd_eval = Evaluator(model, test_data, patient_info, task, moca_pred) rmse[k] = pd_eval.compute_accuracy() print ('-----') print ('RMSE of the %s task: %f' %(task, np.sum(rmse)/krun)) ### Displaying Feature (selected by prediction model) feature = dataio.feature feature.get_pred_feature(param_w, krun, 'moca')
def __call__(self, _, *, audio_paths=[]): batch_size = len(audio_paths) if batch_size == 0: return [] dataset = Splitter( audio_paths, annotations=args["--annotations"], labels=args["--labels"], overlap=args["--overlap"], duration=args["--duration"], output_directory=args["--output_directory"], ) dataloader = torch.utils.data.DataLoader( dataset, # batch_size=batch_size, batch_size=1, shuffle=False, num_workers=args["--cores_per_node"], collate_fn=dataset.collate_fn, ) start = timer() outputs = [] for idx, data in enumerate(dataloader): for out in data: outputs.append(out) end = timer() print("DEBUG: end - start", end - start) return outputs
def pd_prediction(dataio, patient_info, patient_array, params): ### Task Setting task = params['task'][0] # disease prediction split_method = 'cross-validation' kfold = 5 # 5-fold validation ### Initialization auc = np.zeros(kfold, dtype='float32') # evaluation metrics ap = np.zeros(kfold, dtype='float32') n_feature = dataio.feature.feature_len param_w = np.zeros([n_feature, kfold], dtype='float32') print ('-----') split = Splitter(task, patient_array, kfold, split_method) for k in range(kfold): # each fold, k is the index of test set ### Data Splitting train_data, test_data = split.get_splitter(k) ### Model Training pd_pred = PDPredictor(k, patient_info, train_data, params['result_path']) model, y_pred = pd_pred.train_model() param_w[:,k], _ = pd_pred.get_param() ### Evaluating pd_eval = Evaluator(model, test_data, patient_info, task, pd_pred) auc[k], ap[k] = pd_eval.compute_accuracy() print ('-----') print ('AUC of the %s task: %f' %(task, np.sum(auc)/kfold)) print ('Average Precision of the %s task: %f' %(task, np.sum(ap)/kfold)) ### Displaying Feature (selected by prediction model) feature = dataio.feature feature.get_pred_feature(param_w, kfold, 'pd')
def test_splitter_does_the_map(self): areas = read_kml_areas("areas.kml") city_spots = [ { "name": "skytower", "type": "shopping mall", "lat": 17.019690, "lng": 51.094880 }, { "name": "Biedronka close to skytower", "type": "small shop", "lat": 17.018921, "lng": 51.097994 }, { "name": "Panorama Racławicka", "type": "historical building", "lat": 17.044462, "lng": 51.110171 }, { "name": "Galeria Dominikańska", "type": "shopping mall", "lat": 17.040685, "lng": 51.108244 }, ] splitter = Splitter() assigned = splitter.split_all_points(city_spots, areas) self.assertTrue(len(assigned) == 2) self.assertIn('Rynek 1', assigned) self.assertIn('Gajowice 1', assigned)
def run_splitter(batch): dataset = Splitter( batch, annotations=args["--annotations"], labels=args["--labels"], overlap=args["--overlap"], duration=args["--duration"], output_directory=args["--output_directory"], ) dataloader = torch.utils.data.DataLoader( dataset, # batch_size=batch_size, batch_size=1, shuffle=False, num_workers=args["--cores_per_node"], collate_fn=dataset.collate_fn, ) start = timer() outputs = [] for data in dataloader: for out in data: outputs.append(out) end = timer() print("DEBUG: end - start", end - start) return outputs
def split_patents(self, temp_dir, filename): self.logger.info("splitting files") xmls = get_files(temp_dir, ".xml") splitter = Splitter() for file in xmls: splitter.split_file(file, join(self.working_directory, self.patentDir, filename))
def visitTextFile(self, textfile): splitter = Splitter(textfile.filePath, len(self.workers)) file_split_result = splitter.split() self.operations[textfile.id] = FilePartition(textfile.id, len(self.workers), file_split_result, textfile.filePath) self._set_collect_count(textfile)
def __init__(self, modelId, runNo, filter_dics, filename, _type, splitter_type): super(Tracker, self).__init__() self.modelId = modelId self.runNo = runNo self.run_dir = str(modelId) + '_' + str(runNo) self.filename = filename self.filters = self.prepare_filters(filter_dics) self.type = _type # lazy or eager self.splitter = Splitter(splitter_type) self.track_data = {}
def __init__(self): self.splitter = Splitter() self.postagger = POSTagger() self.dicttagger = DictionaryTagger([ '/home/msinghal/PycharmProjects/basic_sentiment_analysis/dicts/positive.yml', '/home/msinghal/PycharmProjects/basic_sentiment_analysis/dicts/negative.yml', '/home/msinghal/PycharmProjects/basic_sentiment_analysis/dicts/morePositive.yml', '/home/msinghal/PycharmProjects/basic_sentiment_analysis/dicts/moreNegative.yml', '/home/msinghal/PycharmProjects/basic_sentiment_analysis/dicts/invert.yml' ])
def assemble2(self): """ Builder method: build a Chain of linked Components :return: """ log.info('Assembling Chain: %s...' % self.chain_str) # Create linked list of input/filter/output (ETL Component) objects chain_str = self.chain_str split_comps = [] while chain_str: chain_str = chain_str.strip() # Check and handle Splitter construct # e.g. input_xml_file |(transformer_xslt|output_file) (output_std) (transformer_xslt|output_std) if chain_str.startswith('('): etl_section_name, chain_str = chain_str.split(')', 1) etl_section_name = etl_section_name.strip('(') # Check for subchain (split at Filter level) if '|' in etl_section_name: # Have subchain: use Chain to assemble sub_chain = Chain(etl_section_name, self.config_dict) sub_chain.assemble2() child_comp = sub_chain.first_comp else: # Single component (Output) to split child_comp = factory.create_obj(self.config_dict, etl_section_name.strip()) # Assemble Components (can be subchains) for Splitter later split_comps.append(child_comp) if '(' in chain_str: # Still components (subchains) to assemble for Splitter continue if len(split_comps) > 0: # Next component is Splitter with children etl_comp = Splitter(self.config_dict, split_comps) split_comps = [] else: # "Normal" case: regular Components piped in Chain if '|' in chain_str: # More than one component in remaining Chain etl_section_name, chain_str = chain_str.split('|', 1) else: # Last element, we're done! etl_section_name = chain_str chain_str = None # Create the ETL component by name and properties etl_comp = factory.create_obj(self.config_dict, etl_section_name.strip()) # Add component to end of Chain self.add(etl_comp)
def __init__(self, dataset: ds.Dataset): self.loader = Splitter(batch_size=32).get_all(dataset) self.dataset_name = str(dataset) self.features = [] self.labels = [] self.pos_class = dataset.pos_class print(f'Encoding: {self.dataset_name}') if not os.path.exists('data'): os.makedirs('data') if not os.path.exists('data/encoded/'): os.makedirs('data/encoded/') self.root_dir = 'data/encoded/'
def process_text(text): splitter = Splitter() postagger = POSTagger() # Split the sentences to words splitted_sentences = splitter.split(text) # Do Parts of Speech Tagging on the words pos_tagged_sentences = postagger.pos_tag(splitted_sentences) dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences) return sum_score(dict_tagged_sentences)
def split_by_silence(self): ''' Uses the Splitter class to split the audio by silence to get the timestamps and respective filenames of the split segments. Segments is a list of tuples (filename, (start_time, end_time)). ''' if (self.audio_extracted): splitter = Splitter(self.audio_file) else: raise Exception( "ERROR: File has not been extracted from video yet.") splitter.run() self.segments = splitter.get_segments()
def attempt_dd_improvement(main_tour, best_length, other_tour, other_length): segments = Splitter(main_tour, other_tour).get_segments() positive_kmoves, negative_kmoves = segments_to_beneficial_kmoves(xy, segments, main_tour) # Now that we have independent kmoves, it can be very expensive to try all possible combinations. # So for efficiency's sake we try high-yield (supposedly) simple combinations: # 1. Try all beneficial kmoves at once. Quit if succeeds (cannot get better). # 2. Try each beneficial kmove sequentially, starting from highest-gain. # 3. Exclude negative k-moves sequentially, starting from highest-loss. naive_gain = best_length - other_length if positive_kmoves: max_positive_gain = sum([x[0] for x in positive_kmoves]) if max_positive_gain < naive_gain: print('MAX POSITIVE GAIN < NAIVE GAIN') main_tour = other_tour best_length = other_length return main_tour, best_length all_positive = combine_segment_array([x[1] for x in positive_kmoves]) test_tour = perform_kmove(main_tour, all_positive) if len(test_tour) == len(main_tour): main_tour = test_tour print('Trying all {} positive kmoves together worked; gain: {} (naive gain: {})'.format(len(positive_kmoves), max_positive_gain, naive_gain)) assert(max_positive_gain >= naive_gain) assert(max_positive_gain > 0) return main_tour, best_length - max_positive_gain # There may be cases where naive gain is more than decomposed gains: # decomposed gains currently only return moves that can be independently performed. # Infeasible moves that are improvements but only can be combined with other moves to become feasible # (a potentially computationally expensive search) will be excluded from the decomposed moves. dd_gain = 0 # gain due to decomposed kmoves. for k in positive_kmoves: print(' trying {}-opt move with gain {}'.format(len(k[1]['adds']), k[0])) test_tour = perform_kmove(main_tour, k[1]) if len(test_tour) == len(main_tour): main_tour = test_tour best_length -= k[0] dd_gain += k[0] if naive_gain > dd_gain: print('naive_gain ({}) greater than dd_gain ({})'.format(naive_gain, dd_gain)) main_tour = other_tour best_length = other_length if dd_gain > 0 and dd_gain > naive_gain: print(' dd gain {} greater than naive gain {}'.format(dd_gain, naive_gain)) elif naive_gain > 0: print('NAIVE GAIN > 0 WITH NO DD POSITIVE GAIN') main_tour = other_tour best_length = other_length return main_tour, best_length
def processQuestion(gloveModel, question, minLen=1, maxLen=3, useAPI=False, useSynonyms=False): tagger = POSTagger() pos = tagger.parse(question) # create splitter and generalizer splitter = Splitter() if question[-1] == '?' or question[-1] == '.': question = question[:-1] gen_question = splitter.generalize(question, pos) labels = [] resultsExists = False if not useAPI: parts = list(splitter.split(gen_question, min=minLen, max=maxLen)) else: resultsExists = True apiResult, _ = api.getBinaryRelations(question) parts = [ rel.predicate for rel in apiResult if len(rel.predicate_positions_) > 1 ] for part in parts: if len(part.split()) > 1: labels.append(part.split()[0] + ''.join(''.join([w[0].upper(), w[1:].lower()]) for w in part.split()[1:])) if useSynonyms: predicates = [max(part.split(), key=len) for part in parts] if predicates is not None and len(predicates) > 0: for predicate in predicates: for part in list(parts): if predicate in part: for syn in gloveModel.gloveModel.most_similar( predicate.lower()): parts.append(part.replace(predicate, syn[0])) if len(parts) == 0: resultsExists = False parts = list(splitter.split(gen_question, min=minLen, max=maxLen)) # create embedder part vectors = [] for part in parts: vectors.append(gloveModel.getVector(part)) return vectors, parts, pos, gen_question, labels, resultsExists
def __init__(self): self.sentences = [] self.abbreviation = {} self.load_data() self.load_abbrv() self.normalizer = Normalizer() self.splitter = Splitter() self.corrector = Filter() self.lemmatizer = WordNetLemmatizer() self.missing_apostrophe_vocab = [ 'isnt', ' arent', 'wasnt', 'werent', 'wont', 'dont', 'didnt', 'doesnt', 'couldnt', 'shouldnt', 'hasnt', 'havent', 'hadnt' ] self.tokenizer_mistake_vocab = [ 'isn', 'aren', 'wasn', 'weren', 'won', 'don', 'didn', 'doesn', 'couldn', 'shouldn', 'hasn', 'haven', 'hadn' ] self._norm = joblib.load('model.crfsuite')
def perturbed_hill_climb(xy, tour): tries = 0 success = 0 best_length = tour_util.length(xy, tour) while True: new_tour, naive_new_length = two_opt.optimize(xy, tour_util.double_bridge(tour)) # double bridge #test_tour = tour[:] #random.shuffle(test_tour) #new_tour, naive_new_length = two_opt.optimize(xy, test_tour) # random restart segments = Splitter(tour, new_tour).get_segments() kmoves = segments_to_beneficial_kmoves(xy, segments, tour) max_gain = 0 if kmoves: max_gain = sum([k[0] for k in kmoves]) naive_gain = best_length - naive_new_length # There may be cases where naive gain is more than decomposed gains: # decomposed gains currently only return moves that can be independently performed. # Infeasible moves that are improvements but only can be combined with other moves to become feasible # (a potentially computationally expensive search) wil be excluded from the decomposed moves. dd_gain = 0 # gain due to decomposed kmoves. if kmoves: for k in kmoves: print(' trying {}-opt move with gain {}'.format(len(k[1]['adds']), k[0])) test_tour = perform_kmove(tour, k[1]) if len(test_tour) == len(tour): tour = test_tour best_length -= k[0] dd_gain += k[0] if naive_gain > dd_gain: print('naive_gain ({}) greater than dd_gain ({})'.format(naive_gain, dd_gain)) tour = new_tour best_length = naive_new_length if naive_gain > 0 or dd_gain > 0: success += 1 if dd_gain > 0 and dd_gain > naive_gain: print(' dd gain {} greater than naive gain {}'.format(dd_gain, naive_gain)) tries += 1 current_length = basic.tour_length(xy, tour) assert(best_length == current_length) if current_length <= TARGET_LENGTH: break print('current best: {} (iteration {}), improvement rate: {}'.format(best_length, tries, success / tries))
def assemble(self): """ Builder method: build a Chain of linked Components :return: """ log.info('Assembling Chain: %s...' % self.chain_str) # Create linked list of input/filter/output (ETL Component) objects chain_str_arr = self.chain_str.split('|') for etl_section_name in chain_str_arr: # Check for splitting outputs construct using '+' # TODO: may also construct combining Inputs or split to multiple sub-Chains # for now only Outputs supported for splitting if '+' in etl_section_name: section_names = etl_section_name.split('+') log.info('Splitting to: %s' % etl_section_name) child_comps = [] for section_name in section_names: if '(' in section_name and ')' in section_name: section_name = section_name.replace(',', '|') section_name = section_name.strip('(') section_name = section_name.strip(')') # Create the child ETL component by name and properties child_comp = factory.create_obj(self.config_dict, section_name.strip()) child_comps.append(child_comp) etl_comp = Splitter(self.config_dict, child_comps) else: # Create the ETL component by name and properties etl_comp = factory.create_obj(self.config_dict, etl_section_name.strip()) # Add component to end of Chain self.add(etl_comp)
def __init__(self, memoryFile): self.nCycles = 0 # Used to hold number of clock cycles spent executing instructions self.dataMemory = DataMemory(memoryFile) self.instructionMemory = InstructionMemory(memoryFile) self.registerFile = RegisterFile() self.alu = ALU() self.mainControl = MainControl() self.splitter = Splitter() self.signExtender = SignExtender() self.andGate = AndGate() self.breaker = Breaker() self.constant4 = Constant(4) # self.randomControl = RandomControl() self.pcMux1 = Mux() self.pcMux2 = Mux() self.regMux = Mux() self.aluMux = Mux() self.resultMux = Mux() self.luiMux = Mux() self.adder = Add() self.branchAdder = Add() self.jumpAddress = JMPAddress() self.shiftBranch = LeftShiftTwo() self.shiftJump = LeftShiftTwo() self.pc = PC(hex(0xbfc00000)) # hard coded "boot" address self.elements = [self.constant4, self.adder, self.instructionMemory, self.breaker, self.splitter, self.shiftJump, self.mainControl, self.regMux, self.signExtender, self.luiMux, self.registerFile, self.jumpAddress, self.shiftBranch, self.branchAdder, self.aluMux, self.alu, self.dataMemory, self.andGate, self.pcMux1, self.pcMux2, self.resultMux, self.registerFile, self.pc] self._connectCPUElements()
from splitter import Splitter def merge_all(): for i, fname in enumerate(os.listdir('output')): if i == 0: df = pd.read_csv('output/' + fname) else: df = pd.merge(df, pd.read_csv('output/' + fname), how='outer', on='datetime') df.to_csv('health_care.csv') if __name__ == "__main__": print("Convert apple health care xml to csv.") s = Splitter() s.get_body_mass() s.get_burned_energy() s.get_heart_rate() s.get_stand_time() s.get_step_count() s.get_walking_distance() print("Merge all csv.") merge_all() print("Done.")
return item """ def select_item(self, splitter, user): max_item = self.actions[0] max_val = np.random.beta( max_item.successes + self.alpha, max_item.count - max_item.successes + self.beta) for item in self.actions[1:]: if not (user in splitter.train_set.keys() and item.item in splitter.train_set[user])\ or self.follow_back(splitter, item.item, user): val = np.random.beta(item.successes + self.alpha, item.count - item.successes + self.beta) if val > max_val: max_item = item max_val = val self.removed = 1 return max_item if __name__ == "__main__": from splitter import Splitter from plot import plot_results_graph import matplotlib.pyplot as plt spl = Splitter("../data/movieLens_binary_mini.dat", separator=" ") bandit = UCBBandit(spl, "mini", param=0, criteria="mean") print(len(bandit.actions)) plot_results_graph("mini", "eps") plt.show()
embeddings = emb.get_embeddings(data['title']) clustering = Clustering(data, config['Clustering']['directory'], config['Clustering']['cluster_picture_name'], config['Clustering']['result_data_file_name'], config['Clustering']['center_replics_file_name'], config['Clustering']['part_to_plot'], config['Clustering']['bgm_config']) df = clustering.get_clusters_and_final_data(embeddings) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') splitter = Splitter(df, config['Splitter']['path_to_save_data'], config['Splitter']['min_freq'], config['Splitter']['test_size'], config['Splitter']['batch_size'], device) train_iterator, test_iterator, train_data, test_data, SRC, TRG = splitter.get_iterators_and_fields( ) input_dim = len(SRC.vocab) output_dim = len(TRG.vocab) trg_pad_idx = TRG.vocab.stoi[TRG.pad_token] enc = Encoder(input_dim, config['model']['EMB_DIM'], config['model']['HID_DIM'], config['model']['ENC_LAYERS'], config['model']['ENC_KERNEL_SIZE'], config['model']['ENC_DROPOUT'], device) dec = Decoder(output_dim, config['model']['EMB_DIM'],
# -*- coding: utf-8 -*- """ Created on Sat May 25 10:06:24 2019 @author: Gerardo Cervantes """ from splitter import Splitter #For testing split keys split_keys = ['{PGUP}', '{BKSP}', '{F4}'] if __name__ == "__main__": splitter = Splitter() splitter.split('Livesplit', '{pgup}', 0)
def segment_and_pred(source_path, print_path, img_type): print('Start process pic:' + source_path) splitter = Splitter() if 'school' in img_type: segment_flag = school_flag attr = school_attr else: segment_flag = degree_flag attr = degree_attr result = {} image_color = cv2.imread(source_path) image = cv2.cvtColor(image_color, cv2.COLOR_BGR2GRAY) ret, adaptive_threshold = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY) splitter.show_img('adaptive_threshold', adaptive_threshold) ret, at = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY_INV) splitter.show_img('at', at) first_column_img = adaptive_threshold[0:image_color.shape[1], 120:350] second_column_img = adaptive_threshold[0:image_color.shape[1], 400:659] first_column_img_w = first_column_img.shape[0] second_column_img_w = second_column_img.shape[0] # 计算换行内容索引 horizontal_sum = np.sum(at, axis=1) peek_ranges = splitter.extract_peek_ranges_from_array(horizontal_sum) last_pr = None # 内容换行 line_feed_index_dict = {} # 内容为空 line_empty_index_dict = {} j = 0 k = 0 for i, pr in enumerate(peek_ranges): if last_pr is not None and pr[0] - last_pr[1] < 15: line_feed_index_dict[i] = j j += 1 elif last_pr is not None and pr[0] - last_pr[1] >= 42: line_empty_index_dict[i] = k k += 1 last_pr = pr # 内容包括换行的行数 line_feed_count = 0 # 内容为空的行数 line_empty_count = 0 i = 0 while i < (len(segment_flag) + len(line_feed_index_dict)): if i in line_feed_index_dict: line_feed_count += 1 elif i in line_empty_index_dict: line_empty_count += 1 result[attr[i - line_feed_count][0]] = '' result[attr[i - line_feed_count][1]] = '' i += 1 continue tmp1 = first_column_img[ peek_ranges[i - line_empty_count][0]:peek_ranges[i - line_empty_count][1], 0:first_column_img_w - 1] splitter.show_img('first image', tmp1) kv0_path = print_path + str(i) + '/' if not os.path.exists(kv0_path): os.makedirs(kv0_path) cv2.imwrite(kv0_path + 'kv0.png', tmp1) # cv2.waitKey(0) if segment_flag[i - line_feed_count][0] == 1: min_width = 12 else: min_width = 3 kv0_print_path = print_path + str(i) + '/0/' if os.path.exists(kv0_print_path): shutil.rmtree(kv0_print_path) if not os.path.exists(kv0_print_path): os.makedirs(kv0_print_path) splitter.process_by_path(kv0_path + 'kv0.png', kv0_print_path, minimun_range=min_width) attr_name = attr[i - line_feed_count][0] if attr_name != '': pred_result, pred_val_list = chinese_ocr.pred(kv0_print_path) if resegment(pred_val_list): print('re segment:' + kv0_print_path) shutil.rmtree(kv0_print_path) os.makedirs(kv0_print_path) splitter.process_by_path(kv0_path + 'kv0.png', kv0_print_path, minimun_range=min_width, pred_val_list=pred_val_list) pred_result, pred_val_list = chinese_ocr.pred(kv0_print_path) if attr_name in result: # 内容换行 result[attr_name] = result[attr_name] + pred_result else: result[attr_name] = pred_result tmp2 = second_column_img[ peek_ranges[i - line_empty_count][0]:peek_ranges[i - line_empty_count][1], 0:second_column_img_w - 1] splitter.show_img('second image', tmp2) kv1_path = print_path + str(i) + '/' cv2.imwrite(kv1_path + 'kv1.png', tmp2) if segment_flag[i - line_feed_count][1] == 1: min_width = 12 else: min_width = 3 kv1_print_path = print_path + str(i) + '/1/' if os.path.exists(kv1_print_path): shutil.rmtree(kv1_print_path) if not os.path.exists(kv1_print_path): os.makedirs(kv1_print_path) splitter.process_by_path(kv1_path + 'kv1.png', kv1_print_path, minimun_range=min_width) attr_name = attr[i - line_feed_count][1] if attr_name != '': pred_result, pred_val_list = chinese_ocr.pred(kv1_print_path) if resegment(pred_val_list): print('re segment:' + kv1_print_path) shutil.rmtree(kv1_print_path) os.makedirs(kv1_print_path) splitter.process_by_path(kv1_path + 'kv1.png', kv1_print_path, minimun_range=min_width, pred_val_list=pred_val_list) pred_result, pred_val_list = chinese_ocr.pred(kv1_print_path) if attr_name in result: # 内容换行 result[attr_name] = result[attr_name] + pred_result else: result[attr_name] = pred_result i += 1 return result
parameters['language'], parameters['path_to_fmridata'], input_path, logger=logs, **kwargs) logs.validate() logs.info("Retrieve arguments for each model...") kwargs_splitter = get_splitter_information(parameters) kwargs_compression = get_compression_information(parameters) kwargs_transformation = get_data_transformation_information(parameters) kwargs_estimator_model = get_estimator_model_information(parameters) logs.validate() logs.info("Instanciations of the classes...") splitter = Splitter(**kwargs_splitter) compressor = Compressor(**kwargs_compression) transformer = Transformer(**kwargs_transformation) estimator_model = EstimatorModel(**kwargs_estimator_model) logs.validate() logs.info("Defining Pipeline flow...") ## Pipeline splitter_cv_external = Task([splitter.split], name='splitter_cv_external') compressor_external = Task([compressor.compress], input_dependencies=[splitter_cv_external], name='compressor_external', flatten_inputs=[True]) transform_data_external = Task( [transformer.make_regressor, transformer.scale], input_dependencies=[splitter_cv_external, compressor_external],
from main import Main; from data_set import Data_Set; from dummy_master import Dummy_Master; from regressor import Regressor; from metrics import Metrics; from back_elimination import Back_Eliminations; from set_reader import Set_Reader; from splitter import Splitter; from plot import Plot; from process_data import Pre_Process_Data; import visual-python m = Main('init'); r = Regressor(); sp = Splitter(); mt = Metrics(); m.print(); be = Back_Eliminations(); pd = Pre_Process_Data(); sr = Set_Reader(); sr.read_files(); # sr.print_files_shapes(); train = sr.get_train(); test = sr.get_test(); ploter = Plot(); ploter.cut_survived(train, test); # ploter.plot_set_survived(sr.get_train(), "Sex", "Survived"); # ploter.plot_set_survived(sr.get_train(), "Pclass" ,"Survived");