def process_multiple(log, do_fetch=True, do_parse=True, do_merge=True): root = config["data-dir"] if do_fetch: tokens = Tokens() api = API(tokens, log) util.delete_files(root + '/processing/invoices', '*.json') success, invoice_cnt = api.fetch_invoice_details(hours_delta=30, tz_offset=7) if success and invoice_cnt > 0: log.write( "INFO api invoices extraction succeeded {:,} invoices saved to : {}" .format(invoice_cnt, '/processing/invoices')) elif success and invoice_cnt == 0: log.write( "INFO api no invoices extracted (no new/updated invoices in refresh period)" ) return True else: log.write( "ERROR api invoices extraction failed {:,} invoices saved to : {}" .format(invoice_cnt, '/processing/invoices')) return False if do_parse: util.delete_files(root + '/processing/invoices', '*.csv') parser = Parser(log) parser.parse('invoices-line-items') if do_merge: merger = Merger(log) merger.merge_invoice_delta() return True
def merge_videos(): global mer path_name_seg = [] for i in all_filename: path_name_seg.append( os.path.join(target_path, sel_res.getVideoTitle(), i)) mer = Merger( unicode( os.path.join( target_path, sel_res.getVideoTitle() + '.' + sel_res.getFileFormat())), path_name_seg) gui.frame_main.initTotal_Merge(len(all_filename)) mer.start() while True: gui.frame_main.updateMerge(mer.now) time.sleep(0.05) if mer.now == mer.sum: gui.frame_main.updateMerge(mer.now) break with open('config.ini', 'wb') as f: save_configure() dlg = wx.MessageDialog(gui.frame_main, u'视频已经合并完成,是否删除分段文件?', u'提示', wx.YES_NO | wx.ICON_QUESTION) if dlg.ShowModal() == wx.ID_YES: del_seg_video() dlg = wx.MessageDialog(gui.frame_main, u'分段文件删除完成。', u'提示', wx.OK | wx.ICON_QUESTION) dlg.ShowModal()
def test(test_case): merger = Merger(test_case) # codes = merger.get_codes(3) for k in range(10, 9, -1): value_sum = 0 for code, num in merger.merge_result(k).items(): value_sum += num print 'k =', k, merger.merge_result(k), ' \tMerged codes:', len(merger.merge_result(k)), ' \tTotal value:', value_sum
def __init__(self, indent=None, to_explore=False): self.indent = indent self.indent_children = None self.content = [] self.parent = None self.to_explore = to_explore self.merger = Merger() self.padding = None self.sf = None self.sc = None
def ProcessRequest(file): name = str(uuid4()) base_file_name = "%s-%s" % (name, secure_filename(file.filename)) file_name = "tmp/%s" % base_file_name print(file_name) file.save(file_name) with ZipFile(file_name, 'r') as zipObj: zipObj.extractall("tmp/%s" % name) Merger("tmp/%s" % name, os.path.realpath("tmp/combined-%s.ics" % name))
def __init__(self, cldict, sampd): self.cldict = cldict self.sampd = sampd self.mergo = Merger(cldict, sampd) self.meto = Metrics(cldict) lbwao = None lbbmapo = None lref_acc_str = sampd.ref_acc_str if lref_acc_str != "none": lbwao = AlignerBwa(cldict, sampd) self.bwao = lbwao self.samfco = SamFC(cldict, sampd)
def merge(self, corpus_size): """ The function will merge all the data in the posting files using the BSBI algorithm """ docs_file = self.get_docs_file() for key in self.postings_data: if os.listdir(self.postings_data[key]['path']): # directory is not empty merger = Merger(self.postings_data[key]['path'], "pkl", docs_file, corpus_size) merger.merge(self.postings_data[key]['name']) # The merger updates the docs data. After the merge of all the letters - all the documents data # Is updated and need to be saved on disk to reduce the memory load utils.save_obj(docs_file, f"{self.posting_dir_path}\\docs\\docs_index")
def get_IMPA_Merger(name): imp = iMPA(name) terc = imp.terc data = imp.getAddresses() s = min(map(lambda x: x.center.y, data)) w = min(map(lambda x: x.center.x, data)) n = max(map(lambda x: x.center.y, data)) e = max(map(lambda x: x.center.x, data)) addr = getAddresses(map(str, (s, w, n, e))) m = Merger(data, addr, terc) m.post_func.append(m.merge_addresses) m.merge() return m
def merge_videos(): _, res = iqiyi.getLastRes() path_name_seg = [] for i in all_filename: path_name_seg.append(os.path.join(video_title, i)) mer = Merger( unicode( os.path.join(target_path, video_title + '.' + res[sel_bid]['ff'])), path_name_seg) gui.frame_main.initTotal_Merge(len(all_filename)) mer.start() while True: gui.frame_main.updateMerge(mer.now) time.sleep(0.01) if mer.now == mer.sum: gui.frame_main.updateMerge(mer.now) break del_seg_video()
def __init__(self, visualizer=None, speaker_recognition=False): self.merger_to_main_queue = Queue(maxsize=1000) # very roughly 30sec self.merger = Merger(self.merger_to_main_queue) if visualizer is None: self.visualization = False else: self.visualization = True self.main_to_vis_queue = Queue(maxsize=50) self.visualizer = visualizer(self.main_to_vis_queue) self.speakers = {} self.num_speakers = 0 self.stt = T2t_stt() self.speaker_recognition = speaker_recognition # if self.speaker_recognition: # self.sr = Speaker_recognition() self.text_queue = mult_Queue() self.bing_allowed = False
def merge(self): # self.text_dst.config(state = 'normal') text = self.text_src.get('1.0', END) # print text.encode('utf-8') codes2num = decode(text) # print codes2num self.merger = Merger(codes2num) self.text_dst.delete(0.0, END) result_text = '' for k in range(10, 3, -1): result_text += '最大长度' + str(k) + ' ' result_text += encode(self.merger.merge_result(k)) # print result_text self.text_dst.insert(END, result_text)
def __init__(self, cldict, sampd): self.cldict = cldict self.sampd = sampd self.mergo = Merger(cldict, sampd) self.meto = Metrics(cldict) lbwao = None lbbmapo = None lref_acc_str = sampd.ref_acc_str lhost_ref_str = sampd.host_ref_str if lref_acc_str != "none": lbwao = AlignerBwa(cldict, sampd) if lhost_ref_str != "none": lbbmapo = AlignerBBMap(cldict, sampd) self.bwao = lbwao self.bbmapo = lbbmapo self.samfco = SamFC(cldict, sampd) self.jlco = CounterJL(cldict, sampd) print("Created JLCounter object")
def main(): scrapper = Scrapper() merger = Merger() parser = Parser() client = MongoClient('localhost', 27017) db = client['Data'] collection_socialmedia = db['socialmedia'] #Begin real time collecting while True: scrapper.scrap() merger.main() parser.main() sleep(3600) #Storing to mangoDB f = open( '/home/sartharion/Bureau/stage/POO/data.json', 'r') file_data = json.load(f) collection_socialmedia.delete_many({}) collection_socialmedia.insert_many(file_data) client.close()
def __init__(self, args): self.config_log_file = args.config_log_file self.sample_id = args.sample_id self.project_id = args.project_id self.prefix_set = args.prefix_set self.bc_set = args.bc_set cldict_d = yaml.load(open(self.config_log_file)) cldict = DictMap(cldict_d) self.cldict = cldict sampd = dict() sampd['sample_id'] = self.sample_id sampd['project_id'] = self.project_id sampd['prefix_set'] = self.prefix_set sampd['bc_set'] = self.bc_set sampd_map = DictMap(sampd) self.sampd = sampd_map mergo = Merger(cldict, sampd_map) self.mergo = mergo
def get_history(id: ObjectId, num_changes: int = None): hist = history.find({'ref': id}).sort('_id', direction=pymongo.DESCENDING) curr = data.find_one({'_id': id}) yield curr prev = curr count = 0 merger = Merger() for d in hist: if num_changes and count == num_changes: break d['ref_creation_time'] = d['_id'].generation_time del d['_id'] del d['ref'] l: dict = copy.deepcopy(prev) merger.merge_changes(l, d) yield l prev = l count += 1
def __call__(self, full_table, bad_tables, good_tables, **kwargs): """ table has been trimmed of extraneous columns. """ self.setup_tables(full_table, bad_tables, good_tables, **kwargs) self.SCORE_ID = add_meta_column( chain(self.bad_tables, self.good_tables), SCORE_VAR) self.CLASS_ID = add_meta_column(chain(self.bad_tables, self.good_tables), "INFCLASS", vals=['0', '1']) start = time.time() self.compute_perrow_influences(self.bad_tables, self.bad_err_funcs) self.compute_perrow_influences(self.good_tables, self.good_err_funcs) self.cost_compute_inf = time.time() - start start = time.time() if self.tree_alg == 'c45': table, rules = self.c45_rules() elif self.tree_alg == 'or': table, rules = self.orange_dt_rules() elif self.tree_alg == 'dt': table, rules = self.sk_dt_rules(max_depth=12) elif self.tree_alg == 'rt': table, rules = self.sk_rt_rules(max_depth=12) else: _logger.warn( "unknown NDT algorithm %s. Defaulting to regression tree", self.tree_alg) table, rules = self.sk_rt_rules(max_depth=12) self.cost_learn = time.time() - start # # ok now convert rules to clusters # _logger.debug("got %d rules", len(rules)) fill_in_rules(rules, table, cols=self.cols) self.cost_learn = time.time() - start clusters = [Cluster.from_rule(rule, self.cols) for rule in rules] for cluster in clusters: cluster.error = self.influence_cluster(cluster) clusters = filter_bad_clusters(clusters) clusters.sort(key=lambda c: c.error, reverse=True) print '\n'.join(map(str, clusters[:5])) self.all_clusters = self.final_clusters = clusters return self.final_clusters # # merge the clusters # thresh = compute_clusters_threshold(clusters, nstds=1.5) is_mergable = lambda c: c.error >= thresh params = dict(kwargs) params.update({ 'cols': self.cols, 'err_func': self.err_func, 'influence': lambda c: self.influence_cluster(c), 'influence_components': lambda c: self.influence_cluster_components(c), 'is_mergable': is_mergable, 'use_mtuples': False, 'learner': self }) self.merger = Merger(**params) merged_clusters = self.merger(clusters) merged_clusters.sort(key=lambda c: c.error, reverse=True) clusters.extend(merged_clusters) normalize_cluster_errors(clusters) clusters = list(set(clusters)) self.all_clusters = clusters self.final_clusters = merged_clusters self.costs = {'cost_learn': self.cost_learn} return self.final_clusters
def assemble(self): """ Builder method: build a Chain of linked Components :return: """ log.info('Assembling Chain: %s...' % self.chain_str) # Create linked list of input/filter/output (ETL Component) objects chain_str = self.chain_str sub_comps = [] while chain_str: chain_str = chain_str.strip() # Check and handle Splitter construct # e.g. input_xml_file |(transformer_xslt|output_file) (output_std) (transformer_xslt|output_std) if chain_str.startswith('('): etl_section_name, chain_str = chain_str.split(')', 1) etl_section_name = etl_section_name.strip('(') # Check for subchain (split at Filter level) if '|' in etl_section_name: # Have subchain: use Chain to assemble sub_chain = Chain(etl_section_name, self.config_dict) sub_chain.assemble() child_comp = sub_chain.first_comp else: # Single component (Output) to split child_comp = factory.create_obj(self.config_dict, etl_section_name.strip()) # Assemble Components (can be subchains) for Splitter later sub_comps.append(child_comp) if '(' in chain_str: # Still components (subchains) to assemble for Splitter continue if len(sub_comps) > 0: if chain_str.startswith('|'): # Next component is Merger with children etl_comp = Merger(self.config_dict, sub_comps) dummy, chain_str = chain_str.split('|', 1) else: # Next component is Splitter with children etl_comp = Splitter(self.config_dict, sub_comps) sub_comps = [] else: # "Normal" case: regular Components piped in Chain if '|' in chain_str: # More than one component in remaining Chain etl_section_name, chain_str = chain_str.split('|', 1) else: # Last element, we're done! etl_section_name = chain_str chain_str = None # Create the ETL component by name and properties etl_comp = factory.create_obj(self.config_dict, etl_section_name.strip()) # Add component to end of Chain self.add(etl_comp)
value = 0 for code, num in case.items(): print code, num, value += num print print value print str(timeit.timeit('test(case)', 'from __main__ import test, case', number=1)) + 's used.' TEST_CASES = [ ['012', '013', '023', '123'], ['012', '013', '023', '124', '134', '234'], ['012', '013', '023', '123', '124', '134', '234'], ['012', '013', '023', '123', '123', '124', '134', '234'], ['012', '013', '014', '023', '024', '034', '123', '124', '134', '234'], ['012', '023', '013', '123', '123', '234', '134', '124', '125', '127', '157', '257', '125', '127', '157'], ['012', '023', '013', '123', '123', '234', '134', '124', '125', '125', '127', '157', '257', '125', '127', '157'], ] codes = Merger([]).get_codes(3) case = TEST_CASES[0] # case = [to_string(random.choice(codes)) for dummy_i in range(10000)] # case = TEST_CASES[3] * 10 # case = TEST_CASES[0] + TEST_CASES[4] # case = TEST_CASES[6] # print str(timeit.timeit('test(case)', 'from __main__ import test, case', number=1)) + 's used.' # print len(" sdsd\n\n \t".strip())
parser = OptionParser(version="%prog " + __VERSION__, usage=usage, description=banner) parser.add_option("--dir", "-d", action="store", type="string", dest="dir", help="Files match (Default: *.ics)", default="*.ics") parser.add_option("--ical", "-i", action="store", type="string", dest="icalfile", help="iCalendar file output") (options, args) = parser.parse_args() if options.icalfile == "": options.icalfile = None if options.icalfile != None: options.icalfile = os.path.realpath(options.icalfile) Merger(options.dir, options.icalfile) sys.exit(0) sys.exit(1)
def test_run(self): merger = Merger() actual = merger.run(os.path.join('test', 'data', 'cfg.csv')) self.assertIsNotNone(actual) self.assertIs(3, actual.shape[1])
from tkinter import END, Tk, Label, Frame, Text, SUNKEN, Button, TRUE, TOP, Listbox, SINGLE, Scrollbar, DISABLED, Checkbutton, BooleanVar from tkinter.filedialog import askopenfilenames, askopenfilename from pandastable import Table, TableModel import detector from exporter_GUI import ExporterGUI from importer import Importer from merger import Merger importer = Importer() merger = Merger(importer) class ImporterGUI: """ This class is the main gui and shows the import window, it also contains the main methods which uses other helper classes Attributes: root = the root tk previewFrame = the frame that shows the preview of the dataframe XMLList = the 2D list which holds the xml filepath on the index 0 and the xsl filepath on index 1 """ def __init__(self, root: Tk): self.root = root self.previewFrame = Frame(root) self.pt = Table(self.previewFrame) self.dialect = detector.Dialect() self.XMLList = [] self.DialectList = [] self.hasHeaderVar = BooleanVar() self.currentPath = ""
def QC_germline(self): # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed.. #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged': # if the user specified the '--pass_fail' option, then run this part still if self.sample_json[ 'sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all: # QC the normal runs with each other self.QC_runs(self.sample_json['runs']) # what if there is only one run that passes all of the metrics? It should be marked as the 'final_json' and have the 'pass_fail_merged' flag marked as pass. # make the merger merger = Merger(self.sample_json, self.options.recalc_3x3_tables) # Check to see if the normal runs are ready to be merged. self.sample_json, merge = merger.check_merge(self.sample_json['runs']) if merge != True: if 'final_json' in self.sample_json: # update the final run status merger.update_merged_run_status(self.sample_json['final_json']) elif merge == True: # merge the normal and/or tumor runs. Will only merge the passing runs with each other. self.sample_json = merger.merge_runs('germline') # update the merged run status merger.update_merged_run_status(self.sample_json['merged_json']) if json.load(open(self.sample_json['merged_json']) )['pass_fail_merged_status'] == 'pass': # Set the sample_status self.sample_json['sample_status'] = 'merged_pass' # cleanup the individual run bam files self.cleanup_sample.cleanup_runs( self.sample_json['runs'], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Cleanup the merged dir self.cleanup_sample.cleanup_runs( [self.sample_json['merged_json']], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) else: self.sample_json['sample_status'] = 'awaiting_more_sequencing' # copy the final run's VCF file to the final_dir if it passes the "merged" coverage flag if 'final_json' in self.sample_json: final_json = json.load(open(self.sample_json['final_json'])) if final_json['pass_fail_merged_status'] == 'pass': final_vcf = glob.glob("%s/*.vcf" % final_json['run_folder'])[0] final_project_dir = "/home/ionadmin/jeff/%s_Final_VCFs" % ( self.sample_json['project']) print "copying %s to %s" % (final_vcf, final_project_dir) # check to make sure the final dir exists. if not os.path.isdir(final_project_dir): os.mkdir(final_project_dir) shutil.copy( final_vcf, "%s/%s.vcf" % (final_project_dir, self.sample_json['sample_name'])) # now push the sample to s3 storage if self.sample_json['project'] == 'Einstein': print "pushing %s to amazon s3 storage" % self.sample_json[ 'sample_name'] self.push_sample_to_s3(final_json)
def QC_tumor_normal(self): # Separate the runs into tumor and normal lists normal_runs, tumor_runs = self.getTumor_Normal() if self.sample_json['analysis']['settings'][ 'type'] == 'all_tumor_normal': # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed.. #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged': # if the user specified the '--pass_fail' option, then run this part still if self.sample_json[ 'sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all: # QC the normal or tumor runs with each other self.QC_runs(normal_runs, 'normal_') self.QC_runs(tumor_runs, 'tumor_') # now QC the tumor and normal runs together. self.QC_normal_tumor_runs(normal_runs, tumor_runs) # make the merger merger = Merger(self.sample_json, self.options.recalc_3x3_tables) # Check to see if the normal runs are ready to be merged. self.sample_json, merge_normal = merger.check_merge( normal_runs, 'Normal/', 'normal_') if merge_normal == True: # merge the normal and/or tumor runs. Will only merge the passing runs with each other. self.sample_json = merger.merge_runs('normal', 'Normal_', 'normal_') # Check to see if the tumor runs are ready to be merged. self.sample_json, merge_tumor = merger.check_merge( tumor_runs, 'Tumor/', 'tumor_') if merge_tumor == True: self.sample_json = merger.merge_runs('tumor', 'Tumor_', 'tumor_') # If any runs were merged, QC them. If there are only 1 normal and tumor run, they won't be QCd again. #if normal_merge_dir != '' or tumor_merge_dir != '' or (len(normal_passing_bams) == 1 and len(tumor_passing_bams) == 1): # now QC the tumor and normal merged bams together if both normal and tumor runs are ready. # To only QC all for the actual merged runs (PNET), change the 'final' part to 'merged'. # The 'final_normal_json' and 'final_tumor_json' flags are set by merger.py in the function check_merge, line 157 #if (merge_normal or merge_tumor) and ('merged_normal_json' in self.sample_json and 'merged_tumor_json' in self.sample_json): if 'final_normal_json' in self.sample_json and 'final_tumor_json' in self.sample_json: self.sample_json, qc_json = self.qc_run.QC_2Runs( self.sample_json, self.sample_json['final_normal_json'], self.sample_json['final_tumor_json'], 'normal_', 'tumor_', '_merged') self.sample_json, merged_perc_avail_bases = self.qc_run.update_3x3_runs_status( self.sample_json, self.sample_json['final_normal_json'], self.sample_json['final_tumor_json'], qc_json) # update the merged run status merger.update_merged_run_status( self.sample_json['final_normal_json'], merged_perc_avail_bases) merger.update_merged_run_status( self.sample_json['final_tumor_json'], merged_perc_avail_bases) # cleanup the individual run bam files if merged_perc_avail_bases > .9: final_qc_dir = "%s/all%svs%s" % ( self.sample_json['qc_folder'], json.load(open(self.sample_json['final_normal_json'])) ['run_name'], json.load(open( self.sample_json['final_tumor_json']))['run_name']) # annotate the final somatic variants command = "bash %s/Somatic_Variants/somatic_variants.sh %s %s %s" % ( self.sample_json['analysis']['software_directory'], final_qc_dir, self.sample_json['sample_name'], self.sample_json['analysis']['software_directory']) if runCommandLine(command) != 0: sys.stderr.write("ERROR: somatic annotation failed!\n") # Cleanup the PTRIM.bam and chr bam files after all of the QC is done. # are there any other files to clean up? self.cleanup_sample.cleanup_runs( self.sample_json['runs'], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) #self.cleanup_sample.delete_runs(runs, self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Cleanup after the merging QC is done. self.cleanup_sample.cleanup_runs([ self.sample_json['final_normal_json'], self.sample_json['final_tumor_json'] ], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Set the sample_status self.sample_json['sample_status'] = 'merged_pass' else: self.sample_json[ 'sample_status'] = 'awaiting_more_sequencing'
def my_fit(self, Xs, y, time_ramain, X_test): np.random.seed(CONSTANT.SEED) split = CONSTANT.SPLIT self.split = split log(f'split {split}') if split == -1: config = Config(time.time(), self.info['time_budget']) X_test.index = -X_test.index - 1 main_shape = Xs[CONSTANT.MAIN_TABLE_NAME].shape[0] main_max_shape = 2888888 main_min_shape = min(main_shape, 100000) test_shape = X_test.shape[0] max_accept_shape = 3999999 if main_shape + test_shape > max_accept_shape: sample_main_shape = max_accept_shape - test_shape if sample_main_shape > main_max_shape: sample_main_shape = main_max_shape if sample_main_shape < main_min_shape: sample_main_shape = main_min_shape log(f'start sample main table. origin main shape {main_shape} test shape {test_shape} sample rows num {sample_main_shape}' ) if 'time_col' in self.info: key_time_col = self.info['time_col'] if key_time_col in Xs[CONSTANT.MAIN_TABLE_NAME].columns: Xs[CONSTANT.MAIN_TABLE_NAME].sort_values( by=key_time_col, inplace=True) Xs[CONSTANT.MAIN_TABLE_NAME] = Xs[ CONSTANT.MAIN_TABLE_NAME].iloc[-sample_main_shape:] gc.collect() Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat( [Xs[CONSTANT.MAIN_TABLE_NAME], X_test]) X_test.drop(X_test.columns, axis=1, inplace=True) gc.collect() graph = Graph(self.info, Xs) graph.sort_tables() train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[ Xs[CONSTANT.MAIN_TABLE_NAME].index >= 0] y = y.loc[train_index] test_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[ Xs[CONSTANT.MAIN_TABLE_NAME].index < 0] graph.preprocess_fit_transform() gc.collect() merge_feat_pipeline = DeafultMergeFeatPipeline() merger = Merger(merge_feat_pipeline) merger.merge_table(graph) main_table = merger.merge_to_main_fit_transform(graph) self.release_tables(Xs, graph) del merger del graph gc.collect() feat_pipeline = DefaultFeatPipeline() feat_engine = FeatEngine(feat_pipeline, config) feat_engine.fit_transform_order1(main_table, y) sample_for_combine_features = True if sample_for_combine_features: main_data = main_table.data train_data = main_data.loc[main_data.index >= 0] del main_data sample_num = CONSTANT.SAMPLE_NUM train_shape = train_data.shape if train_shape[0] <= sample_num: sample_for_combine_features = False else: data_tail_new = train_data.iloc[-sample_num:] gc.collect() y_tail_new = y.loc[data_tail_new.index] table_tail_new = copy.deepcopy(main_table) table_tail_new.data = data_tail_new del data_tail_new gc.collect() feat_engine.fit_transform_all_order2(table_tail_new, y_tail_new, sample=True) feat_engine.fit_transform_keys_order2(table_tail_new, y_tail_new, sample=True) del table_tail_new, y_tail_new gc.collect() feat_engine.fit_transform_all_order2(main_table, y, selection=False) feat_engine.fit_transform_keys_order2(main_table, y, selection=False) feat_engine.fit_transform_post_order1(main_table, y) if not sample_for_combine_features: gc.collect() feat_engine.fit_transform_all_order2(main_table, y) feat_engine.fit_transform_keys_order2(main_table, y) feat_engine.fit_transform_keys_order3(main_table, y) feat_engine.fit_transform_post_order1(main_table, y) del feat_engine gc.collect() X_test = main_table.data.loc[test_index] main_table.data = main_table.data.loc[train_index] gc.collect() test_table = copy.deepcopy(main_table) test_table.data = X_test self.test_table = test_table len_test = X_test.shape[0] gc.collect() feat_engine = FeatEngine(feat_pipeline, config) feat_engine.fit_transform_merge_order1(main_table, y) self.feat_engine = feat_engine feat_output = FeatOutput() self.feat_output = feat_output X, y, categories = feat_output.final_fit_transform_output( main_table, y) del main_table gc.collect() lgb = AutoLGB() lgb.param_compute(X, y, categories, config) X_train, y_train, X_test, y_test = time_train_test_split( X, y, test_rate=0.2) lgb.param_opt_new(X_train, y_train, X_test, y_test, categories) gc.collect() del X_train, y_train, X_test, y_test gc.collect() X, y = self.shuffle(X, y, 2019) gc.collect() lgb.ensemble_train(X, y, categories, config, len_test) gc.collect() importances = lgb.get_ensemble_importances() self.model = lgb del X, y elif split == -2: config = Config(time.time(), self.info['time_budget']) Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([ Xs[CONSTANT.MAIN_TABLE_NAME], ]) gc.collect() graph = Graph(self.info, Xs) graph.sort_tables() train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[ Xs[CONSTANT.MAIN_TABLE_NAME].index >= 0] y = y.loc[train_index] graph.preprocess_fit_transform() gc.collect() merge_feat_pipeline = DeafultMergeFeatPipeline() merger = Merger(merge_feat_pipeline) merger.merge_table(graph) main_table = merger.merge_to_main_fit_transform(graph) self.release_tables(Xs, graph) del merger del graph gc.collect() feat_pipeline = DefaultFeatPipeline() feat_engine = FeatEngine(feat_pipeline, config) feat_engine.fit_transform_order1(main_table, y) sample_for_combine_features = True if sample_for_combine_features: main_data = main_table.data train_data = main_data.loc[main_data.index >= 0] del main_data sample_num = CONSTANT.SAMPLE_NUM train_shape = train_data.shape if train_shape[0] <= sample_num: sample_for_combine_features = False else: data_tail_new = train_data.iloc[-sample_num:] gc.collect() log(f'sample data shape {data_tail_new.shape}') y_tail_new = y.loc[data_tail_new.index] table_tail_new = copy.deepcopy(main_table) table_tail_new.data = data_tail_new del data_tail_new gc.collect() feat_engine.fit_transform_all_order2(table_tail_new, y_tail_new, sample=True) feat_engine.fit_transform_keys_order2(table_tail_new, y_tail_new, sample=True) del table_tail_new, y_tail_new gc.collect() feat_engine.fit_transform_all_order2(main_table, y, selection=False) feat_engine.fit_transform_keys_order2(main_table, y, selection=False) feat_engine.fit_transform_post_order1(main_table, y) if not sample_for_combine_features: gc.collect() feat_engine.fit_transform_all_order2(main_table, y) feat_engine.fit_transform_keys_order2(main_table, y) feat_engine.fit_transform_keys_order3(main_table, y) feat_engine.fit_transform_post_order1(main_table, y) del feat_engine gc.collect() main_table.data = main_table.data.loc[train_index] gc.collect() def split_table(table, y): X = table.data X_train, y_train, X_test, y_test = time_train_test_split( X, y, shuffle=False, test_rate=0.2) table1 = copy.deepcopy(table) table1.data = X_train table2 = copy.deepcopy(table) table2.data = X_test return table1, y_train, table2, y_test table1, y_train, table2, y_test = split_table(main_table, y) feat_engine = FeatEngine(feat_pipeline, config) feat_engine.fit_transform_merge_order1(table1, y_train) self.feat_engine = feat_engine feat_output = FeatOutput() self.feat_output = feat_output X_train, y_train, categories = feat_output.fit_transform_output( table1, y_train) gc.collect() self.feat_engine.transform_merge_order1(table2) X_test = self.feat_output.transform_output(table2) lgb = AutoLGB() lgb.param_compute(X_train, y_train, categories, config) lgb.param_opt_new(X_train, y_train, X_test, y_test, categories) len_test = X_test.shape[0] lgb.ensemble_train(X_train, y_train, categories, config, len_test) gc.collect() pred, pred0 = lgb.ensemble_predict_test(X_test) auc = roc_auc_score(y_test, pred0) print('source AUC:', auc) auc = roc_auc_score(y_test, pred) Model.ensemble_auc.append(auc) print('ensemble AUC:', auc) importances = lgb.get_ensemble_importances() self.model = lgb del X_train, y_train, X_test, y_test gc.collect() paths = os.path.join(feature_importance_path, version) if not os.path.exists(paths): os.makedirs(paths) importances.to_csv(os.path.join( paths, '{}_importances.csv'.format( datetime.now().strftime('%Y%m%d%H%M%S'))), index=False)
import matplotlib.pyplot as plt import numpy as np from keras import Sequential from keras.callbacks import History from keras.layers import Dense, BatchNormalization from sklearn.model_selection import train_test_split from loader import Loader from merger import Merger params, scores = Loader.get_flow_data(6767, 100) qualities = Loader.get_task_qualities() description = Loader.get_description(6767) merger = Merger(params, description, scores, qualities) X, y = merger.merge(100) # Split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # model # --> 0.0017802061972600456 model = Sequential() model.add(Dense(32, input_shape=(X.shape[1], ), activation='relu')) model.add(BatchNormalization()) model.add(Dense(32, activation='relu')) model.add(BatchNormalization())
from merger import Merger if __name__ == '__main__': m = Merger('file_one.txt', 'file_two.txt') m.run()
import argparse from merger import Merger from preprocessor.time_series_preprocessor import TimeSeriesPreprocessor if __name__ == "__main__": parser = argparse.ArgumentParser(description='Merges multiple CSV-files.') parser.add_argument('--config_file', action="store", default="cfg.csv") parser.add_argument('--result_file', action="store", default="merged.csv") parser.add_argument('--num_previous_entries_to_include', action="store", type=int, default=0) args = parser.parse_args() df = Merger().run(args.config_file) df = TimeSeriesPreprocessor().process(df, {'num_previous_entries_to_include': args.num_previous_entries_to_include}) df.to_csv(args.result_file, index=False)