예제 #1
0
def process_multiple(log, do_fetch=True, do_parse=True, do_merge=True):
    root = config["data-dir"]

    if do_fetch:
        tokens = Tokens()
        api = API(tokens, log)
        util.delete_files(root + '/processing/invoices', '*.json')
        success, invoice_cnt = api.fetch_invoice_details(hours_delta=30,
                                                         tz_offset=7)
        if success and invoice_cnt > 0:
            log.write(
                "INFO api invoices extraction succeeded {:,} invoices saved to : {}"
                .format(invoice_cnt, '/processing/invoices'))
        elif success and invoice_cnt == 0:
            log.write(
                "INFO api no invoices extracted (no new/updated invoices in refresh period)"
            )
            return True
        else:
            log.write(
                "ERROR api invoices extraction failed {:,} invoices saved to : {}"
                .format(invoice_cnt, '/processing/invoices'))
            return False

    if do_parse:
        util.delete_files(root + '/processing/invoices', '*.csv')
        parser = Parser(log)
        parser.parse('invoices-line-items')

    if do_merge:
        merger = Merger(log)
        merger.merge_invoice_delta()

    return True
예제 #2
0
def merge_videos():
    global mer

    path_name_seg = []
    for i in all_filename:
        path_name_seg.append(
            os.path.join(target_path, sel_res.getVideoTitle(), i))
    mer = Merger(
        unicode(
            os.path.join(
                target_path,
                sel_res.getVideoTitle() + '.' + sel_res.getFileFormat())),
        path_name_seg)
    gui.frame_main.initTotal_Merge(len(all_filename))
    mer.start()

    while True:
        gui.frame_main.updateMerge(mer.now)
        time.sleep(0.05)
        if mer.now == mer.sum:
            gui.frame_main.updateMerge(mer.now)
            break

    with open('config.ini', 'wb') as f:
        save_configure()

    dlg = wx.MessageDialog(gui.frame_main, u'视频已经合并完成,是否删除分段文件?', u'提示',
                           wx.YES_NO | wx.ICON_QUESTION)
    if dlg.ShowModal() == wx.ID_YES:
        del_seg_video()
    dlg = wx.MessageDialog(gui.frame_main, u'分段文件删除完成。', u'提示',
                           wx.OK | wx.ICON_QUESTION)
    dlg.ShowModal()
예제 #3
0
파일: test.py 프로젝트: DarkArmed/Merger
def test(test_case):
    merger = Merger(test_case)
    # codes = merger.get_codes(3)

    for k in range(10, 9, -1):
        value_sum = 0
        for code, num in merger.merge_result(k).items():
            value_sum += num
        print 'k =', k, merger.merge_result(k), ' \tMerged codes:', len(merger.merge_result(k)), ' \tTotal value:', value_sum
예제 #4
0
 def __init__(self, indent=None, to_explore=False):
     self.indent = indent
     self.indent_children = None
     self.content = []
     self.parent = None
     self.to_explore = to_explore
     self.merger = Merger()
     self.padding = None
     self.sf = None
     self.sc = None
예제 #5
0
def ProcessRequest(file):

    name = str(uuid4())
    base_file_name = "%s-%s" % (name, secure_filename(file.filename))
    file_name = "tmp/%s" % base_file_name
    print(file_name)
    file.save(file_name)
    with ZipFile(file_name, 'r') as zipObj:
        zipObj.extractall("tmp/%s" % name)
    Merger("tmp/%s" % name, os.path.realpath("tmp/combined-%s.ics" % name))
예제 #6
0
    def __init__(self, cldict, sampd):
        self.cldict = cldict
        self.sampd = sampd
        self.mergo = Merger(cldict, sampd)
        self.meto = Metrics(cldict)

        lbwao = None
        lbbmapo = None
        lref_acc_str = sampd.ref_acc_str
        if lref_acc_str != "none":
            lbwao = AlignerBwa(cldict, sampd)
        self.bwao = lbwao
        self.samfco = SamFC(cldict, sampd)
    def merge(self, corpus_size):
        """
        The function will merge all the data in the posting files using the BSBI algorithm
        """
        docs_file = self.get_docs_file()
        for key in self.postings_data:
            if os.listdir(self.postings_data[key]['path']):  # directory is not empty
                merger = Merger(self.postings_data[key]['path'], "pkl", docs_file, corpus_size)
                merger.merge(self.postings_data[key]['name'])

        #  The merger updates the docs data. After the merge of all the letters - all the documents data
        #  Is updated and need to be saved on disk to reduce the memory load
        utils.save_obj(docs_file, f"{self.posting_dir_path}\\docs\\docs_index")
예제 #8
0
def get_IMPA_Merger(name):
    imp = iMPA(name)
    terc = imp.terc
    data = imp.getAddresses()
    s = min(map(lambda x: x.center.y, data))
    w = min(map(lambda x: x.center.x, data))
    n = max(map(lambda x: x.center.y, data))
    e = max(map(lambda x: x.center.x, data))
    addr = getAddresses(map(str, (s, w, n, e)))

    m = Merger(data, addr, terc)
    m.post_func.append(m.merge_addresses)
    m.merge()
    return m
예제 #9
0
def merge_videos():
    _, res = iqiyi.getLastRes()
    path_name_seg = []
    for i in all_filename:
        path_name_seg.append(os.path.join(video_title, i))
    mer = Merger(
        unicode(
            os.path.join(target_path, video_title + '.' + res[sel_bid]['ff'])),
        path_name_seg)
    gui.frame_main.initTotal_Merge(len(all_filename))
    mer.start()
    while True:
        gui.frame_main.updateMerge(mer.now)
        time.sleep(0.01)
        if mer.now == mer.sum:
            gui.frame_main.updateMerge(mer.now)
            break

    del_seg_video()
예제 #10
0
    def __init__(self, visualizer=None, speaker_recognition=False):

        self.merger_to_main_queue = Queue(maxsize=1000)  # very roughly 30sec
        self.merger = Merger(self.merger_to_main_queue)
        if visualizer is None:
            self.visualization = False
        else:
            self.visualization = True
            self.main_to_vis_queue = Queue(maxsize=50)
            self.visualizer = visualizer(self.main_to_vis_queue)

        self.speakers = {}
        self.num_speakers = 0
        self.stt = T2t_stt()
        self.speaker_recognition = speaker_recognition
        # if self.speaker_recognition:
        #     self.sr = Speaker_recognition()
        self.text_queue = mult_Queue()
        self.bing_allowed = False
예제 #11
0
    def merge(self):
        # self.text_dst.config(state = 'normal')

        text = self.text_src.get('1.0', END)
        # print text.encode('utf-8')
        codes2num = decode(text)
        # print codes2num

        self.merger = Merger(codes2num)

        self.text_dst.delete(0.0, END)

        result_text = ''
        for k in range(10, 3, -1):
            result_text += '最大长度' + str(k) + ' '
            result_text += encode(self.merger.merge_result(k))
            # print result_text

        self.text_dst.insert(END, result_text)
예제 #12
0
    def __init__(self, cldict, sampd):
        self.cldict = cldict
        self.sampd = sampd
        self.mergo = Merger(cldict, sampd)
        self.meto = Metrics(cldict)

        lbwao = None
        lbbmapo = None
        lref_acc_str = sampd.ref_acc_str
        lhost_ref_str = sampd.host_ref_str

        if lref_acc_str != "none":
            lbwao = AlignerBwa(cldict, sampd)
        if lhost_ref_str != "none":
            lbbmapo = AlignerBBMap(cldict, sampd)
        self.bwao = lbwao
        self.bbmapo = lbbmapo
        self.samfco = SamFC(cldict, sampd)
        self.jlco = CounterJL(cldict, sampd)
        print("Created JLCounter object")
예제 #13
0
def main():
	scrapper = Scrapper()
	merger = Merger()
	parser = Parser()
	client = MongoClient('localhost', 27017)
	db = client['Data']
	collection_socialmedia = db['socialmedia']

	#Begin real time collecting
	while True: 
		scrapper.scrap()	
		merger.main()
		parser.main()	
		sleep(3600)
		
		#Storing to mangoDB
		f = open( '/home/sartharion/Bureau/stage/POO/data.json', 'r')  
		file_data = json.load(f)
		collection_socialmedia.delete_many({})
		collection_socialmedia.insert_many(file_data)		
	
	client.close()
예제 #14
0
    def __init__(self, args):
        self.config_log_file = args.config_log_file
        
        self.sample_id = args.sample_id
        self.project_id = args.project_id
        self.prefix_set = args.prefix_set
        self.bc_set = args.bc_set

        cldict_d = yaml.load(open(self.config_log_file))
        cldict = DictMap(cldict_d)
        self.cldict = cldict

        sampd = dict()
        sampd['sample_id'] = self.sample_id
        sampd['project_id'] = self.project_id
        sampd['prefix_set'] = self.prefix_set
        sampd['bc_set'] = self.bc_set
        sampd_map = DictMap(sampd) 
        self.sampd = sampd_map
 
        mergo = Merger(cldict, sampd_map)
        self.mergo = mergo
예제 #15
0
def get_history(id: ObjectId, num_changes: int = None):
    hist = history.find({'ref': id}).sort('_id', direction=pymongo.DESCENDING)
    curr = data.find_one({'_id': id})

    yield curr

    prev = curr
    count = 0
    merger = Merger()
    for d in hist:
        if num_changes and count == num_changes:
            break

        d['ref_creation_time'] = d['_id'].generation_time
        del d['_id']
        del d['ref']

        l: dict = copy.deepcopy(prev)
        merger.merge_changes(l, d)

        yield l
        prev = l
        count += 1
예제 #16
0
파일: ndt.py 프로젝트: pdphuong/scorpion
    def __call__(self, full_table, bad_tables, good_tables, **kwargs):
        """
        table has been trimmed of extraneous columns.
        """
        self.setup_tables(full_table, bad_tables, good_tables, **kwargs)

        self.SCORE_ID = add_meta_column(
            chain(self.bad_tables, self.good_tables), SCORE_VAR)
        self.CLASS_ID = add_meta_column(chain(self.bad_tables,
                                              self.good_tables),
                                        "INFCLASS",
                                        vals=['0', '1'])

        start = time.time()
        self.compute_perrow_influences(self.bad_tables, self.bad_err_funcs)
        self.compute_perrow_influences(self.good_tables, self.good_err_funcs)
        self.cost_compute_inf = time.time() - start

        start = time.time()
        if self.tree_alg == 'c45':
            table, rules = self.c45_rules()
        elif self.tree_alg == 'or':
            table, rules = self.orange_dt_rules()
        elif self.tree_alg == 'dt':
            table, rules = self.sk_dt_rules(max_depth=12)
        elif self.tree_alg == 'rt':
            table, rules = self.sk_rt_rules(max_depth=12)
        else:
            _logger.warn(
                "unknown NDT algorithm %s.  Defaulting to regression tree",
                self.tree_alg)
            table, rules = self.sk_rt_rules(max_depth=12)
        self.cost_learn = time.time() - start

        #
        # ok now convert rules to clusters
        #

        _logger.debug("got %d rules", len(rules))
        fill_in_rules(rules, table, cols=self.cols)

        self.cost_learn = time.time() - start

        clusters = [Cluster.from_rule(rule, self.cols) for rule in rules]
        for cluster in clusters:
            cluster.error = self.influence_cluster(cluster)
        clusters = filter_bad_clusters(clusters)
        clusters.sort(key=lambda c: c.error, reverse=True)
        print '\n'.join(map(str, clusters[:5]))

        self.all_clusters = self.final_clusters = clusters
        return self.final_clusters

        #
        # merge the clusters
        #
        thresh = compute_clusters_threshold(clusters, nstds=1.5)
        is_mergable = lambda c: c.error >= thresh

        params = dict(kwargs)
        params.update({
            'cols':
            self.cols,
            'err_func':
            self.err_func,
            'influence':
            lambda c: self.influence_cluster(c),
            'influence_components':
            lambda c: self.influence_cluster_components(c),
            'is_mergable':
            is_mergable,
            'use_mtuples':
            False,
            'learner':
            self
        })
        self.merger = Merger(**params)
        merged_clusters = self.merger(clusters)
        merged_clusters.sort(key=lambda c: c.error, reverse=True)

        clusters.extend(merged_clusters)
        normalize_cluster_errors(clusters)
        clusters = list(set(clusters))
        self.all_clusters = clusters
        self.final_clusters = merged_clusters

        self.costs = {'cost_learn': self.cost_learn}
        return self.final_clusters
예제 #17
0
파일: chain.py 프로젝트: pombredanne/stetl
    def assemble(self):
        """
        Builder method: build a Chain of linked Components
        :return:
        """
        log.info('Assembling Chain: %s...' % self.chain_str)

        # Create linked list of input/filter/output (ETL Component) objects
        chain_str = self.chain_str
        sub_comps = []
        while chain_str:
            chain_str = chain_str.strip()

            # Check and handle Splitter construct
            # e.g. input_xml_file |(transformer_xslt|output_file) (output_std) (transformer_xslt|output_std)
            if chain_str.startswith('('):
                etl_section_name, chain_str = chain_str.split(')', 1)
                etl_section_name = etl_section_name.strip('(')

                # Check for subchain (split at Filter level)
                if '|' in etl_section_name:
                    # Have subchain: use Chain to assemble
                    sub_chain = Chain(etl_section_name, self.config_dict)
                    sub_chain.assemble()
                    child_comp = sub_chain.first_comp
                else:
                    # Single component (Output) to split
                    child_comp = factory.create_obj(self.config_dict,
                                                    etl_section_name.strip())

                # Assemble Components (can be subchains) for Splitter later
                sub_comps.append(child_comp)
                if '(' in chain_str:
                    # Still components (subchains) to assemble for Splitter
                    continue

            if len(sub_comps) > 0:
                if chain_str.startswith('|'):
                    # Next component is Merger with children
                    etl_comp = Merger(self.config_dict, sub_comps)
                    dummy, chain_str = chain_str.split('|', 1)
                else:
                    # Next component is Splitter with children
                    etl_comp = Splitter(self.config_dict, sub_comps)
                sub_comps = []
            else:

                # "Normal" case: regular Components piped in Chain
                if '|' in chain_str:
                    # More than one component in remaining Chain
                    etl_section_name, chain_str = chain_str.split('|', 1)
                else:
                    # Last element, we're done!
                    etl_section_name = chain_str
                    chain_str = None

                # Create the ETL component by name and properties
                etl_comp = factory.create_obj(self.config_dict,
                                              etl_section_name.strip())

            # Add component to end of Chain
            self.add(etl_comp)
예제 #18
0
파일: test.py 프로젝트: DarkArmed/Merger
value = 0
for code, num in case.items():
    print code, num, 
    value += num
print
print value

print str(timeit.timeit('test(case)', 'from __main__ import test, case', number=1)) + 's used.'

TEST_CASES = [
                ['012', '013', '023', '123'],
                ['012', '013', '023', '124', '134', '234'],
                ['012', '013', '023', '123', '124', '134', '234'],
                ['012', '013', '023', '123', '123', '124', '134', '234'],
                ['012', '013', '014', '023', '024', '034', '123', '124', '134', '234'],
                ['012', '023', '013', '123', '123', '234', '134', '124', '125', '127', '157', '257', '125', '127', '157'],
                ['012', '023', '013', '123', '123', '234', '134', '124', '125', '125', '127', '157', '257', '125', '127', '157'],
             ]

codes = Merger([]).get_codes(3)
case = TEST_CASES[0]
# case = [to_string(random.choice(codes)) for dummy_i in range(10000)]
# case = TEST_CASES[3] * 10
# case = TEST_CASES[0] + TEST_CASES[4]
# case = TEST_CASES[6]

# print str(timeit.timeit('test(case)', 'from __main__ import test, case', number=1)) + 's used.'

# print len(" sdsd\n\n \t".strip())
예제 #19
0
    parser = OptionParser(version="%prog " + __VERSION__,
                          usage=usage,
                          description=banner)

    parser.add_option("--dir",
                      "-d",
                      action="store",
                      type="string",
                      dest="dir",
                      help="Files match (Default: *.ics)",
                      default="*.ics")
    parser.add_option("--ical",
                      "-i",
                      action="store",
                      type="string",
                      dest="icalfile",
                      help="iCalendar file output")

    (options, args) = parser.parse_args()

    if options.icalfile == "":
        options.icalfile = None

    if options.icalfile != None:
        options.icalfile = os.path.realpath(options.icalfile)
        Merger(options.dir, options.icalfile)
        sys.exit(0)

    sys.exit(1)
예제 #20
0
 def test_run(self):
     merger = Merger()
     actual = merger.run(os.path.join('test', 'data', 'cfg.csv'))
     self.assertIsNotNone(actual)
     self.assertIs(3, actual.shape[1])
예제 #21
0
from tkinter import END, Tk, Label, Frame, Text, SUNKEN, Button, TRUE, TOP, Listbox, SINGLE, Scrollbar, DISABLED, Checkbutton, BooleanVar
from tkinter.filedialog import askopenfilenames, askopenfilename
from pandastable import Table, TableModel
import detector
from exporter_GUI import ExporterGUI
from importer import Importer
from merger import Merger

importer = Importer()
merger = Merger(importer)


class ImporterGUI:
    """
    This class is the main gui and shows the import window, it also contains the main methods which uses other helper classes

    Attributes:
        root = the root tk
        previewFrame = the frame that shows the preview of the dataframe
        XMLList = the 2D list which holds the xml filepath on the index 0 and the xsl filepath on index 1
    """
    def __init__(self, root: Tk):
        self.root = root
        self.previewFrame = Frame(root)
        self.pt = Table(self.previewFrame)
        self.dialect = detector.Dialect()
        self.XMLList = []
        self.DialectList = []
        self.hasHeaderVar = BooleanVar()
        self.currentPath = ""
예제 #22
0
    def QC_germline(self):
        # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed..
        #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged':
        # if the user specified the '--pass_fail' option, then run this part still
        if self.sample_json[
                'sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all:
            # QC the normal runs with each other
            self.QC_runs(self.sample_json['runs'])

        # what if there is only one run that passes all of the metrics? It should be marked as the 'final_json' and have the 'pass_fail_merged' flag marked as pass.
        # make the merger
        merger = Merger(self.sample_json, self.options.recalc_3x3_tables)
        # Check to see if the normal runs are ready to be merged.
        self.sample_json, merge = merger.check_merge(self.sample_json['runs'])
        if merge != True:
            if 'final_json' in self.sample_json:
                # update the final run status
                merger.update_merged_run_status(self.sample_json['final_json'])
        elif merge == True:
            # merge the normal and/or tumor runs. Will only merge the passing runs with each other.
            self.sample_json = merger.merge_runs('germline')

            # update the merged run status
            merger.update_merged_run_status(self.sample_json['merged_json'])

            if json.load(open(self.sample_json['merged_json'])
                         )['pass_fail_merged_status'] == 'pass':
                # Set the sample_status
                self.sample_json['sample_status'] = 'merged_pass'
                # cleanup the individual run bam files
                self.cleanup_sample.cleanup_runs(
                    self.sample_json['runs'],
                    self.sample_json['analysis']['settings']['cleanup'],
                    self.no_errors)
                # Cleanup the merged dir
                self.cleanup_sample.cleanup_runs(
                    [self.sample_json['merged_json']],
                    self.sample_json['analysis']['settings']['cleanup'],
                    self.no_errors)
            else:
                self.sample_json['sample_status'] = 'awaiting_more_sequencing'

        # copy the final run's VCF file to the final_dir if it passes the "merged" coverage flag
        if 'final_json' in self.sample_json:
            final_json = json.load(open(self.sample_json['final_json']))
            if final_json['pass_fail_merged_status'] == 'pass':
                final_vcf = glob.glob("%s/*.vcf" % final_json['run_folder'])[0]
                final_project_dir = "/home/ionadmin/jeff/%s_Final_VCFs" % (
                    self.sample_json['project'])
                print "copying %s to %s" % (final_vcf, final_project_dir)
                # check to make sure the final dir exists.
                if not os.path.isdir(final_project_dir):
                    os.mkdir(final_project_dir)
                shutil.copy(
                    final_vcf, "%s/%s.vcf" %
                    (final_project_dir, self.sample_json['sample_name']))
                # now push the sample to s3 storage
                if self.sample_json['project'] == 'Einstein':
                    print "pushing %s to amazon s3 storage" % self.sample_json[
                        'sample_name']
                    self.push_sample_to_s3(final_json)
예제 #23
0
    def QC_tumor_normal(self):
        # Separate the runs into tumor and normal lists
        normal_runs, tumor_runs = self.getTumor_Normal()

        if self.sample_json['analysis']['settings'][
                'type'] == 'all_tumor_normal':
            # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed..
            #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged':
            # if the user specified the '--pass_fail' option, then run this part still
            if self.sample_json[
                    'sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all:
                # QC the normal or tumor runs with each other
                self.QC_runs(normal_runs, 'normal_')
                self.QC_runs(tumor_runs, 'tumor_')
                # now QC the tumor and normal runs together.
                self.QC_normal_tumor_runs(normal_runs, tumor_runs)

            # make the merger
            merger = Merger(self.sample_json, self.options.recalc_3x3_tables)
            # Check to see if the normal runs are ready to be merged.
            self.sample_json, merge_normal = merger.check_merge(
                normal_runs, 'Normal/', 'normal_')
            if merge_normal == True:
                # merge the normal and/or tumor runs. Will only merge the passing runs with each other.
                self.sample_json = merger.merge_runs('normal', 'Normal_',
                                                     'normal_')

            # Check to see if the tumor runs are ready to be merged.
            self.sample_json, merge_tumor = merger.check_merge(
                tumor_runs, 'Tumor/', 'tumor_')
            if merge_tumor == True:
                self.sample_json = merger.merge_runs('tumor', 'Tumor_',
                                                     'tumor_')

            # If any runs were merged, QC them. If there are only 1 normal and tumor run, they won't be QCd again.
            #if normal_merge_dir != '' or tumor_merge_dir != '' or (len(normal_passing_bams) == 1 and len(tumor_passing_bams) == 1):
            # now QC the tumor and normal merged bams together if both normal and tumor runs are ready.
            # To only QC all for the actual merged runs (PNET), change the 'final' part to 'merged'.
            # The 'final_normal_json' and 'final_tumor_json' flags are set by merger.py in the function check_merge, line 157
            #if (merge_normal or merge_tumor) and ('merged_normal_json' in self.sample_json and 'merged_tumor_json' in self.sample_json):
            if 'final_normal_json' in self.sample_json and 'final_tumor_json' in self.sample_json:
                self.sample_json, qc_json = self.qc_run.QC_2Runs(
                    self.sample_json, self.sample_json['final_normal_json'],
                    self.sample_json['final_tumor_json'], 'normal_', 'tumor_',
                    '_merged')
                self.sample_json, merged_perc_avail_bases = self.qc_run.update_3x3_runs_status(
                    self.sample_json, self.sample_json['final_normal_json'],
                    self.sample_json['final_tumor_json'], qc_json)
                # update the merged run status
                merger.update_merged_run_status(
                    self.sample_json['final_normal_json'],
                    merged_perc_avail_bases)
                merger.update_merged_run_status(
                    self.sample_json['final_tumor_json'],
                    merged_perc_avail_bases)

                # cleanup the individual run bam files
                if merged_perc_avail_bases > .9:
                    final_qc_dir = "%s/all%svs%s" % (
                        self.sample_json['qc_folder'],
                        json.load(open(self.sample_json['final_normal_json']))
                        ['run_name'],
                        json.load(open(
                            self.sample_json['final_tumor_json']))['run_name'])
                    # annotate the final somatic variants
                    command = "bash %s/Somatic_Variants/somatic_variants.sh %s %s %s" % (
                        self.sample_json['analysis']['software_directory'],
                        final_qc_dir, self.sample_json['sample_name'],
                        self.sample_json['analysis']['software_directory'])
                    if runCommandLine(command) != 0:
                        sys.stderr.write("ERROR: somatic annotation failed!\n")

                    # Cleanup the PTRIM.bam and chr bam files after all of the QC is done.
                    # are there any other files to clean up?
                    self.cleanup_sample.cleanup_runs(
                        self.sample_json['runs'],
                        self.sample_json['analysis']['settings']['cleanup'],
                        self.no_errors)
                    #self.cleanup_sample.delete_runs(runs, self.sample_json['analysis']['settings']['cleanup'], self.no_errors)

                    # Cleanup after the merging QC is done.
                    self.cleanup_sample.cleanup_runs([
                        self.sample_json['final_normal_json'],
                        self.sample_json['final_tumor_json']
                    ], self.sample_json['analysis']['settings']['cleanup'],
                                                     self.no_errors)

                    # Set the sample_status
                    self.sample_json['sample_status'] = 'merged_pass'
                else:
                    self.sample_json[
                        'sample_status'] = 'awaiting_more_sequencing'
예제 #24
0
    def my_fit(self, Xs, y, time_ramain, X_test):
        np.random.seed(CONSTANT.SEED)

        split = CONSTANT.SPLIT

        self.split = split

        log(f'split {split}')

        if split == -1:
            config = Config(time.time(), self.info['time_budget'])

            X_test.index = -X_test.index - 1

            main_shape = Xs[CONSTANT.MAIN_TABLE_NAME].shape[0]
            main_max_shape = 2888888
            main_min_shape = min(main_shape, 100000)

            test_shape = X_test.shape[0]
            max_accept_shape = 3999999

            if main_shape + test_shape > max_accept_shape:
                sample_main_shape = max_accept_shape - test_shape
                if sample_main_shape > main_max_shape:
                    sample_main_shape = main_max_shape
                if sample_main_shape < main_min_shape:
                    sample_main_shape = main_min_shape
                log(f'start sample main table. origin main shape {main_shape} test shape {test_shape} sample rows num {sample_main_shape}'
                    )
                if 'time_col' in self.info:
                    key_time_col = self.info['time_col']
                    if key_time_col in Xs[CONSTANT.MAIN_TABLE_NAME].columns:
                        Xs[CONSTANT.MAIN_TABLE_NAME].sort_values(
                            by=key_time_col, inplace=True)
                Xs[CONSTANT.MAIN_TABLE_NAME] = Xs[
                    CONSTANT.MAIN_TABLE_NAME].iloc[-sample_main_shape:]
                gc.collect()

            Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat(
                [Xs[CONSTANT.MAIN_TABLE_NAME], X_test])

            X_test.drop(X_test.columns, axis=1, inplace=True)
            gc.collect()

            graph = Graph(self.info, Xs)
            graph.sort_tables()
            train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[
                Xs[CONSTANT.MAIN_TABLE_NAME].index >= 0]
            y = y.loc[train_index]
            test_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[
                Xs[CONSTANT.MAIN_TABLE_NAME].index < 0]

            graph.preprocess_fit_transform()
            gc.collect()

            merge_feat_pipeline = DeafultMergeFeatPipeline()
            merger = Merger(merge_feat_pipeline)

            merger.merge_table(graph)
            main_table = merger.merge_to_main_fit_transform(graph)
            self.release_tables(Xs, graph)
            del merger
            del graph
            gc.collect()

            feat_pipeline = DefaultFeatPipeline()
            feat_engine = FeatEngine(feat_pipeline, config)
            feat_engine.fit_transform_order1(main_table, y)

            sample_for_combine_features = True

            if sample_for_combine_features:
                main_data = main_table.data
                train_data = main_data.loc[main_data.index >= 0]

                del main_data

                sample_num = CONSTANT.SAMPLE_NUM
                train_shape = train_data.shape

                if train_shape[0] <= sample_num:
                    sample_for_combine_features = False
                else:
                    data_tail_new = train_data.iloc[-sample_num:]

                    gc.collect()

                    y_tail_new = y.loc[data_tail_new.index]

                    table_tail_new = copy.deepcopy(main_table)
                    table_tail_new.data = data_tail_new

                    del data_tail_new
                    gc.collect()

                    feat_engine.fit_transform_all_order2(table_tail_new,
                                                         y_tail_new,
                                                         sample=True)
                    feat_engine.fit_transform_keys_order2(table_tail_new,
                                                          y_tail_new,
                                                          sample=True)

                    del table_tail_new, y_tail_new
                    gc.collect()

                    feat_engine.fit_transform_all_order2(main_table,
                                                         y,
                                                         selection=False)
                    feat_engine.fit_transform_keys_order2(main_table,
                                                          y,
                                                          selection=False)

                    feat_engine.fit_transform_post_order1(main_table, y)

            if not sample_for_combine_features:
                gc.collect()

                feat_engine.fit_transform_all_order2(main_table, y)
                feat_engine.fit_transform_keys_order2(main_table, y)

                feat_engine.fit_transform_keys_order3(main_table, y)
                feat_engine.fit_transform_post_order1(main_table, y)

            del feat_engine
            gc.collect()

            X_test = main_table.data.loc[test_index]
            main_table.data = main_table.data.loc[train_index]

            gc.collect()

            test_table = copy.deepcopy(main_table)
            test_table.data = X_test
            self.test_table = test_table
            len_test = X_test.shape[0]
            gc.collect()

            feat_engine = FeatEngine(feat_pipeline, config)
            feat_engine.fit_transform_merge_order1(main_table, y)
            self.feat_engine = feat_engine

            feat_output = FeatOutput()
            self.feat_output = feat_output
            X, y, categories = feat_output.final_fit_transform_output(
                main_table, y)

            del main_table
            gc.collect()

            lgb = AutoLGB()

            lgb.param_compute(X, y, categories, config)
            X_train, y_train, X_test, y_test = time_train_test_split(
                X, y, test_rate=0.2)

            lgb.param_opt_new(X_train, y_train, X_test, y_test, categories)

            gc.collect()

            del X_train, y_train, X_test, y_test

            gc.collect()

            X, y = self.shuffle(X, y, 2019)
            gc.collect()

            lgb.ensemble_train(X, y, categories, config, len_test)

            gc.collect()

            importances = lgb.get_ensemble_importances()

            self.model = lgb
            del X, y

        elif split == -2:

            config = Config(time.time(), self.info['time_budget'])

            Xs[CONSTANT.MAIN_TABLE_NAME] = pd.concat([
                Xs[CONSTANT.MAIN_TABLE_NAME],
            ])

            gc.collect()

            graph = Graph(self.info, Xs)
            graph.sort_tables()
            train_index = Xs[CONSTANT.MAIN_TABLE_NAME].index[
                Xs[CONSTANT.MAIN_TABLE_NAME].index >= 0]
            y = y.loc[train_index]

            graph.preprocess_fit_transform()
            gc.collect()

            merge_feat_pipeline = DeafultMergeFeatPipeline()
            merger = Merger(merge_feat_pipeline)

            merger.merge_table(graph)
            main_table = merger.merge_to_main_fit_transform(graph)
            self.release_tables(Xs, graph)
            del merger
            del graph
            gc.collect()

            feat_pipeline = DefaultFeatPipeline()
            feat_engine = FeatEngine(feat_pipeline, config)
            feat_engine.fit_transform_order1(main_table, y)

            sample_for_combine_features = True

            if sample_for_combine_features:
                main_data = main_table.data
                train_data = main_data.loc[main_data.index >= 0]

                del main_data

                sample_num = CONSTANT.SAMPLE_NUM
                train_shape = train_data.shape

                if train_shape[0] <= sample_num:
                    sample_for_combine_features = False
                else:
                    data_tail_new = train_data.iloc[-sample_num:]

                    gc.collect()
                    log(f'sample data shape {data_tail_new.shape}')

                    y_tail_new = y.loc[data_tail_new.index]

                    table_tail_new = copy.deepcopy(main_table)
                    table_tail_new.data = data_tail_new

                    del data_tail_new
                    gc.collect()

                    feat_engine.fit_transform_all_order2(table_tail_new,
                                                         y_tail_new,
                                                         sample=True)
                    feat_engine.fit_transform_keys_order2(table_tail_new,
                                                          y_tail_new,
                                                          sample=True)

                    del table_tail_new, y_tail_new
                    gc.collect()

                    feat_engine.fit_transform_all_order2(main_table,
                                                         y,
                                                         selection=False)
                    feat_engine.fit_transform_keys_order2(main_table,
                                                          y,
                                                          selection=False)
                    feat_engine.fit_transform_post_order1(main_table, y)

            if not sample_for_combine_features:
                gc.collect()

                feat_engine.fit_transform_all_order2(main_table, y)
                feat_engine.fit_transform_keys_order2(main_table, y)
                feat_engine.fit_transform_keys_order3(main_table, y)
                feat_engine.fit_transform_post_order1(main_table, y)

            del feat_engine
            gc.collect()

            main_table.data = main_table.data.loc[train_index]

            gc.collect()

            def split_table(table, y):
                X = table.data
                X_train, y_train, X_test, y_test = time_train_test_split(
                    X, y, shuffle=False, test_rate=0.2)
                table1 = copy.deepcopy(table)
                table1.data = X_train
                table2 = copy.deepcopy(table)
                table2.data = X_test
                return table1, y_train, table2, y_test

            table1, y_train, table2, y_test = split_table(main_table, y)

            feat_engine = FeatEngine(feat_pipeline, config)
            feat_engine.fit_transform_merge_order1(table1, y_train)
            self.feat_engine = feat_engine

            feat_output = FeatOutput()
            self.feat_output = feat_output

            X_train, y_train, categories = feat_output.fit_transform_output(
                table1, y_train)

            gc.collect()
            self.feat_engine.transform_merge_order1(table2)
            X_test = self.feat_output.transform_output(table2)

            lgb = AutoLGB()

            lgb.param_compute(X_train, y_train, categories, config)

            lgb.param_opt_new(X_train, y_train, X_test, y_test, categories)

            len_test = X_test.shape[0]

            lgb.ensemble_train(X_train, y_train, categories, config, len_test)
            gc.collect()

            pred, pred0 = lgb.ensemble_predict_test(X_test)

            auc = roc_auc_score(y_test, pred0)
            print('source AUC:', auc)

            auc = roc_auc_score(y_test, pred)
            Model.ensemble_auc.append(auc)
            print('ensemble AUC:', auc)

            importances = lgb.get_ensemble_importances()

            self.model = lgb

            del X_train, y_train, X_test, y_test
            gc.collect()

        paths = os.path.join(feature_importance_path, version)
        if not os.path.exists(paths):
            os.makedirs(paths)
        importances.to_csv(os.path.join(
            paths, '{}_importances.csv'.format(
                datetime.now().strftime('%Y%m%d%H%M%S'))),
                           index=False)
예제 #25
0
import matplotlib.pyplot as plt
import numpy as np
from keras import Sequential
from keras.callbacks import History
from keras.layers import Dense, BatchNormalization
from sklearn.model_selection import train_test_split

from loader import Loader
from merger import Merger

params, scores = Loader.get_flow_data(6767, 100)
qualities = Loader.get_task_qualities()
description = Loader.get_description(6767)

merger = Merger(params, description, scores, qualities)
X, y = merger.merge(100)

# Split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

# model

# --> 0.0017802061972600456
model = Sequential()
model.add(Dense(32, input_shape=(X.shape[1], ), activation='relu'))
model.add(BatchNormalization())
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
예제 #26
0
from merger import Merger

if __name__ == '__main__':
    m = Merger('file_one.txt', 'file_two.txt')
    m.run()
예제 #27
0
import argparse

from merger import Merger
from preprocessor.time_series_preprocessor import TimeSeriesPreprocessor

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Merges multiple CSV-files.')
    parser.add_argument('--config_file', action="store", default="cfg.csv")
    parser.add_argument('--result_file', action="store", default="merged.csv")
    parser.add_argument('--num_previous_entries_to_include', action="store", type=int, default=0)
    args = parser.parse_args()
    df = Merger().run(args.config_file)
    df = TimeSeriesPreprocessor().process(df, {'num_previous_entries_to_include': args.num_previous_entries_to_include})
    df.to_csv(args.result_file, index=False)