示例#1
0
def create_html_tables():
    tables = {}
    tables['feature_importance_table'] = __tsv_to_html(
        config.get_option("OUTPUT_DIR") +
        '/global_model/feature_importances.tsv',
        sep="\t")
    try:
        tables['prediction_results_table'] = __tsv_to_html(
            config.get_option("OUTPUT_DIR") + '/Y_pred.tsv', sep="\t")
    except FileNotFoundError:
        tables['prediction_results_table'] = ""
    return tables
示例#2
0
def index():
	print(f'[WEB] GET /', flush=True)
	if logic.progress == "getting config frontend":
		return bottle.static_file("input.html", root='/app/app/templates')
	elif logic.progress == "finishing":
		return bottle.template("result",
							   figures=create_html_figures(),
							   tables=create_html_tables(),
							   prediction_visibility=('block' if config.get_option('prediction') == 'True' else 'none'),
							   survival_visibility=('block' if config.get_option('treetype') == 'survival' else 'none'))
	else:
		return bottle.template('loading', status=status_ui[logic.progress])
示例#3
0
def __analyze_results(model, model_name, interaction_network, expression_data,
                      split):
    """
	Private Method.
	Executes the RScript 'grandforest.analyze_results.R' to create the plots and tables
	of the feature importance and endophenotyping analyses in the split output directory.
	:param interaction_network: Interaction Network as base64 encoded RData file with "data" object inside
	:param expression_data: Local expression data as base64 encoded RData file with "data" object inside
	:param split: current split as path to the output directory
	:return: None
	"""
    temp_path = config.get_option('TEMP_DIR') + '/' + str(uuid4())
    os.makedirs(temp_path)

    open(temp_path + '/' + 'interaction_network.RData',
         'wb').write(base64.decodebytes(interaction_network.encode('utf-8')))
    open(temp_path + '/' + 'expression_data.RData',
         'wb').write(base64.decodebytes(expression_data.encode('utf-8')))
    open(temp_path + '/' + 'model.RData',
         'wb').write(base64.decodebytes(model.encode('utf-8')))

    try:
        os.makedirs(split + '/' + model_name)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    if config.get_option('grandforest_treetype') == 'survival':
        command = [
            "/app/app/R/grandforest.analyze_results.R",
            temp_path + '/' + 'model' + '.RData',
            temp_path + '/' + 'interaction_network.RData',
            temp_path + '/' + 'expression_data.RData',
            str(config.get_option('expression_data_survival_event')),
            str(config.get_option('expression_data_survival_time')),
            split + '/' + model_name + '/'
        ]
    else:
        command = [
            "/app/app/R/grandforest.analyze_results.R",
            temp_path + '/' + 'model' + '.RData',
            temp_path + '/' + 'interaction_network.RData',
            temp_path + '/' + 'expression_data.RData', 'None', 'None',
            split + '/' + model_name + '/'
        ]

    print('[IO] Starting RSubprocess to analyze the ' + model_name + '...')
    analyzing_results_subprocess = RSubprocess(command)
    analyzing_results_subprocess.start()
    print('[IO] Started RSubprocess to analyze the ' + model_name)
    analyzing_results_subprocess.join()
    print('[IO] Finished RSubprocess to analyze the ' + model_name)
示例#4
0
def __predict_with_grandforest_model(global_model, expression_data, split):
    """
	Private Method.
	Executes the RScript 'grandforest.predict.supervised.R' or 'grandforest.predict.unsupervised.R'
	to predict the local expression data with the global model and write the results to the split
	output directory.
	:param global_model: global model as base64 encoded RData file
	:param expression_data: Local expression data as base64 encoded RData file
	:param split: current split as path to the output directory
	:return: None
	"""
    temp_path = config.get_option('TEMP_DIR') + '/' + str(uuid4())
    os.makedirs(temp_path)

    open(temp_path + '/' + 'global_model.RData',
         'wb').write(base64.decodebytes(global_model.encode('utf-8')))
    open(temp_path + '/' + 'expression_data.RData',
         'wb').write(base64.decodebytes(expression_data.encode('utf-8')))

    try:
        os.makedirs(split)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    if config.get_option('grandforest_method') == 'supervised':
        command = [
            "/app/app/R/grandforest.predict.supervised.R",
            temp_path + '/' + 'global_model.RData',
            temp_path + '/' + 'expression_data.RData',
            str(config.get_option('expression_data_dependent_variable_name')),
            split + '/'
        ]
    else:
        command = [
            "/app/app/R/grandforest.predict.unsupervised.R",
            temp_path + '/' + 'global_model.RData',
            temp_path + '/' + 'expression_data.RData', split + '/'
        ]

    local_prediction_subprocess = RSubprocess(command)
    print(
        '[ALGO] Starting RSubprocess to predict local expression data with the global GrandForest model...'
    )
    local_prediction_subprocess.start()
    print(
        '[ALGO] Started RSubprocess to predict local expression data with the global GrandForest model'
    )
    local_prediction_subprocess.join()
    print(
        '[ALGO] Finished RSubprocess to predict local expression data with the global GrandForest model'
    )
示例#5
0
def create_html_figures():
    figures = {}
    svg_start = '<svg width="100" height="100">'
    svg_end = '</svg>'

    figures['feature_importance_plot_importances'] = svg_start + open(
        config.get_option("OUTPUT_DIR") +
        '/global_model/feature_importances.svg').read() + svg_end
    figures['feature_importance_plot_network'] = svg_start + open(
        config.get_option("OUTPUT_DIR") +
        '/global_model/interaction_subnetwork.svg').read() + svg_end
    figures['endophenotypes_plot_heatmap'] = svg_start + open(
        config.get_option("OUTPUT_DIR") +
        '/global_model/patient_clustering_heatmap.svg').read() + svg_end
    try:
        figures['endophenotypes_plot_survival'] = svg_start + open(
            config.get_option("OUTPUT_DIR") +
            '/global_model/patient_clustering_survival.svg').read() + svg_end
    except FileNotFoundError:
        figures['endophenotypes_plot_survival'] = ""
    return figures
示例#6
0
def get_input_filesizes(splits):
    """
	Gets the amount of lines in the expression data files for each split.
	:param splits: dictionary with all splits as path to the input directory as keys
	:return: dictionary of amount of lines in the expression data files for each split
	"""
    filesizes = dict()
    for split in splits.keys():
        with open(split + '/' +
                  config.get_option('expression_data_filename')) as file:
            filesizes[split] = sum(1 for _ in file)
    return filesizes
示例#7
0
    def create_splits(self):
        # This method creates splits from folders in the input directory
        splits = {}

        if config.get_option('split_mode') == 'directory':
            splits = dict.fromkeys([
                f.path for f in os.scandir(
                    f'{config.get_option("INPUT_DIR")}/{config.get_option("split_dir")}'
                ) if f.is_dir()
            ])
        else:
            splits[config.get_option("INPUT_DIR") + '/'] = None

        for split in splits.keys():
            os.makedirs(split.replace("/input/", "/output/"), exist_ok=True)

        if check_if_config_file_exists():
            shutil.copyfile(
                config.get_option('INPUT_DIR') + '/config.yml',
                config.get_option('OUTPUT_DIR') + '/config.yml')

        self.split_expression_data = splits
示例#8
0
def read_input(input_filepath, input_filename, input_separator):
    """
	Executes the RScript 'grandforest.read_data_frame.R' to read in a CSV/TSV file
	from the input directory to a base64 encoded RData file string with a "data" object insinde.
	:param input_filepath: Full path to the CSV/TSV file
	:param input_filename: only the filename of the CSV/TSV file
	:param input_separator: sperator used in the CSV/TSV file
	:return: base64 encoded RData file string or None if File could not be read
	"""
    temp_path = config.get_option('TEMP_DIR') + '/' + str(uuid4())
    os.makedirs(temp_path)

    try:
        print('[IO] Parsing data of ' + input_filepath)

        command = [
            "/app/app/R/grandforest.read_data_frame.R", input_filepath,
            str(input_separator), temp_path + "/" + input_filename + ".RData"
        ]

        input_reader_subprocess = RSubprocess(command)
        print('[IO] Starting RSubprocess to read ' + input_filename + '...')
        input_reader_subprocess.start()
        print('[IO] Started RSubprocess to read ' + input_filename)
        input_reader_subprocess.join()
        print('[IO] Finished RSubprocess to read ' + input_filename)

        data = base64.b64encode(
            open(temp_path + "/" + input_filename + ".RData",
                 'rb').read()).decode('utf-8')
        print('[IO] Converted RSubprocess Result to a python binary object')

        print('[IO] Read R Dataframe with size ' + str(sys.getsizeof(data)) +
              'Bytes')

        return data
    except Exception as e:
        print('[IO] could not read file', e)

        return None
示例#9
0
def __aggregate_grandforest_models(global_data):
    """
	Private Method.
	Executes the RScript 'grandforest.sum_models.R' to sum all the local models to a global model.
	:param global_data: Local models as a list of base64 encoded RData files with one "model" object per file inside
	:return: base64 encoded RData file with the aggregated global model as "model" object inside
	"""
    temp_path = config.get_option('TEMP_DIR') + '/' + str(uuid4())
    os.makedirs(temp_path)

    open(temp_path + '/' + 'forest1.RData',
         'wb').write(base64.decodebytes(global_data[0].encode('utf-8')))
    print(
        '[ALGO] Starting RSubprocesses to aggregate the GrandForest models...')
    for i in range(1, len(global_data)):
        open(temp_path + '/' + 'forest2.RData',
             'wb').write(base64.decodebytes(global_data[i].encode('utf-8')))
        command = [
            "/app/app/R/grandforest.sum_models.R",
            temp_path + '/' + 'forest1.RData',
            temp_path + '/' + 'forest2.RData',
            temp_path + '/' + 'forest1.RData'
        ]
        model_aggregation_subprocess = RSubprocess(command)
        model_aggregation_subprocess.start()
        model_aggregation_subprocess.join()

        os.remove(temp_path + '/' + 'forest2.RData')
    print('[ALGO] Finished RSubprocesses to aggregate the GrandForest models')

    # save global model as base64 encoded string
    global_model = base64.b64encode(
        open(temp_path + '/' + 'forest1.RData', 'rb').read()).decode('utf-8')
    os.remove(temp_path + '/' + 'forest1.RData')
    print(
        f'[ALGO] Global Aggregation on client {config.get_option("id")}: {sys.getsizeof(global_model)} Bytes successful'
    )

    return global_model
示例#10
0
def read_config(is_coordinator):
    """
	Read in the config.yml in the input directory. Save the parameters in the global config dictionary.
	:return: None
	"""
    print('[IO] Read config file.')
    with open(INPUT_DIR + '/config.yml') as f:
        config_file = yaml.load(f, Loader=yaml.FullLoader)['fc_grandforest']

    config.add_option('INPUT_DIR', INPUT_DIR)
    config.add_option('TEMP_DIR', TEMP_DIR)
    config.add_option('OUTPUT_DIR', OUTPUT_DIR)

    if is_coordinator:
        config.add_option('grandforest_method',
                          config_file['global_options']['grandforest_method'])
        config.add_option(
            'grandforest_treetype',
            config_file['global_options']['grandforest_treetype'])
        config.add_option('number_of_trees',
                          config_file['global_options']['number_of_trees'])
        config.add_option('minimal_node_size',
                          config_file['global_options']['minimal_node_size'])
        config.add_option('seed', config_file['global_options']['seed'])
        if config_file['global_options']['interaction_network'] == 'biogrid':
            config.add_option('interaction_network_filename', 'biogrid')
            config.add_option('interaction_network_filepath',
                              '/app/interaction_networks/biogrid.tsv')
            config.add_option('interaction_network_separator', '\t')
        elif config_file['global_options']['interaction_network'] == 'htridb':
            config.add_option('interaction_network_filename', 'htridb')
            config.add_option('interaction_network_filepath',
                              '/app/interaction_networks/htridb.tsv')
            config.add_option('interaction_network_separator', '\t')
        elif config_file['global_options']['interaction_network'] == 'iid':
            config.add_option('interaction_network_filename', 'iid')
            config.add_option('interaction_network_filepath',
                              '/app/interaction_networks/iid.tsv')
            config.add_option('interaction_network_separator', '\t')
        elif config_file['global_options'][
                'interaction_network'] == 'regnetwork':
            config.add_option('interaction_network_filename', 'regnetwork')
            config.add_option('interaction_network_filepath',
                              '/app/interaction_networks/regnetwork.tsv')
            config.add_option('interaction_network_separator', '\t')
        else:
            config.add_option(
                'interaction_network_filename',
                config_file['global_options']['interaction_network'])
            config.add_option(
                'interaction_network_filepath', INPUT_DIR + '/' +
                config_file['global_options']['interaction_network'])
            config.add_option(
                'interaction_network_separator',
                config_file['global_options']['interaction_network_separator'])

        # Check if coordinator config options are set correctly
        if not config.get_option('grandforest_method') in {
                'supervised', 'unsupervised'
        }:
            print('[IO] Config File Error.')
            raise ValueError(
                "grandforest_method can either be 'supervised' or 'unsupervised'"
            )

        if not config.get_option('grandforest_treetype') in {
                'classification', 'regression', 'survival', 'probability'
        }:
            print('[IO] Config File Error.')
            raise ValueError(
                "grandforest_treetype can be 'classification', 'regression', 'survival' or 'probability'"
            )

    # Client config options
    # local options
    if str(config_file['local_options']['prediction']) == 'True':
        config.add_option('prediction', True)
    elif str(config_file['local_options']['prediction']) == 'False':
        config.add_option('prediction', False)
    else:
        print('[IO] Config File Error.')
        raise ValueError("prediction can be 'True' or 'False'")

    # local files
    try:
        config.add_option(
            'expression_data_dependent_variable_name',
            config_file['local_files']
            ['expression_data_dependent_variable_name'])
    except KeyError:
        pass

    try:
        config.add_option(
            'expression_data_survival_event',
            config_file['local_files']['expression_data_survival_event'])
        config.add_option(
            'expression_data_survival_time',
            config_file['local_files']['expression_data_survival_time'])
    except KeyError:
        pass

    config.add_option('expression_data_separator',
                      config_file['local_files']['expression_data_separator'])
    config.add_option('expression_data_filename',
                      config_file['local_files']['expression_data'])

    # split
    config.add_option('split_mode', config_file['split']['mode'])
    config.add_option('split_dir', config_file['split']['dir'])
示例#11
0
def check_config(splits):
    """
	Checks the config from the frontend FormsDict. Is a bit Runtime intensive, so this method is not
	executed in a workflow with configuration file.
	:param splits: Boolean, if this Participant is the coordinator
	:return: True, if the input is correct, False if there are non valid inputs
	"""
    # Test text variables
    try:
        int(config.get_option('number_of_trees'))
    except ValueError:
        print('[IO] Config File Error.')
        print(
            f"Number of Trees variable is not a valid number: {config.get_option('number_of_trees')}"
        )
        return False

    try:
        int(config.get_option('minimal_node_size'))
    except ValueError:
        print('[IO] Config File Error.')
        print(
            f"Minimal Node Size variable is not a valid number: {config.get_option('minimal_node_size')}"
        )
        return False

    if config.get_option('seed') != 'None' and config.get_option(
            'seed') is not None:
        try:
            int(config.get_option('seed'))
        except ValueError:
            print('[IO] Config File Error.')
            print(
                f"Seed variable is not a valid number: {config.get_option('seed')}"
            )
            return False

    # Test interaction network file
    try:
        open(config.get_option('interaction_network_filepath'), 'r')
    except FileNotFoundError:
        print('[IO] Config File Error.')
        print(
            f"Interaction Network File {config.get_option('interaction_network_filename')} not found."
        )
        return False

    # Test expression data files in each split
    for split in splits.keys():
        try:
            open(split + '/' + config.get_option('expression_data_filename'),
                 'r')
        except FileNotFoundError:
            print('[IO] Config File Error.')
            print(
                f"Expression data File {config.get_option('expression_data_filename')} not found."
            )
            return False

        with open(split + '/' + config.get_option('expression_data_filename'),
                  'r') as file:
            firstline = file.readline()
            if config.get_option('grandforest_treetype') == 'survival':
                if firstline.find(
                        config.get_option("expression_data_survival_event")
                ) == -1 or firstline.find(
                        config.get_option(
                            "expression_data_survival_time")) == -1:
                    print('[IO] Config File Error.')
                    print(
                        f"expression_data_survival_event {config.get_option('expression_data_survival_event')} or expression_data_survival_time {config.get_option('expression_data_survival_time')} not found in Expression data File {config.get_option('expression_data_filename')}."
                    )
                    return False

            else:
                if firstline.find(
                        config.get_option(
                            "expression_data_dependent_variable_name")) == -1:
                    print('[IO] Config File Error.')
                    print(
                        f"expression_data_dependent_variable_name {config.get_option('expression_data_dependent_variable_name')} not found in Expression data File {config.get_option('expression_data_filename')}."
                    )
                    return False
    return True
示例#12
0
    def app_flow(self):
        # This method contains a state machine for the client and coordinator instance
        # Coordinator Workflow: 1 -> 2 -> 4 -> 5 -> 6 -> 7 -> 8 -> 9
        # Client Workflow:      1 -> 2 -> 3 -> 4 -> 5 -> 6 -> 8 -> 9

        # === States ===
        state_initialize = 1
        state_read_config = 2
        state_wait_for_config = 3
        state_read_input = 4
        state_local_computation = 5
        state_wait_for_global_aggregation = 6
        state_global_aggregation = 7
        state_write_results = 8
        state_finish = 9

        # Initial state
        state = state_initialize
        self.progress = 'initializing'

        while True:

            # INITIALIZE THE WORKFLOW

            if state == state_initialize:
                if self.id is not None:  # Test if setup has happened already
                    config.init()  # initialize config dictionary
                    config.add_option('id', self.id)
                    config.add_option('is_coordinator', self.master)

                    # If config does not exist, wait for correct input in the frontend
                    if check_if_config_file_exists():
                        self.progress = 'parsing config file'
                        read_config(self.master)
                        self.create_splits()
                    else:
                        print(
                            '[IO] No config file found. Waiting for user input in the FrontEnd...'
                        )
                        config_is_valid = False
                        while config_is_valid is False:
                            config.add_option('input_form', False)
                            self.progress = 'getting config frontend'
                            while config.get_option('input_form') is False:
                                time.sleep(10)

                            self.progress = 'parsing config frontend'
                            print(
                                '[IO] Received FrontEnd Input Form. Continuing GrandForest workflow...'
                            )
                            read_config_from_frontend(
                                self.master, config.get_option('input_form'))
                            self.create_splits()
                            if check_config(self.split_expression_data):
                                config_is_valid = True

                    self.local_models = dict.fromkeys(
                        self.split_expression_data.keys())

                    # create temp directory for python <-> R data exchange
                    # TODO create RAMDISK instead?
                    try:
                        os.makedirs(config.get_option('TEMP_DIR'))
                    except OSError as e:
                        if e.errno != errno.EEXIST:
                            print(
                                f'[CRIT] Could not create temporary directory',
                                flush=True)
                            raise

                # Set Expression Data Sample Size for Model Balancing
                if self.master:
                    self.data_incoming.append([
                        self.id,
                        get_input_filesizes(self.split_expression_data)
                    ])
                    state = state_read_config
                else:
                    self.data_outgoing = json.dumps([
                        self.id,
                        get_input_filesizes(self.split_expression_data)
                    ])
                    self.status_available = True
                    state = state_wait_for_config

            # READ CONFIG AND SEND GLOBAL OPTIONS TO CLIENTS

            if state == state_read_config:
                self.progress = 'sending config'

                # Prepare and Send global options from the configuration to all clients
                #  including balanced amount of trees to be trained
                if self.master:
                    print("[MASTER] Received Data from ",
                          len(self.data_incoming), " of ", str(self.clients),
                          "clients.")
                    if len(self.data_incoming) == len(self.clients):
                        print(
                            f'[CLIENT] Received all client expression data filesizes.',
                            flush=True)
                        filesizes_combined = dict()
                        for participant in self.data_incoming:
                            for split in self.split_expression_data.keys():
                                try:
                                    filesizes_combined[split]
                                except KeyError:
                                    filesizes_combined[split] = 0
                                filesizes_combined[split] = filesizes_combined[
                                    split] + participant[1][split]

                        num_trees_per_client_per_split = dict()
                        for participant in self.data_incoming:
                            for split in self.split_expression_data.keys():
                                try:
                                    num_trees_per_client_per_split[
                                        participant[0]]
                                except KeyError:
                                    num_trees_per_client_per_split[
                                        participant[0]] = dict()

                                try:
                                    num_trees_per_client_per_split[
                                        participant[0]][split]
                                except KeyError:
                                    num_trees_per_client_per_split[
                                        participant[0]][split] = 0
                                num_trees_per_client_per_split[
                                    participant[0]][split] = math.ceil(
                                        participant[1][split] /
                                        filesizes_combined[split] * int(
                                            config.get_option(
                                                'number_of_trees')))

                        self.interaction_network = read_input(
                            config.get_option('interaction_network_filepath'),
                            config.get_option('interaction_network_filename'),
                            config.get_option('interaction_network_separator'))
                        config.add_option(
                            'number_of_trees_per_split',
                            num_trees_per_client_per_split[self.id])
                        self.data_incoming = []

                        print(
                            f'[COORDINATOR] Sending interaction network to clients',
                            flush=True)
                        self.data_outgoing = json.dumps([
                            config.get_option('grandforest_method'),
                            config.get_option('grandforest_treetype'),
                            num_trees_per_client_per_split,
                            config.get_option('minimal_node_size'),
                            config.get_option('seed'), self.interaction_network
                        ])
                        self.status_available = True
                        state = state_read_input
                else:
                    state = state_wait_for_config

            # WAIT FOR CONFIG

            if state == state_wait_for_config:
                self.progress = 'gathering config'
                if len(self.data_incoming) > 0:
                    config.add_option('grandforest_method',
                                      self.data_incoming[0][0])
                    config.add_option('grandforest_treetype',
                                      self.data_incoming[0][1])
                    config.add_option('number_of_trees_per_split',
                                      self.data_incoming[0][2][self.id])
                    config.add_option('minimal_node_size',
                                      self.data_incoming[0][3])
                    config.add_option('seed', self.data_incoming[0][4])
                    self.interaction_network = self.data_incoming[0][5]
                    print(
                        f'[CLIENT] Received config and interaction network with size {sys.getsizeof(self.interaction_network)} Bytes from coordinator',
                        flush=True)
                    self.data_incoming = []
                    state = state_read_input

            # READ INPUT FILES IN R

            if state == state_read_input:
                for split in self.split_expression_data.keys():
                    self.split_expression_data[split] = read_input(
                        split + '/' +
                        config.get_option('expression_data_filename'),
                        config.get_option('expression_data_filename'),
                        config.get_option('expression_data_separator'))
                state = state_local_computation

            # COMPUTE LOCAL MODEL IN R

            if state == state_local_computation:
                self.progress = 'computing'

                # Check if config is valid
                #  this could be outsourced to io.py, since the frontend configuration is already checked there
                if config.get_option('grandforest_method') == "supervised":
                    if config.get_option('grandforest_treetype') == "survival":
                        try:
                            config.get_option('expression_data_survival_event')
                            config.get_option('expression_data_survival_time')
                        except KeyError:
                            print('[LOGIC] Config File Error.')
                            raise ValueError(
                                "The GrandForest Layout is invalid: survival time and/or event missing"
                            )
                        config.add_option(
                            'expression_data_dependent_variable_name', "None")
                    else:
                        try:
                            config.get_option(
                                'expression_data_dependent_variable_name')
                        except KeyError:
                            print('[LOGIC] Config File Error.')
                            raise ValueError(
                                "The GrandForest Layout is invalid: dependent variable name missing"
                            )
                        config.add_option('expression_data_survival_event',
                                          "None")
                        config.add_option('expression_data_survival_time',
                                          "None")

                for split in self.split_expression_data.keys():
                    self.local_models[split] = local_computation(
                        self.split_expression_data[split],
                        self.interaction_network, split)

                if self.master:
                    print(f'[COORDINATOR] Finished computing the local model',
                          flush=True)
                    self.client_models.append(self.local_models)
                else:
                    print(f'[CLIENT] Sending local model to master',
                          flush=True)
                    self.data_outgoing = json.dumps(self.local_models)
                    self.data_incoming = []
                    self.status_available = True

                state = state_wait_for_global_aggregation

            # WAIT FOR GLOBAL AGGREGATION

            if state == state_wait_for_global_aggregation:
                if self.master:
                    self.progress = 'gathering models'
                    if len(self.data_incoming) == len(self.clients) - 1:
                        print(
                            f'[COORDINATOR] Received local models from all clients',
                            flush=True)
                        self.client_models.extend(self.data_incoming)
                        self.data_incoming = []
                        state = state_global_aggregation
                else:
                    self.progress = 'gathering global model'
                    if len(self.data_incoming) > 0:
                        self.global_models = self.data_incoming[0]
                        self.data_incoming = []
                        print(f'[CLIENT] Received result from master',
                              flush=True)
                        state = state_write_results

            # GLOBAL AGGREGATION IN R

            if state == state_global_aggregation:
                self.progress = 'computing'
                for split in self.split_expression_data.keys():
                    self.global_models[split] = global_aggregation([
                        client_splits[split]
                        for client_splits in self.client_models
                    ])
                print(f'[COORDINATOR] Sending global model to clients',
                      flush=True)
                self.data_outgoing = json.dumps(self.global_models)
                self.status_available = True
                state = state_write_results

            # WRITE AND ANALYZE RESULTS IN R

            if state == state_write_results:
                self.progress = 'writing results'
                for split in self.split_expression_data.keys():
                    if config.get_option('prediction'):
                        local_prediction(self.global_models[split],
                                         self.split_expression_data[split],
                                         split.replace("/input/", "/output/"))
                    write_results(self.local_models[split],
                                  self.global_models[split],
                                  split.replace("/input/", "/output/"))
                    result_analysis(self.local_models[split],
                                    self.global_models[split],
                                    self.interaction_network,
                                    self.split_expression_data[split],
                                    split.replace("/input/", "/output/"))

                if self.master:
                    self.data_incoming = ['DONE']
                else:
                    self.data_outgoing = json.dumps('DONE')
                    self.status_available = True
                state = state_finish

            # FINISH THE WORKFLOW

            if state == state_finish:
                self.progress = 'finishing'
                if self.master:
                    if len(self.data_incoming) == len(self.clients):
                        # FINISH COORDINATOR
                        print(
                            f'[COORDINATOR] Finished the workflow, exiting...',
                            flush=True)
                        self.status_finished = True
                        break
                else:
                    # FINISH CIENT
                    print(f'[CLIENT] Finished the workflow, exiting...',
                          flush=True)
                    self.status_finished = True
                    break

            time.sleep(0.1)
示例#13
0
def __compute_local_grandforest_model(expression_data, interaction_network,
                                      split):
    """
	Private Method.
	Executes the RScript 'grandforest.train_model.supervised.R' or 'grandforest.train_model.unsupervised.R'
	to train a local GrandForest model.
	:param expression_data: Local expression data as base64 encoded RData file with "data" object inside
	:param interaction_network: Interaction Network as base64 encoded RData file with "data" object inside
	:param split: current split as path to the output directory
	:return: base64 encoded RData file with the local model as "model" object inside
	"""
    temp_path = config.get_option('TEMP_DIR') + '/' + str(uuid4())
    os.makedirs(temp_path)

    open(temp_path + '/' + 'expression_data.RData',
         'wb').write(base64.decodebytes(expression_data.encode('utf-8')))
    open(temp_path + '/' + 'interaction_network.RData',
         'wb').write(base64.decodebytes(interaction_network.encode('utf-8')))

    if config.get_option('grandforest_method') == 'supervised':
        command = [
            "/app/app/R/grandforest.train_model.supervised.R",
            temp_path + '/' + 'expression_data.RData',
            temp_path + '/' + 'interaction_network.RData',
            str(config.get_option('number_of_trees_per_split')[split]),
            str(config.get_option('minimal_node_size')),
            str(config.get_option('seed')),
            str(config.get_option('grandforest_treetype')),
            str(config.get_option('expression_data_dependent_variable_name')),
            str(config.get_option('expression_data_survival_event')),
            str(config.get_option('expression_data_survival_time')),
            temp_path + '/' + 'local_model.RData'
        ]
    else:
        command = [
            "/app/app/R/grandforest.train_model.unsupervised.R",
            temp_path + '/' + 'expression_data.RData',
            temp_path + '/' + 'interaction_network.RData',
            str(config.get_option('number_of_trees_per_split')[split]),
            str(config.get_option('minimal_node_size')),
            str(config.get_option('seed')),
            temp_path + '/' + 'local_model.RData'
        ]

    local_computation_subprocess = RSubprocess(command)
    print(
        '[ALGO] Starting RSubprocess to calculate local GrandForest model...')
    local_computation_subprocess.start()
    print('[ALGO] Started RSubprocess to calculate local GrandForest model')
    local_computation_subprocess.join()
    print('[ALGO] Finished RSubprocess to calculate local GrandForest model')

    # save local model as base64 encoded string
    local_model = base64.b64encode(
        open(temp_path + '/' + 'local_model.RData',
             'rb').read()).decode('utf-8')

    print(
        f'[ALGO] Local computation of client {config.get_option("id")}: {sys.getsizeof(local_model)} Bytes successful'
    )

    return local_model