예제 #1
0
 def extract_tool_usage(self, tool_usage_file, cutoff_date, dictionary):
     """
     Extract the tool usage over time for each tool
     """
     tool_usage_dict = dict()
     all_dates = list()
     all_tool_list = list(dictionary.keys())
     with open(tool_usage_file, 'rt') as usage_file:
         tool_usage = csv.reader(usage_file, delimiter='\t')
         for index, row in enumerate(tool_usage):
             if (str(row[1]) > cutoff_date) is True:
                 tool_id = utils.format_tool_id(row[0])
                 if tool_id in all_tool_list:
                     all_dates.append(row[1])
                     if tool_id not in tool_usage_dict:
                         tool_usage_dict[tool_id] = dict()
                         tool_usage_dict[tool_id][row[1]] = int(row[2])
                     else:
                         curr_date = row[1]
                         # merge the usage of different version of tools into one
                         if curr_date in tool_usage_dict[tool_id]:
                             tool_usage_dict[tool_id][curr_date] += int(
                                 row[2])
                         else:
                             tool_usage_dict[tool_id][curr_date] = int(
                                 row[2])
     # get unique dates
     unique_dates = list(set(all_dates))
     for tool in tool_usage_dict:
         usage = tool_usage_dict[tool]
         # extract those dates for which tool's usage is not present in raw data
         dates_not_present = list(set(unique_dates) ^ set(usage.keys()))
         # impute the missing values by 0
         for dt in dates_not_present:
             tool_usage_dict[tool][dt] = 0
         # sort the usage list by date
         tool_usage_dict[tool] = collections.OrderedDict(
             sorted(usage.items()))
     utils.write_file(
         main_path + "/data/generated_files/tool_usage_dict.txt",
         tool_usage_dict)
     return tool_usage_dict
예제 #2
0
    def read_tabular_file(self, raw_file_path):
        """
        Read tabular file and extract workflow connections
        """
        print("Reading workflows...")
        workflows = {}
        workflow_paths_dup = ""
        workflow_parents = dict()
        workflow_paths = list()
        unique_paths = dict()
        standard_connections = dict()
        with open(raw_file_path, "rt") as workflow_connections_file:
            workflow_connections = csv.reader(workflow_connections_file,
                                              delimiter="\t")
            for index, row in enumerate(workflow_connections):
                wf_id = str(row[0])
                in_tool = row[3]
                out_tool = row[6]
                if wf_id not in workflows:
                    workflows[wf_id] = list()
                if out_tool and in_tool and out_tool != in_tool:
                    workflows[wf_id].append((out_tool, in_tool))
                    qc = self.collect_standard_connections(row)
                    if qc:
                        i_t = utils.format_tool_id(in_tool)
                        o_t = utils.format_tool_id(out_tool)
                        if i_t not in standard_connections:
                            standard_connections[i_t] = list()
                        if o_t not in standard_connections[i_t]:
                            standard_connections[i_t].append(o_t)
        print("Processing workflows...")
        wf_ctr = 0
        for wf_id in workflows:
            wf_ctr += 1
            workflow_parents[wf_id] = self.read_workflow(
                wf_id, workflows[wf_id])

        for wf_id in workflow_parents:
            flow_paths = list()
            parents_graph = workflow_parents[wf_id]
            roots, leaves = self.get_roots_leaves(parents_graph)
            for root in roots:
                for leaf in leaves:
                    paths = self.find_tool_paths_workflow(
                        parents_graph, root, leaf)
                    # reverse the paths as they are computed from leaves to roots leaf
                    paths = [tool_path for tool_path in paths]
                    if len(paths) > 0:
                        flow_paths.extend(paths)
            workflow_paths.extend(flow_paths)
        print("Workflows processed: %d" % wf_ctr)

        # remove slashes from the tool ids
        wf_paths_no_slash = list()
        for path in workflow_paths:
            path_no_slash = [utils.format_tool_id(tool_id) for tool_id in path]
            wf_paths_no_slash.append(path_no_slash)

        # collect duplicate paths
        for path in wf_paths_no_slash:
            workflow_paths_dup += ",".join(path) + "\n"

        # collect unique paths
        unique_paths = list(workflow_paths_dup.split("\n"))
        unique_paths = list(filter(None, unique_paths))
        random.shuffle(unique_paths)
        no_dup_paths = list(set(unique_paths))

        print("Finding compatible next tools...")
        compatible_next_tools = self.set_compatible_next_tools(no_dup_paths)
        return unique_paths, compatible_next_tools, standard_connections