def extract_tool_usage(self, tool_usage_file, cutoff_date, dictionary): """ Extract the tool usage over time for each tool """ tool_usage_dict = dict() all_dates = list() all_tool_list = list(dictionary.keys()) with open(tool_usage_file, 'rt') as usage_file: tool_usage = csv.reader(usage_file, delimiter='\t') for index, row in enumerate(tool_usage): if (str(row[1]) > cutoff_date) is True: tool_id = utils.format_tool_id(row[0]) if tool_id in all_tool_list: all_dates.append(row[1]) if tool_id not in tool_usage_dict: tool_usage_dict[tool_id] = dict() tool_usage_dict[tool_id][row[1]] = int(row[2]) else: curr_date = row[1] # merge the usage of different version of tools into one if curr_date in tool_usage_dict[tool_id]: tool_usage_dict[tool_id][curr_date] += int( row[2]) else: tool_usage_dict[tool_id][curr_date] = int( row[2]) # get unique dates unique_dates = list(set(all_dates)) for tool in tool_usage_dict: usage = tool_usage_dict[tool] # extract those dates for which tool's usage is not present in raw data dates_not_present = list(set(unique_dates) ^ set(usage.keys())) # impute the missing values by 0 for dt in dates_not_present: tool_usage_dict[tool][dt] = 0 # sort the usage list by date tool_usage_dict[tool] = collections.OrderedDict( sorted(usage.items())) utils.write_file( main_path + "/data/generated_files/tool_usage_dict.txt", tool_usage_dict) return tool_usage_dict
def read_tabular_file(self, raw_file_path): """ Read tabular file and extract workflow connections """ print("Reading workflows...") workflows = {} workflow_paths_dup = "" workflow_parents = dict() workflow_paths = list() unique_paths = dict() standard_connections = dict() with open(raw_file_path, "rt") as workflow_connections_file: workflow_connections = csv.reader(workflow_connections_file, delimiter="\t") for index, row in enumerate(workflow_connections): wf_id = str(row[0]) in_tool = row[3] out_tool = row[6] if wf_id not in workflows: workflows[wf_id] = list() if out_tool and in_tool and out_tool != in_tool: workflows[wf_id].append((out_tool, in_tool)) qc = self.collect_standard_connections(row) if qc: i_t = utils.format_tool_id(in_tool) o_t = utils.format_tool_id(out_tool) if i_t not in standard_connections: standard_connections[i_t] = list() if o_t not in standard_connections[i_t]: standard_connections[i_t].append(o_t) print("Processing workflows...") wf_ctr = 0 for wf_id in workflows: wf_ctr += 1 workflow_parents[wf_id] = self.read_workflow( wf_id, workflows[wf_id]) for wf_id in workflow_parents: flow_paths = list() parents_graph = workflow_parents[wf_id] roots, leaves = self.get_roots_leaves(parents_graph) for root in roots: for leaf in leaves: paths = self.find_tool_paths_workflow( parents_graph, root, leaf) # reverse the paths as they are computed from leaves to roots leaf paths = [tool_path for tool_path in paths] if len(paths) > 0: flow_paths.extend(paths) workflow_paths.extend(flow_paths) print("Workflows processed: %d" % wf_ctr) # remove slashes from the tool ids wf_paths_no_slash = list() for path in workflow_paths: path_no_slash = [utils.format_tool_id(tool_id) for tool_id in path] wf_paths_no_slash.append(path_no_slash) # collect duplicate paths for path in wf_paths_no_slash: workflow_paths_dup += ",".join(path) + "\n" # collect unique paths unique_paths = list(workflow_paths_dup.split("\n")) unique_paths = list(filter(None, unique_paths)) random.shuffle(unique_paths) no_dup_paths = list(set(unique_paths)) print("Finding compatible next tools...") compatible_next_tools = self.set_compatible_next_tools(no_dup_paths) return unique_paths, compatible_next_tools, standard_connections