def main(): args = get_options() clan_dict = {} pathconfigfile = os.path.realpath(__file__).replace( 'complexityFAS.py', 'pathconfig.txt') option_dict = {} with open(pathconfigfile) as f: toolpath = f.readline().strip() if args.featuretypes is not None: option_dict['input_linearized'], option_dict[ 'input_normal'] = fasInput.featuretypes(args.featuretypes) else: option_dict['input_linearized'], option_dict[ 'input_normal'] = fasInput.featuretypes(toolpath + '/' + 'annoTools.txt') option_dict["max_overlap"] = args.max_overlap if 0.0 <= float(args.max_overlap_percentage) <= 1.0: option_dict["max_overlap_percentage"] = float( args.max_overlap_percentage) else: raise Exception( "[--max_overlap_percentage] should be between 0.0 and 1.0") option_dict["eFeature"] = args.eFeature option_dict["eInstance"] = args.eInstance proteome = fasInput.read_json(args.input) clan_dict.update(proteome["clan"]) proteome = proteome["feature"] if args.id: proteins = args.id else: proteins = list(proteome.keys()) print('Protein ID\t#Paths\tapproximate greedy complexity') for protein in proteins: calc_complex(protein, proteome, option_dict, clan_dict, args)
def write_domain_file(path, idlist, outpath, tools, groupname): proteome = fasInput.read_json(path)["feature"] with open(outpath, 'w') as out: for pid in idlist: for tool in tools: for feature in proteome[pid][tool]: for instance in proteome[pid][tool][feature]["instance"]: out.write(groupname + "#" + pid + "\t" + pid + "\t" + str(proteome[pid]["length"]) + "\t" + feature + "\t" + str(instance[0]) + "\t" + str(instance[1]) + "\tNA\tN\n")
def run_fas(data): tmp = True try: tmp_data = read_json(data[7] + "/" + data[0] + ".json") except FileNotFoundError: try: tmp_data = read_json(data[4] + "/" + data[0] + ".json") tmp = False except FileNotFoundError: raise Exception('Taxon: "' + data[0] + '" is missing in the weight_dir') query_proteome = {} missing = [] for i in data[1]['pairwise']: try: query_proteome[i[1]] = tmp_data["feature"][i[1]] except KeyError: if data[6]: missing.append(i[1]) else: raise Exception( 'The protein: "' + i[1] + '" is missing in taxon: "' + data[0] + '". The annotations ' + 'in weight_dir should contain all proteins from the genome_dir.' ) if missing: missinseq = [] for i in missing: missinseq.append('>' + i + '\n' + data[6][data[0]][i]) if tmp: weight_dir = data[7] else: weight_dir = data[4] doAnnoForMissing(data[0], missinseq, weight_dir + "/" + data[0] + ".json", data[7] + "/", 1, True) tmp_data = read_json(data[7] + "/" + data[0] + ".json") for i in missing: query_proteome[i] = tmp_data["feature"][i] clan_dict = data[5] clan_dict.update(tmp_data["clan"]) seed_proteome = data[2] weight = w_weight_correction("loge", tmp_data["count"]) f_results = greedyFAS.fc_main(weight, seed_proteome, query_proteome, clan_dict, data[1]) outdata = {} for result in f_results: outdata[result[0], result[1]] = (result[2][0], 0.0) if data[1]["bidirectional"]: data[1]["reverse"] = True pairtmp = [] for pair in data[1]['pairwise']: pairtmp.append((pair[1], pair[0])) data[1]['pairwise'] = pairtmp r_results = greedyFAS.fc_main(data[3], query_proteome, seed_proteome, clan_dict, data[1]) for result in r_results: outdata[result[1], result[0]] = (outdata[result[1], result[0]][0], result[2][0]) return outdata, data[0]
def manage_jobpool(jobdict, seed_names, seed_spec, weight_dir, tmp_path, cores, features, bidirectional, fasta): missing = [] for spec in jobdict: if not os.path.exists(weight_dir + "/" + spec + ".json"): missing.append(spec) try: tmp_data = read_json(weight_dir + "/" + seed_spec + ".json") except FileNotFoundError: missing.append(seed_spec) if missing: raise Exception('The following taxa are missing in the weight_dir:\n' + '\n'.join(missing)) seed_weight = w_weight_correction("loge", tmp_data["count"]) seed_proteome = tmp_data["feature"] missing = [] for seed_name in seed_names: if seed_name not in seed_proteome: if fasta: missing.append(seed_name) else: raise Exception( 'The protein: "' + seed_name + '" is missing in taxon: "' + seed_spec + '". The annotations in weight_dir should contain all proteins from the genome_dir.' ) if missing: missinseq = [] for i in missing: missinseq.append('>' + i + '\n' + fasta[seed_spec][i]) doAnnoForMissing(seed_spec, missinseq, weight_dir + "/" + seed_spec + ".json", tmp_path + "/", cores, True) tmp_data = read_json(tmp_path + "/" + seed_spec + ".json") seed_weight = w_weight_correction("loge", tmp_data["count"]) seed_proteome = tmp_data["feature"] clan_dict = tmp_data["clan"] data = [] for spec in jobdict: data.append([ spec, { "weight_const": False, "seed_id": None, "query_id": None, "priority_mode": True, "priority_threshold": 30, "max_cardinality": 500, "eFeature": 0.001, "cores": 1, "eInstance": 0.01, "e_output": True, "feature_info": None, "bidirectional": bidirectional, "raw": False, "silent": False, "reverse": False, "max_overlap": 0, "classicMS": False, "timelimit": 0, "ref_2": None, "phyloprofile": None, "score_weights": (0.7, 0.0, 0.3), "output": 0, "max_overlap_percentage": 0.0, "domain": True, "pairwise": jobdict[spec], "weight_correction": "loge", "outpath": tmp_path + "/" + spec, "input_linearized": features[0], "input_normal": features[1], "MS_uni": 0, "ref_proteome": [spec + '.json'], "progress": False }, seed_proteome, seed_weight, weight_dir, clan_dict, fasta, tmp_path ]) jobpool = multiprocessing.Pool(processes=cores) results = [] for _ in tqdm(jobpool.imap_unordered(run_fas, data), total=len(jobdict)): results.append(_) jobpool.close() jobpool.join() return results
def fc_start(option): """Overhead function, this function manages the individual functions that read the input files and prepares the data for the main script. Function calls: xmlreader(), w_count_ref(), w_count(), fc_main() :param option: dictionary that contains the main option variables of FAS """ clan_dict = {} domain_count = {} option["reverse"] = False # MS_uni set to 0 when no weighting is conducted if option["MS_uni"] == 0: domain_count = {} for path in option["ref_proteome"]: domain_count.update(read_json(path)["count"]) proteome_list = [] for path in option["p_path"]: proteome = read_json(path) proteome_list.append(proteome["feature"]) clan_dict.update(proteome["clan"]) seed_proteome = mergeNestedDic(proteome_list) proteome_list = [] for path in option["s_path"]: proteome = read_json(path) proteome_list.append(proteome["feature"]) clan_dict.update(proteome["clan"]) query_proteome = mergeNestedDic(proteome_list) if option["weight_correction"]: domain_count = w_weight_correction(option["weight_correction"], domain_count) for tool in option["input_linearized"]: if tool not in seed_proteome[list(seed_proteome)[0]]: raise Exception(tool + " is missing in the seed annotation") if tool not in query_proteome[list(query_proteome)[0]]: raise Exception(tool + " is missing in the query annotation") if option["seed_id"]: for protid in option["seed_id"]: if protid not in seed_proteome: raise Exception(protid + " is not in the seed annotation") if option["query_id"]: for protid in option["query_id"]: if protid not in query_proteome: raise Exception(protid + " is not in the query annotation") if option["bidirectional"]: print("calculating forward scores...") f_results = fc_main(domain_count, seed_proteome, query_proteome, clan_dict, option) if option["MS_uni"] == 0 and option["ref_2"]: domain_count_2 = {} for path in option["ref_2"]: domain_count_2.update(read_json(path)["count"]) if option["weight_correction"]: domain_count_2 = w_weight_correction(option["weight_correction"], domain_count_2) option['ref_proteome'] = option['ref_2'] else: domain_count_2 = domain_count id_tmp = option["seed_id"] option["reverse"] = True option["seed_id"] = option["query_id"] option["query_id"] = id_tmp if option["pairwise"]: pairtmp = [] for pair in option["pairwise"]: pairtmp.append((pair[1], pair[0])) option["pairwise"] = pairtmp print("calculating backward scores...") r_results = fc_main(domain_count_2, query_proteome, seed_proteome, clan_dict, option) if option["phyloprofile"]: phyloprofile_out(option["outpath"], True, option["phyloprofile"], (f_results, r_results)) if not option['tsv']: write_tsv_out(option["outpath"], True, (f_results, r_results)) else: print("calculating forward scores...") results = fc_main(domain_count, seed_proteome, query_proteome, clan_dict, option) if not option["tsv"]: write_tsv_out(option["outpath"], False, (results, None)) if option["phyloprofile"]: phyloprofile_out(option["outpath"], False, option["phyloprofile"], [results, None])
def main(): options = get_options() ids = get_ids(options.extract_ids) annotation = read_json(options.input) new_dict = extract_architectures(ids, annotation) savejson(new_dict, options.output)