예제 #1
0
def main():
    args = get_options()
    clan_dict = {}
    pathconfigfile = os.path.realpath(__file__).replace(
        'complexityFAS.py', 'pathconfig.txt')
    option_dict = {}
    with open(pathconfigfile) as f:
        toolpath = f.readline().strip()
    if args.featuretypes is not None:
        option_dict['input_linearized'], option_dict[
            'input_normal'] = fasInput.featuretypes(args.featuretypes)
    else:
        option_dict['input_linearized'], option_dict[
            'input_normal'] = fasInput.featuretypes(toolpath + '/' +
                                                    'annoTools.txt')
    option_dict["max_overlap"] = args.max_overlap
    if 0.0 <= float(args.max_overlap_percentage) <= 1.0:
        option_dict["max_overlap_percentage"] = float(
            args.max_overlap_percentage)
    else:
        raise Exception(
            "[--max_overlap_percentage] should be between 0.0 and 1.0")
    option_dict["eFeature"] = args.eFeature
    option_dict["eInstance"] = args.eInstance
    proteome = fasInput.read_json(args.input)
    clan_dict.update(proteome["clan"])
    proteome = proteome["feature"]
    if args.id:
        proteins = args.id
    else:
        proteins = list(proteome.keys())
    print('Protein ID\t#Paths\tapproximate greedy complexity')
    for protein in proteins:
        calc_complex(protein, proteome, option_dict, clan_dict, args)
예제 #2
0
def write_domain_file(path, idlist, outpath, tools, groupname):
    proteome = fasInput.read_json(path)["feature"]
    with open(outpath, 'w') as out:
        for pid in idlist:
            for tool in tools:
                for feature in proteome[pid][tool]:
                    for instance in proteome[pid][tool][feature]["instance"]:
                        out.write(groupname + "#" + pid + "\t" + pid + "\t" +
                                  str(proteome[pid]["length"]) + "\t" +
                                  feature + "\t" + str(instance[0]) + "\t" +
                                  str(instance[1]) + "\tNA\tN\n")
예제 #3
0
파일: fdogFAS.py 프로젝트: BIONF/FAS
def run_fas(data):
    tmp = True
    try:
        tmp_data = read_json(data[7] + "/" + data[0] + ".json")
    except FileNotFoundError:
        try:
            tmp_data = read_json(data[4] + "/" + data[0] + ".json")
            tmp = False
        except FileNotFoundError:
            raise Exception('Taxon: "' + data[0] +
                            '" is missing in the weight_dir')
    query_proteome = {}
    missing = []
    for i in data[1]['pairwise']:
        try:
            query_proteome[i[1]] = tmp_data["feature"][i[1]]
        except KeyError:
            if data[6]:
                missing.append(i[1])
            else:
                raise Exception(
                    'The protein: "' + i[1] + '" is missing in taxon: "' +
                    data[0] + '". The annotations ' +
                    'in weight_dir should contain all proteins from the genome_dir.'
                )
    if missing:
        missinseq = []
        for i in missing:
            missinseq.append('>' + i + '\n' + data[6][data[0]][i])
        if tmp:
            weight_dir = data[7]
        else:
            weight_dir = data[4]
        doAnnoForMissing(data[0], missinseq,
                         weight_dir + "/" + data[0] + ".json", data[7] + "/",
                         1, True)
        tmp_data = read_json(data[7] + "/" + data[0] + ".json")
        for i in missing:
            query_proteome[i] = tmp_data["feature"][i]

    clan_dict = data[5]
    clan_dict.update(tmp_data["clan"])
    seed_proteome = data[2]
    weight = w_weight_correction("loge", tmp_data["count"])
    f_results = greedyFAS.fc_main(weight, seed_proteome, query_proteome,
                                  clan_dict, data[1])
    outdata = {}
    for result in f_results:
        outdata[result[0], result[1]] = (result[2][0], 0.0)
    if data[1]["bidirectional"]:
        data[1]["reverse"] = True
        pairtmp = []
        for pair in data[1]['pairwise']:
            pairtmp.append((pair[1], pair[0]))
        data[1]['pairwise'] = pairtmp
        r_results = greedyFAS.fc_main(data[3], query_proteome, seed_proteome,
                                      clan_dict, data[1])
        for result in r_results:
            outdata[result[1],
                    result[0]] = (outdata[result[1],
                                          result[0]][0], result[2][0])
    return outdata, data[0]
예제 #4
0
파일: fdogFAS.py 프로젝트: BIONF/FAS
def manage_jobpool(jobdict, seed_names, seed_spec, weight_dir, tmp_path, cores,
                   features, bidirectional, fasta):
    missing = []
    for spec in jobdict:
        if not os.path.exists(weight_dir + "/" + spec + ".json"):
            missing.append(spec)
    try:
        tmp_data = read_json(weight_dir + "/" + seed_spec + ".json")
    except FileNotFoundError:
        missing.append(seed_spec)
    if missing:
        raise Exception('The following taxa are missing in the weight_dir:\n' +
                        '\n'.join(missing))
    seed_weight = w_weight_correction("loge", tmp_data["count"])
    seed_proteome = tmp_data["feature"]
    missing = []
    for seed_name in seed_names:
        if seed_name not in seed_proteome:
            if fasta:
                missing.append(seed_name)
            else:
                raise Exception(
                    'The protein: "' + seed_name + '" is missing in taxon: "' +
                    seed_spec +
                    '". The annotations in weight_dir should contain all proteins from the genome_dir.'
                )
    if missing:
        missinseq = []
        for i in missing:
            missinseq.append('>' + i + '\n' + fasta[seed_spec][i])
        doAnnoForMissing(seed_spec, missinseq,
                         weight_dir + "/" + seed_spec + ".json",
                         tmp_path + "/", cores, True)
        tmp_data = read_json(tmp_path + "/" + seed_spec + ".json")
        seed_weight = w_weight_correction("loge", tmp_data["count"])
        seed_proteome = tmp_data["feature"]
    clan_dict = tmp_data["clan"]
    data = []
    for spec in jobdict:
        data.append([
            spec, {
                "weight_const": False,
                "seed_id": None,
                "query_id": None,
                "priority_mode": True,
                "priority_threshold": 30,
                "max_cardinality": 500,
                "eFeature": 0.001,
                "cores": 1,
                "eInstance": 0.01,
                "e_output": True,
                "feature_info": None,
                "bidirectional": bidirectional,
                "raw": False,
                "silent": False,
                "reverse": False,
                "max_overlap": 0,
                "classicMS": False,
                "timelimit": 0,
                "ref_2": None,
                "phyloprofile": None,
                "score_weights": (0.7, 0.0, 0.3),
                "output": 0,
                "max_overlap_percentage": 0.0,
                "domain": True,
                "pairwise": jobdict[spec],
                "weight_correction": "loge",
                "outpath": tmp_path + "/" + spec,
                "input_linearized": features[0],
                "input_normal": features[1],
                "MS_uni": 0,
                "ref_proteome": [spec + '.json'],
                "progress": False
            }, seed_proteome, seed_weight, weight_dir, clan_dict, fasta,
            tmp_path
        ])
    jobpool = multiprocessing.Pool(processes=cores)
    results = []
    for _ in tqdm(jobpool.imap_unordered(run_fas, data), total=len(jobdict)):
        results.append(_)
    jobpool.close()
    jobpool.join()
    return results
예제 #5
0
def fc_start(option):
    """Overhead function,
    this function manages the individual functions that read the input files and prepares the data for the main script.
    Function calls: xmlreader(), w_count_ref(), w_count(), fc_main()

    :param option: dictionary that contains the main option variables of FAS
    """
    clan_dict = {}
    domain_count = {}
    option["reverse"] = False
    # MS_uni set to 0 when no weighting is conducted
    if option["MS_uni"] == 0:
        domain_count = {}
        for path in option["ref_proteome"]:
            domain_count.update(read_json(path)["count"])
    proteome_list = []
    for path in option["p_path"]:
        proteome = read_json(path)
        proteome_list.append(proteome["feature"])
        clan_dict.update(proteome["clan"])
    seed_proteome = mergeNestedDic(proteome_list)
    proteome_list = []
    for path in option["s_path"]:
        proteome = read_json(path)
        proteome_list.append(proteome["feature"])
        clan_dict.update(proteome["clan"])
    query_proteome = mergeNestedDic(proteome_list)
    if option["weight_correction"]:
        domain_count = w_weight_correction(option["weight_correction"], domain_count)
    for tool in option["input_linearized"]:
        if tool not in seed_proteome[list(seed_proteome)[0]]:
            raise Exception(tool + " is missing in the seed annotation")
        if tool not in query_proteome[list(query_proteome)[0]]:
            raise Exception(tool + " is missing in the query annotation")
    if option["seed_id"]:
        for protid in option["seed_id"]:
            if protid not in seed_proteome:
                raise Exception(protid + " is not in the seed annotation")
    if option["query_id"]:
        for protid in option["query_id"]:
            if protid not in query_proteome:
                raise Exception(protid + " is not in the query annotation")
    if option["bidirectional"]:
        print("calculating forward scores...")
        f_results = fc_main(domain_count, seed_proteome, query_proteome, clan_dict, option)
        if option["MS_uni"] == 0 and option["ref_2"]:
            domain_count_2 = {}
            for path in option["ref_2"]:
                domain_count_2.update(read_json(path)["count"])
            if option["weight_correction"]:
                domain_count_2 = w_weight_correction(option["weight_correction"], domain_count_2)
            option['ref_proteome'] = option['ref_2']
        else:
            domain_count_2 = domain_count
        id_tmp = option["seed_id"]
        option["reverse"] = True
        option["seed_id"] = option["query_id"]
        option["query_id"] = id_tmp
        if option["pairwise"]:
            pairtmp = []
            for pair in option["pairwise"]:
                pairtmp.append((pair[1], pair[0]))
            option["pairwise"] = pairtmp
        print("calculating backward scores...")
        r_results = fc_main(domain_count_2, query_proteome, seed_proteome, clan_dict, option)
        if option["phyloprofile"]:
            phyloprofile_out(option["outpath"], True, option["phyloprofile"], (f_results, r_results))
        if not option['tsv']:
            write_tsv_out(option["outpath"], True, (f_results, r_results))
    else:
        print("calculating forward scores...")
        results = fc_main(domain_count, seed_proteome, query_proteome, clan_dict, option)
        if not option["tsv"]:
            write_tsv_out(option["outpath"], False, (results, None))
        if option["phyloprofile"]:
            phyloprofile_out(option["outpath"], False, option["phyloprofile"], [results, None])
예제 #6
0
def main():
    options = get_options()
    ids = get_ids(options.extract_ids)
    annotation = read_json(options.input)
    new_dict = extract_architectures(ids, annotation)
    savejson(new_dict, options.output)