예제 #1
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    genes = utils.read_gene_list(args.in_genes_path)
    if args.from_symbols:
        utils.gm.enable_agreement_check()
        entrez = utils.gm.symb2entrez(genes)
        for s, e in zip(genes, entrez):
            print('{} -> {}'.format(s, e))
        if len(utils.gm.errors):
            logging.info('Num. of original genes: {}'.format(len(genes)))
            if len(utils.gm.get_failed_queries()) > 0:
                logging.error(
                    'Num. of gene names that could not be translated: {}'.
                    format(len(utils.gm.get_failed_queries())))
            print(utils.gm.errors)
    else:
        entrez = genes
    if args.net_path is not None:
        net = utils.read_network(args.net_path)
        entrez = [gene for gene in entrez if gene in net and gene != -1]
        logging.info('Num. of mapped genes: {}'.format(len(entrez)))

    utils.write_gene_list(args.out_genes_path, entrez)
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    seeds = utils.read_gene_list(args.in_seed_path)
    candidates = utils.read_gene_list(args.in_candidate_path)
    net = utils.read_network(args.net_path)
    ext = [
        gene for gene in utils.read_gene_list(args.in_extgenes_path)
        if gene not in seeds and gene in net
    ]

    pvals = np.zeros(len(candidates))
    for i in range(len(candidates)):
        pvals[i] = utils.fisher_overlap_set(candidates[:i + 1], ext,
                                            list(net.nodes()))

    i_min = np.argmin(pvals)
    module = seeds + candidates[:i_min + 1]

    np.savetxt(args.out_pvals_path, pvals)
    utils.write_gene_list(args.out_module_path, module)

    if args.plot:
        import matplotlib.pyplot as plt
        plt.semilogy(pvals)
        plt.title("Cutoff: {}, size: {}".format(i_min, len(module)))
        plt.show()
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    gda = pd.read_csv(args.in_gda_path, sep='\t')
    disease_maps = pd.read_csv(args.in_dismap_path, sep='|')
    net = utils.read_network(args.in_net_path)

    gda = gda[gda.geneId.isin(net.nodes())]
    #gda = gda.groupby('diseaseId').filter(lambda x: len(x) >= args.min_disease_size)
    cols = [
        'diseaseId', 'diseaseName', 'diseaseType', 'diseaseClass',
        'diseaseSemanticType', 'geneId'
    ]
    diseases = gda.groupby('diseaseId').apply(lambda x: pd.Series(
        {
            'diseaseName': x.diseaseName.tolist()[0],
            'diseaseType': x.diseaseType.tolist()[0],
            'diseaseClass': x.diseaseClass.tolist()[0],
            'diseaseSemanticType': x.diseaseSemanticType.tolist()[0],
            'n_genes': x.shape[0]
        })).reset_index()
    print(diseases.shape)
    diseases = diseases[(diseases.n_genes > args.min_disease_size)
                        & (diseases.n_genes < args.max_disease_size) &
                        (diseases.diseaseType == 'disease') &
                        (diseases.diseaseSemanticType
                         == 'Disease or Syndrome')]
    print(diseases.shape)

    def get_doids(diseaseId):
        return '|'.join([
            'DOID:{}'.format(doid)
            for doid in disease_maps[(disease_maps.vocabulary == 'DO') & (
                disease_maps.diseaseId == diseaseId)].code.tolist()
        ])

    diseases['DOID'] = diseases.diseaseId.apply(get_doids)
    doids = list(
        set([
            'DOID:{}'.format(doid)
            for doid in disease_maps[(disease_maps.vocabulary == 'DO')
                                     & (disease_maps.diseaseId.isin(
                                         diseases.diseaseId))].code.tolist()
        ]))

    gda.to_csv(args.out_gda_file, sep='\t', index=False)
    diseases.to_csv(args.out_disease_file, sep='\t', index=False)
    utils.write_text(args.out_doid_file, doids)
예제 #4
0
def main(args):
    logging.basicConfig(level=logging.INFO,
                        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
                            handlers=[logging.FileHandler("../logs/report.log"),logging.StreamHandler()])
    logging.info(args)

    genes = utils.read_gene_list(args.in_genes_path)
    net = utils.read_network(args.net_path)

    diamond_genes = DIAMOnD.DIAMOnD(net, genes, args.N, args.alpha)

    utils.write_gene_list(args.out_diamond_path, [gene[0] for gene in diamond_genes])
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    gda = pd.read_csv(args.in_gda_path, sep='\t')
    net = utils.read_network(args.in_net_path)

    seeds = gda[gda.diseaseId == args.disease_id].geneId.tolist()

    logging.info("Seed size: {}, DIAMOND iterations: {}".format(
        len(seeds), args.module_size - len(seeds)))
    diamond_genes = [
        gene[0] for gene in DIAMOnD.DIAMOnD(net, seeds, args.module_size -
                                            len(seeds), args.alpha)
    ]
    utils.write_gene_list(args.out_module_file, seeds + diamond_genes)
예제 #6
0
parser.add_argument('--seed', type=int, default=100, help='Random seed')
args = parser.parse_args()

logging.basicConfig(level=logging.INFO,
                    format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
                    handlers=[
                        logging.FileHandler("../logs/report.log"),
                        logging.StreamHandler()
                    ])
logging.info(args)
random.seed(args.seed)
srcnodes = utils.read_gene_list(args.srcnodes_file)
gda = pd.read_csv(args.gda_file, sep='\t')
expr = utils.read_expr(args.expr_file)
gsm = utils.read_text(args.gsm_file)
net = utils.read_network(args.net_path)

expr = expr[gsm]
destnodes = gda[gda.diseaseId == args.disease_id].geneId.unique().tolist()

logging.info("Number of nodes: {}".format(len(destnodes)))


def get_abscorr(i, j, corrdata):
    method = 'pearson' if not args.spearman else 'spearman'
    corrmatr = corrdata.loc[[i, j]].T.corr(method)
    return corrmatr.abs().groupby('ENTREZ_GENE_ID').apply(lambda x: x.max(
    )).T.groupby('ENTREZ_GENE_ID').apply(lambda x: x.max()).values[0, 1]


def get_seq_corr(path):
예제 #7
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)
    random.seed(args.seed)
    net = utils.read_network(args.net_path)
    srcnodes = utils.read_gene_list(args.srcnodes_file)
    destnodes = utils.read_gene_list(args.destnodes_file)

    flows = utils.read_flows(args.flows_file)
    fcnodes = flows[(flows.FCS >= args.fc_thresh)
                    & (flows.N_paths >= args.npath_thresh)].index.tolist()

    logging.info('Num of FC nodes: {}'.format(len(fcnodes)))

    all_paths = []
    for src_gene in tqdm(srcnodes):
        for dest_gene in tqdm(destnodes, leave=False):
            paths = list(nx.all_shortest_paths(net, src_gene, dest_gene))
            all_paths += paths

    logging.info('Num of all paths: {}'.format(len(all_paths)))

    fc_paths = []
    for i in trange(len(all_paths)):
        fullpath = all_paths[i]
        if len(fullpath) > 2:
            path = all_paths[i][1:-1]
            if np.all([node in fcnodes for node in path]):
                fc_paths.append(fullpath)

    logging.info('Num of FC paths: {}'.format(len(fc_paths)))

    rdm_paths_A = []
    for i in trange(args.N_samples):
        path = random.choice(fc_paths)
        tries = 0
        newpath = []
        while tries < 100:
            newpath = [random.choice(list(net.nodes()))]
            while len(newpath) < len(path):
                possible_nodes = [
                    gene for gene in net[newpath[-1]] if gene not in newpath
                ]
                if len(possible_nodes) > 0:
                    newpath.append(random.choice(possible_nodes))
                else:
                    tries += 1
                    break
            if len(newpath) == len(path):
                break
        assert tries < 100
        rdm_paths_A.append(newpath)

    rdm_paths_B = random.sample(all_paths, args.N_samples)

    utils.write_paths(args.out_allpaths, all_paths)
    utils.write_paths(args.out_fcpaths, fc_paths)
    utils.write_paths(args.out_rdmpaths_A, rdm_paths_A)
    utils.write_paths(args.out_rdmpaths_B, rdm_paths_B)