def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[ logging.FileHandler("../logs/report.log"), logging.StreamHandler() ]) logging.info(args) genes = utils.read_gene_list(args.in_genes_path) if args.from_symbols: utils.gm.enable_agreement_check() entrez = utils.gm.symb2entrez(genes) for s, e in zip(genes, entrez): print('{} -> {}'.format(s, e)) if len(utils.gm.errors): logging.info('Num. of original genes: {}'.format(len(genes))) if len(utils.gm.get_failed_queries()) > 0: logging.error( 'Num. of gene names that could not be translated: {}'. format(len(utils.gm.get_failed_queries()))) print(utils.gm.errors) else: entrez = genes if args.net_path is not None: net = utils.read_network(args.net_path) entrez = [gene for gene in entrez if gene in net and gene != -1] logging.info('Num. of mapped genes: {}'.format(len(entrez))) utils.write_gene_list(args.out_genes_path, entrez)
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[ logging.FileHandler("../logs/report.log"), logging.StreamHandler() ]) logging.info(args) seeds = utils.read_gene_list(args.in_seed_path) candidates = utils.read_gene_list(args.in_candidate_path) net = utils.read_network(args.net_path) ext = [ gene for gene in utils.read_gene_list(args.in_extgenes_path) if gene not in seeds and gene in net ] pvals = np.zeros(len(candidates)) for i in range(len(candidates)): pvals[i] = utils.fisher_overlap_set(candidates[:i + 1], ext, list(net.nodes())) i_min = np.argmin(pvals) module = seeds + candidates[:i_min + 1] np.savetxt(args.out_pvals_path, pvals) utils.write_gene_list(args.out_module_path, module) if args.plot: import matplotlib.pyplot as plt plt.semilogy(pvals) plt.title("Cutoff: {}, size: {}".format(i_min, len(module))) plt.show()
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[ logging.FileHandler("../logs/report.log"), logging.StreamHandler() ]) logging.info(args) gda = pd.read_csv(args.in_gda_path, sep='\t') disease_maps = pd.read_csv(args.in_dismap_path, sep='|') net = utils.read_network(args.in_net_path) gda = gda[gda.geneId.isin(net.nodes())] #gda = gda.groupby('diseaseId').filter(lambda x: len(x) >= args.min_disease_size) cols = [ 'diseaseId', 'diseaseName', 'diseaseType', 'diseaseClass', 'diseaseSemanticType', 'geneId' ] diseases = gda.groupby('diseaseId').apply(lambda x: pd.Series( { 'diseaseName': x.diseaseName.tolist()[0], 'diseaseType': x.diseaseType.tolist()[0], 'diseaseClass': x.diseaseClass.tolist()[0], 'diseaseSemanticType': x.diseaseSemanticType.tolist()[0], 'n_genes': x.shape[0] })).reset_index() print(diseases.shape) diseases = diseases[(diseases.n_genes > args.min_disease_size) & (diseases.n_genes < args.max_disease_size) & (diseases.diseaseType == 'disease') & (diseases.diseaseSemanticType == 'Disease or Syndrome')] print(diseases.shape) def get_doids(diseaseId): return '|'.join([ 'DOID:{}'.format(doid) for doid in disease_maps[(disease_maps.vocabulary == 'DO') & ( disease_maps.diseaseId == diseaseId)].code.tolist() ]) diseases['DOID'] = diseases.diseaseId.apply(get_doids) doids = list( set([ 'DOID:{}'.format(doid) for doid in disease_maps[(disease_maps.vocabulary == 'DO') & (disease_maps.diseaseId.isin( diseases.diseaseId))].code.tolist() ])) gda.to_csv(args.out_gda_file, sep='\t', index=False) diseases.to_csv(args.out_disease_file, sep='\t', index=False) utils.write_text(args.out_doid_file, doids)
def main(args): logging.basicConfig(level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log"),logging.StreamHandler()]) logging.info(args) genes = utils.read_gene_list(args.in_genes_path) net = utils.read_network(args.net_path) diamond_genes = DIAMOnD.DIAMOnD(net, genes, args.N, args.alpha) utils.write_gene_list(args.out_diamond_path, [gene[0] for gene in diamond_genes])
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[ logging.FileHandler("../logs/report.log"), logging.StreamHandler() ]) logging.info(args) gda = pd.read_csv(args.in_gda_path, sep='\t') net = utils.read_network(args.in_net_path) seeds = gda[gda.diseaseId == args.disease_id].geneId.tolist() logging.info("Seed size: {}, DIAMOND iterations: {}".format( len(seeds), args.module_size - len(seeds))) diamond_genes = [ gene[0] for gene in DIAMOnD.DIAMOnD(net, seeds, args.module_size - len(seeds), args.alpha) ] utils.write_gene_list(args.out_module_file, seeds + diamond_genes)
parser.add_argument('--seed', type=int, default=100, help='Random seed') args = parser.parse_args() logging.basicConfig(level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[ logging.FileHandler("../logs/report.log"), logging.StreamHandler() ]) logging.info(args) random.seed(args.seed) srcnodes = utils.read_gene_list(args.srcnodes_file) gda = pd.read_csv(args.gda_file, sep='\t') expr = utils.read_expr(args.expr_file) gsm = utils.read_text(args.gsm_file) net = utils.read_network(args.net_path) expr = expr[gsm] destnodes = gda[gda.diseaseId == args.disease_id].geneId.unique().tolist() logging.info("Number of nodes: {}".format(len(destnodes))) def get_abscorr(i, j, corrdata): method = 'pearson' if not args.spearman else 'spearman' corrmatr = corrdata.loc[[i, j]].T.corr(method) return corrmatr.abs().groupby('ENTREZ_GENE_ID').apply(lambda x: x.max( )).T.groupby('ENTREZ_GENE_ID').apply(lambda x: x.max()).values[0, 1] def get_seq_corr(path):
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[ logging.FileHandler("../logs/report.log"), logging.StreamHandler() ]) logging.info(args) random.seed(args.seed) net = utils.read_network(args.net_path) srcnodes = utils.read_gene_list(args.srcnodes_file) destnodes = utils.read_gene_list(args.destnodes_file) flows = utils.read_flows(args.flows_file) fcnodes = flows[(flows.FCS >= args.fc_thresh) & (flows.N_paths >= args.npath_thresh)].index.tolist() logging.info('Num of FC nodes: {}'.format(len(fcnodes))) all_paths = [] for src_gene in tqdm(srcnodes): for dest_gene in tqdm(destnodes, leave=False): paths = list(nx.all_shortest_paths(net, src_gene, dest_gene)) all_paths += paths logging.info('Num of all paths: {}'.format(len(all_paths))) fc_paths = [] for i in trange(len(all_paths)): fullpath = all_paths[i] if len(fullpath) > 2: path = all_paths[i][1:-1] if np.all([node in fcnodes for node in path]): fc_paths.append(fullpath) logging.info('Num of FC paths: {}'.format(len(fc_paths))) rdm_paths_A = [] for i in trange(args.N_samples): path = random.choice(fc_paths) tries = 0 newpath = [] while tries < 100: newpath = [random.choice(list(net.nodes()))] while len(newpath) < len(path): possible_nodes = [ gene for gene in net[newpath[-1]] if gene not in newpath ] if len(possible_nodes) > 0: newpath.append(random.choice(possible_nodes)) else: tries += 1 break if len(newpath) == len(path): break assert tries < 100 rdm_paths_A.append(newpath) rdm_paths_B = random.sample(all_paths, args.N_samples) utils.write_paths(args.out_allpaths, all_paths) utils.write_paths(args.out_fcpaths, fc_paths) utils.write_paths(args.out_rdmpaths_A, rdm_paths_A) utils.write_paths(args.out_rdmpaths_B, rdm_paths_B)