def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--method", dest="methods", type="choice", action="append", choices=("reorder-samples", ), help="method to apply [default=%default]") parser.add_option("--sort-order", dest="sort_order", help="sort order for sample names. Give column names as " "comma-separated list or specify ``alphabetical`` " "[default=%default]") parser.set_defaults( methods=[], sort_order="alphabetical", ) (options, args) = E.start(parser, add_pipe_options=True) if not options.methods: raise ValueError("no method specified") infile = VCF.VCFFile(options.stdin) sort_order = False if "reorder-samples" in options.methods: if options.sort_order: sort_order = options.sort_order.split(",") if "alphabetical" in sort_order: sort_order = sorted(infile.samples) infile.writeHeader(options.stdout, order=sort_order) for vcf in infile: if sort_order: vcf.order = sort_order options.stdout.write(str(vcf) + "\n") E.stop()
def main(argv=sys.argv): parser = E.OptionParser( version= "%prog version: $Id: bed2bed.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--reorder", dest="reorder", type="string", help= "reorder columns. Give column names as comma-separated list or specify ``alphabetical`` [default=%default]" ) parser.set_defaults(reorder=None) (options, args) = E.Start(parser, add_pipe_options=True) noutput = 0 infile = VCF.VCFFile(options.stdin) if options.reorder: order = options.reorder.split(",") if "alphabetical" in order: order = sorted(infile.samples) else: order = False infile.writeHeader(options.stdout, order=order) for vcf in infile: if order: vcf.order = order options.stdout.write(str(vcf) + "\n") E.Stop()
def extractEBioinfo(eBio_ids, vcfs, outfile): '''find the number of mutations identitified in previous studies (eBio_ids) for the mutated genes in the vcfs''' genes = set() for vcf in vcfs: infile = VCF.VCFFile(IOTools.openFile(vcf)) for vcf_entry in infile: # assumes all vcf entries without "REJECT" are "PASS" if vcf_entry.filter != "REJECT": info_entries = vcf_entry.info.split(";") for entry in info_entries: if "SNPEFF_GENE_NAME" in entry: genes.update((entry.split("=")[1], )) eBio_ids = IOTools.openFile(eBio_ids, "r") tissue_counts = collections.defaultdict( lambda: collections.defaultdict(lambda: collections.defaultdict(int))) def chunks(l, n): ''' Yield successive n-sized chunks from l ''' for i in range(0, len(l), n): yield l[i:i + n] # delete me E.info("number of genes: %i" % len(list(genes))) for line in eBio_ids: tissue, study, table = line.strip().split("\t") n = 0 for i in range(0, len(list(genes)), 250): genes_chunk = list(genes)[i:i + 250] # TS sporadic error when querying with a single gene at a time # "urllib2.URLError: <urlopen error [Errno 110] Connection timed out>" # max URL length appears to be 8200 characters, # try doing 250 genes at a time? gene_list = "+".join(list(genes_chunk)) n += len(genes_chunk) E.info("number of genes processed: %i" % n) url = ( "http://www.cbioportal.org/webservice.do?cmd=getProfileData&" "case_set_id=%(study)s_all&genetic_profile_id=%(table)s&" "gene_list=%(gene_list)s" % locals()) df = pd.io.parsers.read_csv(url, comment="#", sep="\t", index_col=0) for gene in genes_chunk: tmp_df = df[df['COMMON'] == gene] # check dataframe contains data! if tmp_df.shape[0] != 0: # seem to be having issues with gene set containing duplicates! # --> dataframe with repeated instances of gene after selection # so splice to first row and recreate dataframe from series if tmp_df.shape[0] > 1: tmp_df = pd.DataFrame(tmp_df.iloc[0]).T tissue_counts[tissue][gene]["total"] += tmp_df.shape[1] - 2 tissue_counts[tissue][gene]["mutations"] += int( tmp_df.count(1)) - 1 out = IOTools.openFile(outfile, "w") tissues = list(tissue_counts.keys()) out.write( "gene\t%s\n" % "\t".join(["%s_frequency" % x.replace(" ", "_") for x in tissues])) for gene in genes: freq_values = [] for tissue in tissues: total = tissue_counts[tissue][gene]["total"] mutations = tissue_counts[tissue][gene]["mutations"] freq_values.append(round(np.divide(float(mutations), total), 4)) out.write("%s\t%s\n" % (gene, "\t".join(map(str, freq_values)))) out.close()