def collect_data(infile, genes): sample_data = defaultdict(dict) with open(infile, "rU") as casefile: reader = csv.reader(casefile, dialect="excel-tab") reader.next() for row in reader: if row[0].startswith("#"): sys.stderr.write("WARNING: Skipping commented input line\n") continue print row profile_data = cbioportal.get_multi_gene(row[1], row[2], genes) clin_data = cbioportal.get_clin_data(row[1]) profile_header = profile_data.pop(0) clin_header = clin_data.pop(0) profile_header_data = profile_header.split() profile_header_data.pop(0) profile_header_data.pop(0) if len(profile_data) <= 0: sys.stderr.write( "ERROR: No profile data retrieved for query, " "with response header: {}\n".format(profile_header) ) continue if len(clin_data) <= 0: sys.stderr.write( "ERROR: No clinical data retrieved for query, " "with response header: {}\n".format(clin_header) ) continue for line in profile_data: data = line.split() data.pop(0) gene_id = data.pop(0) # print gene_id i = 0 for value in data: # sys.stdout.write("{}: {}\n".format(profile_header_data[i], value)) sample_data[profile_header_data[i]][gene_id] = value i += 1 for line in clin_data: if not line.strip(): continue data = line.split() sample_data[data[0]]["AGE"] = data[1] return sample_data
def collect_data(infile, genes): sample_data = defaultdict(dict) with open(infile, 'rU') as casefile: reader = csv.reader(casefile, dialect='excel-tab') reader.next() for row in reader: if row[0].startswith("#"): sys.stderr.write("WARNING: Skipping commented input line\n") continue print row profile_data = cbioportal.get_multi_gene(row[1], row[2], genes) clin_data = cbioportal.get_clin_data(row[1]) profile_header = profile_data.pop(0) clin_header = clin_data.pop(0) profile_header_data = profile_header.split() profile_header_data.pop(0) profile_header_data.pop(0) if len(profile_data) <= 0: sys.stderr.write("ERROR: No profile data retrieved for query, " "with response header: {}\n".format(profile_header)) continue if len(clin_data) <= 0: sys.stderr.write("ERROR: No clinical data retrieved for query, " "with response header: {}\n".format(clin_header)) continue for line in profile_data: data = line.split() data.pop(0) gene_id = data.pop(0) # print gene_id i = 0 for value in data: # sys.stdout.write("{}: {}\n".format(profile_header_data[i], value)) sample_data[profile_header_data[i]][gene_id] = value i += 1 for line in clin_data: if not line.strip(): continue data = line.split() sample_data[data[0]]['AGE'] = data[1] return sample_data
parser.add_argument('-c', '--cases', help="Input file with study, case, and profile ids [Required]") parser.add_argument('-g', '--genes', help="Text file with list of genes to evaluate for correlations") parser.add_argument('-o', '--output', help='Output file name') args = parser.parse_args() with open(args.genes, 'rU') as genefile: genes = genefile.read().splitlines() with open(args.output, 'w') as outfile: outfile.write("Study\tCase List\tProfile\tGenes\tNumbers\tSpearman's Rho\tP-Value\n") with open(args.cases, 'rU') as casefile: reader = csv.reader(casefile, dialect='excel-tab') reader.next() for row in reader: profile_data = cbioportal.get_multi_gene(row[1], row[2], genes) header = profile_data.pop(0) for pair in itertools.combinations(profile_data, 2): data1 = pair[0].split() data2 = pair[1].split() # Remove Gene ID data1.pop(0) data2.pop(0) gene1 = data1.pop(0) gene2 = data2.pop(0) expression1 = list() expression2 = list()
outfile.write( "Study\tCase List\tProfile\tGenes\tMin Exp\t25th Pctl\tMedian\t75th Pctl\tMax\t" "Bin1 (Low) #\tBin1 R\tBin1 p\t" "Bin2 #\tBin2 R\tBin2 p\t" "Bin3 #\tBin3 R\tBin3 p\t" "Bin4 (High) #\tBin4 R\tBin4 p\n") with open(args.cases, 'rU') as casefile: reader = csv.reader(casefile, dialect='excel-tab') reader.next() for row in reader: if row[0].startswith("#"): sys.stderr.write( "WARNING: Skipping commented input line\n") continue profile_data = cbioportal.get_multi_gene(row[1], row[2], genes) header = profile_data.pop(0) if len(profile_data) <= 0: sys.stderr.write( "ERROR: No data retrieved for query, with response header: {}\n" .format(header)) continue # Because gene expression data is returned in alphabetical order we have to find the line # containing our control gene data line, remove it from the data_lines, and isolated it for comparisons primary_data = list() i = 0 for line in profile_data: # print line data = line.split()