def main(argv): if len(argv) != 4: raise Exception( "Wrong number of arguments. Usage: input output buildID-file") inPath = argv[1] outPath = argv[2] buildIdPath = argv[3] build_number = None inFile = open(inPath, "rb") try: table = tsv.TsvReader(inFile, TABLE_SCHEMA) except csv.Error: print "Warning: ragged table. Assuming excel_tab and correcting" inFile.close() inFile = open(inPath, "rb") outPath = inPath + ".fixed_raggedness" outFile = open(outPath, "wb") tsv.fixRaggedTable(inFile, outFile, csv.excel_tab) outFile.flush() outFile.close() del outFile inFile = open(outPath) try: table = tsv.TsvReader(inFile, TABLE_SCHEMA) except: print "Well, that didn't work" raise # end handler for ragged table line = 0 output = open(outPath, "w") for record in table: line += 1 bNum = record["NCBI_Build"] if (build_number is not None) and (bNum != build_number): raise Exception( "Inconsistent NCBI_Build values (prev: %s; current: %s); cleave table first" % (str(build_number), str(bNum))) build_number = bNum # TSV rows can't be written into, only read. thus... writableRecord = [cell for cell in record] writableRecord[LINENUM_INDEX] = "line" + str(line) output.write("\t".join(writableRecord)) output.write("\n") output.flush() output.close() del output buildIdOut = open(buildIdPath, "w") buildIdOut.write(NCBI_BUILD_LUT[build_number]) buildIdOut.flush() buildIdOut.close() del buildIdOut return 0
def __init__(self, assembly, contig, start, end): """ Given a range on a contig, get all the repeats overlapping that range. Keeps an IntervalTree of element names, and a Counter from element name to number of that element in the range. No protection against SQL injection. """ # Make the interval tree self.tree = IntervalTree() # Make a counter for repeats with a certain name self.counts = collections.Counter() command = [ "hgsql", "-e", "select repName, genoName, genoStart, genoEnd " "from {}.rmsk where genoName = '{}' and genoStart > '{}' " "and genoEnd < '{}';".format(assembly, contig, start, end) ] process = subprocess.Popen(command, stdout=subprocess.PIPE) for parts in itertools.islice(tsv.TsvReader(process.stdout), 1, None): # For each line except the first, broken into fields # Add the item to the tree covering its range. Store the repeat type # name as the interval's data. self.tree.addi(int(parts[2]), int(parts[3]), parts[0]) # Count it self.counts[parts[0]] += 1
def get_max_f_score(job, gam_key, condition, options): """ Given the GAM file key for a sample that has already had vcfeval run under the given conditions, parse the vcfeval roc and return the biggest F score. """ # Make the IOStore cache_store = IOStore.get(options.cache) # Find the ROC curve roc_key = vcfeval_roc_key(gam_key, condition) # Get the file roc_compressed = cache_store.get_input_file(job, roc_key) # Read it reader = tsv.TsvReader(gzip.GzipFile(roc_compressed)) # What's the max F score we found? max_f_score = None for parts in reader: # Parse all the F scores f_score = float(parts[6]) if max_f_score is None or f_score > max_f_score: # And keep the max max_f_score = f_score # Return the max F score. return max_f_score
def read_bootstraps(root_path): quant_bootstraps = tsv.TsvReader(open(root_path + "quant_bootstraps.tsv")) count = 0 quant_boot = [] for parts in quant_bootstraps: quant_boot.append(parts) df_quant_boot = pd.DataFrame.from_records(quant_boot[1:], columns=quant_boot[0]) id_qb = list(df_quant_boot.columns) return df_quant_boot, id_qb
def generateTxtFromMetas(pathToEmbeddings, pathToMeta, outputPath): reader = tsv.TsvReader(open(pathToEmbeddings)) meta = tsv.TsvReader(open(pathToMeta)) chara = [] label = [] for zi, line in meta: label.append(zi) label = label[1:] # print(len(label)) with open(outputPath, 'w') as f: for count, embedding in enumerate(reader): em = list(embedding) if count == 0: size = len(em) f.write(str(len(label)) + " " + str(size) + "\n") data = " ".join(em) if (count < len(label)): f.write(label[count] + " " + data + "\n")
def getRegions(metadata_url): """ Download the assembly metadata file at the given URL, and return a dict from upper-case region names to 0-based end-exclusive (contig, start, end) tuples. Contig names start with "chr". """ # Holds the chromosome number for each region? region_chromosomes = {} # Holds the minimum start position for each region on its chromosome region_starts = collections.defaultdict(lambda: float("inf")) # Holds the maximum stop position for each region on its chromosome region_stops = collections.defaultdict(lambda: float("-inf")) # Holds the (contig, start, end) tuple for each alt in a given region. ranges_by_region = collections.defaultdict(list) # Hard-code some regions that aren't real alt regions ranges_by_region["BRCA1"] = ("chr17", 43044294, 43125482) ranges_by_region["BRCA2"] = ("chr13", 32314861, 32399849) ranges_by_region["CENX"] = ("chrX", 58605580, 62412542) # Read the reference database database = tsv.TsvReader(urllib2.urlopen(metadata_url)) for parts in database: # Parse out all the info for this alt and its parent chromosome region_name = parts[7] # Grab the chromosome ("1" or "X") that's the parent parent_chromosome = parts[5] parent_start = int(parts[11]) parent_stop = int(parts[12]) alt_contig = parts[3] alt_start = int(parts[9]) alt_stop = int(parts[10]) # Note the region start, stop, and parent chromosome number region_chromosomes[region_name] = parent_chromosome region_starts[region_name] = min(region_starts[region_name], parent_start) region_stops[region_name] = max(region_stops[region_name], parent_stop) for region_name in region_chromosomes.iterkeys(): # Add in the reference ranges that all the alts are alternatives to # Make sure to add the chr prefix. ranges_by_region[region_name] = ("chr" + region_chromosomes[region_name], region_starts[region_name], region_stops[region_name]) # Give back our region info dict return ranges_by_region
def main(): # Where do we put them? DATABASE_PATH = 'Database' # This holds total price by keyword, for all observed keywords total_by_keyword = collections.defaultdict(float) occurrences = collections.Counter() # This holds an example for each keyword examples = dict() for root, dirs, files in os.walk(DATABASE_PATH): for filename in files: if filename.endswith('.tsv'): # We found a TSV with open(os.path.join(root, filename)) as tsv_in: # Read the TSV reader = tsv.TsvReader(tsv_in) for item, price in reader: # For each recorded item # Parse the price price = float(price) if math.isnan(price): # Skip unpriceable items continue # Compute all unique keywords in the item keywords = set(item.upper().split()) for keyword in keywords: # The price contributes to every keyword total_by_keyword[keyword] += price # We count the occurrences occurrences[keyword] += 1 if keyword not in examples or random.random( ) < 0.5: # This ought to be our example for this keyword examples[keyword] = item.upper() # Make a big table keywords_with_totals = list(total_by_keyword.items()) # Sort by total cost, descending keywords_with_totals.sort(key=operator.itemgetter(1), reverse=True) print("=== Top 10 Expensive Grocery Keywords ===") for i, (keyword, cost) in enumerate(keywords_with_totals[:10]): print("#{}:\t${}\t{} (x{}, e.g. \"{}\")".format( i + 1, cost, keyword, occurrences[keyword], examples[keyword]))
def url_open_tsv(url): """ Open a TSV URL and loop through the lines as lists. """ try: reader = tsv.TsvReader(urllib2.urlopen(url)) except urllib2.URLError as err: print("Could not open " + url) raise err return reader
def parse_name_stream(self, stream): """ Parse GRC chr2acc format (TSV of name and accession.version) on the given input stream, and make the appropriate primary scaffold name assignments. """ # Make a TSV reader reader = tsv.TsvReader(stream) for name, accession in reader: # Apply each name/accession mapping self.set_chromosome_name(accession, name)
def parse_placement_stream(self, stream): """ Parse GRC alt_scaffold_placement.txt format (TSV with alt accession.version on column 4 and parent accession.version on column 7) on the given input stream, and make the appropriate parent assignments. """ # Make a TSV reader reader = tsv.TsvReader(stream) for parts in reader: # Look at each non-comment line if len(parts) < 7: # We can't pull out the parent raise RuntimeError( "Insufficient columns in alt scaffold location data") # Make the alt (1-based column 4) a child of the parent (1-based # column 7) self.set_alt_parent(parts[3], parts[6])
def main(args): """ Parses command line arguments and do the work of the program. "args" specifies the program arguments, with args[0] being the executable name. The return value should be used as the program's exit code. """ if len(args) == 2 and args[1] == "--test": # Run the tests return doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) options = parse_args(args) # This holds the nicely-parsed options object # Load the sample whitelist, if applicable. Holds a set if we have a # whitelist, or None otherwise. sample_whitelist = None if options.samples is not None: # Read all the samples from the file sample_whitelist = set( [line[0] for line in tsv.TsvReader(options.samples)]) RealTimeLogger.start_master() # Make a root job root_job = Job.wrapJobFn(scan_all, options, sample_whitelist, cores=1, memory="1G", disk="1G") # Run it and see how many jobs fail failed_jobs = Job.Runner.startToil(root_job, options) if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) print("All jobs completed successfully") RealTimeLogger.stop_master()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ @author: celdel created to have a nice and clean encods_1.csv with all encods of all the img """ import tsv import pandas as pd import csv reader = tsv.TsvReader(open("encods.tsv")) new_list = [] for parts in reader: new_list.append(list(parts)) df = pd.DataFrame(new_list) df.to_csv("encods_1.csv")
def process_raw_data(raw_data, old_html_dir, options): """ This function receives the file containing raw genomic data that the user wants to map to the pre-existing visulization & the location of pre-existing visualization files. We will parse this new data file placing the rows in an order defined by the genes tab from the pre-existing visualization. This way we generate a mutable numpy matrix of raw patient data and have the genes in the required by the transform matrix, U^T, & S matrices. """ # Create the file paths for the required files genes_file_loc = os.path.join(old_html_dir, "genes.tab") s_matrix_file_loc = os.path.join(old_html_dir, "S.tab") u_t_matrix_file_loc = os.path.join(old_html_dir, "U_T.tab") beta_matrix_file_loc = os.path.join(old_html_dir, "beta.tab") assignments_file_loc = os.path.join(old_html_dir, "assignments0.tab") # First open the genes file. genes_reader = tsv.TsvReader(open(genes_file_loc, 'r')) # This holds an iterator over lines in that file genes_iterator = genes_reader.__iter__() # Extract data type of the pre-existing visualization & the list of genes old_data_type = genes_iterator.next() print("Previous Data Type", old_data_type) # First see of the new data and the old data are of compatible data types new_data_type = options.type old_genes_list = [] # If they are the same data type add the genes to a python list if old_data_type[0] == new_data_type: print("Same Data Types") old_genes_list = genes_iterator.next() genes_reader.close() # First open the raw data file. raw_data_reader = tsv.TsvReader(open(raw_data, 'r')) # This holds an iterator over lines in that file raw_data_iterator = raw_data_reader.__iter__() sample_names = raw_data_iterator.next() sample_names = sample_names[1:] num_samples = len(sample_names) new_genes_list = [] for row in raw_data_iterator: new_gene = row[0] new_genes_list.append(new_gene) raw_data_reader.close() # Get the number of new samples & number of old genes to create # a new numpy data matrix print("Number of New Samples:", num_samples) num_new_genes = len(new_genes_list) print("Number of New genes:", num_new_genes) # Re-Initialize the data iterator # This holds an iterator over lines in that file raw_data_reader = tsv.TsvReader(open(raw_data, 'r')) raw_data_iterator = raw_data_reader.__iter__() # Skip the first line which is simple a row of headers raw_data_iterator.next() # Next we have to dump all the valus from the file into a numpy matrix # The values will be unsorted. We will then have to sort the rows of the # numpy matrix according to the order prescribed by old_genes_list raw_data_matrix_unsorted = numpy.zeros(shape=(num_new_genes, num_samples)) for rindex, row in enumerate(raw_data_iterator): # Cut off the first value of each row. It is simply the gene name. only_values = row[1:] # Place the data from only_values into the appropriate row in # raw_data_matrix. for cindex, col in enumerate(only_values): raw_data_matrix_unsorted[rindex][cindex] = only_values[cindex] # For every gene in old_genes_list search the new_genes_list for the # the appropriate index. Then use this index to find the values in # the unsorted data matrix and copy them a new sorted matrix. # This new matrix will be used the compute the (x,y) coordinates # needed to map the new samples. num_old_genes = len(old_genes_list) #Debugging num_no_data = 0 raw_data_matrix_sorted = numpy.zeros(shape=(num_old_genes, num_samples)) for rindex, gene in enumerate(old_genes_list): # Find the index of the desired gene in the new_genes_list # This index will corrrespond to the row in the raw_data_matrix_unsorted # that we want to extract and place in the raw_data_matrix_sorted try: gene_index = new_genes_list.index(gene) extracted_data_row = raw_data_matrix_unsorted[gene_index] # Iterate over the extracted row to place the values in the appropriate row # of the sorted data matrix. for cindex, col in enumerate(extracted_data_row): raw_data_matrix_sorted[rindex][ cindex] = extracted_data_row[cindex] except ValueError: num_no_data += 1 print("Number of genes with no data", num_no_data) # Open up S matrix, U^T, and Betas for x,y coordinate computation # First open the matrix file. s_reader = tsv.TsvReader(open(s_matrix_file_loc, 'r')) u_t_reader = tsv.TsvReader(open(u_t_matrix_file_loc, 'r')) beta_reader = tsv.TsvReader(open(beta_matrix_file_loc, 'r')) # Next create iterators to traverse the files s_iterator = s_reader.__iter__() u_t_iterator = u_t_reader.__iter__() beta_iterator = beta_reader.__iter__() # Create an array for s_values & create a diagonal matrix from it s_values = s_iterator.next() float_s_values = [] for value in s_values: v = float(value) float_s_values.append(v) s_values = float_s_values print("S_values", s_values) s_diag = numpy.diag(s_values) print(s_diag) # Create a numpy matrix for u_t (number of principal components * number of genes) u_t = numpy.zeros(shape=(len(s_values), num_old_genes)) for rindex, row in enumerate(u_t_iterator): for cindex, col in enumerate(row): u_t[rindex][cindex] = float(row[cindex]) # Create a numpy matrix for the betas (number of principal components * 2) betas = numpy.zeros(shape=(len(s_values), 2)) for rindex, row in enumerate(beta_iterator): for cindex, col in enumerate(row): betas[rindex][cindex] = float(row[cindex]) betas = numpy.transpose(betas) # Compute new coordinates coords = betas * (numpy.asmatrix(s_diag) * numpy.asmatrix(u_t) * numpy.asmatrix(raw_data_matrix_sorted)) print("Coordinates") print(coords) coords = numpy.transpose(coords) # Add to existing "assignments.tab" file assignments_writer = tsv.TsvWriter(open(assignments_file_loc, 'a')) for rindex, sample in enumerate(sample_names): print("Cindex", cindex) x = str(coords[rindex, 0]) y = str(coords[rindex, 1]) print(sample, x, y) assignments_writer.line(sample, x, y) assignments_writer.close() else: raise Exception("Pre-existing Visualization employs ", old_data_type, " data. Data to me mapped is of ", new_data_type, ". Data Types must be the same.") return True
#!/usr/bin/env python """ A simple example of how to read a TSV file using the 'tsv' module """ import tsv reader = tsv.TsvReader(open("data_samples/file.tsv")) for parts in reader: parts = list(parts) # Here parts is a list of strings, one per tab-separated column. # Make sure you handle not having enough fields, or not being able to # parse numbers where you expect them. print("Record with fields: {}".format(parts))
def downloadAllReads(job, options): """ Download all the reads for the regions. """ # Move to the appropriate working directory from wherever Toil dropped us os.chdir(options.cwd) # Initialize logging RealTimeLogger.set_master(options) RealTimeLogger.get().info("Starting download") # First make the output directory if not os.path.exists(options.out_dir): try: # Make it if it doesn't exist os.makedirs(options.out_dir) except OSError: # If you can't make it, maybe someone else did? pass # Whatever happens, it needs to exist here assert (os.path.exists(options.out_dir) and os.path.isdir(options.out_dir)) # Holds the chromosome number for each region? region_chromosomes = {} # Holds the minimum start position for each region on its chromosome region_starts = collections.defaultdict(lambda: float("inf")) # Holds the maximum stop position for each region on its chromosome region_stops = collections.defaultdict(lambda: float("-inf")) # Holds the contig:start-end string for each alt in a given region # The reference range gets added in last ranges_by_region = collections.defaultdict(list) # Hard-code some regions that aren't real ranges_by_region["BRCA1"] = ["chr17:43044294-43125482"] ranges_by_region["BRCA2"] = ["chr13:32314861-32399849"] ranges_by_region["CENX"] = ["chrX:58605580-62412542"] # Read the reference database database = tsv.TsvReader(urllib2.urlopen(options.reference_metadata)) for parts in database: # Parse out all the info for this alt and its parent chromosome region_name = parts[7] # Grab the chromosome ("1" or "X") that's the parent parent_chromosome = parts[5] parent_start = int(parts[11]) parent_stop = int(parts[12]) alt_contig = parts[3] alt_start = int(parts[9]) alt_stop = int(parts[10]) # Note the region start, stop, and parent chromosome number region_chromosomes[region_name] = parent_chromosome region_starts[region_name] = min(region_starts[region_name], parent_start) region_stops[region_name] = max(region_stops[region_name], parent_stop) # Turn the alt name into the proper format (GL000251.2 to # chr6_GL000251v2_alt) name_parts = alt_contig.split(".") fixed_alt_contig = "chr{}_{}v{}_alt".format(parent_chromosome, name_parts[0], name_parts[1]) # Add it to the list for its region ranges_by_region[region_name].append("{}:{}-{}".format( fixed_alt_contig, alt_start, alt_stop)) for region_name in region_chromosomes.iterkeys(): # Add in the reference ranges that all the alts are alternatives to ranges_by_region[region_name].append("chr{}:{}-{}".format( region_chromosomes[region_name], region_starts[region_name], region_stops[region_name])) # Are we using a real FTP URL, or a file URL? if urlparse.urlparse(options.sample_ftp_root).scheme == "ftp": # It's really FTP ftp, root_path = ftp_connect(options.sample_ftp_root) else: # Assume it's a bare file path ftp = FakeFTP(options.sample_ftp_root) root_path = "" if len(root_path) > 0: # Calculate the FTP base URL (without directory). We need it later for # turning found index files into URLs. base_url = options.sample_ftp_root[:-len(root_path)] else: # If root_path is empty, we should do nothing, because there's no way to # say [:-0]. base_url = options.sample_ftp_root RealTimeLogger.get().info("Sample root: {} Base URL: {}".format( options.sample_ftp_root, base_url)) # Dump the good data files for samples good_samples = open("{}/good.txt".format(options.out_dir), "w") # Grab all the population names that match the population pattern population_names = [ n for n in ftp.nlst() if fnmatch.fnmatchcase(n, options.population_pattern) ] # TODO: We'll go through them in this order, so if you want a representative # subsampling, add some shuffle here or something. # This holds URLs to data files (BAM/CRAM) with indexes that are on a # sufficient number of contigs, by sample name. We take the first # sufficiently good file for any sample. sample_file_urls = {} for population_name in population_names: # For each of those, we need to get samples # Go to the population root ftp.cwd("{}/{}".format(root_path, population_name)) # Grab all the sample names that match the sample name pattern. # Hopefully there aren't too many. sample_names = [ n for n in ftp.nlst() if fnmatch.fnmatchcase(n, options.sample_pattern) ] # TODO: handle failures during explore_path? for sample_name in sample_names: # For every sample print("Try {}".format(sample_name)) for data_name in explore_path( ftp, "{}/{}/{}".format(root_path, population_name, sample_name), options.file_pattern): # Find its data files (there may be several) # Get the index for each index_name = data_name + options.index_suffix print(index_name) if options.min_indexed_contigs > 0: # We need to run the check on the index before downloading # the sample reads. # Count up the contigs it indexes over indexed_contigs = count_indexed_contigs( "{}/{}".format(base_url, index_name), options.ftp_retry) if indexed_contigs >= options.min_indexed_contigs: # This file for this sample is good enough sample_file_urls[sample_name] = "{}/{}".format( base_url, data_name) RealTimeLogger.get().info( "Sample {} has index of {} contigs".format( sample_name, indexed_contigs)) # Add the sample to the file we spit out good_samples.write("{}\n".format(sample_name)) # Don't finish exploring the path break else: # Complain RealTimeLogger.get().warning( "Sample {} has index on too few contigs ({})." "Skipping!".format(sample_name, indexed_contigs)) else: # We don't need to check the number of indexed contigs. RealTimeLogger.get().info( "Sample {} doesn't need an " "indexed contigs check".format(sample_name)) # Still use this one. TODO: unify code with above. sample_file_urls[sample_name] = "{}/{}".format( base_url, data_name) # Add the sample to the file we spit out good_samples.write("{}\n".format(sample_name)) if len(sample_file_urls) >= options.sample_limit: # We got enough. Don't finish this population break if len(sample_file_urls) >= options.sample_limit: # We got enough. Don't check more populations break good_samples.close() RealTimeLogger.get().info("Got {} sample URLs".format( len(sample_file_urls))) if (options.sample_limit < float("inf")): # Make sure we got as many as we wanted. assert (len(sample_file_urls) == options.sample_limit) for region_name in options.regions: for sample_name, sample_url in sample_file_urls.iteritems(): # Make sure the sample directory exists sample_dir = "{}/{}/{}".format(options.out_dir, region_name, sample_name) if not os.path.exists(sample_dir): try: # Make it if it doesn't exist os.makedirs(sample_dir) except OSError: # If you can't make it, maybe someone else did? pass assert (os.path.exists(sample_dir) and os.path.isdir(sample_dir)) # Where will this sample's BAM for this region go? bam_filename = "{}/{}.bam".format(sample_dir, sample_name) if os.path.exists(bam_filename) and not options.overwrite: # Don't re-download stuff we already have. RealTimeLogger.get().info("Skipping {} x {} which has already " "been downloaded".format( region_name, sample_name)) continue RealTimeLogger.get().info("Making child for {} x {}: {}".format( region_name, sample_name, sample_url)) # Now kick off a job to download all the ranges for the region in # parallel for this sample, and then concatenate them together. Tell # it to save the results to a file on a shared filesystem. job.addChildJobFn(downloadRegion, options, region_name, sample_url, ranges_by_region[region_name], bam_filename, cores=1, memory="1G", disk="4G") RealTimeLogger.get().info("Done making children")
import tsv import re reader = tsv.TsvReader(open('karint_corpus.tsv', encoding='utf-8')) for i in reader: # print(' '.join(i)) msg = list(i)[1] if re.search(r'.sozluk.', msg) is not None: print(msg)
def scan_region(job, options, region, pop_by_sample, sample_whitelist): """ Scan all the graphs in a region for bias. If sample_whitelist is not None, ignores samples not in that set. """ # Set up the IO stores. in_store = IOStore.get(options.in_store) out_store = IOStore.get(options.out_store) # This holds a dict from graph name, then sample name, then stat name to # actual stat value. stats_cache = collections.defaultdict( lambda: collections.defaultdict(dict)) # This is the cache file for this region, in # <graph>\t<sample>\t<stat>\t<value> format cache_tsv_key = "plots/cache/{}.tsv".format(region) # What name will it have locally for us? local_filename = os.path.join(job.fileStore.getLocalTempDir(), "temp.tsv") if out_store.exists(cache_tsv_key): # Just read in from that TSV RealTimeLogger.get().info("Loading cached region {}".format(region)) # Grab the cached results out_store.read_input_file(cache_tsv_key, local_filename) # Read all the pop, value pairs from the TSV reader = tsv.TsvReader(open(local_filename)) # Which samples are going to be skipped? skipped_samples = set() for graph, sample, stat, value in reader: # Read every line from the cache and pull out what value for what # stat it gives for what sample. if sample_whitelist is not None and sample not in sample_whitelist: # Skip this sample that's not on the list skipped_samples.add(sample) continue # Populate our cache dict stats_cache[graph][sample][stat] = float(value) RealTimeLogger.get().info("Skipped {} samples".format( len(skipped_samples))) else: # Stats haven't been collated raise RuntimeError( "No graph stats for {}; run collateStatistics.py".format(region)) # We want normalized and un-normalized versions of the stats cache stats_by_mode = {"absolute": stats_cache} if stats_cache.has_key("refonly"): # Deep copy and normalize the stats cache normed_stats_cache = copy.deepcopy(stats_cache) # We want to normalize and the reference exists (i.e. not CENX) # Normalize every stat against the reference, by subtraction for graph, stats_by_sample in normed_stats_cache.iteritems(): # For each graph and all the stats for that graph for sample, stats_by_name in stats_by_sample.iteritems(): # For each sample and all the stats for that sample for stat_name in stats_by_name.keys(): if stats_cache["refonly"].has_key(sample): # Get the reference value ref_value = stats_cache["refonly"][sample][stat_name] # Normalize by subtraction stats_by_name[stat_name] -= ref_value else: # Nothing to norm against. TODO: maybe complain when # sample sets aren't all the same? stats_by_name[stat_name] = None # Register this as a condition stats_by_mode["normalized"] = normed_stats_cache # Now save stats, parceling out by region and graph for mode, mode_stats_cache in stats_by_mode.iteritems(): for graph, stats_by_sample in mode_stats_cache.iteritems(): # We need some config # Where should we route each stat to? stat_file_keys = { "substitution_rate": "bias/{}/{}/substrate.{}.tsv".format(mode, region, graph), "indel_rate": "bias/{}/{}/indelrate.{}.tsv".format(mode, region, graph), "portion_perfect": "bias/{}/{}/perfect.{}.tsv".format(mode, region, graph) } # Make a local temp file for each (dict from stat name to file # object with a .name). stats_file_temps = { name: tempfile.NamedTemporaryFile( dir=job.fileStore.getLocalTempDir(), delete=False) for name in stat_file_keys.iterkeys() } for sample, stats_by_name in stats_by_sample.iteritems(): # For each sample and all the stats for that sample for stat_name, stat_value in stats_by_name.iteritems(): # For each stat if not stats_file_temps.has_key(stat_name): # Skip stats that have nowhere to go continue # Write graph and value to the file for the stat, for # plotting, naming it after the pop that the sample is in stats_file_temps[stat_name].write("{}\t{}\n".format( pop_by_sample[sample], stat_value)) for stat_name, stat_file in stats_file_temps.iteritems(): # Flush and close the temp file stat_file.flush() os.fsync(stat_file.fileno()) stat_file.close() # Upload the file out_store.write_output_file(stat_file.name, stat_file_keys[stat_name])
def scan_all(job, options, sample_whitelist): """ Scan all the regions and graphs for bias. Only looks at samples in the whitelist set, if the whitelist is not None. """ # Set up the IO stores. in_store = IOStore.get(options.in_store) out_store = IOStore.get(options.out_store) # Download the superpopulation assignments # This holds superpop by pop superpopulation_by_population = {} for parts in tsv.TsvReader( urllib2.urlopen(urllib2.Request(options.superpopulation_url))): # For each population code (column 1), assign it to the right # superpopulation (column 2). superpopulation_by_population[parts[1]] = parts[2] RealTimeLogger.get().info("Downloading sample population assignments") # Load the 1000 Genomes population assignments. # Make a reader that goes through split out lines in the TSV. reader = tsv_reader_with_comments( urllib2.urlopen(urllib2.Request(options.index_url))) # Get an iterator over the lines lines = iter(reader) # Grab the headings headings = lines.next() while headings[0].startswith("##"): # Skip leading lines that aren't the real header (which starts with #) headings = lines.next() # Which column holds sample names? sample_name_column = headings.index("SAMPLE_NAME") # Which column holds sample populations? sample_population_column = headings.index("POPULATION") # What dict do we fill in? Holds population string by sample name. # We now use the superpopulation names for our populations. pop_by_sample = {} # We also want to count samples in each population for debuging samples_per_pop = collections.Counter() for parts in lines: # Save superpopulation under sample pop_by_sample[parts[sample_name_column]] = \ superpopulation_by_population[parts[sample_population_column]] # Count the sample for its population samples_per_pop[parts[sample_population_column]] += 1 RealTimeLogger.get().info("Found {} populations:".format( len(samples_per_pop))) for (pop, count) in samples_per_pop.iteritems(): RealTimeLogger.get().info("{}: {}".format(pop, count)) for region in in_store.list_input_directory("stats"): # Collate everything in the region job.addChildJobFn(scan_region, options, region, pop_by_sample, sample_whitelist, cores=1, memory="1G", disk="10G")
import math import tsv import numpy as np import pandas as pd import matplotlib.pyplot as plt root_path = "../data/poly_mo/" # preprocess data files # Quant_bootstraps.tsv :containing the matrix of bootstrap experiments # containing the final count for each transcript in each round of bootstrapping # with a row be a bootstrap output and columns be list of transcripts. quant_bootstraps = tsv.TsvReader(open(root_path + "quant_bootstraps.tsv")) count = 0 quant_boot = [] for parts in quant_bootstraps: quant_boot.append(parts) df_quant_boot = pd.DataFrame.from_records(quant_boot[1:], columns=quant_boot[0]) df_quant_boot = df_quant_boot.astype('float') df_quant_boot_mean = df_quant_boot.mean() df_quant_boot_std = df_quant_boot.std() id_in_quant_boot = list(df_quant_boot.columns) # given serial distance distance for tid in id_in_quant_boot: if tid in distance: for i in range(len(df_quant_boot[tid])):
def doTheThing(inputStream, regionsToCheck=["Jita", "Hek", "Rens"]): initializeItems() inventory = {} invReader = tsv.TsvReader(inputStream) for rawRow in invReader: row = list(rawRow) item = nameToItem[row[0]] quantity = int(row[1].replace(",", "")) if item.typeId not in inventory: inventory[item.typeId] = {"quantity": quantity, "item": item} else: inventory[item.typeId]["quantity"] += quantity reprocessOutputsToConsider = set() for invItem in inventory.values(): for materialTypeId in invItem["item"].reprocessingOutputs.keys(): reprocessOutputsToConsider.add(materialTypeId) # Prefetch everything requestMap = {} for materialTypeId in reprocessOutputsToConsider: jitaRegion = regionMap["Jita"]["regionId"] try: requestMap[jitaRegion].add(materialTypeId) except KeyError: requestMap[jitaRegion] = {materialTypeId} # for regionId in map(lambda region: region["regionId"], regionMap.values()): for regionId in map(lambda region: regionMap[region]["regionId"], regionsToCheck): for itemId in inventory.keys(): try: requestMap[regionId].add(itemId) except KeyError: requestMap[regionId] = {itemId} start = time.time() orderHistory = runBatch(requestMap) print("Offer retrieval time: {}".format(time.time() - start)) offers = {} jitaOffers = {} for region in regionsToCheck: regionId = regionMap[region]["regionId"] for typeId in inventory.keys(): item = typeIdToItem[typeId] response = orderHistory[regionId][typeId] try: fiveDayAverage = getFiveDayAverage(response) except statistics.StatisticsError: eprint("Failed to process '{typeId}' in {region}. Continuing.". format_map({ "typeId": item.name, "region": region })) continue if typeId not in offers: offers[typeId] = {} offers[typeId][region] = { "item": item, "price": fiveDayAverage, } if "Jita" == region: jitaOffers[typeId] = fiveDayAverage reprocessPrices = {} for typeId in reprocessOutputsToConsider: response = orderHistory[regionMap["Jita"]["regionId"]][typeId] fiveDayAverage = getFiveDayAverage(response) reprocessPrices[typeId] = fiveDayAverage reprocessOffers = {} for typeId in inventory.keys(): reprocessValue = 0 item = typeIdToItem[typeId] if 0 == len(item.reprocessingOutputs): continue for materialTypeId in item.reprocessingOutputs.keys(): try: unitPrice = reprocessPrices[materialTypeId] except KeyError: continue reprocessValue += ( 1 - REPROCESSING_TAX_RATE ) * REPROCESSING_EFFICIENCY * unitPrice * item.reprocessingOutputs[ materialTypeId] reprocessOffers[typeId] = reprocessValue bestOffers = {k: [] for k in regionsToCheck} for typeId in offers: bestRegion = "" bestPrice = -1.00 for region in offers[typeId]: currentPrice = offers[typeId][region]["price"] if currentPrice > bestPrice: bestRegion = region bestPrice = currentPrice bestOffers[bestRegion].append(offers[typeId][bestRegion]) results = {} for region in bestOffers: regionOffers = bestOffers[region] if 0 == len(regionOffers): continue doJitaComparison = "Jita" != region headers = ["Item Name", "Unit Price"] columnFormatString = ["{}", "{:,.2f}"] if doJitaComparison: headers.append("Jita") columnFormatString.append("{:,.2f} ISK") headers.append("Reprocess Value") columnFormatString.append("{:,.2f} ISK") headers.append("Qty") columnFormatString.append("{}") headers.append("Estimated Price") columnFormatString.append("{:,.2f} ISK") tableData = [] for offer in regionOffers: item = offer["item"] typeId = item.typeId qty = inventory[typeId]["quantity"] unitPrice = offer["price"] row = [item.name, unitPrice] if doJitaComparison: row.append(jitaOffers[typeId]) try: row.append(reprocessOffers[typeId]) except KeyError: row.append(None) row.append(qty) row.append(qty * unitPrice) tableData.append(row) tableData.sort(key=lambda row: row[-1], reverse=True) results[region] = { 'headers': headers, 'data': tableData, 'format': columnFormatString } return results