def test_add_track(self): new_track_name, new_track_file = self.new_track # Open new track genome = Genome(self.gdfilepath, mode="r+") with genome: self.assertEqual(genome.num_tracks_continuous, 0) genome.add_track_continuous(new_track_name) # Load data for new track load_data(self.gdfilepath, new_track_name, test_filename(new_track_file), verbose=self.verbose) # Close data with new track close_data(self.gdfilepath, verbose=self.verbose) # Make sure addition was successful genome = Genome(self.gdfilepath) with genome: # Track ordering should now end with dnase self.assertEqual(genome.tracknames_continuous, [new_track_name]) # Given track ordering, check single track data retrieval self.assertArraysEqual(genome["chr1"][305:310, new_track_name], [-2.65300012, 0.37200001, 0.37200001, 0.37200001, 0.37099999])
def test_replace_track(self): # Test ability to delete and replace a track old_trackname = "primate" old_entry = (290, -2.327) new_trackname = "placental" new_entry = (290, -2.297) # Test value before deleting track with warnings.catch_warnings(): warnings.simplefilter("ignore", GenomedataDirtyWarning) with Genome(self.gdfilepath) as genome: chromosome = genome["chr1"] self.assertArraysEqual(chromosome[old_entry[0], old_trackname], old_entry[1]) # Remove track erase_data(self.gdfilepath, old_trackname, verbose=self.verbose) # Now replace it with the data from a different track track_index = self.tracknames.index(new_trackname) datafile = self.trackfiles[track_index] load_data(self.gdfilepath, new_trackname, datafile, verbose=self.verbose) # Re-close data close_data(self.gdfilepath, verbose=self.verbose) # Now test that the data matches the new track data with Genome(self.gdfilepath) as genome: chromosome = genome["chr1"] self.assertArraysEqual(chromosome[new_entry[0], new_trackname], new_entry[1])
def test_no_context(self): genome = Genome(self.gdfilepath) chr1 = genome["chr1"] tracknames = genome.tracknames_continuous data = chr1[100:1000] # Used to segfault chr2 = genome["chrY"] chr2.close() # Make sure manual close doesn't break it self.assertTrue(chr1.isopen) self.assertFalse(chr2.isopen) genome.close() self.assertFalse(chr1.isopen) self.assertRaises(Exception, iter(chr1).next)
def test_no_context(self): genome = Genome(self.gdfilepath) chr1 = genome["chr1"] genome.tracknames_continuous # test access chr1[100:1000] # test access: at one point segfaulted chr2 = genome["chrY"] chr2.close() # Make sure manual close doesn't break it self.assertTrue(chr1.isopen) self.assertFalse(chr2.isopen) genome.close() self.assertFalse(chr1.isopen) self.assertRaises(Exception, next, iter(chr1))
def test_filter_track(self): # Add filter track open_data(self.gdfilepath, [UNFILTERED_TRACKNAME], verbose=self.verbose) load_data(self.gdfilepath, UNFILTERED_TRACKNAME, test_filename(UNFILTERED_TRACK_FILENAME), verbose=self.verbose) close_data(self.gdfilepath, verbose=self.verbose) # Perform filtering on data hardmask_data(self.gdfilepath, test_filename(self.filter), [UNFILTERED_TRACKNAME], lambda x: x < self.filter_threshold, verbose=self.verbose) # Make sure filtering was successful genome = Genome(self.gdfilepath) with genome: self.assertArraysEqual(genome["chr1"][0:4, UNFILTERED_TRACKNAME], [nan, nan, nan, nan]) self.assertArraysEqual( genome["chr1"][128:132, UNFILTERED_TRACKNAME], [nan, nan, 0.5, 0.5]) self.assertArraysEqual( genome["chr1"][168:172, UNFILTERED_TRACKNAME], [0.9, 0.9, nan, nan]) self.assertArraysEqual( genome["chr1"][206:210, UNFILTERED_TRACKNAME], [nan, nan, nan, nan])
def test_repr_str(self): genome = Genome(self.gdfilepath, mode="r") self.assertEqual(repr(genome), "Genome('%s', **{'mode': 'r'})" % self.gdfilepath) chr = genome["chr1"] if self.mode == "dir": self.assertEqual( repr(chr), "<Chromosome 'chr1', file='%s/chr1.genomedata'>" % self.gdfilepath) self.assertEqual(str(chr), "chr1") elif self.mode == "file": self.assertEqual( repr(chr), "<Chromosome 'chr1', file='%s'>" % self.gdfilepath) self.assertEqual(str(chr), "chr1") genome.close()
def genomedata_fill_random(gdfilename): with Genome(gdfilename, mode="r+") as genome: print >> sys.stderr, "Opening %s..." % gdfilename for chromosome in genome: print >> sys.stderr, "Overwriting %s with random data" % chromosome for supercontig, continuous in chromosome.itercontinuous(): continuous[...] = rand(*continuous.shape)
def make_continuous_cells(track_indexes, genomedata_names, chromosome_name, start, end): """ returns 2-dimensional numpy.ndarray of continuous observation data for specified interval. This data is untransformed dim 0: position dim 1: track """ continuous_cells = None # For every track in each genomedata archive zipper = zip(track_indexes, genomedata_names) for track_index, genomedata_name in zipper: with Genome(genomedata_name) as genome: chromosome = genome[chromosome_name] # If we haven't started creating the continous cells if continuous_cells is None: # Copy the first track into our continous cells continuous_cells = copy(chromosome[start:end, [track_index]]) else: # Otherwise append the track to our continuous cells continuous_cells = append(continuous_cells, chromosome[start:end, [track_index]], DIM_TRACK) return continuous_cells
def process_signals(gdarchive, trackname, score_type, verbose=False): ''' Process signals from a genomedata archive. Args: genome (genomedata.Genome): genomedata archive trackname (str): name of track in genomedata archive score_type (str): scoring function verbose (Optional[bool]): maximum verbosity Returns: None. Writes signals to stdout or specified file. ''' score_func = _score_func(score_type) with Genome(gdarchive) as genome: for chrom in genome: for pos in range(chrom.start, chrom.end): if isnan(chrom[pos, trackname]): continue score = score_func(chrom, pos, trackname, verbose) if not score: continue fields = (chrom, pos, pos + 1, score) print(*map(str, fields), sep='\t')
def test_repr_str(self): genome = Genome(self.gdfilepath, mode="r") self.assertEqual(repr(genome), "Genome('%s', **{'mode': 'r'})" % self.gdfilepath) chr = genome["chr1"] if self.mode == "dir": self.assertEqual(repr(chr), "<Chromosome 'chr1', file='%s/chr1.genomedata'>" % self.gdfilepath) self.assertEqual(str(chr), "chr1") elif self.mode == "file": self.assertEqual(repr(chr), "<Chromosome 'chr1', file='%s'>" % self.gdfilepath) self.assertEqual(str(chr), "chr1") genome.close()
def test_open_chromosomes(self): genome = Genome(self.gdfilepath) with genome: chr1 = genome["chr1"] chr2 = genome["chr1"] # Memoized self.assertEqual(chr1, chr2) genome["chrY"] self.assertEqual(len(genome.open_chromosomes), 2) self.assertEqual(genome.open_chromosomes, {})
def test_add_track(self): new_track_name, new_track_file = self.new_track # Open new track genome = Genome(self.gdfilepath, mode="r+") with warnings.catch_warnings(): warnings.simplefilter("ignore", GenomedataDirtyWarning) with genome: self.assertEqual(genome.num_tracks_continuous, 0) genome.add_track_continuous(new_track_name) # Load data for new track load_data(self.gdfilepath, new_track_name, test_filename(new_track_file), verbose=self.verbose) # Close data with new track close_data(self.gdfilepath, verbose=self.verbose) # Make sure addition was successful genome = Genome(self.gdfilepath) with genome: # Track ordering should now end with dnase self.assertEqual(genome.tracknames_continuous, [new_track_name]) # Given track ordering, check single track data retrieval self.assertArraysEqual( genome["chr1"][305:310, new_track_name], [-2.65300012, 0.37200001, 0.37200001, 0.37200001, 0.37099999])
def test_delete_tracks(self): # Test ability to delete a track trackname = "primate" old_entry = (290, -2.327) # Test value before deleting track warnings.simplefilter("ignore") with Genome(self.gdfilepath, "r+") as genome: chromosome = genome["chr1"] self.assertArraysEqual(chromosome[old_entry[0], trackname], old_entry[1]) chromosome._erase_data(trackname) warnings.resetwarnings() # Re-close data close_data(self.gdfilepath, verbose=self.verbose) # Test value after deleting track with Genome(self.gdfilepath) as genome: chromosome = genome["chr1"] self.assertArraysEqual(chromosome[old_entry[0], trackname], old_entry[1])
def validate(filename, genomedatadir, dirpath, clobber=False, quick=False, replot=False, noplot=False, mnemonic_file=None, verbose=True, ropts=None): setup_directory(dirpath) if not replot: annotation = Annotation(filename, verbose=verbose) labels = annotation.labels with Genome(genomedatadir) as genome: nuc_counts, dinuc_counts = \ calc_nucleotide_frequencies(annotation, genome, quick=quick, verbose=verbose) save_tab(labels, nuc_counts, dinuc_counts, dirpath, clobber=clobber, verbose=verbose) if not noplot: with open_transcript(dirpath, MODULE) as transcriptfile: save_plot(dirpath, clobber=clobber, verbose=verbose, mnemonic_file=mnemonic_file, ropts=ropts, transcriptfile=transcriptfile) save_html(dirpath, clobber=clobber, mnemonicfile=mnemonic_file, verbose=verbose)
def print_random_coordinates(genomedatadir, n=DEFAULT_N, chrom=None, one_chrom=False, in_supercontigs=False): with Genome(genomedatadir) as genome: # Collect names and total lengths of chromosomes # If in_supercontigs, this is the total length of all supercontigs chrom_weights = {} for chromosome in genome: chrom_weight = 0 if in_supercontigs: for supercontig in chromosome: chrom_weight += supercontig.end - supercontig.start else: chrom_weight = chromosome.end - chromosome.start chrom_weights[chromosome.name] = chrom_weight total_weight = sum(chrom_weights.values()) if chrom: assert chrom in chrom_weights one_chrom = True # Use specified chrom print >>sys.stderr, "set one_chrom" elif one_chrom: chrom = rand_chrom(chrom_weights.keys()) print >>sys.stderr, "using only %s" % chrom for i in range(0, n): if not one_chrom: chrom = rand_chrom_weighted(chrom_weights, total_weight) if in_supercontigs: index = rand_supercontig_position(genome[chrom], chrom_weights[chrom]) else: index = rand_chromosome_position(genome[chrom]) print "%s\t%d" % (chrom, index)
from __future__ import with_statement, division import sys import warnings from collections import defaultdict from genomedata import Genome indices = defaultdict(list) for line in sys.stdin: tokens = line.strip().split() if tokens: chrom, index = line.strip().split() indices[chrom].append(int(index)) indices = dict(indices) # remove defaultdict behavior with Genome(sys.argv[1]) as genome: tracknames = sys.argv[2:] for trackname in tracknames: assert trackname in genome.tracknames_continuous warnings.simplefilter("ignore") count = 0 for chrom in indices: indices[chrom].sort() # Sort by index ascending index_iter = iter(indices[chrom]) index = index_iter.next() chromosome = genome[chrom] chrom_done = False for supercontig, continuous in chromosome.itercontinuous(): start = supercontig.start end = supercontig.end
def close_data(gdfilename, verbose=False): with Genome(gdfilename, mode="r+") as genome: write_genome_metadata(genome, verbose)
### DenseCPT ### input_master.append( InlineSection( ("start_seg", DenseCPT([1 / 3, 1 / 3, 1 / 3])), ("seg_subseg", DenseCPT([[1.0], [1.0], [1.0]])), ("seg_seg", DenseCPT([[0, 0.5, 0.5], [0.5, 0, 0.5], [0.5, 0.5, 0]])), ("seg_subseg_subseg", DenseCPT([[[1.0]], [[1.0]], [[1.0]]])), ("segCountDown_seg_segTransition", DenseCPT([[[0.99, 0.00999, 0.00001], [0.99, 0.00999, 0.00001], [0.99, 0.00999, 0.00001]], [[0.99, 0.01, 0.0], [0.99, 0.01, 0.0], [0.99, 0.01, 0.0]] ])))) ### Mean and Covar Sections ### # These sections are taken from the genomedata archive, and what we aim to change with Genome(genomedata) as genome: # Load info from GD archive to get mean and variance sums = genome.sums sums_squares = genome.sums_squares num_datapoints = genome.num_datapoints mean = sums / num_datapoints var = (sums_squares / num_datapoints) - mean**2 sd = sqrt(var) # Set group means to be 2 SD from the actual mean means = [mean - 2 * sd, mean, mean + 2 * sd] # Transform for arcsinh dist var_transformed = arcsinh(var) means_transformed = arcsinh(means)
#data is a numpy array def get_nonzero_min(data, zero): #test1 = numpy.zeros(10) #test1[5] = 5 #print("test1:", test1) #test1_result = get_nonzero_min(test1, 0) #print("Should be 5:", test1_result) if type(data) == list: temp = numpy.concatenate(([data[0]], [data[1]])) ma = numpy.ma.masked_equal(temp, 0.0, copy=False) else: ma = numpy.ma.masked_equal(data, 0.0, copy=False) return ma.min() with Genome(str(sys.argv[1])) as genome: with open(str(sys.argv[2])) as bedfile: for line in bedfile: print(line) bed_items = line.strip().split() chr_name = bed_items[0] start = int(bed_items[1]) end = int(bed_items[2]) # HERE IS YOUR GENOMEDATA DATA, a numpy array print("data grabbing") data = genome[chr_name][start:end] print("min&max") min = get_nonzero_min(data, 0) max = data.max() print("concatenation")
(options, args) = parser.parse_args() if options.inVCF is None: parser.error('input VCF not given') if options.outVCF is None: parser.error('output VCF not given') if options.genomedata is None: parser.error('genomedata archive not given') ############################################################################### # try to open up the genome data archieve try: genome = Genome(options.genomedata) except: print "ERROR!! Couldn't open the genomedata archive: " + options.genomedata + "\n" sys.exit(1) #setup file open/close with or without gzip # if options.isGzip is True: # try: # gc = 'gunzip -c ' + options.inVCF # inFile = os.popen(gc, 'r') # except: # print "ERROR!! Couldn't open the file" + options.inVCF + " (with gzip)\n" # sys.exit(1) # # try: # gc = 'gzip > ' + options.outVCF
stageFileMatches = stageFileMatches.set_index("Stage") # Get the list of features featureList = list(stageFileMatches.columns.values) # Since "Stage" is the index, don't need to drop it specifically. stageFileDict = {} for eachStage in stageList: stageFileDict[eachStage] = {} # Now each stage is a key to an inner dictionary. Fill this inner dictionary for eachFeature in featureList: stageFileDict[eachStage][eachFeature] = stageFileMatches.get_value( eachStage, eachFeature) # Next, I need to read in a genomedata file to acccess tracks for defining our features genomeDataDir = "/net/noble/vol2/home/katecook/proj/2016predictExpression/data/pfal3D7.genomedata" with Genome(genomeDataDir) as myGenome: # Add a functor for applying operations to each row for eachFeature in featureList: print("Assinging " + str(eachFeature)) # fileLookupFunctor = featureAssinger(stageFileMatches, eachFeature, windowBack, windowFor) # # Add the row # groSeqData[eachFeature] = groSeqData.apply(fileLookupFunctor, axis = 1) # Loop for each section where averages should be taken for eachSection in windowDict: fileLookupFunctorAverage = featureAssinger( stageFileMatches, eachFeature, windowDict[eachSection][0], windowDict[eachSection][1], valueToUse="Average") fileLookupFunctorMax = featureAssinger(stageFileMatches,
def validate(bedfilename, genomedatadir, dirpath, clobber=False, quick=False, replot=False, noplot=False, verbose=True, mnemonic_file=None, create_mnemonics=False, inputdirs=None, chroms=None, ropts=None, label_order_file=None, track_order_file=None, transformation=None): if not replot: setup_directory(dirpath) genome = Genome(genomedatadir) segmentation = Segmentation(bedfilename, verbose=verbose) if inputdirs: # Merge stats from many input directories stats = SignalStats() for inputdir in inputdirs: try: sub_stats = SignalStats.from_file(inputdir, verbose=verbose) except IOError as e: log("Problem reading data from %s: %s" % (inputdir, e)) else: stats.add(sub_stats) elif replot: stats = SignalStats.from_file(dirpath, verbose=verbose) else: # Calculate stats over segmentation stats = SignalStats.from_segmentation(genome, segmentation, transformation=transformation, quick=quick, chroms=chroms, verbose=verbose) if not replot: stats.save_tab(dirpath, clobber=clobber, verbose=verbose) if mnemonic_file is None and create_mnemonics: statsfilename = make_tabfilename(dirpath, NAMEBASE) mnemonic_file = create_mnemonic_file(statsfilename, dirpath, clobber=clobber, verbose=verbose) if not noplot: if label_order_file is not None: log("Reading label ordering from: %s" % label_order_file) label_order = read_order_file(label_order_file) if track_order_file is not None: log("Reading track ordering from: %s" % track_order_file) track_order = read_order_file(track_order_file) with open_transcript(dirpath, MODULE) as transcriptfile: stats.save_plot(dirpath, namebase=NAMEBASE, clobber=clobber, mnemonic_file=mnemonic_file, verbose=verbose, label_order=label_order, track_order=track_order, ropts=ropts, transcriptfile=transcriptfile) save_html(dirpath, genomedatadir, clobber=clobber, verbose=verbose)
def test_interface(self): original_num_datapoints = 0 if self.write: mode = "r+" else: mode = "r" # catch_warnings acts as a context manager storing the original warning # filter and resetting it at the end. All non user warnings should # still be displayed with warnings.catch_warnings(): warnings.simplefilter("ignore", GenomedataDirtyWarning) warnings.simplefilter("ignore", OverlapWarning) with Genome(self.gdfilepath, mode=mode) as genome: original_num_datapoints = genome.num_datapoints self.assertTrue("chr1" in genome) self.assertFalse("chrZ" in genome) chromosome = genome["chr1"] # Test tracknames are as expected self.assertEqual(sorted(chromosome.tracknames_continuous), sorted(self.tracknames)) # Test tracknames are consistent self.assertEqual(sorted(genome.tracknames_continuous), sorted(chromosome.tracknames_continuous)) # Test chromosome attributes self.assertEqual(chromosome.start, 0) self.assertEqual(chromosome.end, 24950) # Test sequence inside of data range self.assertEqual(seq2str(chromosome.seq[0:20]), "taaccctaaccctaacccta") # Test sequence outside of data range self.assertEqual(seq2str(chromosome.seq[30000]), "n") # Track ordering should be: placental, primate, vertebrate self.assertEqual(chromosome.tracknames_continuous, self.tracknames) # Given track ordering, check multi-track data retrieval self.assertArraysEqual(chromosome[290, 0:3], [-2.297, -2.327, -2.320]) # test multi-track data retrieval by list self.assertArraysEqual( chromosome[290, ["placental", "primate", "vertebrate"]], chromosome[290, 0:3]) self.assertArraysEqual( chromosome[290, ["placental", "vertebrate"]], [-2.297, -2.320]) self.assertArraysEqual(chromosome[290, [0, 2]], [-2.297, -2.320]) self.assertArraysEqual(chromosome[290, [2, 0]], [-2.320, -2.297]) self.assertArraysEqual(chromosome[290, array([1, 0])], [-2.327, -2.297]) # Test filling of unassigned continuous segments chromosome = genome["chrY"] # Get first supercontig for supercontig in chromosome: break self.assertArraysEqual(supercontig.continuous[0, 2], nan) # If we are testing writing to archives if self.write: # Test writing scalars to various indexing methods chromosome = genome["chr1"] # Test writing scalar to multiple tracks chromosome[290] = 100.0 # Test writing scalar to tracks by named list chromosome[291, ["placental", "primate", "vertebrate"]] = 101.0 # Test writing scalar to select tracks by named list chromosome[292, ["placental", "vertebrate"]] = 102.0 # Test writing scalar to tracks by index chromosome[293, [0, 2]] = 103.0 chromosome[294, [2, 0]] = 104.0 # Test writing arrays to various indexing methods # Test writing an array to a single index chromosome[295] = [105.0, 106.0, 107.0] # Test writing a subarray to a index subset chromosome[296, ["placental", "vertebrate"]] = [108.0, 109.0] # Test removing datapoints by writing NaN chromosome[297, ["primate"]] = nan # Test writing around supercontig boundaries # <Supercontig 'supercontig_0', [0:24950]> # Test writing outside a supercontig try: chromosome[300000] = 110.0 except ValueError: pass # we expect a value error here # Test writing overlap across supercontig to no supercontig try: chromosome[24900:30000] = 111.0 except ValueError: pass # we expect a value error here # Check write output after closing if testing writes if self.write: # Close with newly written data close_data(self.gdfilepath, verbose=self.verbose) # Read data and verify new data and parameters with Genome(self.gdfilepath) as genome: chromosome = genome["chr1"] self.assertArraysEqual(chromosome[290], [100.0, 100.0, 100.0]) self.assertArraysEqual(chromosome[291], [101.0, 101.0, 101.0]) # L14 in primate wigFix self.assertArraysEqual(chromosome[292], [102.0, 0.371, 102.0]) self.assertArraysEqual(chromosome[293], [103.0, 0.372, 103.0]) self.assertArraysEqual(chromosome[294], [104.0, 0.372, 104.0]) self.assertArraysEqual(chromosome[295], [105.0, 106.0, 107.0]) self.assertArraysEqual(chromosome[296], [108.0, -2.327, 109.0]) # Check if one datapoint was successfully removed self.assertArraysEqual(original_num_datapoints, [ genome.num_datapoints[0], genome.num_datapoints[1] + 1, genome.num_datapoints[2] ])