def test_constructors_errors(self): # scalar with pytest.raises(TypeError): IntervalIndex(5) # not an interval with pytest.raises(TypeError): IntervalIndex([0, 1]) with pytest.raises(TypeError): IntervalIndex.from_intervals([0, 1]) # invalid closed with pytest.raises(ValueError): IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid') # mismatched closed with pytest.raises(ValueError): IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2, closed='left')]) with pytest.raises(ValueError): IntervalIndex.from_arrays([0, 10], [3, 5]) with pytest.raises(ValueError): Index([Interval(0, 1), Interval(2, 3, closed='left')]) # no point in nesting periods in an IntervalIndex with pytest.raises(ValueError): IntervalIndex.from_breaks( pd.period_range('2000-01-01', periods=3))
def test_constructors_errors_string(self, data): # GH 19016 left, right = data[:-1], data[1:] tuples = lzip(left, right) ivs = [Interval(l, r) for l, r in tuples] or data msg = ('category, object, and string subtypes are not supported ' 'for IntervalIndex') with tm.assert_raises_regex(TypeError, msg): IntervalIndex(ivs) with tm.assert_raises_regex(TypeError, msg): Index(ivs) with tm.assert_raises_regex(TypeError, msg): IntervalIndex.from_intervals(ivs) with tm.assert_raises_regex(TypeError, msg): IntervalIndex.from_breaks(data) with tm.assert_raises_regex(TypeError, msg): IntervalIndex.from_arrays(left, right) with tm.assert_raises_regex(TypeError, msg): IntervalIndex.from_tuples(tuples)
def test_constructors(self, data, closed, name): left, right = data[:-1], data[1:] ivs = [Interval(l, r, closed=closed) for l, r in lzip(left, right)] expected = IntervalIndex._simple_new( left=left, right=right, closed=closed, name=name) # validate expected assert expected.closed == closed assert expected.name == name assert expected.dtype.subtype == data.dtype tm.assert_index_equal(expected.left, data[:-1]) tm.assert_index_equal(expected.right, data[1:]) # validated constructors result = IntervalIndex(ivs, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_intervals(ivs, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_breaks(data, closed=closed, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_arrays( left, right, closed=closed, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples( lzip(left, right), closed=closed, name=name) tm.assert_index_equal(result, expected) result = Index(ivs, name=name) assert isinstance(result, IntervalIndex) tm.assert_index_equal(result, expected) # idempotent tm.assert_index_equal(Index(expected), expected) tm.assert_index_equal(IntervalIndex(expected), expected) result = IntervalIndex.from_intervals(expected) tm.assert_index_equal(result, expected) result = IntervalIndex.from_intervals( expected.values, name=expected.name) tm.assert_index_equal(result, expected) left, right = expected.left, expected.right result = IntervalIndex.from_arrays( left, right, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples( expected.to_tuples(), closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) breaks = expected.left.tolist() + [expected.right[-1]] result = IntervalIndex.from_breaks( breaks, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected)
def test_missing_values(self): idx = pd.Index([np.nan, pd.Interval(0, 1), pd.Interval(1, 2)]) idx2 = pd.IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2]) assert idx.equals(idx2) with pytest.raises(ValueError): IntervalIndex.from_arrays([np.nan, 0, 1], np.array([0, 1, 2])) tm.assert_numpy_array_equal(isnull(idx), np.array([True, False, False]))
def test_union(self): other = IntervalIndex.from_arrays([2], [3]) expected = IntervalIndex.from_arrays(range(3), range(1, 4)) actual = self.index.union(other) self.assertTrue(expected.equals(actual)) actual = other.union(self.index) self.assertTrue(expected.equals(actual)) tm.assert_index_equal(self.index.union(self.index), self.index) tm.assert_index_equal(self.index.union(self.index[:1]), self.index)
def test_missing_values(self, closed): idx = Index([np.nan, Interval(0, 1, closed=closed), Interval(1, 2, closed=closed)]) idx2 = IntervalIndex.from_arrays( [np.nan, 0, 1], [np.nan, 1, 2], closed=closed) assert idx.equals(idx2) with pytest.raises(ValueError): IntervalIndex.from_arrays( [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed) tm.assert_numpy_array_equal(isna(idx), np.array([True, False, False]))
def test_union(self, closed): idx = self.create_index(closed=closed) other = IntervalIndex.from_arrays([2], [3], closed=closed) expected = IntervalIndex.from_arrays( range(3), range(1, 4), closed=closed) actual = idx.union(other) assert expected.equals(actual) actual = other.union(idx) assert expected.equals(actual) tm.assert_index_equal(idx.union(idx), idx) tm.assert_index_equal(idx.union(idx[:1]), idx)
def test_sort_index_intervals(self): s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays( [0, 1, 2, 3], [1, 2, 3, 4])) result = s.sort_index() expected = s assert_series_equal(result, expected) result = s.sort_index(ascending=False) expected = Series([3, 2, 1, np.nan], IntervalIndex.from_arrays( [3, 2, 1, 0], [4, 3, 2, 1])) assert_series_equal(result, expected)
def test_constructor_errors(self): # GH 19016: categorical data data = Categorical(list('01234abcde'), ordered=True) msg = ('category, object, and string subtypes are not supported ' 'for IntervalIndex') with tm.assert_raises_regex(TypeError, msg): IntervalIndex.from_arrays(data[:-1], data[1:]) # unequal length left = [0, 1, 2] right = [2, 3] msg = 'left and right must have the same length' with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_arrays(left, right)
def test_constructors(self, closed, name): left, right = Index([0, 1, 2, 3]), Index([1, 2, 3, 4]) ivs = [Interval(l, r, closed=closed) for l, r in zip(left, right)] expected = IntervalIndex._simple_new( left=left, right=right, closed=closed, name=name) result = IntervalIndex(ivs, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_intervals(ivs, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_breaks( np.arange(5), closed=closed, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_arrays( left.values, right.values, closed=closed, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples( zip(left, right), closed=closed, name=name) tm.assert_index_equal(result, expected) result = Index(ivs, name=name) assert isinstance(result, IntervalIndex) tm.assert_index_equal(result, expected) # idempotent tm.assert_index_equal(Index(expected), expected) tm.assert_index_equal(IntervalIndex(expected), expected) result = IntervalIndex.from_intervals( expected.values, name=expected.name) tm.assert_index_equal(result, expected) left, right = expected.left, expected.right result = IntervalIndex.from_arrays( left, right, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples( expected.to_tuples(), closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) breaks = expected.left.tolist() + [expected.right[-1]] result = IntervalIndex.from_breaks( breaks, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected)
def test_constructors_errors_tz(self, tz_left, tz_right): # GH 18537 left = date_range('2017-01-01', periods=4, tz=tz_left) right = date_range('2017-01-02', periods=4, tz=tz_right) # don't need to check IntervalIndex(...) or from_intervals, since # mixed tz are disallowed at the Interval level with pytest.raises(ValueError): IntervalIndex.from_arrays(left, right) with pytest.raises(ValueError): IntervalIndex.from_tuples(lzip(left, right)) with pytest.raises(ValueError): breaks = left.tolist() + [right[-1]] IntervalIndex.from_breaks(breaks)
def test_astype(self): ci = self.create_index() result = ci.astype('category') tm.assert_index_equal(result, ci, exact=True) result = ci.astype(object) tm.assert_index_equal(result, Index(np.array(ci))) # this IS equal, but not the same class assert result.equals(ci) assert isinstance(result, Index) assert not isinstance(result, CategoricalIndex) # interval ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], right=[2, 4], closed='right') ci = CategoricalIndex(Categorical.from_codes( [0, 1, -1], categories=ii, ordered=True)) result = ci.astype('interval') expected = ii.take([0, 1, -1]) tm.assert_index_equal(result, expected) result = IntervalIndex.from_intervals(result.values) tm.assert_index_equal(result, expected)
def test_constructors_empty(self, data, closed): # GH 18421 expected_dtype = getattr(data, 'dtype', np.int64) expected_values = np.array([], dtype=object) expected_index = IntervalIndex(data, closed=closed) # validate the expected index assert expected_index.empty assert expected_index.closed == closed assert expected_index.dtype.subtype == expected_dtype tm.assert_numpy_array_equal(expected_index.values, expected_values) result = IntervalIndex.from_tuples(data, closed=closed) tm.assert_index_equal(result, expected_index) tm.assert_numpy_array_equal(result.values, expected_values) result = IntervalIndex.from_breaks(data, closed=closed) tm.assert_index_equal(result, expected_index) tm.assert_numpy_array_equal(result.values, expected_values) result = IntervalIndex.from_arrays(data, data, closed=closed) tm.assert_index_equal(result, expected_index) tm.assert_numpy_array_equal(result.values, expected_values) if closed == 'right': # Can't specify closed for IntervalIndex.from_intervals result = IntervalIndex.from_intervals(data) tm.assert_index_equal(result, expected_index) tm.assert_numpy_array_equal(result.values, expected_values)
def test_constructors_nan(self, closed, data): # GH 18421 expected_values = np.array(data, dtype=object) expected_idx = IntervalIndex(data, closed=closed) # validate the expected index assert expected_idx.closed == closed tm.assert_numpy_array_equal(expected_idx.values, expected_values) result = IntervalIndex.from_tuples(data, closed=closed) tm.assert_index_equal(result, expected_idx) tm.assert_numpy_array_equal(result.values, expected_values) result = IntervalIndex.from_breaks([np.nan] + data, closed=closed) tm.assert_index_equal(result, expected_idx) tm.assert_numpy_array_equal(result.values, expected_values) result = IntervalIndex.from_arrays(data, data, closed=closed) tm.assert_index_equal(result, expected_idx) tm.assert_numpy_array_equal(result.values, expected_values) if closed == 'right': # Can't specify closed for IntervalIndex.from_intervals result = IntervalIndex.from_intervals(data) tm.assert_index_equal(result, expected_idx) tm.assert_numpy_array_equal(result.values, expected_values)
def test_subtype_conversion(self, index, subtype): dtype = IntervalDtype(subtype) result = index.astype(dtype) expected = IntervalIndex.from_arrays(index.left.astype(subtype), index.right.astype(subtype), closed=index.closed) tm.assert_index_equal(result, expected)
def test_constructors(self): expected = self.index actual = IntervalIndex.from_breaks(np.arange(3), closed='right') self.assertTrue(expected.equals(actual)) alternate = IntervalIndex.from_breaks(np.arange(3), closed='left') self.assertFalse(expected.equals(alternate)) actual = IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) self.assertTrue(expected.equals(actual)) actual = IntervalIndex([Interval(0, 1), Interval(1, 2)]) self.assertTrue(expected.equals(actual)) actual = IntervalIndex.from_arrays(np.arange(2), np.arange(2) + 1, closed='right') self.assertTrue(expected.equals(actual)) actual = Index([Interval(0, 1), Interval(1, 2)]) assert isinstance(actual, IntervalIndex) self.assertTrue(expected.equals(actual)) actual = Index(expected) assert isinstance(actual, IntervalIndex) self.assertTrue(expected.equals(actual))
def test_take(self): actual = self.index.take([0, 1]) self.assertTrue(self.index.equals(actual)) expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2]) actual = self.index.take([0, 0, 1]) self.assertTrue(expected.equals(actual))
def test_itemsize(self): # GH 19209 left = np.arange(0, 4, dtype='i8') right = np.arange(1, 5, dtype='i8') result = IntervalIndex.from_arrays(left, right).itemsize expected = 16 # 8 * 2 assert result == expected
def setup(self, N): left = np.append(np.arange(N), np.array(0)) right = np.append(np.arange(1, N + 1), np.array(1)) self.intv = IntervalIndex.from_arrays(left, right) self.intv._engine self.left = IntervalIndex.from_breaks(np.arange(N)) self.right = IntervalIndex.from_breaks(np.arange(N - 3, 2 * N - 3))
def test_subtype_integer(self, subtype_start, subtype_end): index = IntervalIndex.from_breaks(np.arange(100, dtype=subtype_start)) dtype = IntervalDtype(subtype_end) result = index.astype(dtype) expected = IntervalIndex.from_arrays(index.left.astype(subtype_end), index.right.astype(subtype_end), closed=index.closed) tm.assert_index_equal(result, expected)
def test_nbytes(self): # GH 19209 left = np.arange(0, 4, dtype='i8') right = np.arange(1, 5, dtype='i8') result = IntervalIndex.from_arrays(left, right).nbytes expected = 64 # 4 * 8 * 2 assert result == expected
def test_dropna_intervals(self): s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays( [np.nan, 0, 1, 2], [np.nan, 1, 2, 3])) result = s.dropna() expected = s.iloc[1:] assert_series_equal(result, expected)
def test_take(self, closed): index = self.create_index(closed=closed) actual = index.take([0, 1]) tm.assert_index_equal(actual, index) expected = IntervalIndex.from_arrays( [0, 0, 1], [1, 1, 2], closed=closed) actual = index.take([0, 0, 1]) tm.assert_index_equal(actual, expected)
def test_append(self): index1 = IntervalIndex.from_arrays([0, 1], [1, 2]) index2 = IntervalIndex.from_arrays([1, 2], [2, 3]) result = index1.append(index2) expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3]) tm.assert_index_equal(result, expected) result = index1.append([index1, index2]) expected = IntervalIndex.from_arrays([0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3]) tm.assert_index_equal(result, expected) def f(): index1.append(IntervalIndex.from_arrays([0, 1], [1, 2], closed='both')) self.assertRaises(ValueError, f)
def test_take(self, closed): index = self.create_index(closed=closed) result = index.take(range(10)) tm.assert_index_equal(result, index) result = index.take([0, 0, 1]) expected = IntervalIndex.from_arrays( [0, 0, 1], [1, 1, 2], closed=closed) tm.assert_index_equal(result, expected)
def test_get_reindexer_datetimelike(self, arrays): # GH 20636 index = IntervalIndex.from_arrays(*arrays) tuples = [(index[0].left, index[0].left + pd.Timedelta('12H')), (index[-1].right - pd.Timedelta('12H'), index[-1].right)] target = IntervalIndex.from_tuples(tuples) result = index._get_reindexer(target) expected = np.array([0, 3], dtype='intp') tm.assert_numpy_array_equal(result, expected)
def test_itemsize(self): # GH 19209 left = np.arange(0, 4, dtype='i8') right = np.arange(1, 5, dtype='i8') expected = 16 # 8 * 2 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = IntervalIndex.from_arrays(left, right).itemsize assert result == expected
def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) assert i[0] == Interval(0.0, 1.0, closed=closed) assert i[1] == Interval(1.0, 2.0, closed=closed) assert isna(i[2]) result = i[0:1] expected = IntervalIndex.from_arrays((0.,), (1.,), closed=closed) tm.assert_index_equal(result, expected) result = i[0:2] expected = IntervalIndex.from_arrays((0., 1), (1., 2.), closed=closed) tm.assert_index_equal(result, expected) result = i[1:3] expected = IntervalIndex.from_arrays((1., np.nan), (2., np.nan), closed=closed) tm.assert_index_equal(result, expected)
def test_difference(self, closed, sort): index = IntervalIndex.from_arrays([1, 0, 3, 2], [1, 2, 3, 4], closed=closed) result = index.difference(index[:1], sort) expected = index[1:] if sort: expected = expected.sort_values() tm.assert_index_equal(result, expected) # GH 19101: empty result, same dtype result = index.difference(index, sort) expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) tm.assert_index_equal(result, expected) # GH 19101: empty result, different dtypes other = IntervalIndex.from_arrays(index.left.astype('float64'), index.right, closed=closed) result = index.difference(other, sort) tm.assert_index_equal(result, expected)
def test_dropna(self): expected = IntervalIndex.from_tuples([(0.0, 1.0), (1.0, 2.0)]) ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan]) result = ii.dropna() tm.assert_index_equal(result, expected) ii = IntervalIndex.from_arrays([0, 1, np.nan], [1, 2, np.nan]) result = ii.dropna() tm.assert_index_equal(result, expected)
def test_astype(self): ci = self.create_index() result = ci.astype(object) tm.assert_index_equal(result, Index(np.array(ci))) # this IS equal, but not the same class assert result.equals(ci) assert isinstance(result, Index) assert not isinstance(result, CategoricalIndex) # interval ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], right=[2, 4], closed="right") ci = CategoricalIndex( Categorical.from_codes([0, 1, -1], categories=ii, ordered=True) ) result = ci.astype("interval") expected = ii.take([0, 1, -1]) tm.assert_index_equal(result, expected) result = IntervalIndex(result.values) tm.assert_index_equal(result, expected)
def chromosome_coverage_read_counts(self, gene_overlap_dat, chrom_gene_df, chrom_exon_df, chrom): """ Determine per-chromosome reads coverage and per-gene read counts from an RNA-seq experiment in a way that properly considers ambiguous reads - if a (paired) read falls entirely within the exonic regions of a *single* gene, only then does read contribute to read count and coverage. The cigar scores from single and paired reads are parsed according to cigar_segment_bounds. 1. Saves compressed coverage array to self.save_dir with file name 'sample_[sample_id]_[chrom].npz' for genes with no overlap with any other gene (a.k.a. "isolated genes") with filename 'chrom_coverage_[sample_id]_[chrom].npz' 2. Saves a dictionary of {gene_name: 1-d numpy gene coverage arrays (concatenated exonic regions)} to a serialized pickle file for all genes that exonic have overlap with other genes (a.k.a. "overlap genes") with filename 'overlap_coverage_[sample_id]_[chrom].pkl' 3. Saves read counts to self.save_dir with filename 'read_counts_[sample_id]_[chrom].csv' NOTE: if the required chromosome coverage files and read count file *already* exist prior to any coverage/read count calculations, Degnorm will default to using those files. This will only happen if a user either moves coverage and read count files from a prior Degnorm pipeline run to the appropriate chromosome directories of the target output directory, or if they re-use a Degnorm pipeline run's output directory. This is *NOT* the same as using a warm-start directory. A warm-start skips coverage/read count calculations entirely, assuming a prior Degnorm run successfully parse all coverage/read counts. :param chrom_gene_df: pandas.DataFrame with `chr`, `gene`, `gene_start`, and `gene_end` columns that delineate the start and end position of a gene's transcript on a chromosome, must be subset to the chromosome in study. :param gene_overlap_dat: dictionary with keys 'isolated_genes' and 'overlap_genes' detailing groups of genes that do not overlap with others and then groups of genes that share any overlap. See gene_processing.get_gene_overlap_structure function. :param chrom_exon_df: pandas.DataFrame with `chr`, `gene`, `start`, `end` columns that delineate the start and end positions of exons on a gene. :param chrom: str chromosome name :return: None. Coverage and read count files are written to self.save_dir. """ # First, load this chromosome's reads. if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- begin loading reads from {2}'.format( self.sample_id, chrom, self.filename)) # assess how many genes we have. n_genes = chrom_gene_df.shape[0] # gene_overlap_dat data check: ensure that number isolated genes + number overlapping genes # equals number of genes in genes DataFrame. n_isolated_genes, n_overlap_genes = 0, 0 if gene_overlap_dat['isolated_genes']: n_isolated_genes = len(gene_overlap_dat['isolated_genes']) if gene_overlap_dat['overlap_genes']: n_overlap_genes = np.sum( [len(x) for x in gene_overlap_dat['overlap_genes']]) if n_isolated_genes + n_overlap_genes != n_genes: raise ValueError( 'number of genes contained in gene_overlap_dat does not match that of chrom_gene_df.' ) # create filepaths to non-overlapping read coverage, overlapping read coverage, read count files. chrom_cov_file = os.path.join( self.save_dir, 'chrom_coverage_' + self.sample_id + '_' + str(chrom) + '.npz') ol_cov_file = os.path.join( self.save_dir, 'overlap_coverage_' + self.sample_id + '_' + str(chrom) + '.pkl') count_file = os.path.join( self.save_dir, 'read_counts_' + self.sample_id + '_' + str(chrom) + '.csv') # if all required coverage, read count files are present, e.g. created from a previous run attempt, # then skip all calculations and default to the existing files. Addresses issue #30. if ((n_isolated_genes > 0 and os.path.isfile(chrom_cov_file)) or n_isolated_genes == 0) \ and ((n_overlap_genes > 0 and os.path.isfile(ol_cov_file)) or n_overlap_genes == 0) \ and (os.path.isfile(count_file)): if self.verbose: logging.info("""SAMPLE {0}, CHR {1} -- WARNING... All coverage and read count files already present: {0} {1} {2} Defaulting to these files; skipping coverage and read count calculations."""\ .format(chrom_cov_file, ol_cov_file, count_file)) return None # initialize read counts. read_count_dict = {gene: 0 for gene in chrom_gene_df.gene} # set pandas.options.mode.chained_assignment = None to avoid SettingWithCopyWarnings set_option('mode.chained_assignment', None) # ---------------------------------------------------------------------- # # Step 1. Load chromosome's reads and index them. # ---------------------------------------------------------------------- # reads_df = self.load_chromosome_reads(chrom) if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- reads successfully loaded. shape = {2}' .format(self.sample_id, chrom, reads_df.shape)) # append end position to reads based on cigar score. reads_df['end_pos'] = reads_df['pos'] + reads_df['cigar'].apply( lambda x: sum([int(k) for k, v in re.findall(r'(\d+)([A-Z]?)', x)])) # assign row number to read ID column. reads_df['read_id'] = range(reads_df.shape[0]) # easy win: drop reads whose start position is < minimum start position of a gene, # and drop reads whose end position is > maximum start position of a gene min_gene_start, max_gene_end = chrom_gene_df.gene_start.min( ) - 1, chrom_gene_df.gene_end.max() - 1 reads_df = reads_df[(reads_df.pos >= (min_gene_start)) & (reads_df.end_pos <= (max_gene_end))] # If working with paired reads, # ensure that we've sequestered paired reads (eliminate any query names only occurring once). if self.paired: qname_counts = reads_df.qname_unpaired.value_counts() paired_occ_reads = qname_counts[qname_counts == 2].index.values.tolist() reads_df = reads_df[reads_df.qname_unpaired.isin(paired_occ_reads)] # ---------------------------------------------------------------------- # # Step 2. Drop reads that don't fully fall within union of all exons. # ---------------------------------------------------------------------- # chrom_len = self.header[self.header.chr == chrom].length.iloc[0] tscript_vec = np.ones( [chrom_len], dtype=int) # large vector, will delete after using. # build binary 0/1 exon/intron indicator vector. # Need to account for exon data being 1-indexed, tscript_vec is 0-indexed, but # exon end positions are inclusive. exon_starts = chrom_exon_df.start.values - 1 exon_ends = chrom_exon_df.end.values for i in range(len(exon_starts)): tscript_vec[exon_starts[i]:exon_ends[i]] = 0 del exon_starts, exon_ends gc.collect() # store read_ids of reads to drop, and initialize dropped read count. drop_reads = list() # store read match region bounds, so that we only parse CIGAR strings once. read_bounds = list() # use values array, faster access. dat = reads_df[['cigar', 'pos', 'read_id']].values # for paired reads, perform special parsing of CIGAR strings to avoid double-counting of overlap regions. if self.paired: for ii in np.arange(1, dat.shape[0], 2): # obtain read region bounds. bounds_1 = cigar_segment_bounds(dat[ii - 1, 0], start=dat[ii - 1, 1]) bounds_2 = cigar_segment_bounds(dat[ii, 0], start=dat[ii, 1]) # leverage nature of alignments of paired reads to find disjoint coverage ranges. min_bounds_1, max_bounds_1 = min(bounds_1), max(bounds_1) min_bounds_2, max_bounds_2 = min(bounds_2), max(bounds_2) if max_bounds_2 >= max_bounds_1: bounds_2 = [ max_bounds_1 + 1 if j <= max_bounds_1 else j for j in bounds_2 ] else: bounds_2 = [ min_bounds_1 - 1 if j >= min_bounds_1 else j for j in bounds_2 ] bounds_2.sort() # aggregate read pair's bounds. bounds = bounds_1 + bounds_2 # iterate over match regions. If a single region is not fully contained # within exon regions, drop the pair. drop_read = False for j in np.arange(1, len(bounds), step=2): # check whether matching regions on tscript_vec are fully contained within exonic regions. # note that right-bounds are inclusive. if np.sum( tscript_vec[(bounds[j - 1]):(bounds[j] + 1)]) > 0: drop_read = True # append read id to set of read indices to drop (if appropriate). if drop_read: drop_reads.extend([dat[ii - 1, 2], dat[ii, 2]]) # otherwise, append match region bounds list. Note: endpoints of regions are inclusive. else: read_bounds.append(bounds) # for single-read RNA-Seq experiments, we do not need such special consideration. else: for ii in np.arange(dat.shape[0]): # obtain read regions bounds. bounds = cigar_segment_bounds(dat[ii, 0], start=dat[ii, 1]) # iterate over match regions. If a single region is not fully contained # within exon regions, drop the read. drop_read = False for j in np.arange(1, len(bounds), step=2): if np.sum( tscript_vec[(bounds[j - 1]):(bounds[j] + 1)]) > 0: drop_read = True # append read id to set of read indices to drop (if appropriate). if drop_read: drop_reads.append(dat[ii, 2]) # otherwise, append match region bounds list. Note: endpoints of regions are inclusive. else: read_bounds.append(bounds) # drop reads that don't fully intersect exonic regions. if drop_reads: reads_df = reads_df[~reads_df.read_id.isin(drop_reads)] if self.paired: # if paired reads, don't actually need .1 and .2 constituent reads anymore. # So to save time + memory, take every other read. reads_df = reads_df.iloc[np.arange(1, reads_df.shape[0], step=2)] # add parsed match region bounds to reads! reads_df['bounds'] = read_bounds # delete objs, attempt to save on memory. del tscript_vec, drop_reads, dat, read_bounds gc.collect() # ---------------------------------------------------------------------- # # Step 3. Compute coverage, reads across groups of mutually overlapping genes. # (This is costly from a time perspective. Should constitute # coverage, read count calculations for ~ 10-20% of genes.) # ---------------------------------------------------------------------- # # display summary statistics around rate of gene intersection. if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- overlap genes = {2} / {3}.'.format( self.sample_id, chrom, n_overlap_genes, n_genes)) logging.info( 'SAMPLE {0}, CHR {1} -- begin overlap gene group reads processing.' .format(self.sample_id, chrom)) # for genes in a group of overlapping genes, compute read coverage + count. if n_overlap_genes > 0: ol_cov_dict = dict() # iterate over groups of overlapping genes. for ol_genes in gene_overlap_dat['overlap_genes']: ol_gene_df = chrom_gene_df[chrom_gene_df.gene.isin(ol_genes)] ol_gene_group_start = ol_gene_df.gene_start.min() - 1 ol_gene_group_end = ol_gene_df.gene_end.max() - 1 ol_gene_starts = list() gene_exon_bounds = list() transcript_idx = list( ) # list of 1-d np.arrays, each holding one overlapping gene's exon positioning. # obtain exon regions for each gene in overlap group. # Exon starts/ends are 1-indexed, change them to be 0-indexed. for ol_gene in ol_genes: ol_gene_exon_df = chrom_exon_df[chrom_exon_df.gene == ol_gene] # store gene starts for constructing per-gene coverage vectors. # 0-index gene starts/ends. ol_gene_start = ol_gene_exon_df.gene_start.iloc[0] - 1 ol_gene_end = ol_gene_exon_df.gene_end.iloc[0] - 1 ol_gene_starts.append(ol_gene_start) # initialize gene coverage vector for each gene in overlap group. ol_cov_dict[ol_gene] = np.zeros( [ol_gene_end - ol_gene_start + 1], dtype=int) # save gene exon positioning, for determining which reads captured by which genes. # 0-index exon positions, and include gene end positioning. e_starts, e_ends = np.sort( ol_gene_exon_df.start.values) - 1, np.sort( ol_gene_exon_df.end.values) gene_exon_bounds += [[ [e_starts[j], e_ends[j]] for j in range(len(e_starts)) ]] # list of list of lists, includes exon end pos. transcript_idx.append( np.unique( fill_in_bounds(flatten_2d(gene_exon_bounds[-1]))) ) # transcript vector is 0-indexed, includes exon end pos. # drop things we don't need any more. del ol_gene_df, ol_gene_exon_df, e_starts, e_ends # storage for reads to drop. drop_reads = list() # subset reads to those that start and end within scope of this bloc of overlapping genes. ol_reads_dat = reads_df[(reads_df.pos >= (ol_gene_group_start)) & (reads_df.end_pos <= (ol_gene_group_end))][[ 'bounds', 'read_id' ]].values # for single-read RNA-Seq experiments, we do not need such special consideration. for i in range(ol_reads_dat.shape[0]): # obtain read regions bounds. read_bounds, read_id = ol_reads_dat[i, :] # find genes that fully include this read. Everything is 0-indexed. caught_genes = self.determine_full_inclusion( read_bounds, gene_exon_bounds=gene_exon_bounds) # Ambiguous read determination logic: # - if paired reads lie fully within 0 or 2+ genes, do not use the reads pair and drop them. # - if read lies fully within a single gene: # - do not drop it. # - if the caught gene is the current gene being analyzed, use the read. O/w do not. n_caught_genes = len(caught_genes) # if only one gene captures read, use the read and identify capturing gene for # incrementing count, but drop it from consideration later (it's been accounted for). # if only full intersection is with with a single gene, increment coverage and read count # for that gene, and drop read. # Note: need to restart coverage calculations relative to gene's start position. if n_caught_genes == 1: drop_read = True read_gene = ol_genes[caught_genes[0]] read_gene_start = ol_gene_starts[caught_genes[0]] read_idx = fill_in_bounds( read_bounds, endpoint=True) - read_gene_start - 1 ol_cov_dict[read_gene][read_idx] += 1 read_count_dict[read_gene] += 1 # if no gene fully captures the read, do not use read *but do not drop it*, # for the possibility that some isolated gene captures the read later on. elif n_caught_genes == 0: drop_read = False # if > 1 gene fully captures the read, # do not use read and drop it from consideration. else: drop_read = True # if need be, add read to list of reads to be dropped. if drop_read: drop_reads.append(read_id) # drop ambiguous reads from larger set of chromosome reads, # should speed up gene-read searches in the future. if drop_reads: reads_df = reads_df[~reads_df.read_id.isin(drop_reads)] del drop_reads # pare down coverage vectors for genes in overlap group to their concatenated exon regions. for i in range(len(ol_genes)): ol_gene = ol_genes[i] ol_cov_dict[ol_gene] = ol_cov_dict[ol_gene][ transcript_idx[i] - ol_gene_starts[i]] # ---------------------------------------------------------------------- # # Step 3.5: save overlapping genes' coverage vectors. # overlapping gene coverage vector dict ->> pkl file. # ---------------------------------------------------------------------- # if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- saving overlapping gene coverage vectors.' .format(self.sample_id, chrom)) # dump overlapping genes' coverage matrices. with open(ol_cov_file, 'wb') as f: pkl.dump(ol_cov_dict, f) # free up some memory -- delete groups of intersecting genes, etc. del ol_reads_dat, ol_cov_dict, transcript_idx, gene_exon_bounds gc.collect() if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- overlapping gene reads processing successful.' .format(self.sample_id, chrom)) # ---------------------------------------------------------------------- # # Step 4. Compute coverage, reads for individual isolated genes. # ---------------------------------------------------------------------- # if n_isolated_genes > 0: if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- begin isolated gene reads processing.' .format(self.sample_id, chrom)) # reduce chrom_gene_df to remaining genes chrom_gene_df = chrom_gene_df[chrom_gene_df.gene.isin( gene_overlap_dat['isolated_genes'])] # run same inclusion/exclusion transcript test but on the isolated genes. tscript_vec = np.ones([chrom_len], dtype=int) # identify regions of chromosome covered by isolated genes. # change gene starts/ends to 0-indexed to match 0-indexed tscript_vec array, but # gene ends are inclusive. gene_starts = chrom_gene_df.gene_start.values - 1 gene_ends = chrom_gene_df.gene_end.values for i in range(len(gene_starts)): tscript_vec[gene_starts[i]:gene_ends[i]] = 0 # identify reads that do not fall within an isolated gene's (start, end). drop_reads = list() dat = reads_df[['pos', 'end_pos', 'read_id']].values for i in range(dat.shape[0]): read_start, read_end, read_id = dat[i, :] # remember to include read end position. reads are 0-indexed. if np.sum(tscript_vec[read_start:(read_end + 1)]) > 0: drop_reads.append(read_id) # drop memory hogs. del dat, gene_starts, gene_ends, tscript_vec # drop reads that do not lie completely within area covered by isolated genes. if drop_reads: reads_df = reads_df[~reads_df.read_id.isin(drop_reads)] del drop_reads gc.collect() # (a precaution) only continue if we have any reads intersecting isolated genes. if not reads_df.empty: # initialize chromosome coverage array. cov_vec = np.zeros([chrom_len], dtype=int) # ---------------------------------------------------------------------- # # Step 4.5.1: join genes on reads data # so that each read is tied to a gene, for read counting purposes. # ---------------------------------------------------------------------- # # 0-index gene_starts, gene_ends because reads are 0-indexed. chrom_gene_df.loc[:, ['gene_start', 'gene_end']] -= 1 # add IntervalIndex index to chromosome gene data. chrom_gene_df.index = IntervalIndex.from_arrays( chrom_gene_df.gene_start, right=chrom_gene_df.gene_end, closed='both') try: reads_df['gene'] = chrom_gene_df.loc[ reads_df.pos].gene.values # if there remains at least one read that doesn't land within a gene span, # try another sweep to remove reads not within gene regions. except KeyError: # outline valid read start positions along transcript. tscript_vec = np.ones([chrom_len], dtype=int) for i in range(chrom_gene_df.shape[0]): left = chrom_gene_df.index[i].left right = chrom_gene_df.index[i].right + 1 tscript_vec[left:right] = 0 # iterate over reads, checking whether read start position falls within # a [gene_start, gene_end] region. drop_reads = list() for i in range(reads_df.shape[0]): if tscript_vec[reads_df.pos.iloc[i]] != 0: drop_reads.append(reads_df.read_id.iloc[i]) # drop reads that do not start within valid [gene_start, gene_end] regions. if drop_reads: reads_df = reads_df[~reads_df.read_id.isin(drop_reads)] del tscript_vec, drop_reads gc.collect() # subset reads to reads w/ valid read ID, then join with interval index again. reads_df['gene'] = chrom_gene_df.loc[ reads_df.pos].gene.values # loop over reads for isolated genes, incrementing read count and coverage. dat = reads_df[['bounds', 'gene']].values for i in range(dat.shape[0]): bounds, gene = dat[i, :] # reads are already 0-indexed. read_idx = fill_in_bounds(bounds, endpoint=True) # increment coverage and read count. cov_vec[read_idx] += 1 read_count_dict[gene] += 1 # ---------------------------------------------------------------------- # # Step 4.5.2: save chromosome coverage vector. # chromosome overage vector ->> compressed csr numpy array # ---------------------------------------------------------------------- # if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- saving csr-compressed chrom coverage array.' .format(self.sample_id, chrom)) # save coverage vector as a compressed-sparse row matrix. sparse.save_npz(chrom_cov_file, matrix=sparse.csr_matrix(cov_vec)) # drop large data objects. del cov_vec, dat, reads_df # drop remaining large data data objects. del chrom_gene_df, chrom_exon_df gc.collect() if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- isolated gene reads processing successful.' .format(self.sample_id, chrom)) # ---------------------------------------------------------------------- # # Step 5. Save read counts. # chromosome read counts ->> .csv file # ---------------------------------------------------------------------- # # construct read count DataFrame from read count dictionary. read_count_df = DataFrame({ 'gene': list(read_count_dict.keys()), self.sample_id: list(read_count_dict.values()) }) del read_count_dict gc.collect() if self.verbose: logging.info( 'SAMPLE {0}, CHR {1} -- mean per-gene read count: {2:.4}'. format(self.sample_id, chrom, read_count_df[self.sample_id].mean())) logging.info('SAMPLE {0}, CHR {1} -- saving read counts.'.format( self.sample_id, chrom)) # save sample's chromosome read counts to .csv for joining later. read_count_df.to_csv(count_file, index=False)
def create_series_categorical_intervals(left, right, closed="right"): return Series(Categorical(IntervalIndex.from_arrays(left, right, closed)))
def test_dir(): # GH#27571 dir(interval_index) should not raise index = IntervalIndex.from_arrays([0, 1], [1, 2]) result = dir(index) assert "str" not in result
class TestIntervalIndex: index = IntervalIndex.from_arrays([0, 1], [1, 2]) def create_index(self, closed="right"): return IntervalIndex.from_breaks(range(11), closed=closed) def create_index_with_nan(self, closed="right"): mask = [True, False] + [True] * 8 return IntervalIndex.from_arrays( np.where(mask, np.arange(10), np.nan), np.where(mask, np.arange(1, 11), np.nan), closed=closed, ) def test_properties(self, closed): index = self.create_index(closed=closed) assert len(index) == 10 assert index.size == 10 assert index.shape == (10,) tm.assert_index_equal(index.left, Index(np.arange(10))) tm.assert_index_equal(index.right, Index(np.arange(1, 11))) tm.assert_index_equal(index.mid, Index(np.arange(0.5, 10.5))) assert index.closed == closed ivs = [Interval(l, r, closed) for l, r in zip(range(10), range(1, 11))] expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) # with nans index = self.create_index_with_nan(closed=closed) assert len(index) == 10 assert index.size == 10 assert index.shape == (10,) expected_left = Index([0, np.nan, 2, 3, 4, 5, 6, 7, 8, 9]) expected_right = expected_left + 1 expected_mid = expected_left + 0.5 tm.assert_index_equal(index.left, expected_left) tm.assert_index_equal(index.right, expected_right) tm.assert_index_equal(index.mid, expected_mid) assert index.closed == closed ivs = [ Interval(l, r, closed) if notna(l) else np.nan for l, r in zip(expected_left, expected_right) ] expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) @pytest.mark.parametrize( "breaks", [ [1, 1, 2, 5, 15, 53, 217, 1014, 5335, 31240, 201608], [-np.inf, -100, -10, 0.5, 1, 1.5, 3.8, 101, 202, np.inf], pd.to_datetime(["20170101", "20170202", "20170303", "20170404"]), pd.to_timedelta(["1ns", "2ms", "3s", "4M", "5H", "6D"]), ], ) def test_length(self, closed, breaks): # GH 18789 index = IntervalIndex.from_breaks(breaks, closed=closed) result = index.length expected = Index(iv.length for iv in index) tm.assert_index_equal(result, expected) # with NA index = index.insert(1, np.nan) result = index.length expected = Index(iv.length if notna(iv) else iv for iv in index) tm.assert_index_equal(result, expected) def test_with_nans(self, closed): index = self.create_index(closed=closed) assert index.hasnans is False result = index.isna() expected = np.zeros(len(index), dtype=bool) tm.assert_numpy_array_equal(result, expected) result = index.notna() expected = np.ones(len(index), dtype=bool) tm.assert_numpy_array_equal(result, expected) index = self.create_index_with_nan(closed=closed) assert index.hasnans is True result = index.isna() expected = np.array([False, True] + [False] * (len(index) - 2)) tm.assert_numpy_array_equal(result, expected) result = index.notna() expected = np.array([True, False] + [True] * (len(index) - 2)) tm.assert_numpy_array_equal(result, expected) def test_copy(self, closed): expected = self.create_index(closed=closed) result = expected.copy() assert result.equals(expected) result = expected.copy(deep=True) assert result.equals(expected) assert result.left is not expected.left def test_ensure_copied_data(self, closed): # exercise the copy flag in the constructor # not copying index = self.create_index(closed=closed) result = IntervalIndex(index, copy=False) tm.assert_numpy_array_equal( index.left.values, result.left.values, check_same="same" ) tm.assert_numpy_array_equal( index.right.values, result.right.values, check_same="same" ) # by-definition make a copy result = IntervalIndex(np.array(index), copy=False) tm.assert_numpy_array_equal( index.left.values, result.left.values, check_same="copy" ) tm.assert_numpy_array_equal( index.right.values, result.right.values, check_same="copy" ) def test_delete(self, closed): expected = IntervalIndex.from_breaks(np.arange(1, 11), closed=closed) result = self.create_index(closed=closed).delete(0) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( "data", [ interval_range(0, periods=10, closed="neither"), interval_range(1.7, periods=8, freq=2.5, closed="both"), interval_range(Timestamp("20170101"), periods=12, closed="left"), interval_range(Timedelta("1 day"), periods=6, closed="right"), ], ) def test_insert(self, data): item = data[0] idx_item = IntervalIndex([item]) # start expected = idx_item.append(data) result = data.insert(0, item) tm.assert_index_equal(result, expected) # end expected = data.append(idx_item) result = data.insert(len(data), item) tm.assert_index_equal(result, expected) # mid expected = data[:3].append(idx_item).append(data[3:]) result = data.insert(3, item) tm.assert_index_equal(result, expected) # invalid type msg = "can only insert Interval objects and NA into an IntervalIndex" with pytest.raises(ValueError, match=msg): data.insert(1, "foo") # invalid closed msg = "inserted item must be closed on the same side as the index" for closed in {"left", "right", "both", "neither"} - {item.closed}: with pytest.raises(ValueError, match=msg): bad_item = Interval(item.left, item.right, closed=closed) data.insert(1, bad_item) # GH 18295 (test missing) na_idx = IntervalIndex([np.nan], closed=data.closed) for na in [np.nan, None, pd.NA]: expected = data[:1].append(na_idx).append(data[1:]) result = data.insert(1, na) tm.assert_index_equal(result, expected) if data.left.dtype.kind not in ["m", "M"]: # trying to insert pd.NaT into a numeric-dtyped Index should cast/raise msg = "can only insert Interval objects and NA into an IntervalIndex" with pytest.raises(ValueError, match=msg): result = data.insert(1, pd.NaT) else: result = data.insert(1, pd.NaT) tm.assert_index_equal(result, expected) def test_is_unique_interval(self, closed): """ Interval specific tests for is_unique in addition to base class tests """ # unique overlapping - distinct endpoints idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)], closed=closed) assert idx.is_unique is True # unique overlapping - shared endpoints idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed) assert idx.is_unique is True # unique nested idx = IntervalIndex.from_tuples([(-1, 1), (-2, 2)], closed=closed) assert idx.is_unique is True def test_monotonic(self, closed): # increasing non-overlapping idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)], closed=closed) assert idx.is_monotonic is True assert idx._is_strictly_monotonic_increasing is True assert idx.is_monotonic_decreasing is False assert idx._is_strictly_monotonic_decreasing is False # decreasing non-overlapping idx = IntervalIndex.from_tuples([(4, 5), (2, 3), (1, 2)], closed=closed) assert idx.is_monotonic is False assert idx._is_strictly_monotonic_increasing is False assert idx.is_monotonic_decreasing is True assert idx._is_strictly_monotonic_decreasing is True # unordered non-overlapping idx = IntervalIndex.from_tuples([(0, 1), (4, 5), (2, 3)], closed=closed) assert idx.is_monotonic is False assert idx._is_strictly_monotonic_increasing is False assert idx.is_monotonic_decreasing is False assert idx._is_strictly_monotonic_decreasing is False # increasing overlapping idx = IntervalIndex.from_tuples([(0, 2), (0.5, 2.5), (1, 3)], closed=closed) assert idx.is_monotonic is True assert idx._is_strictly_monotonic_increasing is True assert idx.is_monotonic_decreasing is False assert idx._is_strictly_monotonic_decreasing is False # decreasing overlapping idx = IntervalIndex.from_tuples([(1, 3), (0.5, 2.5), (0, 2)], closed=closed) assert idx.is_monotonic is False assert idx._is_strictly_monotonic_increasing is False assert idx.is_monotonic_decreasing is True assert idx._is_strictly_monotonic_decreasing is True # unordered overlapping idx = IntervalIndex.from_tuples([(0.5, 2.5), (0, 2), (1, 3)], closed=closed) assert idx.is_monotonic is False assert idx._is_strictly_monotonic_increasing is False assert idx.is_monotonic_decreasing is False assert idx._is_strictly_monotonic_decreasing is False # increasing overlapping shared endpoints idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)], closed=closed) assert idx.is_monotonic is True assert idx._is_strictly_monotonic_increasing is True assert idx.is_monotonic_decreasing is False assert idx._is_strictly_monotonic_decreasing is False # decreasing overlapping shared endpoints idx = pd.IntervalIndex.from_tuples([(2, 3), (1, 3), (1, 2)], closed=closed) assert idx.is_monotonic is False assert idx._is_strictly_monotonic_increasing is False assert idx.is_monotonic_decreasing is True assert idx._is_strictly_monotonic_decreasing is True # stationary idx = IntervalIndex.from_tuples([(0, 1), (0, 1)], closed=closed) assert idx.is_monotonic is True assert idx._is_strictly_monotonic_increasing is False assert idx.is_monotonic_decreasing is True assert idx._is_strictly_monotonic_decreasing is False # empty idx = IntervalIndex([], closed=closed) assert idx.is_monotonic is True assert idx._is_strictly_monotonic_increasing is True assert idx.is_monotonic_decreasing is True assert idx._is_strictly_monotonic_decreasing is True def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) assert i[0] == Interval(0.0, 1.0, closed=closed) assert i[1] == Interval(1.0, 2.0, closed=closed) assert isna(i[2]) result = i[0:1] expected = IntervalIndex.from_arrays((0.0,), (1.0,), closed=closed) tm.assert_index_equal(result, expected) result = i[0:2] expected = IntervalIndex.from_arrays((0.0, 1), (1.0, 2.0), closed=closed) tm.assert_index_equal(result, expected) result = i[1:3] expected = IntervalIndex.from_arrays( (1.0, np.nan), (2.0, np.nan), closed=closed ) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( "breaks", [ date_range("20180101", periods=4), date_range("20180101", periods=4, tz="US/Eastern"), timedelta_range("0 days", periods=4), ], ids=lambda x: str(x.dtype), ) def test_maybe_convert_i8(self, breaks): # GH 20636 index = IntervalIndex.from_breaks(breaks) # intervalindex result = index._maybe_convert_i8(index) expected = IntervalIndex.from_breaks(breaks.asi8) tm.assert_index_equal(result, expected) # interval interval = Interval(breaks[0], breaks[1]) result = index._maybe_convert_i8(interval) expected = Interval(breaks[0].value, breaks[1].value) assert result == expected # datetimelike index result = index._maybe_convert_i8(breaks) expected = Index(breaks.asi8) tm.assert_index_equal(result, expected) # datetimelike scalar result = index._maybe_convert_i8(breaks[0]) expected = breaks[0].value assert result == expected # list-like of datetimelike scalars result = index._maybe_convert_i8(list(breaks)) expected = Index(breaks.asi8) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( "breaks", [date_range("2018-01-01", periods=5), timedelta_range("0 days", periods=5)], ) def test_maybe_convert_i8_nat(self, breaks): # GH 20636 index = IntervalIndex.from_breaks(breaks) to_convert = breaks._constructor([pd.NaT] * 3) expected = pd.Float64Index([np.nan] * 3) result = index._maybe_convert_i8(to_convert) tm.assert_index_equal(result, expected) to_convert = to_convert.insert(0, breaks[0]) expected = expected.insert(0, float(breaks[0].value)) result = index._maybe_convert_i8(to_convert) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( "breaks", [np.arange(5, dtype="int64"), np.arange(5, dtype="float64")], ids=lambda x: str(x.dtype), ) @pytest.mark.parametrize( "make_key", [ IntervalIndex.from_breaks, lambda breaks: Interval(breaks[0], breaks[1]), lambda breaks: breaks, lambda breaks: breaks[0], list, ], ids=["IntervalIndex", "Interval", "Index", "scalar", "list"], ) def test_maybe_convert_i8_numeric(self, breaks, make_key): # GH 20636 index = IntervalIndex.from_breaks(breaks) key = make_key(breaks) # no conversion occurs for numeric result = index._maybe_convert_i8(key) assert result is key @pytest.mark.parametrize( "breaks1, breaks2", permutations( [ date_range("20180101", periods=4), date_range("20180101", periods=4, tz="US/Eastern"), timedelta_range("0 days", periods=4), ], 2, ), ids=lambda x: str(x.dtype), ) @pytest.mark.parametrize( "make_key", [ IntervalIndex.from_breaks, lambda breaks: Interval(breaks[0], breaks[1]), lambda breaks: breaks, lambda breaks: breaks[0], list, ], ids=["IntervalIndex", "Interval", "Index", "scalar", "list"], ) def test_maybe_convert_i8_errors(self, breaks1, breaks2, make_key): # GH 20636 index = IntervalIndex.from_breaks(breaks1) key = make_key(breaks2) msg = ( f"Cannot index an IntervalIndex of subtype {breaks1.dtype} with " f"values of dtype {breaks2.dtype}" ) msg = re.escape(msg) with pytest.raises(ValueError, match=msg): index._maybe_convert_i8(key) def test_contains_method(self): # can select values that are IN the range of a value i = IntervalIndex.from_arrays([0, 1], [1, 2]) expected = np.array([False, False], dtype="bool") actual = i.contains(0) tm.assert_numpy_array_equal(actual, expected) actual = i.contains(3) tm.assert_numpy_array_equal(actual, expected) expected = np.array([True, False], dtype="bool") actual = i.contains(0.5) tm.assert_numpy_array_equal(actual, expected) actual = i.contains(1) tm.assert_numpy_array_equal(actual, expected) # __contains__ not implemented for "interval in interval", follow # that for the contains method for now with pytest.raises( NotImplementedError, match="contains not implemented for two" ): i.contains(Interval(0, 1)) def test_contains_dunder(self): index = IntervalIndex.from_arrays([0, 1], [1, 2], closed="right") # __contains__ requires perfect matches to intervals. assert 0 not in index assert 1 not in index assert 2 not in index assert Interval(0, 1, closed="right") in index assert Interval(0, 2, closed="right") not in index assert Interval(0, 0.5, closed="right") not in index assert Interval(3, 5, closed="right") not in index assert Interval(-1, 0, closed="left") not in index assert Interval(0, 1, closed="left") not in index assert Interval(0, 1, closed="both") not in index def test_dropna(self, closed): expected = IntervalIndex.from_tuples([(0.0, 1.0), (1.0, 2.0)], closed=closed) ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan], closed=closed) result = ii.dropna() tm.assert_index_equal(result, expected) ii = IntervalIndex.from_arrays([0, 1, np.nan], [1, 2, np.nan], closed=closed) result = ii.dropna() tm.assert_index_equal(result, expected) def test_non_contiguous(self, closed): index = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed) target = [0.5, 1.5, 2.5] actual = index.get_indexer(target) expected = np.array([0, -1, 1], dtype="intp") tm.assert_numpy_array_equal(actual, expected) assert 1.5 not in index def test_isin(self, closed): index = self.create_index(closed=closed) expected = np.array([True] + [False] * (len(index) - 1)) result = index.isin(index[:1]) tm.assert_numpy_array_equal(result, expected) result = index.isin([index[0]]) tm.assert_numpy_array_equal(result, expected) other = IntervalIndex.from_breaks(np.arange(-2, 10), closed=closed) expected = np.array([True] * (len(index) - 1) + [False]) result = index.isin(other) tm.assert_numpy_array_equal(result, expected) result = index.isin(other.tolist()) tm.assert_numpy_array_equal(result, expected) for other_closed in {"right", "left", "both", "neither"}: other = self.create_index(closed=other_closed) expected = np.repeat(closed == other_closed, len(index)) result = index.isin(other) tm.assert_numpy_array_equal(result, expected) result = index.isin(other.tolist()) tm.assert_numpy_array_equal(result, expected) def test_comparison(self): actual = Interval(0, 1) < self.index expected = np.array([False, True]) tm.assert_numpy_array_equal(actual, expected) actual = Interval(0.5, 1.5) < self.index expected = np.array([False, True]) tm.assert_numpy_array_equal(actual, expected) actual = self.index > Interval(0.5, 1.5) tm.assert_numpy_array_equal(actual, expected) actual = self.index == self.index expected = np.array([True, True]) tm.assert_numpy_array_equal(actual, expected) actual = self.index <= self.index tm.assert_numpy_array_equal(actual, expected) actual = self.index >= self.index tm.assert_numpy_array_equal(actual, expected) actual = self.index < self.index expected = np.array([False, False]) tm.assert_numpy_array_equal(actual, expected) actual = self.index > self.index tm.assert_numpy_array_equal(actual, expected) actual = self.index == IntervalIndex.from_breaks([0, 1, 2], "left") tm.assert_numpy_array_equal(actual, expected) actual = self.index == self.index.values tm.assert_numpy_array_equal(actual, np.array([True, True])) actual = self.index.values == self.index tm.assert_numpy_array_equal(actual, np.array([True, True])) actual = self.index <= self.index.values tm.assert_numpy_array_equal(actual, np.array([True, True])) actual = self.index != self.index.values tm.assert_numpy_array_equal(actual, np.array([False, False])) actual = self.index > self.index.values tm.assert_numpy_array_equal(actual, np.array([False, False])) actual = self.index.values > self.index tm.assert_numpy_array_equal(actual, np.array([False, False])) # invalid comparisons actual = self.index == 0 tm.assert_numpy_array_equal(actual, np.array([False, False])) actual = self.index == self.index.left tm.assert_numpy_array_equal(actual, np.array([False, False])) msg = ( "not supported between instances of 'int' and " "'pandas._libs.interval.Interval'" ) with pytest.raises(TypeError, match=msg): self.index > 0 with pytest.raises(TypeError, match=msg): self.index <= 0 with pytest.raises(TypeError, match=msg): self.index > np.arange(2) msg = "Lengths must match to compare" with pytest.raises(ValueError, match=msg): self.index > np.arange(3) def test_missing_values(self, closed): idx = Index( [np.nan, Interval(0, 1, closed=closed), Interval(1, 2, closed=closed)] ) idx2 = IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2], closed=closed) assert idx.equals(idx2) msg = ( "missing values must be missing in the same location both left " "and right sides" ) with pytest.raises(ValueError, match=msg): IntervalIndex.from_arrays( [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed ) tm.assert_numpy_array_equal(isna(idx), np.array([True, False, False])) def test_sort_values(self, closed): index = self.create_index(closed=closed) result = index.sort_values() tm.assert_index_equal(result, index) result = index.sort_values(ascending=False) tm.assert_index_equal(result, index[::-1]) # with nan index = IntervalIndex([Interval(1, 2), np.nan, Interval(0, 1)]) result = index.sort_values() expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan]) tm.assert_index_equal(result, expected) result = index.sort_values(ascending=False, na_position="first") expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)]) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("tz", [None, "US/Eastern"]) def test_datetime(self, tz): start = Timestamp("2000-01-01", tz=tz) dates = date_range(start=start, periods=10) index = IntervalIndex.from_breaks(dates) # test mid start = Timestamp("2000-01-01T12:00", tz=tz) expected = date_range(start=start, periods=9) tm.assert_index_equal(index.mid, expected) # __contains__ doesn't check individual points assert Timestamp("2000-01-01", tz=tz) not in index assert Timestamp("2000-01-01T12", tz=tz) not in index assert Timestamp("2000-01-02", tz=tz) not in index iv_true = Interval( Timestamp("2000-01-02", tz=tz), Timestamp("2000-01-03", tz=tz) ) iv_false = Interval( Timestamp("1999-12-31", tz=tz), Timestamp("2000-01-01", tz=tz) ) assert iv_true in index assert iv_false not in index # .contains does check individual points assert not index.contains(Timestamp("2000-01-01", tz=tz)).any() assert index.contains(Timestamp("2000-01-01T12", tz=tz)).any() assert index.contains(Timestamp("2000-01-02", tz=tz)).any() # test get_indexer start = Timestamp("1999-12-31T12:00", tz=tz) target = date_range(start=start, periods=7, freq="12H") actual = index.get_indexer(target) expected = np.array([-1, -1, 0, 0, 1, 1, 2], dtype="intp") tm.assert_numpy_array_equal(actual, expected) start = Timestamp("2000-01-08T18:00", tz=tz) target = date_range(start=start, periods=7, freq="6H") actual = index.get_indexer(target) expected = np.array([7, 7, 8, 8, 8, 8, -1], dtype="intp") tm.assert_numpy_array_equal(actual, expected) def test_append(self, closed): index1 = IntervalIndex.from_arrays([0, 1], [1, 2], closed=closed) index2 = IntervalIndex.from_arrays([1, 2], [2, 3], closed=closed) result = index1.append(index2) expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3], closed=closed) tm.assert_index_equal(result, expected) result = index1.append([index1, index2]) expected = IntervalIndex.from_arrays( [0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3], closed=closed ) tm.assert_index_equal(result, expected) msg = "Intervals must all be closed on the same side" for other_closed in {"left", "right", "both", "neither"} - {closed}: index_other_closed = IntervalIndex.from_arrays( [0, 1], [1, 2], closed=other_closed ) with pytest.raises(ValueError, match=msg): index1.append(index_other_closed) def test_is_non_overlapping_monotonic(self, closed): # Should be True in all cases tpls = [(0, 1), (2, 3), (4, 5), (6, 7)] idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is True idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is True # Should be False in all cases (overlapping) tpls = [(0, 2), (1, 3), (4, 5), (6, 7)] idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is False idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is False # Should be False in all cases (non-monotonic) tpls = [(0, 1), (2, 3), (6, 7), (4, 5)] idx = IntervalIndex.from_tuples(tpls, closed=closed) assert idx.is_non_overlapping_monotonic is False idx = IntervalIndex.from_tuples(tpls[::-1], closed=closed) assert idx.is_non_overlapping_monotonic is False # Should be False for closed='both', otherwise True (GH16560) if closed == "both": idx = IntervalIndex.from_breaks(range(4), closed=closed) assert idx.is_non_overlapping_monotonic is False else: idx = IntervalIndex.from_breaks(range(4), closed=closed) assert idx.is_non_overlapping_monotonic is True @pytest.mark.parametrize( "start, shift, na_value", [ (0, 1, np.nan), (Timestamp("2018-01-01"), Timedelta("1 day"), pd.NaT), (Timedelta("0 days"), Timedelta("1 day"), pd.NaT), ], ) def test_is_overlapping(self, start, shift, na_value, closed): # GH 23309 # see test_interval_tree.py for extensive tests; interface tests here # non-overlapping tuples = [(start + n * shift, start + (n + 1) * shift) for n in (0, 2, 4)] index = IntervalIndex.from_tuples(tuples, closed=closed) assert index.is_overlapping is False # non-overlapping with NA tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)] index = IntervalIndex.from_tuples(tuples, closed=closed) assert index.is_overlapping is False # overlapping tuples = [(start + n * shift, start + (n + 2) * shift) for n in range(3)] index = IntervalIndex.from_tuples(tuples, closed=closed) assert index.is_overlapping is True # overlapping with NA tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)] index = IntervalIndex.from_tuples(tuples, closed=closed) assert index.is_overlapping is True # common endpoints tuples = [(start + n * shift, start + (n + 1) * shift) for n in range(3)] index = IntervalIndex.from_tuples(tuples, closed=closed) result = index.is_overlapping expected = closed == "both" assert result is expected # common endpoints with NA tuples = [(na_value, na_value)] + tuples + [(na_value, na_value)] index = IntervalIndex.from_tuples(tuples, closed=closed) result = index.is_overlapping assert result is expected @pytest.mark.parametrize( "tuples", [ list(zip(range(10), range(1, 11))), list( zip( date_range("20170101", periods=10), date_range("20170101", periods=10), ) ), list( zip( timedelta_range("0 days", periods=10), timedelta_range("1 day", periods=10), ) ), ], ) def test_to_tuples(self, tuples): # GH 18756 idx = IntervalIndex.from_tuples(tuples) result = idx.to_tuples() expected = Index(com.asarray_tuplesafe(tuples)) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( "tuples", [ list(zip(range(10), range(1, 11))) + [np.nan], list( zip( date_range("20170101", periods=10), date_range("20170101", periods=10), ) ) + [np.nan], list( zip( timedelta_range("0 days", periods=10), timedelta_range("1 day", periods=10), ) ) + [np.nan], ], ) @pytest.mark.parametrize("na_tuple", [True, False]) def test_to_tuples_na(self, tuples, na_tuple): # GH 18756 idx = IntervalIndex.from_tuples(tuples) result = idx.to_tuples(na_tuple=na_tuple) # check the non-NA portion expected_notna = Index(com.asarray_tuplesafe(tuples[:-1])) result_notna = result[:-1] tm.assert_index_equal(result_notna, expected_notna) # check the NA portion result_na = result[-1] if na_tuple: assert isinstance(result_na, tuple) assert len(result_na) == 2 assert all(isna(x) for x in result_na) else: assert isna(result_na) def test_nbytes(self): # GH 19209 left = np.arange(0, 4, dtype="i8") right = np.arange(1, 5, dtype="i8") result = IntervalIndex.from_arrays(left, right).nbytes expected = 64 # 4 * 8 * 2 assert result == expected @pytest.mark.parametrize("new_closed", ["left", "right", "both", "neither"]) def test_set_closed(self, name, closed, new_closed): # GH 21670 index = interval_range(0, 5, closed=closed, name=name) result = index.set_closed(new_closed) expected = interval_range(0, 5, closed=new_closed, name=name) tm.assert_index_equal(result, expected) @pytest.mark.parametrize("bad_closed", ["foo", 10, "LEFT", True, False]) def test_set_closed_errors(self, bad_closed): # GH 21670 index = interval_range(0, 5) msg = f"invalid option for 'closed': {bad_closed}" with pytest.raises(ValueError, match=msg): index.set_closed(bad_closed) def test_is_all_dates(self): # GH 23576 year_2017 = pd.Interval( pd.Timestamp("2017-01-01 00:00:00"), pd.Timestamp("2018-01-01 00:00:00") ) year_2017_index = pd.IntervalIndex([year_2017]) assert not year_2017_index.is_all_dates @pytest.mark.parametrize("key", [[5], (2, 3)]) def test_get_value_non_scalar_errors(self, key): # GH 31117 idx = IntervalIndex.from_tuples([(1, 3), (2, 4), (3, 5), (7, 10), (3, 10)]) s = pd.Series(range(len(idx)), index=idx) msg = str(key) with pytest.raises(InvalidIndexError, match=msg): with tm.assert_produces_warning(FutureWarning): idx.get_value(s, key) @pytest.mark.parametrize("closed", ["left", "right", "both"]) def test_pickle_round_trip_closed(self, closed): # https://github.com/pandas-dev/pandas/issues/35658 idx = IntervalIndex.from_tuples([(1, 2), (2, 3)], closed=closed) result = tm.round_trip_pickle(idx) tm.assert_index_equal(result, idx)
class TestFloatSubtype(AstypeTests): """Tests specific to IntervalIndex with float subtype""" indexes = [ interval_range(-10.0, 10.0, inclusive="neither"), IntervalIndex.from_arrays( [-1.5, np.nan, 0.0, 0.0, 1.5], [-0.5, np.nan, 1.0, 1.0, 3.0], inclusive="both", ), ] @pytest.fixture(params=indexes) def index(self, request): return request.param @pytest.mark.parametrize("subtype", ["int64", "uint64"]) def test_subtype_integer(self, subtype): index = interval_range(0.0, 10.0) dtype = IntervalDtype(subtype, "right") result = index.astype(dtype) expected = IntervalIndex.from_arrays( index.left.astype(subtype), index.right.astype(subtype), inclusive=index.inclusive, ) tm.assert_index_equal(result, expected) # raises with NA msg = r"Cannot convert non-finite values \(NA or inf\) to integer" with pytest.raises(ValueError, match=msg): index.insert(0, np.nan).astype(dtype) @pytest.mark.parametrize("subtype", ["int64", "uint64"]) def test_subtype_integer_with_non_integer_borders(self, subtype): index = interval_range(0.0, 3.0, freq=0.25) dtype = IntervalDtype(subtype, "right") result = index.astype(dtype) expected = IntervalIndex.from_arrays( index.left.astype(subtype), index.right.astype(subtype), inclusive=index.inclusive, ) tm.assert_index_equal(result, expected) def test_subtype_integer_errors(self): # float64 -> uint64 fails with negative values index = interval_range(-10.0, 10.0, inclusive="right") dtype = IntervalDtype("uint64", "right") msg = re.escape( "Cannot convert interval[float64, right] to interval[uint64, right]; " "subtypes are incompatible") with pytest.raises(TypeError, match=msg): index.astype(dtype) @pytest.mark.parametrize("subtype", ["datetime64[ns]", "timedelta64[ns]"]) def test_subtype_datetimelike(self, index, subtype): dtype = IntervalDtype(subtype, "right") msg = "Cannot convert .* to .*; subtypes are incompatible" with pytest.raises(TypeError, match=msg): index.astype(dtype)
def test_get_loc_decreasing(self, values): # GH 25860 index = IntervalIndex.from_arrays(values[1:], values[:-1]) result = index.get_loc(index[0]) expected = 0 assert result == expected
def test_constructors_errors(self): # scalar msg = (r'IntervalIndex\(...\) must be called with a collection of ' 'some kind, 5 was passed') with tm.assert_raises_regex(TypeError, msg): IntervalIndex(5) # not an interval msg = ("type <(class|type) 'numpy.int64'> with value 0 " "is not an interval") with tm.assert_raises_regex(TypeError, msg): IntervalIndex([0, 1]) with tm.assert_raises_regex(TypeError, msg): IntervalIndex.from_intervals([0, 1]) # invalid closed msg = "invalid options for 'closed': invalid" with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid') # mismatched closed within intervals msg = 'intervals must all be closed on the same side' with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2, closed='left')]) with tm.assert_raises_regex(ValueError, msg): IntervalIndex([Interval(0, 1), Interval(2, 3, closed='left')]) with tm.assert_raises_regex(ValueError, msg): Index([Interval(0, 1), Interval(2, 3, closed='left')]) # mismatched closed inferred from intervals vs constructor. msg = 'conflicting values for closed' with tm.assert_raises_regex(ValueError, msg): iv = [Interval(0, 1, closed='both'), Interval(1, 2, closed='both')] IntervalIndex(iv, closed='neither') # no point in nesting periods in an IntervalIndex msg = 'Period dtypes are not supported, use a PeriodIndex instead' with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_breaks( pd.period_range('2000-01-01', periods=3)) # decreasing breaks/arrays msg = 'left side of interval must be <= right side' with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_breaks(range(10, -1, -1)) with tm.assert_raises_regex(ValueError, msg): IntervalIndex.from_arrays(range(10, -1, -1), range(9, -2, -1)) # GH 19016: categorical data data = Categorical(list('01234abcde'), ordered=True) msg = ('category, object, and string subtypes are not supported ' 'for IntervalIndex') with tm.assert_raises_regex(TypeError, msg): IntervalIndex.from_breaks(data) with tm.assert_raises_regex(TypeError, msg): IntervalIndex.from_arrays(data[:-1], data[1:])
def f(): index1.append(IntervalIndex.from_arrays([0, 1], [1, 2], closed='both'))
def setup(self, N): left = np.append(np.arange(N), np.array(0)) right = np.append(np.arange(1, N + 1), np.array(1)) self.intv = IntervalIndex.from_arrays(left, right) self.intv._engine
def setup_method(self, method): self.index = IntervalIndex.from_arrays([0, 1], [1, 2]) self.index_with_nan = IntervalIndex.from_tuples([(0, 1), np.nan, (1, 2)]) self.indices = dict(intervalIndex=tm.makeIntervalIndex(10))
def create_index_with_nan(self, closed='right'): mask = [True, False] + [True] * 8 return IntervalIndex.from_arrays(np.where(mask, np.arange(10), np.nan), np.where(mask, np.arange(1, 11), np.nan), closed=closed)
def test_constructors(self, data, closed, name): left, right = data[:-1], data[1:] ivs = [Interval(l, r, closed=closed) for l, r in lzip(left, right)] expected = IntervalIndex._simple_new(left=left, right=right, closed=closed, name=name) # validate expected assert expected.closed == closed assert expected.name == name assert expected.dtype.subtype == data.dtype tm.assert_index_equal(expected.left, data[:-1]) tm.assert_index_equal(expected.right, data[1:]) # validated constructors result = IntervalIndex(ivs, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_intervals(ivs, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_breaks(data, closed=closed, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_arrays(left, right, closed=closed, name=name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples(lzip(left, right), closed=closed, name=name) tm.assert_index_equal(result, expected) result = Index(ivs, name=name) assert isinstance(result, IntervalIndex) tm.assert_index_equal(result, expected) # idempotent tm.assert_index_equal(Index(expected), expected) tm.assert_index_equal(IntervalIndex(expected), expected) result = IntervalIndex.from_intervals(expected) tm.assert_index_equal(result, expected) result = IntervalIndex.from_intervals(expected.values, name=expected.name) tm.assert_index_equal(result, expected) left, right = expected.left, expected.right result = IntervalIndex.from_arrays(left, right, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) result = IntervalIndex.from_tuples(expected.to_tuples(), closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected) breaks = expected.left.tolist() + [expected.right[-1]] result = IntervalIndex.from_breaks(breaks, closed=expected.closed, name=expected.name) tm.assert_index_equal(result, expected)