def test_sanitize_records_triu_action(): text = valid_data chunk = pd.read_csv(StringIO(text), sep="\t", names=columns) out = sanitize_records(bins, schema="pairs", validate=True, tril_action="reflect")( chunk.copy() ) is_tril = ~np.array(out["triu"], dtype=bool) is_tril_ix = out.index[is_tril] assert np.all(out.loc[is_tril_ix, "chrom1"] == chunk.loc[is_tril_ix, "chrom2"]) assert np.all(out.loc[is_tril_ix, "chrom2"] == chunk.loc[is_tril_ix, "chrom1"]) assert np.all(out.loc[is_tril_ix, "strand1"] == "+") text = valid_data chunk = pd.read_csv(StringIO(text), sep="\t", names=columns) out = sanitize_records(bins, schema="pairs", validate=True, tril_action="drop")( chunk.copy() ) is_tril = ~np.array(out["triu"], dtype=bool) is_tril_ix = out.index[is_tril] assert np.all(out.loc[is_tril_ix, "chrom1"] == chunk.loc[is_tril_ix, "chrom2"]) assert np.all(out.loc[is_tril_ix, "chrom2"] == chunk.loc[is_tril_ix, "chrom1"]) assert np.all(out.loc[is_tril_ix, "strand1"] == "+") assert len(out) == chunk["triu"].sum() func = sanitize_records(bins, schema="pairs", validate=True, tril_action="raise") text = valid_data chunk = pd.read_csv(StringIO(text), sep="\t", names=columns) with pytest.raises(BadInputError): func(chunk)
def test_aggregate_records(): bins = cooler.binnify( cooler.util.read_chromsizes(op.join(datadir, "toy.chrom.sizes")), 1 ) records = pd.read_csv( op.join(datadir, "toy.pairs"), sep='\t', names=[ "read_id", "chrom1", "pos1", "chrom2", "pos2", "strand1", "strand2", "value" ] ) sanitizer = sanitize_records( bins, schema="pairs", validate=False, tril_action="reflect", is_one_based=False, sort=False, ) chunk = sanitizer(records) aggregator = aggregate_records() aggregator(chunk)
def test_sanitize_records_with_nuisance_records(): text = _insert_lines(valid_data, nuisance_chroms) chunk = pd.read_csv(StringIO(text), sep="\t", names=columns) out = sanitize_records(bins, schema="pairs", validate=True, tril_action="reflect")( chunk.copy() ) assert ("chr9" not in out["chrom1"]) and ("chr9" not in out["chrom2"])
def test_sanitize_triu_action(): text = valid_data chunk = pd.read_csv(StringIO(text), sep='\t', names=columns) out = sanitize_records( bins, schema='pairs', validate=True, tril_action='reflect', )(chunk.copy()) is_tril = ~np.array(out['triu'], dtype=bool) is_tril_ix = out.index[is_tril] assert np.all(out.loc[is_tril_ix, 'chrom1'] == chunk.loc[is_tril_ix, 'chrom2']) assert np.all(out.loc[is_tril_ix, 'chrom2'] == chunk.loc[is_tril_ix, 'chrom1']) assert np.all(out.loc[is_tril_ix, 'strand1'] == '+') text = valid_data chunk = pd.read_csv(StringIO(text), sep='\t', names=columns) out = sanitize_records( bins, schema='pairs', validate=True, tril_action='drop', )(chunk.copy()) is_tril = ~np.array(out['triu'], dtype=bool) is_tril_ix = out.index[is_tril] assert np.all(out.loc[is_tril_ix, 'chrom1'] == chunk.loc[is_tril_ix, 'chrom2']) assert np.all(out.loc[is_tril_ix, 'chrom2'] == chunk.loc[is_tril_ix, 'chrom1']) assert np.all(out.loc[is_tril_ix, 'strand1'] == '+') assert len(out) == chunk['triu'].sum() func = sanitize_records( bins, schema='pairs', validate=True, tril_action='raise', ) text = valid_data chunk = pd.read_csv(StringIO(text), sep='\t', names=columns) with pytest.raises(BadInputError): func(chunk)
def test_sanitize_with_nuisance_records(): text = _insert_lines(valid_data, nuisance_chroms) chunk = pd.read_csv(StringIO(text), sep='\t', names=columns) out = sanitize_records( bins, schema='pairs', validate=True, tril_action='reflect', )(chunk.copy()) assert ('chr9' not in out['chrom1']) and ('chr9' not in out['chrom2'])
def test_sanitize_records_with_bad_records(): func = sanitize_records(bins, schema="pairs", validate=True, tril_action="reflect") text = _insert_lines(valid_data, oob_lower) chunk = pd.read_csv(StringIO(text), sep="\t", names=columns) with pytest.raises(BadInputError): func(chunk) text = _insert_lines(valid_data, oob_upper) chunk = pd.read_csv(StringIO(text), sep="\t", names=columns) with pytest.raises(BadInputError): func(chunk)
def test_sanitize_with_strand_column(): text = valid_data chunk = pd.read_csv(StringIO(text), sep='\t', names=columns) out = sanitize_records( bins, schema='pairs', validate=True, tril_action='reflect', sided_fields=('chrom', 'pos', 'strand'), )(chunk.copy()) is_tril = ~np.array(out['triu'], dtype=bool) assert np.all(out.loc[is_tril, 'chrom1'] == chunk.loc[is_tril, 'chrom2']) assert np.all(out.loc[is_tril, 'chrom2'] == chunk.loc[is_tril, 'chrom1']) assert np.all(out.loc[is_tril, 'strand1'] == '-')
def test_sanitize_records_with_strand_column(): text = valid_data chunk = pd.read_csv(StringIO(text), sep="\t", names=columns) out = sanitize_records( bins, schema="pairs", validate=True, tril_action="reflect", sided_fields=("chrom", "pos", "strand"), )(chunk.copy()) is_tril = ~np.array(out["triu"], dtype=bool) assert np.all(out.loc[is_tril, "chrom1"] == chunk.loc[is_tril, "chrom2"]) assert np.all(out.loc[is_tril, "chrom2"] == chunk.loc[is_tril, "chrom1"]) assert np.all(out.loc[is_tril, "strand1"] == "-")
def test_sanitize_with_bad_records(): func = sanitize_records( bins, schema='pairs', validate=True, tril_action='reflect', ) text = _insert_lines(valid_data, oob_lower) chunk = pd.read_table(StringIO(text), names=columns) with pytest.raises(BadInputError): func(chunk) text = _insert_lines(valid_data, oob_upper) chunk = pd.read_table(StringIO(text), names=columns) with pytest.raises(BadInputError): func(chunk)
def test_sanitize_records(): chunk = pd.read_csv(StringIO(valid_data), sep="\t", names=columns) with pytest.raises(ValueError): sanitize_records( bins, schema="doesnotexist", validate=True, tril_action="reflect", is_one_based=True, sort=True, )(chunk.copy()) chunk = pd.read_csv(StringIO(valid_data), sep="\t", names=columns) sanitize_records( bins, schema="pairs", validate=True, tril_action="reflect", is_one_based=True, sort=True, )(chunk.copy()) # variable-length bins chunk = pd.read_csv(StringIO(valid_data), sep="\t", names=columns) sanitize_records( pd.DataFrame({ 'chrom': ['chr1', 'chr1', 'chr2', 'chr2', 'chr3'], 'start': [0, 150, 0, 100, 0], 'end': [150, 300, 100, 300, 300], }), schema="pairs", validate=True, tril_action="reflect", is_one_based=True, sort=True, )(chunk.copy()) # input with already enum-encoded chromosomes (decode_chroms=False) text = """0\t1\t+\t1\t100\t-\t.\tLL\t1 1\t99\t+\t0\t13\t-\t.\tLL\t0 1\t13\t+\t1\t60\t-\t.\tLL\t1 0\t200\t+\t1\t50\t-\t.\tLL\t1 2\t11\t+\t2\t40\t-\t.\tLL\t1 0\t234\t+\t2\t30\t-\t.\tLL\t1 2\t3\t+\t1\t20\t-\t.\tLL\t0 1\t23\t+\t2\t11\t-\t.\tLL\t1 0\t123\t+\t-1\t200\t-\t.\tLL\t1 """ chunk = pd.read_csv(StringIO(text), sep="\t", names=columns) sanitize_records( bins, schema="pairs", decode_chroms=False, validate=True, tril_action="reflect" )(chunk.copy()) # fails on string chromosomes chunk = pd.read_csv(StringIO(valid_data), sep="\t", names=columns) with pytest.raises(BadInputError): sanitize_records( bins, schema="pairs", decode_chroms=False, validate=True, tril_action="reflect" )(chunk.copy()) # empty chunk out = sanitize_records( bins, schema="pairs", validate=True, tril_action="reflect" )(chunk.iloc[0:0]) assert len(out) == 0