def load(): valid_chromosomes = set(genome.get_chromosome_lengths()) data = {} entries = read_bed(filename) data["chr"] = np.array( [chromosome_mangler(to_string(e.refseq)) for e in entries], dtype=np.object) data["start"] = np.array([e.position for e in entries], dtype=np.int32) data["stop"] = np.array([e.position + e.length for e in entries], dtype=np.int32) data["score"] = np.array([e.score for e in entries], dtype=np.float) data["strand"] = np.array([e.strand for e in entries], dtype=np.int8) data["name"] = np.array([to_string(e.name) for e in entries], dtype=np.object) data = pd.DataFrame(data) if filter_invalid_chromosomes: # pragma: no cover keep = [x in valid_chromosomes for x in data["chr"]] data = data[keep] res = data if len(res) == 0: raise ValueError("Emtpty Bed file - %s" % filename) if (np.isnan(res["score"])).all(): res = res.drop(["score"], axis=1) if (len(res["name"]) > 1) and (len(res["name"].unique()) == 1): res = res.drop(["name"], axis=1) return res
def load(): from mbf_fileformats.gff import gffToDict entries = gffToDict(filename, comment_char=comment_char) data = { "chr": [], "start": [], "stop": [], "score": [], "strand": [], "name": [], } name_found = False for entry in entries: if filter_function and not filter_function(entry): continue if chromosome_mangler: chr = chromosome_mangler(entry["seqname"]) else: chr = entry["seqname"] data["chr"].append(to_string(chr)) start = entry["start"] if fix_negative_coordinates and start < 0: start = 0 data["start"].append(start) data["stop"].append(entry["end"]) data["score"].append(entry["score"]) data["strand"].append(entry["strand"]) name = entry["attributes"].get("Name", [""])[0] data["name"].append(name) if name: name_found = True if not name_found: del data["name"] return pd.DataFrame(data)
def do_liftover(self, listOfChromosomeIntervals, chain_file): """perform a lift over. Error messages are silently swallowed!""" tmp_input = tempfile.NamedTemporaryFile(mode="wb") tmp_output = tempfile.NamedTemporaryFile(mode="wb") tmp_error = tempfile.NamedTemporaryFile(mode="wb") max_len = 0 listOfChromosomeIntervals = [ list(row) for row in listOfChromosomeIntervals ] for row in listOfChromosomeIntervals: tmp_input.write(b" ".join(to_bytes(str(x)) for x in row)) tmp_input.write(b"\n") max_len = max(len(row), max_len) tmp_input.write(b"\n") tmp_input.flush() # it's magic ;) cmd = [ self.algo.path / "liftOver", tmp_input.name, chain_file, tmp_output.name, tmp_error.name, ] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) dummy_stdout, stderr = p.communicate() if p.returncode != 0: # pragma: no cover raise ValueError("do_liftover failed. Returncode: %s, stderr: %s" % (p.returncode, stderr)) tmp_output_in = open(tmp_output.name, "rb") res = [] for row in tmp_output_in: row = row.strip().split(b"\t") row[0] = to_string(row[0]) row[1] = int(row[1]) row[2] = int(row[2]) res.append(tuple(row)) tmp_error_in = open(tmp_error.name, "rb") tmp_error_in.read() tmp_input.close() tmp_output.close() tmp_error.close() return res
def test_to_string(): a = "für".encode("utf-8") b = "für" assert to_string(b) is b assert to_string(a) == b
def load(): df = wiggle_to_intervals(filename, comment_char=comment_char) df["chr"] = [to_string(x) for x in df["chr"]] df["start"] -= enlarge_5prime df["stop"] += enlarge_3prime return df