Пример #1
0
 def load():
     valid_chromosomes = set(genome.get_chromosome_lengths())
     data = {}
     entries = read_bed(filename)
     data["chr"] = np.array(
         [chromosome_mangler(to_string(e.refseq)) for e in entries],
         dtype=np.object)
     data["start"] = np.array([e.position for e in entries], dtype=np.int32)
     data["stop"] = np.array([e.position + e.length for e in entries],
                             dtype=np.int32)
     data["score"] = np.array([e.score for e in entries], dtype=np.float)
     data["strand"] = np.array([e.strand for e in entries], dtype=np.int8)
     data["name"] = np.array([to_string(e.name) for e in entries],
                             dtype=np.object)
     data = pd.DataFrame(data)
     if filter_invalid_chromosomes:  # pragma: no cover
         keep = [x in valid_chromosomes for x in data["chr"]]
         data = data[keep]
     res = data
     if len(res) == 0:
         raise ValueError("Emtpty Bed file - %s" % filename)
     if (np.isnan(res["score"])).all():
         res = res.drop(["score"], axis=1)
     if (len(res["name"]) > 1) and (len(res["name"].unique()) == 1):
         res = res.drop(["name"], axis=1)
     return res
Пример #2
0
    def load():
        from mbf_fileformats.gff import gffToDict

        entries = gffToDict(filename, comment_char=comment_char)
        data = {
            "chr": [],
            "start": [],
            "stop": [],
            "score": [],
            "strand": [],
            "name": [],
        }
        name_found = False
        for entry in entries:
            if filter_function and not filter_function(entry):
                continue
            if chromosome_mangler:
                chr = chromosome_mangler(entry["seqname"])
            else:
                chr = entry["seqname"]
            data["chr"].append(to_string(chr))
            start = entry["start"]
            if fix_negative_coordinates and start < 0:
                start = 0
            data["start"].append(start)
            data["stop"].append(entry["end"])
            data["score"].append(entry["score"])
            data["strand"].append(entry["strand"])
            name = entry["attributes"].get("Name", [""])[0]
            data["name"].append(name)
            if name:
                name_found = True
        if not name_found:
            del data["name"]
        return pd.DataFrame(data)
Пример #3
0
 def do_liftover(self, listOfChromosomeIntervals, chain_file):
     """perform a lift over. Error messages are silently swallowed!"""
     tmp_input = tempfile.NamedTemporaryFile(mode="wb")
     tmp_output = tempfile.NamedTemporaryFile(mode="wb")
     tmp_error = tempfile.NamedTemporaryFile(mode="wb")
     max_len = 0
     listOfChromosomeIntervals = [
         list(row) for row in listOfChromosomeIntervals
     ]
     for row in listOfChromosomeIntervals:
         tmp_input.write(b" ".join(to_bytes(str(x)) for x in row))
         tmp_input.write(b"\n")
         max_len = max(len(row), max_len)
     tmp_input.write(b"\n")
     tmp_input.flush()  # it's magic ;)
     cmd = [
         self.algo.path / "liftOver",
         tmp_input.name,
         chain_file,
         tmp_output.name,
         tmp_error.name,
     ]
     p = subprocess.Popen(cmd,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.PIPE)
     dummy_stdout, stderr = p.communicate()
     if p.returncode != 0:  # pragma: no cover
         raise ValueError("do_liftover failed. Returncode: %s, stderr: %s" %
                          (p.returncode, stderr))
     tmp_output_in = open(tmp_output.name, "rb")
     res = []
     for row in tmp_output_in:
         row = row.strip().split(b"\t")
         row[0] = to_string(row[0])
         row[1] = int(row[1])
         row[2] = int(row[2])
         res.append(tuple(row))
     tmp_error_in = open(tmp_error.name, "rb")
     tmp_error_in.read()
     tmp_input.close()
     tmp_output.close()
     tmp_error.close()
     return res
Пример #4
0
def test_to_string():
    a = "für".encode("utf-8")
    b = "für"
    assert to_string(b) is b
    assert to_string(a) == b
Пример #5
0
 def load():
     df = wiggle_to_intervals(filename, comment_char=comment_char)
     df["chr"] = [to_string(x) for x in df["chr"]]
     df["start"] -= enlarge_5prime
     df["stop"] += enlarge_3prime
     return df