Exemplo n.º 1
0
def parse_busco_full_summary(busco_file, chunk=100000):
    """Parse a busco full summary file."""
    logger.info("Parsing BUSCO full summary file")
    locations = defaultdict(list)
    with tofile.open_file_handle(busco_file) as fh:
        for line in fh:
            if line.startswith("#"):
                continue
            parts = line.split("\t")
            if parts[1] in ("Complete", "Duplicated"):
                locations[parts[2]].append((int(parts[3]), int(parts[4])))
    windows = {}
    for title, tuples in locations.items():
        tuples.sort(key=lambda tup: tup[0])
        windows[title] = []
        start_index = 0
        for location in tuples:
            windows[title].append([location[0], location[0] + chunk, 0])
            for window in windows[title][start_index:]:
                if location[1] < window[1]:
                    window[2] += 1
                else:
                    start_index += 1
        windows[title].sort(key=lambda window: window[2], reverse=True)
    return windows
Exemplo n.º 2
0
def parse_full_tsv(filename):
    """Parse a TSV file containing one value per sequence."""
    values = defaultdict(dict)
    sd = defaultdict(dict)
    n = defaultdict(dict)
    header = None
    with tofile.open_file_handle(filename) as fh:
        for line in fh:
            row = line.strip().split("\t")
            if header is None:
                header = {key: idx + 3 for idx, key in enumerate(row[3:])}
                continue
            length = int(row[2]) - int(row[1])
            values["length"][row[0]] = length
            values["position"][row[0]] = int(row[2])
            for key, idx in header.items():
                if key.endswith("_sd"):
                    sd[key[:key.rfind("_sd")]][row[0]] = float(row[idx])
                elif key.endswith("_n"):
                    n[key[:key.rfind("_n")]][row[0]] = float(row[idx])
                elif key.endswith("_cpm"):
                    values[key][row[0]] = float(
                        "%.3g" % (float(row[idx]) / length * 1000000))
                elif key.endswith("_count"):
                    values[key][row[0]] = int(row[idx])
                else:
                    values[key][row[0]] = float(row[idx])
    return values, sd, n
Exemplo n.º 3
0
def parse_windowed_tsv(filename, window):
    """Parse a TSV file containing one value per sequence."""
    values = defaultdict(lambda: defaultdict(list))
    sd = defaultdict(lambda: defaultdict(list))
    n = defaultdict(lambda: defaultdict(list))
    lengths = {}
    header = None
    with tofile.open_file_handle(filename) as fh:
        for line in fh:
            row = line.strip().split("\t")
            if header is None:
                header = {key: idx + 3 for idx, key in enumerate(row[3:])}
                continue
            length = int(row[2]) - int(row[1])
            if float(window) > 1:
                prev_len = lengths.get(row[0], 0)
                if length < prev_len:
                    continue
                lengths[row[0]] = length
            values["length"][row[0]].append(length)
            values["position"][row[0]].append(round(int(row[1]) + length / 2))
            for key, idx in header.items():
                if key.endswith("_sd"):
                    sd[key[:key.rfind("_sd")]][row[0]].append(float(row[idx]))
                elif key.endswith("_n"):
                    n[key[:key.rfind("_n")]][row[0]].append(float(row[idx]))
                elif key.endswith("_cpm"):
                    values[key][row[0]].append(
                        float("%.3g" % (float(row[idx]) / length * 1000000)))
                elif key.endswith("_count"):
                    values[key][row[0]].append(int(row[idx]))
                else:
                    values[key][row[0]].append(float(row[idx]))
    return values, sd, n
Exemplo n.º 4
0
def parse_busco_summary(filename, mask, header):
    """Parse chunked values into dict."""
    lineage = None
    with tofile.open_file_handle(filename) as fh:
        buscos = defaultdict(list)
        for line in fh.readlines():
            if line.startswith("#"):
                if line.startswith("# The lineage dataset is:"):
                    meta = line.split()
                    lineage = meta[5]
                    header.append("%s_count" % lineage)
                continue
            busco, status, *rest = line.rstrip().split("\t")
            if status in {"Fragmented", "Missing"}:
                continue
            seqid, start, *rest = rest
            buscos[seqid].append(int(start))
    if lineage is not None:
        for seqid in mask:
            starts = sorted(buscos[seqid])
            i = 0
            for start, obj in mask[seqid].items():
                ctr = 0
                while i < len(starts):
                    if starts[i] >= start:
                        if starts[i] > obj["end"]:
                            break
                        ctr += 1
                        i += 1
                obj["cols"].append(ctr)
    return mask, header
Exemplo n.º 5
0
def parse_full_bed(filename):
    """Parse a BED file containing one value per sequence."""
    parsed = {}
    with tofile.open_file_handle(filename) as fh:
        for line in fh:
            row = line.strip().split("\t")
            if len(row) < 5:
                return parsed
            parsed[row[0]] = float(row[4])
    return parsed
Exemplo n.º 6
0
def parse_window_bed(filename):
    """Parse a BED file containing multiple values per sequence."""
    parsed = defaultdict(list)
    with tofile.open_file_handle(filename) as fh:
        for line in fh:
            row = line.strip().split("\t")
            parsed[row[0]].append((int(row[1]), float(row[4])))
    windowed = {}
    for seq_id, arr in parsed.items():
        windowed[seq_id] = [tup[1] for tup in sorted(arr, key=lambda tup: tup[0])]
    return windowed
Exemplo n.º 7
0
def load_mask(filename):
    """Load bed file as mask."""
    mask = defaultdict(dict)
    header = []
    with tofile.open_file_handle(filename) as fh:
        for line in fh.readlines():
            seqid, start, end, *cols = line.rstrip().split("\t")
            if cols is None:
                cols = []
            if seqid == "sequence" and start == "start":
                header = cols
                continue
            mask[seqid].update({int(start): {"end": int(end), "cols": cols}})
    return mask, header
Exemplo n.º 8
0
def parse_busco_summary(filename, mask, header):
    """Parse chunked values into dict."""
    lineage = None
    with tofile.open_file_handle(filename) as fh:
        buscos = defaultdict(list)
        for line in fh.readlines():
            if line.startswith("#"):
                if line.startswith("# The lineage dataset is:"):
                    meta = line.split()
                    lineage = meta[5]
                    header.append("%s_count" % lineage)
                continue
            busco, status, *rest = line.rstrip().split("\t")
            if status in {"Fragmented", "Missing"}:
                continue
            seqid, start, *rest = rest
            buscos[seqid].append(int(start))
    if lineage is not None:
        for seqid in mask:
            starts = sorted(buscos[seqid])
            i = 0
            for start, obj in mask[seqid].items():
                ctr = 0
                while i < len(starts):
                    if starts[i] >= start:
                        if starts[i] > obj["end"]:
                            break
                        ctr += 1
                        i += 1
                obj["cols"].append(ctr)

            # if header is None:
            #     header = {key: idx + 3 for idx, key in enumerate(row[3:])}
            #     continue
            # seqid = row[0]
            # chunk_length = int(row[2]) - int(row[1])
            # if chunk_length > interval:
            #     interval = chunk_length
            # lengths[seqid] += chunk_length
            # for key, idx in header.items():
            #     values[seqid][key].append(float(row[idx]))
    return mask, header
Exemplo n.º 9
0
def parse_chunked_values(filename):
    """Parse chunked values into dict."""
    interval = 0
    header = None
    values = defaultdict(lambda: defaultdict(list))
    lengths = defaultdict(int)
    with tofile.open_file_handle(filename) as fh:
        for line in fh.readlines():
            row = line.rstrip().split("\t")
            if header is None:
                header = {key: idx + 3 for idx, key in enumerate(row[3:])}
                continue
            seqid = row[0]
            chunk_length = int(row[2]) - int(row[1])
            if chunk_length > interval:
                interval = chunk_length
            lengths[seqid] += chunk_length
            for key, idx in header.items():
                values[seqid][key].append(float(row[idx]))
    return lengths, values, interval
Exemplo n.º 10
0
def parse_assembly_report(filename, cat_filename, syn_filename):
    """Parse synonyms and assembly level into tsv files."""
    synonyms = []
    categories = []
    cats = {
        "identifier": {
            "index": 4,
            "list": []
        },
        "assembly_role": {
            "index": 1,
            "list": []
        },
        "assembly_level": {
            "index": 3,
            "list": []
        },
        "assembly_unit": {
            "index": 7,
            "list": []
        },
    }
    names = {
        "identifier": {
            "index": 4,
            "list": []
        },
        "name": {
            "index": 0,
            "list": []
        },
        "assigned_name": {
            "index": 2,
            "list": []
        },
        "refseq_accession": {
            "index": 6,
            "list": []
        },
    }
    with tofile.open_file_handle(filename) as fh:
        for line in fh:
            if line.startswith("#"):
                continue
            row = line.rstrip().split("\t")
            for group in (cats, names):
                for obj in group.values():
                    value = row[obj["index"]]
                    obj["list"].append(value)
    header = []
    for key, obj in cats.items():
        if len(set(obj["list"])) > 1:
            header.append(key)
    categories.append(header)
    for idx, value in enumerate(cats[header[0]]["list"]):
        row = [value]
        for key in header[1:]:
            row.append(cats[key]["list"][idx])
        categories.append(row)
    tofile.write_file(cat_filename, categories)
    header = []
    for key, obj in names.items():
        if len(set(obj["list"])) > 1:
            header.append(key)
    synonyms.append(header)
    for idx, value in enumerate(names[header[0]]["list"]):
        row = [value]
        for key in header[1:]:
            row.append(names[key]["list"][idx])
        synonyms.append(row)
    tofile.write_file(syn_filename, synonyms)