Пример #1
0
    def _get_adj_list_directional(self, umis, counts):
        ''' identify all umis within the hamming distance threshold
        and where the counts of the first umi is > (2 * second umi counts)-1'''

        adj_list = {umi: [] for umi in umis}

        if self.fuzzy_match:
            for umi1 in umis:
                # we need a second regex for some insertions,
                # e.g UMI1 = "ATCG", UMI2 = "ATTC"
                comp_regex_err = regex.compile("(%s){e<=1}" % str(umi1))
                comp_regex_del = regex.compile("(%s){i<=1}" % str(umi1)[::-1])
                for umi2 in umis:
                    if umi1 == umi2:
                        continue
                    if counts[umi1] >= (counts[umi2] * self.dir_threshold):
                        if (max(len(umi1), len(umi2)) -
                                min(len(umi1), len(umi2))) > 1:
                            continue
                        if (comp_regex_err.match(str(umi2))
                                or comp_regex_del.match(str(umi2))):
                            adj_list[umi1].append(umi2)
        else:
            for umi1, umi2 in itertools.combinations(umis, 2):
                if edit_distance(umi1, umi2) <= 1:
                    if counts[umi1] >= (counts[umi2] * 2) - 1:
                        adj_list[umi1].append(umi2)
                    if counts[umi2] >= (counts[umi1] * 2) - 1:
                        adj_list[umi2].append(umi1)

        return adj_list
Пример #2
0
def get_average_umi_distance(umis):

    if len(umis) == 1:
        return -1

    dists = [edit_distance(x, y) for x, y in itertools.combinations(umis, 2)]
    return float(sum(dists)) / (len(dists))
Пример #3
0
    def _get_adj_list_adjacency(self, umis, counts, threshold):
        ''' identify all umis within hamming distance threshold'''

        return {umi: [umi2 for umi2 in umis if
                      edit_distance(umi.encode('utf-8'),
                                    umi2.encode('utf-8')) <= threshold]
                for umi in umis}
Пример #4
0
def getErrorCorrectMapping(cell_barcodes, whitelist, threshold=1):
    ''' Find the mappings between true and false cell barcodes based
    on an edit distance threshold.

    Any cell barcode within the threshold to more than one whitelist
    barcode will be excluded'''

    true_to_false = collections.defaultdict(set)

    whitelist = set([str(x).encode("utf-8") for x in whitelist])

    for cell_barcode in cell_barcodes:
        match = None
        barcode_in_bytes = str(cell_barcode).encode("utf-8")
        for white_cell in whitelist:

            if barcode_in_bytes in whitelist:  # don't check if whitelisted
                continue

            if edit_distance(barcode_in_bytes, white_cell) <= threshold:
                if match is not None:  # already matched one barcode
                    match = None  # set match back to None
                    break  # break and don't add to maps
                else:
                    match = white_cell.decode("utf-8")

        if match is not None:
            true_to_false[match].add(cell_barcode)

    return true_to_false
Пример #5
0
    def _get_adj_list_adjacency(self, umis, counts, threshold):
        ''' identify all umis within hamming distance threshold'''

        return {umi: [umi2 for umi2 in umis if
                      edit_distance(umi.encode('utf-8'),
                                    umi2.encode('utf-8')) <= threshold]
                for umi in umis}
Пример #6
0
    def _get_adj_list_directional(self, umis, counts):
        ''' identify all umis within the hamming distance threshold
        and where the counts of the first umi is > (2 * second umi counts)-1'''

        adj_list = {umi: [] for umi in umis}

        if self.fuzzy_match:
            for umi1 in umis:
                # we need a second regex for some insertions,
                # e.g UMI1 = "ATCG", UMI2 = "ATTC"
                comp_regex_err = regex.compile("(%s){e<=1}" % str(umi1))
                comp_regex_del = regex.compile("(%s){i<=1}" % str(umi1)[::-1])
                for umi2 in umis:
                    if umi1 == umi2:
                        continue
                    if counts[umi1] >= (counts[umi2]*self.dir_threshold):
                        if (max(len(umi1), len(umi2)) -
                            min(len(umi1), len(umi2))) > 1:
                            continue
                        if (comp_regex_err.match(str(umi2)) or
                            comp_regex_del.match(str(umi2))):
                            adj_list[umi1].append(umi2)
        else:
            for umi1, umi2 in itertools.combinations(umis, 2):
                if edit_distance(umi1, umi2) <= 1:
                    if counts[umi1] >= (counts[umi2]*2)-1:
                        adj_list[umi1].append(umi2)
                    if counts[umi2] >= (counts[umi1]*2)-1:
                        adj_list[umi2].append(umi1)

        return adj_list
Пример #7
0
    def _get_adj_list_directional_adjacency(self, umis, counts, threshold):
        ''' identify all umis within the hamming distance threshold
        and where the counts of the first umi is > (2 * second umi counts)-1'''

        return {umi: [umi2 for umi2 in umis if
                      edit_distance(umi.encode('utf-8'),
                                    umi2.encode('utf-8')) == 1 and
                      counts[umi] >= (counts[umi2]*2)-1] for umi in umis}
Пример #8
0
def get_average_umi_distance(umis):

    if len(umis) == 1:
        return -1

    dists = [edit_distance(x.encode('utf-8'), y.encode('utf-8')) for
             x, y in itertools.combinations(umis, 2)]
    return float(sum(dists))/(len(dists))
Пример #9
0
    def _get_adj_list_directional(self, umis, counts, threshold=1):
        ''' identify all umis within the hamming distance threshold
        and where the counts of the first umi is > (2 * second umi counts)-1'''

        return {umi: [umi2 for umi2 in umis if
                      edit_distance(umi.encode('utf-8'),
                                    umi2.encode('utf-8')) == threshold and
                      counts[umi] >= (counts[umi2]*2)-1] for umi in umis}
Пример #10
0
    def _get_adj_list_adjacency(self, umis, counts, threshold):
        ''' identify all umis within hamming distance threshold'''

        adj_list = {umi: [] for umi in umis}
        for umi1, umi2 in itertools.combinations(umis, 2):
            if edit_distance(umi1, umi2) <= threshold:
                adj_list[umi1].append(umi2)
                adj_list[umi2].append(umi1)

        return adj_list
Пример #11
0
    def _get_adj_list_directional(self, umis, counts, threshold=1):
        ''' identify all umis within the hamming distance threshold
        and where the counts of the first umi is > (2 * second umi counts)-1'''

        adj_list = {umi: [] for umi in umis}
        for umi1, umi2 in itertools.combinations(umis, 2):
            if edit_distance(umi1, umi2) <= threshold:
                if counts[umi1] >= (counts[umi2] * 2) - 1:
                    adj_list[umi1].append(umi2)
                if counts[umi2] >= (counts[umi1] * 2) - 1:
                    adj_list[umi2].append(umi1)

        return adj_list
Пример #12
0
    def _get_adj_list_adjacency(self, umis, counts, threshold):
        ''' identify all umis within hamming distance threshold'''

        adj_list = {umi: [] for umi in umis}
        if len(umis) > 25:
            umi_length = len(umis[0])
            substr_idx = build_substr_idx(umis, umi_length, threshold)
            iter_umi_pairs = iter_nearest_neighbours(umis, substr_idx)
        else:
            iter_umi_pairs = itertools.combinations(umis, 2)
        for umi1, umi2 in iter_umi_pairs:
            if edit_distance(umi1, umi2) <= threshold:
                adj_list[umi1].append(umi2)
                adj_list[umi2].append(umi1)

        return adj_list
Пример #13
0
    def _get_adj_list_adjacency(self, umis, counts, threshold):
        ''' identify all umis within hamming distance threshold'''

        adj_list = {umi: [] for umi in umis}
        if len(umis) > 25:
            umi_length = len(umis[0])
            substr_idx = build_substr_idx(umis, umi_length, threshold)
            iter_umi_pairs = iter_nearest_neighbours(umis, substr_idx)
        else:
            iter_umi_pairs = itertools.combinations(umis, 2)
        for umi1, umi2 in iter_umi_pairs:
            if edit_distance(umi1, umi2) <= threshold:
                adj_list[umi1].append(umi2)
                adj_list[umi2].append(umi1)

        return adj_list
Пример #14
0
    def _get_adj_list_directional(self, umis, counts, threshold=1):
        ''' identify all umis within the hamming distance threshold
        and where the counts of the first umi is > (2 * second umi counts)-1'''

        adj_list = {umi: [] for umi in umis}
        if len(umis) > 25:
            umi_length = len(umis[0])
            substr_idx = build_substr_idx(umis, umi_length, threshold)
            iter_umi_pairs = iter_nearest_neighbours(umis, substr_idx)
        else:
            iter_umi_pairs = itertools.combinations(umis, 2)
        for umi1, umi2 in iter_umi_pairs:
            if edit_distance(umi1, umi2) <= threshold:
                if counts[umi1] >= (counts[umi2]*2)-1:
                    adj_list[umi1].append(umi2)
                if counts[umi2] >= (counts[umi1]*2)-1:
                    adj_list[umi2].append(umi1)

        return adj_list
Пример #15
0
    def _get_adj_list_directional(self, umis, counts, threshold=1):
        ''' identify all umis within the hamming distance threshold
        and where the counts of the first umi is > (2 * second umi counts)-1'''

        adj_list = {umi: [] for umi in umis}
        if len(umis) > 25:
            umi_length = len(umis[0])
            substr_idx = build_substr_idx(umis, umi_length, threshold)
            iter_umi_pairs = iter_nearest_neighbours(umis, substr_idx)
        else:
            iter_umi_pairs = itertools.combinations(umis, 2)
        for umi1, umi2 in iter_umi_pairs:
            if edit_distance(umi1, umi2) <= threshold:
                if counts[umi1] >= (counts[umi2]*2)-1:
                    adj_list[umi1].append(umi2)
                if counts[umi2] >= (counts[umi1]*2)-1:
                    adj_list[umi2].append(umi1)

        return adj_list