Exemplo n.º 1
0
def string_disambiguation(value, candidates, name=False):
    def get_label(x):
        return x.split('resource')[1][1:].replace('_', ' ')

    if name:
        best_dist, best_match = (-1, -float('inf')), None
        for entity in candidates:
            label = get_label(entity)
            name_sim = who.ratio(label, value)
            neg_lev_dist = -levenshtein(value.lower(), label.lower())
            dist = (name_sim, neg_lev_dist)
            if dist[1] <= 0 and dist > best_dist:
                best_dist = dist
                best_match = entity
    else:
        distances = []
        best_dist, best_match = 999999, None
        for entity in candidates:
            label = get_label(entity)
            dist = levenshtein(value.lower(), label.lower(), best_dist)
            if 0 <= dist < best_dist:
                best_dist = dist
                best_match = entity

    return best_match
Exemplo n.º 2
0
def get_variations(text: str) -> set:
    """Returns all synonyms of a text having an edit-distance of 2 or less."""
    text = text.replace(' ', '_')
    return {
        s.replace('_', ' ')
        for s in words_util.get_synonyms(text) if levenshtein(s, text, 2) <= 2
    }
Exemplo n.º 3
0
def levenshtein_similarity(string_a, string_b):
    '''
    Function returns Levenshtein similarity between two strings.

    Levenshtein distance is defined as the smallest number of edit operations (insertion, deletion, and substitution)
    required to transform one string into another.
    string_a = 'the fast brown fox'
    string_b = 'the slow brown fox'
    Levenshtein distance = 4

    This is converted into a similarity value by using the following formula:
    1-levenshtein distance/max(string1, string2)
    levenshtein_similarity = 1 - 4/18 = 0.778

    Parameters:
    string_a : first string (text)
    string_b : second string (text)

    Returns:
    float: levenshtein_similarity * 100
    '''
    # Calculate the Levenshtein distance using polyleven (Myers algorithm)
    levenshtein_distance = levenshtein(string_a, string_b)
    # Calculate the Levenshtein similarity
    levenshtein_sim = (
        1 - levenshtein_distance / max(len(string_a), len(string_b))) * 100
    return levenshtein_sim
Exemplo n.º 4
0
    def _add_list_to_graph(self, lst: str, lst_name: str,
                           list_graph: ListGraph) -> str:
        """Add a list as new node to the graph."""
        node_id = self._convert_to_clg_type(lst_name)
        if not self.has_node(node_id):
            # If node_id is not in the graph, then we try synonyms with max. edit-distance of 2
            # e.g. to cover cases where the type is named 'Organisation' and the category 'Organization'
            for name_variation in hypernymy_util.get_variations(lst_name):
                node_id_alternative = clg_util.name2clg_type(name_variation)
                if self.has_node(node_id_alternative):
                    node_id = node_id_alternative
                    break
        node_parts = list_graph.get_parts(lst)

        # check for equivalent mapping and existing node_id (if they map to more than one node -> log error)
        equivalent_nodes = {
            node
            for eq_cat in list_mapping.get_equivalent_categories(lst)
            for node in self.get_nodes_for_part(eq_cat)
        }
        if self.has_node(node_id):
            equivalent_nodes.add(node_id)
        if len(equivalent_nodes) > 1:
            utils.get_logger().debug(
                f'CaLiGraph: ListMerge - For "{lst}" multiple equivalent nodes have been found: {equivalent_nodes}.'
            )
            equivalent_nodes = {
                node_id
            } if node_id in equivalent_nodes else equivalent_nodes
        if equivalent_nodes:
            main_node_id = sorted(equivalent_nodes,
                                  key=lambda x: levenshtein(x, node_id))[0]
            self._set_parts(main_node_id,
                            self.get_parts(main_node_id) | node_parts)
            return main_node_id

        # check for parents to initialise under (parent mapping)
        self._add_nodes({node_id})
        self._set_name(node_id, lst_name)
        self._set_parts(node_id, node_parts)
        parent_nodes = {
            node
            for parent_cat in list_mapping.get_parent_categories(lst)
            for node in self.get_nodes_for_part(parent_cat)
        }
        self._add_edges({(pn, node_id) for pn in parent_nodes})

        return node_id
Exemplo n.º 5
0
 def remove_below_edit_dist(barcode_list, n=3, randomize=True):
     """remove barcodes below a set levenshtein edit-distance n"""
     min_dist = range(n)  # set the minimum distance
     if randomize:
         random.shuffle(barcode_list)  # shuffle barcodes (to avoid always starting with the same barcode)
     filtered = {barcode_list[0]}  # set first barcode as 'seed'
     barcode_list = set(barcode_list)  # convert barcodes to set (for performance reasons)
     for i in barcode_list:
         broken = 0
         for j in filtered:
             current_dist = levenshtein(i, j, n - 1)  # calculate distance to every barcode added so far
             if current_dist in min_dist:  # only use 'acceptable' barcodes
                 broken = 1  # stop looking if a 'match' is found
                 break
         if broken == 0:
             filtered.add(i)
     return list(filtered)
Exemplo n.º 6
0
def query_index(seq, barcode_index, num_mismatches=1):
    """Performs a fuzzy barcode search, exhaustive.

    Parameters
    ----------
    seq : str
        A DNA string.
    barcode_index : dict
        A FastSS index of barcodes.
    num_mismatches : int, optional
        Maximum levenshtein distance allowd.

    Returns
    -------
    dict
        A dictionary of fuzzy searching result. Keys are levenshtein distance.
        Values are list of matched barcodes.
    """

    res = {d: [] for d in range(num_mismatches + 1)}

    cands = {barcode_index.get(key) for key in indexkeys(seq, num_mismatches)}
    # cands.discard(None)
    # cands = {i for i in cands if i}

    if seq in cands:
        res[0].append(seq)

    else:
        for cand in cands:
            if cand:
                dist = levenshtein(seq, cand, num_mismatches)

                if dist <= num_mismatches:
                    res[dist].append(cand)

    return res
Exemplo n.º 7
0
def generate_network(self: Union[Dandelion, pd.DataFrame, str],
                     key: Union[None, str] = None,
                     clone_key: Union[None, str] = None,
                     min_size: int = 2,
                     downsample: Union[None, int] = None,
                     verbose: bool = True,
                     locus: Union[None, Literal['ig', 'tr-ab',
                                                'tr-gd']] = None,
                     **kwargs) -> Dandelion:
    """
    Generates a Levenshtein distance network based on full length VDJ sequence alignments for heavy and light chain(s).
    The distance matrices are then combined into a singular matrix.

    Parameters
    ----------
    data : Dandelion, DataFrame, str
        `Dandelion` object, pandas `DataFrame` in changeo/airr format, or file path to changeo/airr file after clones have been determined.
    key : str, optional
        column name for distance calulations. None defaults to 'sequence_alignment_aa'.
    clone_key: str, optional
        column name to build network on.
    min_size : int
        For visualization purposes, two graphs are created where one contains all cells and a trimmed second graph. This value specifies the minimum number of edges required otherwise node will be trimmed in the secondary graph.
    downsample : int, optional
        whether or not to downsample the number of cells prior to construction of network. If provided, cells will be randomly sampled to the integer provided. A new Dandelion class will be returned.
    verbose : bool
        whether or not to print the progress bars.
    locus : str, optional
        Mode of data. Accepts one of 'ig', 'tr-ab' or 'tr-gd'. None defaults to 'ig'.
    **kwargs
        additional kwargs passed to options specified in `networkx.drawing.layout.spring_layout`.

    Returns
    -------
    `Dandelion` object with `.distance`, `.edges`, `.layout`, `.graph` initialized.
    """
    if verbose:
        start = logg.info('Generating network')
    if self.__class__ == Dandelion:
        dat = load_data(self.data)
    else:
        dat = load_data(self)

    if key is None:
        key_ = 'sequence_alignment_aa'  # default
    else:
        key_ = key

    if key_ not in dat:
        raise ValueError("key {} not found in input table.".format(key_))

    if clone_key is None:
        clonekey = 'clone_id'
    else:
        clonekey = clone_key
    if clonekey not in dat:
        raise ValueError(
            'Data does not contain clone information. Please run find_clones.')

    if locus is None:
        locus = 'ig'

    dat = sanitize_data(dat, ignore=clonekey)

    # calculate distance

    if downsample is not None:
        # if downsample >= dat_h.shape[0]:
        if downsample >= self.metadata.shape[0]:
            if verbose:
                print('Cannot downsample to {} cells. Using all {} cells.'.
                      format(str(downsample), self.metadata.shape[0]))
        else:
            if verbose:
                print('Downsampling to {} cells.'.format(str(downsample)))
            dat_h = dat[dat['locus'].isin(['IGH', 'TRB', 'TRD'])].copy()
            dat_l = dat[dat['locus'].isin(['IGK', 'IGL', 'TRA', 'TRG'])].copy()
            dat_h = dat_h.sample(downsample)
            dat_l = dat_l[dat_l['cell_id'].isin(list(dat_h['cell_id']))].copy()
            dat_ = dat_h.append(dat_l)
            dat_ = sanitize_data(dat_, ignore=clonekey)
    else:
        dat_ = dat.copy()

    # So first, create a data frame to hold all possible (full) sequences split by
    # heavy (only 1 possible for now) and light (multiple possible)
    try:
        dat_seq = retrieve_metadata(dat_,
                                    query=key_,
                                    split=True,
                                    collapse=False,
                                    locus=locus,
                                    ignore=clonekey)
    except:
        dat_seq = retrieve_metadata(dat_,
                                    query=key_,
                                    split=True,
                                    collapse=False,
                                    locus=locus,
                                    ignore=clonekey,
                                    **kwargs)
    dat_seq.columns = [re.sub(key_ + '_', '', i) for i in dat_seq.columns]

    # calculate a distance matrix for all vs all and this can be referenced later on to
    # extract the distance between the right pairs
    dmat = Tree()
    sleep(0.5)
    if verbose:
        for x in tqdm(dat_seq.columns, desc='Calculating distances... '):
            tdarray = np.array(np.array(dat_seq[x])).reshape(-1, 1)
            # d_mat = squareform([levenshtein(x[0],y[0]) for x,y in combinations(tdarray, 2)
            # if (x[0] == x[0]) and (y[0] == y[0]) else 0])
            d_mat = squareform(
                pdist(
                    tdarray, lambda x, y: levenshtein(x[0], y[0])
                    if (x[0] == x[0]) and (y[0] == y[0]) else 0))
            dmat[x] = d_mat
    else:
        for x in dat_seq.columns:
            tdarray = np.array(np.array(dat_seq[x])).reshape(-1, 1)
            # d_mat = squareform([levenshtein(x[0],y[0]) for x,y in combinations(tdarray, 2)
            #                      if (x[0] == x[0]) and (y[0] == y[0]) else 0])
            d_mat = squareform(
                pdist(
                    tdarray, lambda x, y: levenshtein(x[0], y[0])
                    if (x[0] == x[0]) and (y[0] == y[0]) else 0))
            dmat[x] = d_mat

    dist_mat_list = [dmat[x] for x in dmat if type(dmat[x]) is np.ndarray]

    total_dist = np.sum(dist_mat_list, axis=0)

    # generate edge list
    if self.__class__ == Dandelion:
        out = self.copy()
        if downsample is not None:
            out = Dandelion(dat_, locus=locus)
    else:  # re-initiate a Dandelion class object
        out = Dandelion(dat_, locus=locus)

    tmp_totaldist = pd.DataFrame(total_dist,
                                 index=dat_seq.index,
                                 columns=dat_seq.index)
    tmp_clusterdist = Tree()
    overlap = []
    for i in out.metadata.index:
        if len(out.metadata.loc[i, str(clonekey)].split('|')) > 1:
            overlap.append(
                [c for c in out.metadata.loc[i, str(clonekey)].split('|')])
            for c in out.metadata.loc[i, str(clonekey)].split('|'):
                tmp_clusterdist[c][i].value = 1
        else:
            cx = out.metadata.loc[i, str(clonekey)]
            tmp_clusterdist[cx][i].value = 1
    tmp_clusterdist2 = {}
    for x in tmp_clusterdist:
        tmp_clusterdist2[x] = list(tmp_clusterdist[x])
    cluster_dist = {}
    for c_ in tmp_clusterdist2:
        if c_ in list(flatten(overlap)):
            for ol in overlap:
                if c_ in ol:
                    idx = list(
                        set(flatten([tmp_clusterdist2[c_x] for c_x in ol])))
                    if len(list(set(idx))) > 1:
                        dist_mat_ = tmp_totaldist.loc[idx, idx]
                        s1, s2 = dist_mat_.shape
                        if s1 > 1 and s2 > 1:
                            cluster_dist['|'.join(ol)] = dist_mat_
        else:
            dist_mat_ = tmp_totaldist.loc[tmp_clusterdist2[c_],
                                          tmp_clusterdist2[c_]]
            s1, s2 = dist_mat_.shape
            if s1 > 1 and s2 > 1:
                cluster_dist[c_] = dist_mat_

    # to improve the visulisation and plotting efficiency, i will build a minimum spanning tree for each group/clone to connect the shortest path
    mst_tree = mst(cluster_dist)
    sleep(0.5)

    edge_list = Tree()
    if verbose:
        for c in tqdm(mst_tree, desc='Generating edge list '):
            G = nx.from_pandas_adjacency(mst_tree[c])
            edge_list[c] = nx.to_pandas_edgelist(G)
    else:
        for c in mst_tree:
            G = nx.from_pandas_adjacency(mst_tree[c])
            edge_list[c] = nx.to_pandas_edgelist(G)

    sleep(0.5)

    clone_ref = dict(out.metadata[clonekey])
    tmp_clone_tree = Tree()
    for x in out.metadata.index:
        if '|' in clone_ref[x]:
            for x_ in clone_ref[x].split('|'):
                tmp_clone_tree[x_][x].value = 1
        else:
            tmp_clone_tree[clone_ref[x]][x].value = 1
    tmp_clone_tree2 = Tree()
    for x in tmp_clone_tree:
        tmp_clone_tree2[x] = list(tmp_clone_tree[x])

    tmp_clone_tree3 = Tree()
    tmp_clone_tree3_overlap = Tree()
    for x in tmp_clone_tree2:
        # this is to catch all possible cells that may potentially match up with this clone that's joined together
        if x in list(flatten(overlap)):
            for ol in overlap:
                if x in ol:
                    if len(tmp_clone_tree2[x]) > 1:
                        for x_ in tmp_clone_tree2[x]:
                            tmp_clone_tree3_overlap['|'.join(ol)][''.join(
                                x_)].value = 1
                    else:
                        tmp_clone_tree3_overlap['|'.join(ol)][''.join(
                            tmp_clone_tree2[x])].value = 1
        else:
            tmp_ = pd.DataFrame(index=tmp_clone_tree2[x],
                                columns=tmp_clone_tree2[x])
            tmp_ = pd.DataFrame(np.tril(tmp_) + 1,
                                index=tmp_clone_tree2[x],
                                columns=tmp_clone_tree2[x])
            tmp_.fillna(0, inplace=True)
            tmp_clone_tree3[x] = tmp_

    for x in tmp_clone_tree3_overlap:  # repeat for the overlap clones
        tmp_ = pd.DataFrame(index=tmp_clone_tree3_overlap[x],
                            columns=tmp_clone_tree3_overlap[x])
        tmp_ = pd.DataFrame(np.tril(tmp_) + 1,
                            index=tmp_clone_tree3_overlap[x],
                            columns=tmp_clone_tree3_overlap[x])
        tmp_.fillna(0, inplace=True)
        tmp_clone_tree3[x] = tmp_

    # here I'm using a temporary edge list to catch all cells that were identified as clones to forcefully link them up if they were identical but clipped off during the mst step

    # create a dataframe to recall the actual distance quickly
    tmp_totaldiststack = pd.DataFrame(tmp_totaldist.unstack())
    tmp_totaldiststack.index.names = [None, None]
    tmp_totaldiststack = tmp_totaldiststack.reset_index(drop=False)
    tmp_totaldiststack.columns = ['source', 'target', 'weight']
    tmp_totaldiststack.index = [
        str(s) + '|' + str(t) for s, t in zip(tmp_totaldiststack['source'],
                                              tmp_totaldiststack['target'])
    ]
    tmp_totaldiststack['keep'] = [
        False if len(list(set(i.split('|')))) == 1 else True
        for i in tmp_totaldiststack.index
    ]
    tmp_totaldiststack = tmp_totaldiststack[tmp_totaldiststack.keep].drop(
        'keep', axis=1)

    tmp_edge_list = Tree()
    if verbose:
        for c in tqdm(tmp_clone_tree3, desc='Linking edges '):
            if len(tmp_clone_tree3[c]) > 1:
                G = nx.from_pandas_adjacency(tmp_clone_tree3[c])
                tmp_edge_list[c] = nx.to_pandas_edgelist(G)
                tmp_edge_list[c].index = [
                    str(s) + '|' + str(t) for s, t in zip(
                        tmp_edge_list[c]['source'], tmp_edge_list[c]['target'])
                ]
                tmp_edge_list[c]['weight'].update(tmp_totaldiststack['weight'])
                # keep only edges when there is 100% identity, to minimise crowding
                tmp_edge_list[c] = tmp_edge_list[c][tmp_edge_list[c]['weight']
                                                    == 0]
                tmp_edge_list[c].reset_index(inplace=True)
    else:
        for c in tmp_clone_tree3:
            if len(tmp_clone_tree3[c]) > 1:
                G = nx.from_pandas_adjacency(tmp_clone_tree3[c])
                tmp_edge_list[c] = nx.to_pandas_edgelist(G)
                tmp_edge_list[c].index = [
                    str(s) + '|' + str(t) for s, t in zip(
                        tmp_edge_list[c]['source'], tmp_edge_list[c]['target'])
                ]
                tmp_edge_list[c]['weight'].update(tmp_totaldiststack['weight'])
                # keep only edges when there is 100% identity, to minimise crowding
                tmp_edge_list[c] = tmp_edge_list[c][tmp_edge_list[c]['weight']
                                                    == 0]
                tmp_edge_list[c].reset_index(inplace=True)

    # try to catch situations where there's no edge (only singletons)
    try:
        edge_listx = pd.concat([edge_list[x] for x in edge_list])
        edge_listx.index = [
            str(s) + '|' + str(t)
            for s, t in zip(edge_listx['source'], edge_listx['target'])
        ]

        tmp_edge_listx = pd.concat([tmp_edge_list[x] for x in tmp_edge_list])
        tmp_edge_listx.drop('index', axis=1, inplace=True)
        tmp_edge_listx.index = [
            str(s) + '|' + str(t)
            for s, t in zip(tmp_edge_listx['source'], tmp_edge_listx['target'])
        ]

        edge_list_final = edge_listx.combine_first(tmp_edge_listx)
        edge_list_final['weight'].update(tmp_totaldiststack['weight'])
        # return the edge list
        edge_list_final.reset_index(drop=True, inplace=True)
    except:
        edge_list_final = None

    # and finally the vertex list which is super easy
    vertice_list = list(out.metadata.index)
    sleep(0.5)
    # and now to actually generate the network
    g, g_, lyt, lyt_ = generate_layout(vertice_list,
                                       edge_list_final,
                                       min_size=min_size,
                                       weight=None,
                                       verbose=verbose,
                                       **kwargs)

    # convert distance matrices to sparse
    for x in dmat:
        if type(dmat[x]) is np.ndarray:
            dmat[x] = csr_matrix(dmat[x])

    if verbose:
        logg.info(
            ' finished',
            time=start,
            deep=('Updated Dandelion object: \n'
                  '   \'data\', contig-indexed clone table\n'
                  '   \'metadata\', cell-indexed clone table\n'
                  '   \'distance\', heavy and light chain distance matrices\n'
                  '   \'edges\', network edges\n'
                  '   \'layout\', network layout\n'
                  '   \'graph\', network'))
    if self.__class__ == Dandelion:
        if self.germline is not None:
            germline_ = self.germline
        else:
            germline_ = None
        if self.threshold is not None:
            threshold_ = self.threshold
        else:
            threshold_ = None
        if downsample is not None:
            # out = Dandelion(data = dat_downsample, metadata = downsample_meta, distance = dmat, edges = edge_list_final, layout = (lyt, lyt_), graph = (g, g_), germline = germline_)
            out = Dandelion(data=dat_,
                            distance=dmat,
                            edges=edge_list_final,
                            layout=(lyt, lyt_),
                            graph=(g, g_),
                            germline=germline_,
                            locus=locus)
            out.threshold = threshold_
            return (out)
        else:
            self.__init__(data=self.data,
                          metadata=self.metadata,
                          distance=dmat,
                          edges=edge_list_final,
                          layout=(lyt, lyt_),
                          graph=(g, g_),
                          germline=germline_,
                          locus=locus,
                          initialize=False)
            self.threshold = threshold_
    else:
        # out = Dandelion(data = dat, distance = dmat, edges = edge_list_final, layout = (lyt, lyt_), graph = (g, g_), clone_key = clone_key)
        out = Dandelion(data=dat_,
                        distance=dmat,
                        edges=edge_list_final,
                        layout=(lyt, lyt_),
                        graph=(g, g_),
                        clone_key=clone_key,
                        locus=locus)
        return (out)
Exemplo n.º 8
0
def resolve_spelling_redirect(dbp_resource: str) -> str:
    redirect = resolve_redirect(dbp_resource)
    if levenshtein(dbp_resource, redirect, 2) > 2:
        # return original resource if the redirect links to a completely different resource
        return dbp_resource
    return redirect
Exemplo n.º 9
0
 def test_special(self):
     s = (chr(127) + chr(255)) * 33
     self.assertEqual(0, levenshtein(s, s))
Exemplo n.º 10
0
 def test_unicode_with_k(self):
     for k in (0, 1, 2, 3):
         for (dist, s1, s2) in TEST_UNICODE:
             with self.subTest(k=k, s1=s1, s2=s2):
                 self.assertEqual(min(dist, k + 1), levenshtein(s1, s2, k))
Exemplo n.º 11
0
 def test_unicode(self):
     for (dist, s1, s2) in TEST_UNICODE:
         with self.subTest(s1=s1, s2=s2):
             self.assertEqual(dist, levenshtein(s1, s2))
Exemplo n.º 12
0
 def test_long(self):
     for (dist, s1, s2) in TEST_LONG:
         with self.subTest(s1=s1, s2=s2):
             self.assertEqual(dist, levenshtein(s1, s2))
Exemplo n.º 13
0
 def test_ascii_with_k(self):
     for k in (0, 1, 2, 3):
         for (dist, s1, s2) in TEST_ASCII:
             with self.subTest(k=k, s1=s1, s2=s2):
                 self.assertEqual(min(dist, k + 1), levenshtein(s1, s2, k))
Exemplo n.º 14
0
 def test_ascii(self):
     for (dist, s1, s2) in TEST_ASCII:
         with self.subTest(s1=s1, s2=s2):
             self.assertEqual(dist, levenshtein(s1, s2))