Exemplo n.º 1
0
def clustering(matrix, threshold, cluster_method='infomap'):
    if callable(cluster_method):
        c = external_function(threshold, matrix)
    elif cluster_method in [
            'infomap', 'labelprop', 'ebet', 'multilevel', 'spinglass'
    ]:
        c = igraph_clustering(matrix, threshold, method=cluster_method)
    elif cluster_method == 'mcl':
        kw = dict(max_steps=1000,
                  inflation=2,
                  expansion=2,
                  add_self_loops=True,
                  mcl_logs=lambda x: -np.log2((1 - x)**2))
        c = mcl(
            threshold,
            matrix,
            taxa=list(range(len(matrix))),
            revert=True,
            # More mcl parameters
            max_steps=kw['max_steps'],
            inflation=kw['inflation'],
            expansion=kw['expansion'],
            add_self_loops=kw['add_self_loops'],
            logs=kw['mcl_logs'])
    elif cluster_method in ['upgma', 'single', 'complete', 'ward']:
        c = flat_cluster(cluster_method,
                         threshold, [[c for c in r] for r in matrix],
                         revert=True)
    else:
        raise ValueError(
            "No clustering method named {:}".format(cluster_method))
    return c
Exemplo n.º 2
0
    def partial_cluster(
            self,
            method='sca',
            threshold=0.45,
            scale=0.5,
            factor=0.3,
            restricted_chars='_T',
            mode='overlap',
            cluster_method='infomap',
            gop=-1,
            restriction='',
            ref='',
            external_function=None,
            split_on_tones=True,
            **keywords):
        """
        Cluster the words into partial cognate sets.

        Function for flat clustering of words into cognate sets.

        Parameters
        ----------
        method : {'sca','lexstat','edit-dist','turchin'} (default='sca')
            Select the method that shall be used for the calculation.
        cluster_method : {'upgma','single','complete', 'mcl'} (default='upgma')
            Select the cluster method. 'upgma' (:evobib:`Sokal1958`) refers to
            average linkage clustering, 'mcl' refers to the "Markov Clustering
            Algorithm" (:evobib:`Dongen2000`).
        threshold : float (default=0.3)
            Select the threshold for the cluster approach. If set to c{False},
            an automatic threshold will be calculated by calculating the
            average distance of unrelated sequences (use with care).
        scale : float (default=0.5)
            Select the scale for the gap extension penalty.
        factor : float (default=0.3)
            Select the factor for extra scores for identical prosodic segments.
        restricted_chars : str (default="T_")
            Select the restricted chars (boundary markers) in the prosodic
            strings in order to enable secondary alignment.
        mode : {'global','local','overlap','dialign'} (default='overlap')
            Select the mode for the alignment analysis.
        verbose : bool (default=False)
            Define whether verbose output should be used or not.
        gop : int (default=-2)
            If 'sca' is selected as a method, define the gap opening penalty.
        restriction : {'cv'} (default="")
            Specify the restriction for calculations using the edit-distance.
            Currently, only "cv" is supported. If *edit-dist* is selected as
            *method* and *restriction* is set to *cv*, consonant-vowel matches
            will be prohibited in the calculations and the edit distance will
            be normalized by the length of the alignment rather than the length
            of the longest sequence, as described in :evobib:`Heeringa2006`.
        inflation : {int, float} (default=2)
            Specify the inflation parameter for the use of the MCL algorithm.
        expansion : int (default=2)
            Specify the expansion parameter for the use of the MCL algorithm.
        
        """
        kw = dict(
                imap_mode = True,
                post_processing = False,
                inflation=2,
                expansion=2,
                max_steps=1000,
                add_self_loops=True,
                sep=lingpy.settings.rcParams['morpheme_separator'],
                word_sep=lingpy.settings.rcParams['word_separator'],
                word_seps=lingpy.settings.rcParams['word_separators'],
                seps=lingpy.settings.rcParams['morpheme_separators'],
                mcl_logs=lambda x: -np.log2((1 - x) ** 2)
                )
        kw.update(keywords)        

        # check for parameters and add clustering, in order to make sure that
        # analyses are not repeated
        if not hasattr(self, 'params'):
            self.params = {}
        self.params['partial_cluster'] = "{0}_{1}_{2:.2f}".format(
            method, cluster_method, threshold)
        self._stamp += '# Partial Cluster: ' + self.params['partial_cluster']

        matrices = self._get_partial_matrices(method=method, scale=scale,
                factor=factor, restricted_chars=restricted_chars, mode=mode,
                gop=gop, imap_mode=kw['imap_mode'],
                split_on_tones=split_on_tones)
        k = 0
        C = defaultdict(list) # stores the pcogids
        G = {} # stores the graphs
        with pb(desc='PARTIAL SEQUENCE CLUSTERING', total=len(self.rows)) as progress:
            for concept, trace, matrix in matrices:
                progress.update(1)
                lingpy.log.info('Analyzing concept {0}...'.format(concept))
                if external_function:
                    c = external_function(threshold, matrix,
                            taxa=list(range(len(matrix))), revert=True)
                elif cluster_method == 'infomap':
                    c = extra.infomap_clustering(threshold,
                            matrix, taxa=list(range(len(matrix))), 
                            revert=True)
                elif cluster_method == 'mcl':
                    c = clustering.mcl(threshold, matrix, 
                            taxa = list(range(len(matrix))),
                            max_steps=kw['max_steps'],
                            inflation=kw['inflation'],
                            expansion=kw['expansion'],
                            add_self_loops=kw['add_self_loops'],
                            logs=kw['mcl_logs'],
                            revert=True)
                elif cluster_method in ['upgma', 'single', 'complete', 'ward']:
                    c = clustering.flat_cluster(cluster_method,
                            threshold, matrix,
                            revert=True)
                else:
                    raise ValueError("No suitable cluster method specified.")
                
                for i,(idx,pos,slc) in enumerate(trace):
                    C[idx] += [c[i] + k]
                if kw['post_processing']:
                    _g = nx.Graph()
                    for i,(idx,pos,slc) in enumerate(trace):
                        _g.add_node((i,idx,pos))
                    remove_edges = []
                    for (i, n1), (j, n2) in combinations2(enumerate(_g.nodes())):
                        if C[n1[1]][n1[2]] == C[n2[1]][n2[2]]:
                            _g.add_edge(n1, n2)
                            if n1[1] == n2[1]:
                                # get scores for n1 and n2 with all the rest in
                                # the matrix to decide for one
                                sn1, sn2 = 0, 0
                                for i,row in enumerate(matrix):
                                    sn1 += matrix[i][n1[0]]
                                    sn2 += matrix[i][n2[0]]
                                sn1 = sn1 / len(matrix)
                                sn2 = sn2 / len(matrix)
                                if sn1 <= sn2:
                                    remove_edges += [n2]
                                else:
                                            remove_edges += [n1]
                    for node in remove_edges:
                        for edge in sorted(_g[node]):
                            _g.remove_edge(node, edge)

                    for i,coms in enumerate(nx.connected_components(_g)):
                        cogid = i + 1 + k
                        for j,idx,pos in coms:
                            C[idx][pos] = cogid
                    
                    G[concept] = _g

                k += max(c.values())
        self.add_entries(ref or self._partials, C, lambda x: x)
        self.graphs = G
Exemplo n.º 3
0
    def partial_cluster(self,
                        method='sca',
                        threshold=0.45,
                        scale=0.5,
                        factor=0.3,
                        restricted_chars='_T',
                        mode='overlap',
                        cluster_method='infomap',
                        gop=-1,
                        restriction='',
                        ref='',
                        external_function=None,
                        split_on_tones=False,
                        **keywords):
        """
        Cluster the words into partial cognate sets.

        Function for flat clustering of words into cognate sets.

        Parameters
        ----------
        method : {'sca','lexstat','edit-dist','turchin'} (default='sca')
            Select the method that shall be used for the calculation.
        cluster_method : {'upgma','single','complete', 'mcl'} (default='upgma')
            Select the cluster method. 'upgma' (:evobib:`Sokal1958`) refers to
            average linkage clustering, 'mcl' refers to the "Markov Clustering
            Algorithm" (:evobib:`Dongen2000`).
        threshold : float (default=0.3)
            Select the threshold for the cluster approach. If set to c{False},
            an automatic threshold will be calculated by calculating the
            average distance of unrelated sequences (use with care).
        scale : float (default=0.5)
            Select the scale for the gap extension penalty.
        factor : float (default=0.3)
            Select the factor for extra scores for identical prosodic segments.
        restricted_chars : str (default="T_")
            Select the restricted chars (boundary markers) in the prosodic
            strings in order to enable secondary alignment.
        mode : {'global','local','overlap','dialign'} (default='overlap')
            Select the mode for the alignment analysis.
        verbose : bool (default=False)
            Define whether verbose output should be used or not.
        gop : int (default=-2)
            If 'sca' is selected as a method, define the gap opening penalty.
        restriction : {'cv'} (default="")
            Specify the restriction for calculations using the edit-distance.
            Currently, only "cv" is supported. If *edit-dist* is selected as
            *method* and *restriction* is set to *cv*, consonant-vowel matches
            will be prohibited in the calculations and the edit distance will
            be normalized by the length of the alignment rather than the length
            of the longest sequence, as described in :evobib:`Heeringa2006`.
        inflation : {int, float} (default=2)
            Specify the inflation parameter for the use of the MCL algorithm.
        expansion : int (default=2)
            Specify the expansion parameter for the use of the MCL algorithm.
        
        """
        kw = dict(imap_mode=True,
                  post_processing=True,
                  inflation=2,
                  expansion=2,
                  max_steps=1000,
                  add_self_loops=True,
                  sep=lingpy.settings.rcParams['morpheme_separator'],
                  word_sep=lingpy.settings.rcParams['word_separator'],
                  word_seps=lingpy.settings.rcParams['word_separators'],
                  seps=lingpy.settings.rcParams['morpheme_separators'],
                  mcl_logs=lambda x: -np.log2((1 - x)**2))
        kw.update(keywords)

        # check for parameters and add clustering, in order to make sure that
        # analyses are not repeated
        if not hasattr(self, 'params'):
            self.params = {}
        self.params['partial_cluster'] = "{0}_{1}_{2:.2f}".format(
            method, cluster_method, threshold)
        self._stamp += '# Partial Cluster: ' + self.params['partial_cluster']

        matrices = self._get_partial_matrices(
            method=method,
            scale=scale,
            factor=factor,
            restricted_chars=restricted_chars,
            mode=mode,
            gop=gop,
            imap_mode=kw['imap_mode'],
            split_on_tones=split_on_tones)
        k = 0
        C = defaultdict(list)  # stores the pcogids
        G = {}  # stores the graphs
        with util.pb(desc='PARTIAL SEQUENCE CLUSTERING',
                     total=len(self.rows)) as progress:
            for concept, trace, matrix in matrices:
                progress.update(1)
                lingpy.log.info('Analyzing concept {0}...'.format(concept))
                if external_function:
                    c = external_function(threshold,
                                          matrix,
                                          taxa=list(range(len(matrix))),
                                          revert=True)
                elif cluster_method == 'infomap':
                    c = extra.infomap_clustering(threshold,
                                                 matrix,
                                                 taxa=list(range(len(matrix))),
                                                 revert=True)
                elif cluster_method == 'mcl':
                    c = clustering.mcl(threshold,
                                       matrix,
                                       taxa=list(range(len(matrix))),
                                       max_steps=kw['max_steps'],
                                       inflation=kw['inflation'],
                                       expansion=kw['expansion'],
                                       add_self_loops=kw['add_self_loops'],
                                       logs=kw['mcl_logs'],
                                       revert=True)
                elif cluster_method in ['upgma', 'single', 'complete', 'ward']:
                    c = clustering.flat_cluster(cluster_method,
                                                threshold,
                                                matrix,
                                                revert=True)
                else:
                    raise ValueError("No suitable cluster method specified.")

                for i, (idx, pos, slc) in enumerate(trace):
                    C[idx] += [c[i] + k]
                if kw['post_processing']:
                    _g = nx.Graph()
                    for i, (idx, pos, slc) in enumerate(trace):
                        _g.add_node((i, idx, pos))
                    remove_edges = []
                    for (i, n1), (j, n2) in util.combinations2(
                            enumerate(_g.nodes())):
                        if C[n1[1]][n1[2]] == C[n2[1]][n2[2]]:
                            _g.add_edge(n1, n2)
                            if n1[1] == n2[1]:
                                # get scores for n1 and n2 with all the rest in
                                # the matrix to decide for one
                                sn1, sn2 = 0, 0
                                for i, row in enumerate(matrix):
                                    sn1 += matrix[i][n1[0]]
                                    sn2 += matrix[i][n2[0]]
                                sn1 = sn1 / len(matrix)
                                sn2 = sn2 / len(matrix)
                                if sn1 <= sn2:
                                    remove_edges += [n2]
                                else:
                                    remove_edges += [n1]
                    for node in remove_edges:
                        for edge in sorted(_g[node]):
                            _g.remove_edge(node, edge)

                    for i, coms in enumerate(nx.connected_components(_g)):
                        cogid = i + 1 + k
                        for j, idx, pos in coms:
                            C[idx][pos] = cogid

                    G[concept] = _g

                k += len(matrix) + 1
        self.add_entries(ref or self._partials, C, lambda x: x)
        self.graphs = G
dists = np.zeros((len(embeddings), len(embeddings)))
for u, emb_u in enumerate(embeddings):
    print(u, "/", len(embeddings))
    for v, emb_v in enumerate(embeddings):
        print(u, "/", len(embeddings), " - ", v, "/", len(embeddings))

        dists[u, v] = euclidean(u, v)

for threshold in [
        0.9, 0.8500000000000001, 0.8, 0.75, 0.7000000000000001, 0.65,
        0.6000000000000001, 0.55, 0.5, 0.45, 0.4, 0.35000000000000003,
        0.30000000000000004, 0.25, 0.2, 0.15000000000000002, 0.1, 0.05
]:

    cluster2ids = flat_cluster("upgma", threshold=threshold, matrix=dists)
    y_pred = np.zeros(len(X))
    for cluster in cluster2ids:
        y_pred[cluster2ids[cluster]] = cluster
    #y_pred = ap.fit_predict(embeddings)
    n_cognate_classes = len(set(cognate_classes))
    n_concepts = len(set(global_ids))
    y_true = cognate_classes
    y_random = np.random.randint(0, int(n_cognate_classes / n_concepts),
                                 y_pred.shape)
    from pairwise_evaluation import PairwiseEvaluation
    pe = PairwiseEvaluation(X, y_true, y_pred)
    precision, recall, f1 = pe.getPrecisionRecallF1()
    print("--------------------------")
    print(threshold)
    print(metrics.adjusted_rand_score(y_true, y_pred))
Exemplo n.º 5
0
 def test_flat_cluster(self):
     for method in ['upgma', 'single', 'complete', 'ward']:
         flat_cluster(method, 0.5, self.matrix, self.taxa, revert=True)
         flat_cluster(method, 0.5, self.matrix, self.taxa, revert=False)
         flat_cluster(method, 0.5, self.matrix, False, revert=False)
concepts2embeddings = dict((concept,[emb for i,emb in enumerate(embeddings) if global_ids[i] == concept]) for concept in set(sorted(global_ids)))
concepts2cognate_classes = dict((concept,[cog for i,cog in enumerate(cognate_classes) if global_ids[i] == concept]) for concept in set(sorted(global_ids)))
#for damping_factor in np.arange(0.5,1,0.05):
from lingpy.algorithm.clustering import flat_cluster

dists = np.zeros((len(embeddings),len(embeddings)))
for u,emb_u in enumerate(embeddings):
    print(u,"/",len(embeddings))
    for v,emb_v in enumerate(embeddings):
        print(u,"/",len(embeddings)," - ",v,"/",len(embeddings))

        dists[u,v] =euclidean(u,v) 
        
for threshold in [0.9, 0.8500000000000001, 0.8, 0.75, 0.7000000000000001, 0.65, 0.6000000000000001, 0.55, 0.5, 0.45, 0.4, 0.35000000000000003, 0.30000000000000004, 0.25, 0.2, 0.15000000000000002, 0.1, 0.05]:
        
    cluster2ids = flat_cluster("upgma", threshold=threshold, matrix=dists)
    y_pred = np.zeros(len(X))
    for cluster in cluster2ids:
        y_pred[cluster2ids[cluster]]=cluster
    #y_pred = ap.fit_predict(embeddings)
    n_cognate_classes = len(set(cognate_classes))
    n_concepts = len(set(global_ids))
    y_true = cognate_classes
    y_random = np.random.randint(0,int(n_cognate_classes/n_concepts),y_pred.shape)
    from pairwise_evaluation import PairwiseEvaluation
    pe = PairwiseEvaluation(X,y_true,y_pred)
    precision,recall,f1 = pe.getPrecisionRecallF1()
    print("--------------------------")
    print(threshold)
    print(metrics.adjusted_rand_score(y_true, y_pred))
    print(metrics.adjusted_mutual_info_score(y_true, y_pred))
Exemplo n.º 7
0
 def test_flat_cluster(self):
     from lingpy.algorithm.clustering import flat_cluster
     for method in ['upgma', 'single', 'complete', 'ward']:
         flat_cluster(method, 0.5, self.matrix, self.taxa, revert=True)
         flat_cluster(method, 0.5, self.matrix, self.taxa, revert=False)
         flat_cluster(method, 0.5, self.matrix, False, revert=False)
Exemplo n.º 8
0
 def test_flat_cluster(self):
     from lingpy.algorithm.clustering import flat_cluster
     for method in ['upgma', 'single', 'complete', 'ward']:
         flat_cluster(method, 0.5, self.matrix, self.taxa, revert=True)
         flat_cluster(method, 0.5, self.matrix, self.taxa, revert=False)
         flat_cluster(method, 0.5, self.matrix, False, revert=False)