Пример #1
0
 def test_dotjoin(self):
     self.assertEqual(util.dotjoin(1, 2), '1.2')
     self.assertEqual(util.dotjoin([1, 2]), '1.2')
     self.assertEqual(util.dotjoin((1, 2)), '1.2')
     self.assertEqual(
         util.dotjoin((i for i in range(1, 3)), condition=lambda j: j > 1), '2')
     self.assertEqual(util.dotjoin(i for i in range(1, 3)), '1.2')
Пример #2
0
 def test_dotjoin(self):
     self.assertEqual(util.dotjoin(1, 2), '1.2')
     self.assertEqual(util.dotjoin([1, 2]), '1.2')
     self.assertEqual(util.dotjoin((1, 2)), '1.2')
     self.assertEqual(
         util.dotjoin((i for i in range(1, 3)), condition=lambda j: j > 1),
         '2')
     self.assertEqual(util.dotjoin(i for i in range(1, 3)), '1.2')
Пример #3
0
def _make_graph(colexifications, bipartite=False):
    """
    Return a graph-object from colexification data.
    """
    G = nx.Graph()

    if not bipartite:
        for c1, c2, t, f, entry in colexifications:
            try:
                G.edge[c1][c2]['families'] += [f]
                G.edge[c1][c2]['doculects'] += [t]
                G.edge[c1][c2]['words'] += [entry]
            except:
                G.add_node(c1, ntype='concept')
                G.add_node(c2, ntype='concept')
                G.add_edge(c1, c2, families=[f], doculects=[t], words=[entry])
        for a, b, d in G.edges(data=True):
            d['familyWeight'] = len(set(d['families']))
            d['wordWeight'] = len(d['words'])
            d['doculectWeight'] = len(set(d['doculects']))
            d['family'] = sorted(set(d['families']))
            d['doculects'] = sorted(set(d['doculects']))
    elif bipartite:
        for idx, (c1, c2, t, f, entry) in enumerate(colexifications):
            nindex = dotjoin(t, idx + 1)
            try:
                G.edge[nindex][c1]['weight'] += 1
                G.edge[nindex][c2]['weight'] += 1
            except:
                G.add_node(nindex, ntype='word', entry=entry, doculect=t, family=f)
                G.add_node(c1, ntype='concept')
                G.add_node(c2, ntype='concept')
                G.add_edge(nindex, c1, weight=1)
                G.add_edge(nindex, c2, weight=1)
    return G
Пример #4
0
def _make_graph(colexifications, bipartite=False):
    """
    Return a graph-object from colexification data.
    """
    G = nx.Graph()

    if not bipartite:
        for c1, c2, t, f, entry in colexifications:
            try:
                G.edge[c1][c2]['families'] += [f]
                G.edge[c1][c2]['doculects'] += [t]
                G.edge[c1][c2]['words'] += [entry]
            except:
                G.add_node(c1, ntype='concept')
                G.add_node(c2, ntype='concept')
                G.add_edge(c1, c2, families=[f], doculects=[t], words=[entry])
        for a, b, d in G.edges(data=True):
            d['familyWeight'] = len(set(d['families']))
            d['wordWeight'] = len(d['words'])
            d['doculectWeight'] = len(set(d['doculects']))
            d['family'] = sorted(set(d['families']))
            d['doculects'] = sorted(set(d['doculects']))
    elif bipartite:
        for idx, (c1, c2, t, f, entry) in enumerate(colexifications):
            nindex = dotjoin(t, idx + 1)
            try:
                G.edge[nindex][c1]['weight'] += 1
                G.edge[nindex][c2]['weight'] += 1
            except KeyError:
                G.add_node(nindex,
                           ntype='word',
                           entry=entry,
                           doculect=t,
                           family=f)
                G.add_node(c1, ntype='concept')
                G.add_node(c2, ntype='concept')
                G.add_edge(nindex, c1, weight=1)
                G.add_edge(nindex, c2, weight=1)
    return G
Пример #5
0
def get_correspondences(alms, ref='lexstatid'):
    """
    Compute sound correspondences for a given set of aligned cognates.
    """
    # store all correspondences
    corrs = {}

    # store occurrences
    occs = {}

    for key, msa in alms.msa[ref].items():
        # get basic stuff
        idxs = msa['ID']
        taxa = msa['taxa']
        concept = cgi.escape(alms[idxs[0], 'concept'], True)

        # get numerical representation of alignments
        if 'numbers' in alms.header:
            alignment = [class2tokens(
                alms[idxs[i], 'numbers'],
                msa['alignment'][i]) for i in range(len(idxs))]
        else:
            alignment = msa['alignment']

        # create new array for confidence
        character_matrix = []

        # iterate over each taxon
        for i, taxon in enumerate(taxa):
            # get the numerical sequence
            nums = alignment[i]

            # store chars per line
            chars = []

            # iterate over the sequence
            for j, num in enumerate(nums):
                col = [alm[j] for alm in alignment]

                # get the char
                if num != '-':
                    charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2])
                    chars += [charA]
                    try:
                        occs[charA] += [concept]
                    except:
                        occs[charA] = [concept]
                else:
                    chars += ['-']

                for k, numB in enumerate(col):
                    if k != i:
                        if num == '-' and numB == '-':
                            pass
                        else:
                            if numB != '-' and num != '-':
                                # get the second char
                                charB = dotjoin(
                                    taxa[k],
                                    msa['alignment'][k][j],
                                    numB.split('.')[2])
                                try:
                                    corrs[charA][charB] += 1
                                except:
                                    try:
                                        corrs[charA][charB] = 1
                                    except:
                                        corrs[charA] = {charB: 1}

            character_matrix += [chars]

        # append confidence matrix to alignments
        alms.msa[ref][key]['_charmat'] = character_matrix

    return corrs, occs
Пример #6
0
def get_confidence(alms, scorer, ref='lexstatid', gap_weight=1):
    """
    Function creates confidence scores for a given set of alignments.

    Parameters
    ----------
    alms : :py:class`~lingpy.align.sca.Alignments`
        An *Alignments* object containing already aligned strings.
    scorer : :py:class:`~lingpy.algorithm._misc.ScoreDict`
        A *ScoreDict* object which gives similarity scores for all segments in
        the alignment.
    ref : str (default="lexstatid")
        The reference entry-type, referring to the cognate-set to be used for
        the analysis.
    """
    # store all values for average scores
    values = []

    # store all correspondences
    corrs = {}

    # store occurrences
    occs = {}

    for key, msa in alms.msa[ref].items():
        # get basic stuff
        idxs = msa['ID']
        taxa = msa['taxa']
        concept = cgi.escape(alms[idxs[0], 'concept'], True)

        # get numerical representation of alignments
        if scorer:
            alignment = [class2tokens(
                alms[idxs[i], 'numbers'],
                msa['alignment'][i]) for i in range(len(idxs))]
        else:
            alignment = msa['alignment']

        # create new array for confidence
        confidence_matrix = []
        character_matrix = []

        # iterate over each taxon
        for i, taxon in enumerate(taxa):
            idx = alms.taxa.index(taxon) + 1

            # get the numerical sequence
            nums = alignment[i]

            # store confidences per line
            confidences = []

            # store chars per line
            chars = []

            # iterate over the sequence
            for j, num in enumerate(nums):
                col = [alm[j] for alm in alignment]
                score = 0
                count = 0

                # get the char
                if num != '-':
                    charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2])
                    chars += [charA]
                    try:
                        occs[charA] += [concept]
                    except:
                        occs[charA] = [concept]
                else:
                    chars += ['-']

                for k, numB in enumerate(col):
                    if k != i:
                        if num == '-' and numB == '-':
                            pass
                        else:
                            if numB != '-' and num != '-':
                                # get the second char
                                charB = dotjoin(
                                    taxa[k], msa['alignment'][k][j], numB.split('.')[2])
                                try:
                                    corrs[charA][charB] += 1
                                except:
                                    try:
                                        corrs[charA][charB] = 1
                                    except:
                                        corrs[charA] = {charB: 1}

                            gaps = False
                            if num == '-' and numB != '-':
                                numA = charstring(idx)
                                gaps = True
                            elif numB == '-' and num != '-':
                                numB = charstring(alms.taxa.index(taxa[k]))
                                numA = num
                                gaps = True
                            else:
                                numA = num

                            scoreA = scorer[numA, numB]
                            scoreB = scorer[numB, numA]
                            this_score = max(scoreA, scoreB)

                            if not gaps:
                                score += this_score
                                count += 1
                            else:
                                score += this_score * gap_weight
                                count += gap_weight

                if count:
                    score = score / count
                else:
                    score = -25

                confidences += [int(score + 0.5)]
                values += [int(score + 0.5)]
            confidence_matrix += [confidences]
            character_matrix += [chars]

        # append confidence matrix to alignments
        alms.msa[ref][key]['confidence'] = confidence_matrix
        alms.msa[ref][key]['_charmat'] = character_matrix

    # sort the values
    values = sorted(set(values + [1]))

    # make conversion to scale of 100 values
    converter = {}
    valsA = values[:values.index(1)]
    valsB = values[values.index(1):]
    stepA = 50 / (len(valsA) + 1)
    stepB = 75 / (len(valsB) + 1)
    for i, score in enumerate(valsA):  # values[:values.index(0)):
        converter[score] = int((stepA * i) / 4 + 0.5)
    for i, score in enumerate(valsB):
        converter[score] = int(stepB * i + 0.5) + 50

    # iterate over keys again
    for key, msa in alms.msa[ref].items():
        # get basic stuff
        for i, line in enumerate(msa['confidence']):
            for j, cell in enumerate(line):
                alms.msa[ref][key]['confidence'][i][j] = converter[cell]

    jsond = {}
    for key, corr in corrs.items():
        splits = [c.split('.') + [o] for c, o in corr.items()]
        sorts = sorted(splits, key=lambda x: (x[0], -x[3]))
        new_sorts = []

        # check for rowspan
        spans = {}
        for a, b, c, d in sorts:
            if a in spans:
                if spans[a] < 3 and d > 1:
                    spans[a] += 1
                    new_sorts += [[a, b, c, d]]
            else:
                if d > 1:
                    spans[a] = 1
                    new_sorts += [[a, b, c, d]]

        bestis = []
        old_lang = ''
        counter = 0
        for a, b, c, d in new_sorts:
            new_lang = a
            if new_lang != old_lang:
                old_lang = new_lang

                tmp = '<tr class="display">'
                tmp += '<td class="display" rowspan={0}>'.format(spans[a])
                tmp += a + '</td>'
                tmp += '<td class="display" onclick="show({0});"><span '.format(
                    "'" + dotjoin(a, b, c) + "'")
                tmp += 'class="char {0}">' + b + '</span></td>'
                tmp += '<td class="display">'
                tmp += c + '</td>'
                tmp += '<td class="display">' + str(d) + '</td>'
                tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>'
                tmp += '</tr>'
                t = 'dolgo_' + token2class(b, rcParams['dolgo'])

                # bad check for three classes named differently
                if t == 'dolgo__':
                    t = 'dolgo_X'
                elif t == 'dolgo_1':
                    t = 'dolgo_TONE'
                elif t == 'dolgo_0':
                    t = 'dolgo_ERROR'

                bestis += [tmp.format(t)]
                counter += 1

            elif counter > 0:
                tmp = '<tr class="display">'
                tmp += '<td class="display" onclick="show({0});"><span '.format(
                    "'" + dotjoin(a, b, c) + "'")
                tmp += 'class="char {0}">' + b + '</span></td>'
                tmp += '<td class="display">' + c + '</td>'
                tmp += '<td class="display">' + str(d) + '</td>'
                tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>'
                tmp += '</tr>'

                t = 'dolgo_' + token2class(b, rcParams['dolgo'])

                # bad check for three classes named differently
                if t == 'dolgo__':
                    t = 'dolgo_X'
                elif t == 'dolgo_1':
                    t = 'dolgo_TONE'
                elif t == 'dolgo_0':
                    t = 'dolgo_ERROR'

                bestis += [tmp.format(t)]
                counter += 1
                old_lang = new_lang
            else:
                old_lang = new_lang
                counter = 0

        jsond[key] = [''.join(bestis), occs[key]]

    return jsond
Пример #7
0
def get_correspondences(alms, ref='lexstatid'):
    """
    Compute sound correspondences for a given set of aligned cognates.
    """
    # store all correspondences
    corrs = {}

    # store occurrences
    occs = {}

    for key, msa in alms.msa[ref].items():
        # get basic stuff
        idxs = msa['ID']
        taxa = msa['taxa']
        concept = cgi.escape(alms[idxs[0], 'concept'], True)

        # get numerical representation of alignments
        if 'numbers' in alms.header:
            alignment = [class2tokens(
                alms[idxs[i], 'numbers'],
                msa['alignment'][i]) for i in range(len(idxs))]
        else:
            alignment = msa['alignment']

        # create new array for confidence
        character_matrix = []

        # iterate over each taxon
        for i, taxon in enumerate(taxa):
            # get the numerical sequence
            nums = alignment[i]

            # store chars per line
            chars = []

            # iterate over the sequence
            for j, num in enumerate(nums):
                col = [alm[j] for alm in alignment]

                # get the char
                if num != '-':
                    charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2])
                    chars += [charA]
                    try:
                        occs[charA] += [concept]
                    except:
                        occs[charA] = [concept]
                else:
                    chars += ['-']

                for k, numB in enumerate(col):
                    if k != i:
                        if num == '-' and numB == '-':
                            pass
                        else:
                            if numB != '-' and num != '-':
                                # get the second char
                                charB = dotjoin(
                                    taxa[k],
                                    msa['alignment'][k][j],
                                    numB.split('.')[2])
                                try:
                                    corrs[charA][charB] += 1
                                except:
                                    try:
                                        corrs[charA][charB] = 1
                                    except:
                                        corrs[charA] = {charB: 1}

            character_matrix += [chars]

        # append confidence matrix to alignments
        alms.msa[ref][key]['_charmat'] = character_matrix

    return corrs, occs
Пример #8
0
def get_confidence(alms, scorer, ref='lexstatid', gap_weight=1):
    """
    Function creates confidence scores for a given set of alignments.

    Parameters
    ----------
    alms : :py:class`~lingpy.align.sca.Alignments`
        An *Alignments* object containing already aligned strings.
    scorer : :py:class:`~lingpy.algorithm._misc.ScoreDict`
        A *ScoreDict* object which gives similarity scores for all segments in
        the alignment.
    ref : str (default="lexstatid")
        The reference entry-type, referring to the cognate-set to be used for
        the analysis.
    """
    # store all values for average scores
    values = []

    # store all correspondences
    corrs = {}

    # store occurrences
    occs = {}

    for key, msa in alms.msa[ref].items():
        # get basic stuff
        idxs = msa['ID']
        taxa = msa['taxa']
        concept = cgi.escape(alms[idxs[0], 'concept'], True)

        # get numerical representation of alignments
        if scorer:
            alignment = [class2tokens(
                alms[idxs[i], 'numbers'],
                msa['alignment'][i]) for i in range(len(idxs))]
        else:
            alignment = msa['alignment']

        # create new array for confidence
        confidence_matrix = []
        character_matrix = []

        # iterate over each taxon
        for i, taxon in enumerate(taxa):
            idx = alms.taxa.index(taxon) + 1

            # get the numerical sequence
            nums = alignment[i]

            # store confidences per line
            confidences = []

            # store chars per line
            chars = []

            # iterate over the sequence
            for j, num in enumerate(nums):
                col = [alm[j] for alm in alignment]
                score = 0
                count = 0

                # get the char
                if num != '-':
                    charA = dotjoin(taxa[i], msa['alignment'][i][j], num.split('.')[2])
                    chars += [charA]
                    try:
                        occs[charA] += [concept]
                    except:
                        occs[charA] = [concept]
                else:
                    chars += ['-']

                for k, numB in enumerate(col):
                    if k != i:
                        if num == '-' and numB == '-':
                            pass
                        else:
                            if numB != '-' and num != '-':
                                # get the second char
                                charB = dotjoin(
                                    taxa[k], msa['alignment'][k][j], numB.split('.')[2])
                                try:
                                    corrs[charA][charB] += 1
                                except:
                                    try:
                                        corrs[charA][charB] = 1
                                    except:
                                        corrs[charA] = {charB: 1}

                            gaps = False
                            if num == '-' and numB != '-':
                                numA = charstring(idx)
                                gaps = True
                            elif numB == '-' and num != '-':
                                numB = charstring(alms.taxa.index(taxa[k]))
                                numA = num
                                gaps = True
                            else:
                                numA = num

                            scoreA = scorer[numA, numB]
                            scoreB = scorer[numB, numA]
                            this_score = max(scoreA, scoreB)

                            if not gaps:
                                score += this_score
                                count += 1
                            else:
                                score += this_score * gap_weight
                                count += gap_weight

                if count:
                    score = score / count
                else:
                    score = -25

                confidences += [int(score + 0.5)]
                values += [int(score + 0.5)]
            confidence_matrix += [confidences]
            character_matrix += [chars]

        # append confidence matrix to alignments
        alms.msa[ref][key]['confidence'] = confidence_matrix
        alms.msa[ref][key]['_charmat'] = character_matrix

    # sort the values
    values = sorted(set(values + [1]))

    # make conversion to scale of 100 values
    converter = {}
    valsA = values[:values.index(1)]
    valsB = values[values.index(1):]
    stepA = 50 / (len(valsA) + 1)
    stepB = 75 / (len(valsB) + 1)
    for i, score in enumerate(valsA):  # values[:values.index(0)):
        converter[score] = int((stepA * i) / 4 + 0.5)
    for i, score in enumerate(valsB):
        converter[score] = int(stepB * i + 0.5) + 50

    # iterate over keys again
    for key, msa in alms.msa[ref].items():
        # get basic stuff
        for i, line in enumerate(msa['confidence']):
            for j, cell in enumerate(line):
                alms.msa[ref][key]['confidence'][i][j] = converter[cell]

    jsond = {}
    for key, corr in corrs.items():
        splits = [c.split('.') + [o] for c, o in corr.items()]
        sorts = sorted(splits, key=lambda x: (x[0], -x[3]))
        new_sorts = []

        # check for rowspan
        spans = {}
        for a, b, c, d in sorts:
            if a in spans:
                if spans[a] < 3 and d > 1:
                    spans[a] += 1
                    new_sorts += [[a, b, c, d]]
            else:
                if d > 1:
                    spans[a] = 1
                    new_sorts += [[a, b, c, d]]

        bestis = []
        old_lang = ''
        counter = 0
        for a, b, c, d in new_sorts:
            new_lang = a
            if new_lang != old_lang:
                old_lang = new_lang

                tmp = '<tr class="display">'
                tmp += '<td class="display" rowspan={0}>'.format(spans[a])
                tmp += a + '</td>'
                tmp += '<td class="display" onclick="show({0});"><span '.format(
                    "'" + dotjoin(a, b, c) + "'")
                tmp += 'class="char {0}">' + b + '</span></td>'
                tmp += '<td class="display">'
                tmp += c + '</td>'
                tmp += '<td class="display">' + str(d) + '</td>'
                tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>'
                tmp += '</tr>'
                t = 'dolgo_' + token2class(b, rcParams['dolgo'])

                # bad check for three classes named differently
                if t == 'dolgo__':
                    t = 'dolgo_X'
                elif t == 'dolgo_1':
                    t = 'dolgo_TONE'
                elif t == 'dolgo_0':
                    t = 'dolgo_ERROR'

                bestis += [tmp.format(t)]
                counter += 1

            elif counter > 0:
                tmp = '<tr class="display">'
                tmp += '<td class="display" onclick="show({0});"><span '.format(
                    "'" + dotjoin(a, b, c) + "'")
                tmp += 'class="char {0}">' + b + '</span></td>'
                tmp += '<td class="display">' + c + '</td>'
                tmp += '<td class="display">' + str(d) + '</td>'
                tmp += '<td class="display">' + str(len(occs[dotjoin(a, b, c)])) + '</td>'
                tmp += '</tr>'

                t = 'dolgo_' + token2class(b, rcParams['dolgo'])

                # bad check for three classes named differently
                if t == 'dolgo__':
                    t = 'dolgo_X'
                elif t == 'dolgo_1':
                    t = 'dolgo_TONE'
                elif t == 'dolgo_0':
                    t = 'dolgo_ERROR'

                bestis += [tmp.format(t)]
                counter += 1
                old_lang = new_lang
            else:
                old_lang = new_lang
                counter = 0

        jsond[key] = [''.join(bestis), occs[key]]

    return jsond
Пример #9
0
def test_dotjoin():
    assert util.dotjoin(1, 2) == '1.2'
    assert util.dotjoin([1, 2]) == '1.2'
    assert util.dotjoin((1, 2)) == '1.2'
    assert util.dotjoin((i for i in range(1, 3)), condition=lambda j: j > 1) == '2'
    assert util.dotjoin(i for i in range(1, 3)) == '1.2'