Exemplo n.º 1
0
    def test_msa2str(self):
        aranger = '{body}{meta}'

        # read msa traditionally into an object
        msaA = lingpy.MSA(test_data('harry.msa'))

        # read msa from dictionary
        msaB = lingpy.read.qlc.read_msa(test_data('harry.msa'))

        # read msa with IDs
        msaC = lingpy.read.qlc.read_msa(test_data('harry_with_ids.msa'),
                                        ids=True,
                                        header=False)

        # we adjust the dataset and the seq_id since otherwise we won't have
        # similar output
        msaC['seq_id'] = 'test'
        msaC['dataset'] = 'file'

        # when converting these different objects to string with the same body and
        # the like, they should be identical, so we check this here
        strA = msa2str(msaA, _arange=aranger)
        strB = msa2str(msaB, _arange=aranger)
        strC = msa2str(msaC, _arange=aranger, wordlist=False)

        assert strA == strB == strC

        # we next test for converting with the merging attribute
        strD = msa2str(msaC, _arange=aranger, wordlist=True, merge=True)
        strE = msa2str(msaC, _arange=aranger, wordlist=True, merge=False)

        # remove tabstops for checking similar strings
        strDst = strD.replace('\t', '')
        strEst = strE.replace('\t', '')

        # get index until 'COLUMN'
        idx = strDst.index('COLUMNID')
        assert strD != strE and strDst[:idx] == strEst[:idx]

        # add a consensus string to all msa objects
        consensusA = lingpy.align.sca.get_consensus(lingpy.align.sca.MSA(msaB),
                                                    gaps=True)
        consensusB = lingpy.align.sca.get_consensus(lingpy.align.sca.MSA(msaC),
                                                    gaps=True)

        msaB['consensus'] = consensusA
        msaC['consensus'] = consensusB

        assert msa2str(msaB) == msa2str(msaC, wordlist=False)
Exemplo n.º 2
0
    def test_msa2str(self):
        aranger = '{body}{meta}'

        # read msa traditionally into an object
        msa_a = MSA(test_data('harry.msa'))

        # read msa from dictionary
        msa_b = qlc.read_msa(test_data('harry.msa'))

        # read msa with IDs
        msa_c = qlc.read_msa(test_data('harry_with_ids.msa'),
                             ids=True,
                             header=False)

        # we adjust the dataset and the seq_id since otherwise we won't have
        # similar output
        msa_c['seq_id'] = 'test'
        msa_c['dataset'] = 'file'

        # when converting these different objects to string with the same body
        # and the like, they should be identical, so we check this here
        str_a = msa2str(msa_a, _arange=aranger)
        str_b = msa2str(msa_b, _arange=aranger)
        str_c = msa2str(msa_c, _arange=aranger, wordlist=False)

        assert str_a == str_b == str_c

        # we next test for converting with the merging attribute
        str_d = msa2str(msa_c, _arange=aranger, wordlist=True, merge=True)
        str_e = msa2str(msa_c, _arange=aranger, wordlist=True, merge=False)

        # remove tabstops for checking similar strings
        str_d_st = str_d.replace('\t', '')
        str_e_st = str_e.replace('\t', '')

        # get index until 'COLUMN'
        idx = str_d_st.index('COLUMNID')
        assert str_d != str_e and str_d_st[:idx] == str_e_st[:idx]

        # add a consensus string to all msa objects
        consensus_a = get_consensus(MSA(msa_b), gaps=True)
        consensus_b = get_consensus(MSA(msa_c), gaps=True)

        msa_b['consensus'] = consensus_a
        msa_c['consensus'] = consensus_b

        assert msa2str(msa_b) == msa2str(msa_c, wordlist=False)
Exemplo n.º 3
0
    def test_msa2str(self):
        aranger = '{body}{meta}'

        # read msa traditionally into an object
        msaA = lingpy.MSA(test_data('harry.msa'))

        # read msa from dictionary
        msaB = lingpy.read.qlc.read_msa(test_data('harry.msa'))

        # read msa with IDs
        msaC = lingpy.read.qlc.read_msa(
            test_data('harry_with_ids.msa'), ids=True, header=False)

        # we adjust the dataset and the seq_id since otherwise we won't have
        # similar output
        msaC['seq_id'] = 'test'
        msaC['dataset'] = 'file'

        # when converting these different objects to string with the same body and
        # the like, they should be identical, so we check this here
        strA = msa2str(msaA, _arange=aranger)
        strB = msa2str(msaB, _arange=aranger)
        strC = msa2str(msaC, _arange=aranger, wordlist=False)

        assert strA == strB == strC

        # we next test for converting with the merging attribute
        strD = msa2str(msaC, _arange=aranger, wordlist=True, merge=True)
        strE = msa2str(msaC, _arange=aranger, wordlist=True, merge=False)

        # remove tabstops for checking similar strings
        strDst = strD.replace('\t', '')
        strEst = strE.replace('\t', '')

        # get index until 'COLUMN'
        idx = strDst.index('COLUMNID')
        assert strD != strE and strDst[:idx] == strEst[:idx]

        # add a consensus string to all msa objects
        consensusA = lingpy.align.sca.get_consensus(lingpy.align.sca.MSA(msaB), gaps=True)
        consensusB = lingpy.align.sca.get_consensus(lingpy.align.sca.MSA(msaC), gaps=True)

        msaB['consensus'] = consensusA
        msaC['consensus'] = consensusB

        assert msa2str(msaB) == msa2str(msaC, wordlist=False)
Exemplo n.º 4
0
def wl2qlc(header, data, filename='', formatter='concept', **keywords):
    """
    Write the basic data of a wordlist to file.
    """
    util.setdefaults(keywords,
                     ignore=['taxa', 'doculects', 'msa'],
                     fileformat='qlc',
                     prettify=True)
    if keywords['ignore'] == 'all':
        keywords['ignore'] = [
            'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json'
        ]

    formatter = formatter.upper()
    if not filename:
        filename = rcParams['filename']

    # create output string
    out = '# Wordlist\n' if keywords['prettify'] else ''

    # write meta to file
    meta = keywords.get("meta", {})
    kvpairs = {}
    jsonpairs = {}
    msapairs = {}
    trees = {}
    distances = ''
    taxa = ''
    scorer = ''

    for k, v in meta.items():
        # simple key-value-pairs
        if isinstance(v, (str, int)) or k == "tree":
            kvpairs[k] = v
        elif k == 'msa' and k not in keywords['ignore']:
            # go a level deeper, checking for keys
            for ref in v:
                if ref not in msapairs:
                    msapairs[ref] = {}
                for a, b in v[ref].items():
                    msapairs[ref][a] = b
        elif k == 'distances':
            distances = matrix2dst(v, meta['taxa'])
        elif k in ['taxa', 'doculect', 'taxon', 'doculects']:
            # we need to find a better solution here, since it is not nice to
            # have taxa written to json again and again
            pass
        elif k == 'trees' and k not in keywords['ignore']:
            trees = ''
            for key, value in v.items():
                trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value)
        elif k == 'scorer' and k not in keywords['ignore']:
            for key, value in v.items():
                scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format(
                    key, scorer2str(value), k)
        else:
            # check whether serialization works
            try:
                json.dumps(v)
                jsonpairs[k] = v
            except TypeError:
                pass

    if kvpairs and 'meta' not in keywords['ignore']:
        out += '\n# META\n' if keywords['prettify'] else ''
        for k, v in sorted(kvpairs.items(), key=lambda x: x[0]):
            out += '@{0}:{1}\n'.format(k, v)
    if taxa and keywords['taxa']:
        out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n'
    if jsonpairs and 'json' not in keywords['ignore']:
        out += "@json: " + json.dumps(jsonpairs) + '\n'
    if msapairs and 'msa' not in keywords['ignore']:
        for ref in msapairs:
            out += "\n# MSA reference: {0}\n".format(ref)
            for k, v in msapairs[ref].items():
                if 'consensus' in v:
                    out += '#\n<msa '
                    out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format(
                        k, ref, ' '.join(v['consensus']))
                else:
                    out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref)
                outs = msa2str(v, wordlist=True)
                out += outs
                out += "</msa>\n"

    if distances and 'distances' not in keywords['ignore']:
        out += '\n# DISTANCES\n<dst>\n'
        out += distances + '</dst>\n'

    if trees:
        out += '\n# TREES\n' + trees

    if scorer and 'scorer' not in keywords['ignore']:
        out += '\n# SCORER\n' + scorer

    out += '\n# DATA\n' if keywords['prettify'] else ''
    out += 'ID\t' + '\t'.join(header) + '\n'

    # check for gloss in header to create nice output format
    if formatter in header:
        idx = header.index(formatter)
        formatter = None
        sorted_data = sorted(data.keys(), key=lambda x: data[x][idx])
    elif len(formatter.split(',')) == 2:
        idxA, idxB = formatter.split(',')
        idxA = header.index(idxA)
        idxB = header.index(idxB)
        idx = idxA
        formatter = None
        sorted_data = sorted(data.keys(),
                             key=lambda x: (data[x][idxA], data[x][idxB]))
    else:
        idx = False
        formatter = ''
        sorted_data = sorted(data.keys())

    for key in sorted_data:
        # get the line
        line = data[key]

        # check for formatter
        if idx in range(len(line)):
            if line[idx] != formatter:
                out += '#\n' if keywords['prettify'] else ''
                formatter = line[idx]

        # add the key
        out += str(key)

        # add the rest of the values
        for value in line:
            if type(value) == list:
                try:
                    out += '\t' + ' '.join(value)
                except:
                    out += '\t' + ' '.join([str(v) for v in value])
            elif type(value) == int:
                out += '\t' + str(value)
            elif type(value) == float:
                out += '\t{0:.4f}'.format(value)
            elif value is None:
                out += '\t'
            else:
                out += '\t{:}'.format(value)
        out += '\n'

    util.write_text_file(filename + '.' + keywords['fileformat'],
                         out + keywords.get('stamp', ''),
                         normalize="NFC")
    return
Exemplo n.º 5
0
def wl2qlc(
        header,
        data,
        filename='',
        formatter='concept',
        **keywords):
    """
    Write the basic data of a wordlist to file.
    """
    util.setdefaults(
        keywords,
        ignore=['taxa', 'doculects', 'msa'],
        fileformat='qlc',
        prettify=True)
    if keywords['ignore'] == 'all':
        keywords['ignore'] = [
            'taxa', 'scorer', 'meta', 'distances', 'doculects', 'msa', 'json']

    formatter = formatter.upper()
    if not filename:
        filename = rcParams['filename']

    # create output string
    out = '# Wordlist\n' if keywords['prettify'] else ''

    # write meta to file
    meta = keywords.get("meta", {})
    kvpairs = {}
    jsonpairs = {}
    msapairs = {}
    trees = {}
    distances = ''
    taxa = ''
    scorer = ''

    for k, v in meta.items():
        # simple key-value-pairs
        if isinstance(v, (text_type, int)) or k == "tree":
            kvpairs[k] = v
        elif k == 'msa' and k not in keywords['ignore']:
            # go a level deeper, checking for keys
            for ref in v:
                if ref not in msapairs:
                    msapairs[ref] = {}
                for a, b in v[ref].items():
                    msapairs[ref][a] = b
        elif k == 'distances':
            distances = matrix2dst(v, meta['taxa'])
        elif k in ['taxa', 'doculect', 'taxon', 'doculects']:
            # we need to find a better solution here, since it is not nice to
            # have taxa written to json again and again
            pass
        elif k == 'trees' and k not in keywords['ignore']:
            trees = ''
            for key, value in v.items():
                trees += '<tre id="{0}">\n{1}\n</tre>\n'.format(key, value)
        elif k == 'scorer' and k not in keywords['ignore']:
            for key, value in v.items():
                scorer += '<{2} id="{0}">\n{1}</{2}>\n\n'.format(
                    key, scorer2str(value), k)
        else:
            # check whether serialization works
            try:
                json.dumps(v)
                jsonpairs[k] = v
            except TypeError:
                pass

    if kvpairs and 'meta' not in keywords['ignore']:
        out += '\n# META\n' if keywords['prettify'] else ''
        for k, v in sorted(kvpairs.items(), key=lambda x: x[0]):
            out += '@{0}:{1}\n'.format(k, v)
    if taxa and keywords['taxa']:
        out += '\n# TAXA\n<taxa>\n' + taxa + '\n</taxa>\n'
    if jsonpairs and 'json' not in keywords['ignore']:
        out += "@json: " + json.dumps(jsonpairs) + '\n'
    if msapairs and 'msa' not in keywords['ignore']:
        for ref in msapairs:
            out += "\n# MSA reference: {0}\n".format(ref)
            for k, v in msapairs[ref].items():
                if 'consensus' in v:
                    out += '#\n<msa '
                    out += 'id="{0}" ref="{1}" consensus="{2}">\n'.format(
                        k, ref, ' '.join(v['consensus']))
                else:
                    out += '#\n<msa id="{0}" ref="{1}">\n'.format(k, ref)
                outs = msa2str(v, wordlist=True)
                out += outs
                out += "</msa>\n"

    if distances and 'distances' not in keywords['ignore']:
        out += '\n# DISTANCES\n<dst>\n'
        out += distances + '</dst>\n'

    if trees:
        out += '\n# TREES\n' + trees

    if scorer and 'scorer' not in keywords['ignore']:
        out += '\n# SCORER\n' + scorer

    out += '\n# DATA\n' if keywords['prettify'] else ''
    out += 'ID\t' + '\t'.join(header) + '\n'

    # check for gloss in header to create nice output format
    if formatter in header:
        idx = header.index(formatter)
        formatter = None
        sorted_data = sorted(data.keys(), key=lambda x: data[x][idx])
    elif len(formatter.split(',')) == 2:
        idxA, idxB = formatter.split(',')
        idxA = header.index(idxA)
        idxB = header.index(idxB)
        idx = idxA
        formatter = None
        sorted_data = sorted(data.keys(), key=lambda x: (
            data[x][idxA], data[x][idxB]))
    else:
        idx = False
        formatter = ''
        sorted_data = sorted(data.keys())

    for key in sorted_data:
        # get the line
        line = data[key]

        # check for formatter
        if idx in range(len(line)):
            if line[idx] != formatter:
                out += '#\n' if keywords['prettify'] else ''
                formatter = line[idx]

        # add the key
        out += text_type(key)

        # add the rest of the values
        for value in line:
            if type(value) == list:
                try:
                    out += '\t' + ' '.join(value)
                except:
                    out += '\t' + ' '.join([text_type(v) for v in value])
            elif type(value) == int:
                out += '\t' + text_type(value)
            elif type(value) == float:
                out += '\t{0:.4f}'.format(value)
            elif value is None:
                out += '\t'
            else:
                out += '\t{:}'.format(value)
        out += '\n'

    util.write_text_file(
        filename + '.' + keywords['fileformat'],
        out + keywords.get('stamp', ''),
        normalize="NFC")
    return