def __init__(self, infile=None, col='list', row='key', conf=None): QLCParserWithRowsAndCols.__init__( self, infile or util.data_path('swadesh', 'swadesh.qlc'), row, col, conf or util.data_path('conf', 'swadesh.rc')) # get row and key index if not hasattr(self, '_rowidx'): # add indices to alias dictionary for swadesh lists for i, col in enumerate(self.cols): self._meta[col] = self._array[np.nonzero(self._array[:, i]), i][0]
def compile_dvt(path=''): """ Function compiles diacritics, vowels, and tones. Notes ----- Diacritics, vowels, and tones are defined in the :file:`data/models/dv/` directory of the LingPy package and automatically loaded when loading the LingPy library. The values are defined as the constants :py:obj:`rcParams['vowels']`, :py:obj:`rcParams['diacritics']`, and :py:obj:`rcParams['tones']`. Their core purpose is to guide the tokenization of IPA strings (cf. :py:func:`~lingpy.sequence.sound_classes.ipa2tokens`). In order to change the variables, one simply has to change the text files :file:`diacritics`, :file:`tones`, and :file:`vowels` in the :file:`data/models/dv` directory. The structure of these files is fairly simple: Each line contains a vowel or a diacritic character, whereas diacritics are preceded by a dash. See also -------- lingpy.data.model.Model lingpy.data.derive.compile_model """ log.info("Compiling diacritics and vowels...") # get the path to the models if not path: file_path = util.data_path('models', 'dvt') elif path in ['evolaemp', 'el']: file_path = util.data_path('models', 'dvt_el') else: file_path = path def _read_string(name): # normalize stuff # TODO: this is potentially dangerous and it is important to decide whether # TODO: switching to NFD might not be a better choice return util.read_text_file(os.path.join(file_path, name), normalize='NFC').replace('\n', '') diacritics = _read_string('diacritics').replace('-', '') vowels = ''.join( [v for v in _read_string('vowels') if v not in diacritics]) tones = _read_string('tones') dvt = (diacritics, vowels, tones) if path in ['evolaemp', 'el']: cache.dump(dvt, 'dvt_el') else: cache.dump(dvt, 'dvt') log.info("Diacritics and sound classes were successfully compiled.")
def compile_dvt(path=''): """ Function compiles diacritics, vowels, and tones. Notes ----- Diacritics, vowels, and tones are defined in the :file:`data/models/dv/` directory of the LingPy package and automatically loaded when loading the LingPy library. The values are defined as the constants :py:obj:`rcParams['vowels']`, :py:obj:`rcParams['diacritics']`, and :py:obj:`rcParams['tones']`. Their core purpose is to guide the tokenization of IPA strings (cf. :py:func:`~lingpy.sequence.sound_classes.ipa2tokens`). In order to change the variables, one simply has to change the text files :file:`diacritics`, :file:`tones`, and :file:`vowels` in the :file:`data/models/dv` directory. The structure of these files is fairly simple: Each line contains a vowel or a diacritic character, whereas diacritics are preceded by a dash. See also -------- lingpy.data.model.Model lingpy.data.derive.compile_model """ log.info("Compiling diacritics and vowels...") # get the path to the models if not path: file_path = util.data_path('models', 'dvt') elif path in ['evolaemp', 'el']: file_path = util.data_path('models', 'dvt_el') else: file_path = path def _read_string(name): # normalize stuff # TODO: this is potentially dangerous and it is important to decide whether # TODO: switching to NFD might not be a better choice return util.read_text_file( os.path.join(file_path, name), normalize='NFC').replace('\n', '') diacritics = _read_string('diacritics').replace('-', '') vowels = ''.join([v for v in _read_string('vowels') if v not in diacritics]) tones = _read_string('tones') dvt = (diacritics, vowels, tones) if path in ['evolaemp', 'el']: cache.dump(dvt, 'dvt_el') else: cache.dump(dvt, 'dvt') log.info("Diacritics and sound classes were successfully compiled.")
def __init__(self, infile=None, col='list', row='key', conf=None): QLCParserWithRowsAndCols.__init__( self, infile or util.data_path('swadesh', 'swadesh.qlc'), row, col, conf or util.data_path('conf', 'swadesh.rc')) # get row and key index if not hasattr(self, '_rowidx'): # add indices to alias dictionary for swadesh lists for i, col in enumerate(self.cols): self._meta[col] = self._array[np.nonzero(self._array[:, i]), i][0]
def test_init(test_data): p = QLCParser({0: ['a']}) QLCParser(p) with pytest.raises(IOError): QLCParser('not-extisting-file') with pytest.raises(TypeError): QLCParser(None) with pytest.raises(ValueError): QLCParserWithRowsAndCols({0: ['a']}, 'x', 'y', {}) with pytest.raises(ValueError): QLCParserWithRowsAndCols( { 0: ['concept', 'language', 'bla'], 1: ['bla', 'blu'] }, 'concept', 'language', '') p2 = QLCParserWithRowsAndCols(str(test_data / 'bad_file2.tsv'), 'concept', 'language', data_path('conf', 'wordlist.rc')) assert p2.get_entries('cogid')[0][-1] == 'ff' with pytest.raises(KeyError): p2.__getitem__(tuple([2000, 'bla'])) assert p2[3, 'language'] == 'l3' assert p2[3, 'nothing'] is None
def __init__(self, filename, row='concept', col='doculect', conf=None): QLCParserWithRowsAndCols.__init__( self, filename, row, col, conf or util.data_path('conf', 'wordlist.rc')) # setup other local temporary storage self._etym_dict = {} # check for taxa in meta if 'taxa' in self._alias: if self._alias['taxa'] not in self._meta: self._meta[self._alias['taxa']] = self.cols
def __init__(self, filename, row='concept', col='doculect', conf=None): QLCParserWithRowsAndCols.__init__( self, filename, row, col, conf or util.data_path('conf', 'wordlist.rc')) # setup other local temporary storage self._etym_dict = {} # check for taxa in meta if 'taxa' in self._alias: if self._alias['taxa'] not in self._meta: self._meta[self._alias['taxa']] = self.cols
def test_init(self): p = QLCParser({0: ['a']}) QLCParser(p) self.assertRaises(IOError, QLCParser, 'not-extisting-file') self.assertRaises(TypeError, QLCParser, None) self.assertRaises(ValueError, QLCParserWithRowsAndCols, {0: ['a']}, 'x', 'y', {}) self.assertRaises(ValueError, QLCParserWithRowsAndCols, {0: ['concept', 'language', 'bla'], 1 : ['bla', 'blu']}, 'concept', 'language', '') p2 = QLCParserWithRowsAndCols(test_data('bad_file2.tsv'), 'concept', 'language', data_path('conf', 'wordlist.rc')) assert p2.get_entries('cogid')[0][-1] == 'ff' self.assertRaises(KeyError, p2.__getitem__, tuple([2000, 'bla'])) assert p2[3, 'language'] == 'l3' assert p2[3, 'nothing'] is None
def __init__(self, model, path=None): new_path = lambda *cmps: \ os.path.join(path or util.data_path('models'), model, *cmps) self.name = model # try to load the converter try: self.converter = cache.load(model + '.converter') except: compile_model(model, path) self.converter = cache.load(model + '.converter') # give always preference to scorer matrix files if os.path.isfile(new_path('matrix')): self.scorer = read_scorer(new_path('matrix')) elif os.path.isfile(new_path('scorer.bin')): try: self.scorer = cache.load(model + '.scorer') except compat.FileNotFoundError: pass # if none of the above fits, leave it else: pass # read information from the info-file self.info = {} info = util.read_text_file(new_path('INFO')) data = ['description', 'compiler', 'source', 'date', 'vowels', 'tones'] for line in data: try: self.info[line] = re.findall('@' + line + ': (.*)', info)[0] except: self.info[line] = 'unknown' # check for vowels and tones if "vowels" in self.info: self.vowels = self.info['vowels'] if "tones" in self.info: self.tones = self.info['tones']
def __init__(self, model, path=None): new_path = lambda *cmps: \ os.path.join(path or util.data_path('models'), model, *cmps) self.name = model # try to load the converter try: self.converter = cache.load(model + '.converter') except: compile_model(model, path) self.converter = cache.load(model + '.converter') # give always preference to scorer matrix files if os.path.isfile(new_path('matrix')): self.scorer = read_scorer(new_path('matrix')) elif os.path.isfile(new_path('scorer.bin')): try: self.scorer = cache.load(model + '.scorer') except compat.FileNotFoundError: pass # if none of the above fits, leave it else: pass # read information from the info-file self.info = {} info = util.read_text_file(new_path('INFO')) data = ['description', 'compiler', 'source', 'date', 'vowels', 'tones'] for line in data: try: self.info[line] = re.findall('@' + line + ': (.*)', info)[0] except: self.info[line] = 'unknown' # check for vowels and tones if "vowels" in self.info: self.vowels = self.info['vowels'] if "tones" in self.info: self.tones = self.info['tones']
def read_conf(conf=''): # load the configuration file if not conf: conf = util.data_path('conf', 'qlc.rc') # read the file defined by its path in conf tmp = [line.split('\t') for line in util.read_config_file(conf)] aliasD, classD, class_stringD, alias2D = {}, {}, {}, {} for name, cls, alias in tmp: # make sure the name itself is there aliasD[name.lower()] = aliasD[name.upper()] = name classD[name.lower()] = classD[name.upper()] = eval(cls) class_stringD[name.lower()] = class_stringD[name.upper()] = cls # add the aliases for a in alias.split(','): aliasD[a.lower()] = aliasD[a.upper()] = name classD[a.lower()] = classD[a.upper()] = eval(cls) class_stringD[a.lower()] = class_stringD[a.upper()] = cls alias2D[name] = sorted(set(alias.split(','))) + [name] return aliasD, classD, class_stringD, alias2D
@date: 2007/07/19 """ from __future__ import print_function, division, unicode_literals import re import sys import codecs from lingpy.util import data_path # data for sampa2ipa (Peter Kleiwegs implementation) xsdata = [] _xsKeys = [' '] xs = {' ': ' '} for line in codecs.open(data_path('ipa', 'sampa.csv'), 'r', 'utf-8'): line = line.strip('\n').strip('\r') if line and not line.startswith('#'): key,val = line.split('\t') if key in xs and xs[key] != val: raise ValueError("Keys encode too many values.") _xsKeys.append(key) xs[key] = eval('"""' + val + '"""') _kk = [] for _k in _xsKeys: _kk.append(re.escape(_k)) _kk.sort(reverse=True) # long before short _xsPat = '|'.join(_kk) reXS = re.compile('(' + _xsPat + ')|(.)')
def from_cldf( cls, path, columns=( 'parameter_id', 'concept_name', 'language_id', 'language_name', 'value', 'form', 'segments', 'language_glottocode', 'concept_concepticon_id', 'language_latitude', 'language_longitude', 'cognacy' ), namespace=( ('concept_name', 'concept'), ('language_id', 'doculect'), ('segments', 'tokens'), ('language_glottocode', 'glottolog'), ('concept_concepticon_id', 'concepticon'), ('language_latitude', 'latitude'), ('language_longitude', 'longitude'), ('cognacy', 'cognacy'), ('cogid_cognateset_id', 'cogid') ), filter=lambda row: row["form"], **kwargs): """Load a CLDF dataset. Open a CLDF Dataset – with metadata or metadata-free – (only Wordlist datasets are supported for now, because other modules don't seem to make sense for LingPy) and transform it into this Class. Columns from the FormTable are imported in lowercase, columns from LanguageTable, ParameterTable and CognateTable are prefixed with `langage_`, `concept_` and `cogid_`and converted to lowercase. Notes ----- CLDFs default column names for wordlists are different from LingPy's, so you probably have to use:: >>> lingpy.Wordlist.from_cldf( "Wordlist-metadata.json", ) in order to avoid errors from LingPy not finding required columns. Parameters ---------- columns: list or tuple The list of columns to import. (default: all columns) filter: function: rowdict → bool A condition function for importing only some rows. (default: lambda row: row["form"]) All other parameters are passed on to the `cls` Returns ------- A `cls` object representing the CLDF dataset """ kw = { 'row': 'concept', 'col': 'doculect', 'conf': util.data_path('conf', 'wordlist.rc'), } kwargs.update(kw) if isinstance(namespace, tuple): namespace = dict(namespace) # get the datatypes from configuration as to namespace datatypes = read_conf(kwargs['conf'])[1] # Load the dataset. fname = Path(path) if not fname.exists(): raise FileNotFoundError('{:} does not exist'.format(fname)) if fname.suffix == '.json': dataset = pycldf.dataset.Dataset.from_metadata(fname) else: dataset = pycldf.dataset.Dataset.from_data(fname) if dataset.module == "Wordlist": # First, make a list of cognate codes if they are in a separate table. cognateset_assignments = {} try: form_reference = dataset["CognateTable", "formReference"].name for row in dataset["CognateTable"].iterdicts(): cognateset_assignments[row[form_reference]] = row except KeyError: # Either there are no cognate codes, or they are in the form # table. Both options are fine. pass f_id = dataset["FormTable", "id"].name # Access columns by type, not by name. language_column = dataset["FormTable", "languageReference"].name parameter_column = dataset["FormTable", "parameterReference"].name try: l_id = dataset["LanguageTable", "id"].name languages = {l[l_id]: l for l in dataset["LanguageTable"].iterdicts()} except KeyError: l_id = "ID" languages = bounce_as_id try: c_id = dataset["ParameterTable", "id"].name concepts = {c[c_id]: c for c in dataset["ParameterTable"].iterdicts()} except KeyError: c_id = "ID" concepts = bounce_as_id # create dictionary D = {0: columns} # Reserve the header for row in dataset["FormTable"].iterdicts(): # TODO: Improve prefixing behaviour s = {"cogid_{:}".format(key).lower(): value for key, value in cognateset_assignments.get( row[f_id], {}).items()} s.update( {"language_{:}".format(key).lower(): value for key, value in languages[row[language_column]].items()}) s.update( {"concept_{:}".format(key).lower(): value for key, value in concepts[row[parameter_column]].items()}) s.update({k.lower(): v for k, v in row.items()}) if not filter(s): continue # check for numeric ID try: idx = int(row[f_id]) except ValueError: idx = len(D) while idx in D: idx += 1 if not D[0]: columns = list(s.keys()) D[0] = [c.lower() for c in columns] D[idx] = [datatypes.get( namespace.get( column, ''), lambda x: x)( s.get(column, '')) for column in columns] D[0] = [namespace.get(c, c) for c in columns] if len(D[0]) != len(set(D[0])): log.warning('|'.join(columns)) log.warning('|'.join(D[0])) raise ValueError('name space clashes, cannot parse data') # convert to wordlist and return return cls(D, **kwargs) else: # For most LingPy applications, it might be best to see whether we got # a Wordlist module. raise ValueError("LingPy has no procedures for CLDF {:} data.".format( dataset.module))
def __init__(self, filename, conf=''): """ Parse data regularly if the data has not been loaded from a pickled version. """ self.log = log.get_logger() # try to load the data internal_import = False # check whether it's a dictionary from which we load if isinstance(filename, dict): input_data = filename if 'filename' not in input_data: self.filename = rcParams['filename'] internal_import = True # make check for correct input, there was a bug with a wrong # evaluation which is hopefully fixed by now tmp_keys = [k for k in input_data if isinstance(k, int)] if len(input_data[0]) != len(input_data[tmp_keys[0]]): print(input_data[0], input_data[tmp_keys[0]]) raise ValueError("[!] Wrong input format!") # pragma: no cover # check whether it's another wordlist-object elif hasattr(filename, '_data') and hasattr(filename, '_meta'): input_data = dict([(key, [v for v in value]) for key, value in \ filename._data.items()]) input_data.update(filename._meta.items()) input_data[0] = [ a for a, b in sorted( filename.header.items(), key=lambda x: x[1], reverse=False) ] internal_import = True self.filename = rcParams['filename'] # or whether the data is an actual file elif isinstance(filename, string_types) and os.path.isfile(filename): input_data = read_qlc(filename) self.filename = filename # raise an error otherwise elif isinstance(filename, string_types): raise IOError("Input file '{0}' does not exist.".format(filename)) else: raise TypeError( "Unrecognized type for 'filename' argument: {0}".format( type(filename).__name__)) # load the configuration file if not conf: conf = util.data_path('conf', 'qlc.rc') # read the file defined by its path in conf tmp = [line.split('\t') for line in util.read_config_file(conf)] # define two attributes, _alias, and _class which store the aliases and # the datatypes (classes) of the given entries self._alias, self._class, self._class_string, self._alias2 = {}, {}, {}, {} for name, cls, alias in tmp: # make sure the name itself is there self._alias[name.lower()] = self._alias[name.upper()] = name self._class[name.lower()] = self._class[name.upper()] = eval(cls) self._class_string[name.lower()] = self._class_string[ name.upper()] = cls # add the aliases for a in alias.split(','): self._alias[a.lower()] = self._alias[a.upper()] = name self._class[a.lower()] = self._class[a.upper()] = eval(cls) self._class_string[a.lower()] = self._class_string[ a.upper()] = cls self._alias2[name] = sorted(set(alias.split(','))) + [name] # append the names in data[0] to self.conf to make sure that all data # is covered, even the types which are not specifically defined in the # conf file. the datatype defaults here to "str" for name in input_data[0]: if name.lower() not in self._alias: self._alias[name.lower()] = name.lower() self._class[name.lower()] = str if name.upper() not in self._alias: self._alias[name.upper()] = name.lower() self._class[name.upper()] = str # add empty alias for empty strings XXX why was that? I can't remember # why this was important XXX self._alias[''] = '' # the header stores the indices of the data in the original data dictionary self.header = dict( zip([self._alias[x] for x in input_data[0]], range(len(input_data[0])))) # now create a specific header which has all aliases self._header = {k: v for k, v in self.header.items()} # add a sorted header for reference self.columns = sorted(self.header, key=lambda x: self.header[x]) # assign all aliases to the header for alias in self._alias: try: self._header[alias] = self._header[self._alias[alias]] except: pass # assign the data as attribute to the word list class. Note that we # need to check for the type here, but since numpy also offers integer # types, we don't check for type(x) == int, but instead use the # str.numeric-function that returns numeric values only if it is an # integer self._data = { int(k): v for k, v in input_data.items() if k != 0 and str(k).isnumeric() } # check for same length of all columns check_errors = '' for k, v in self._data.items(): if len(v) != len(self.header): check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format( k, len(v), len(self.header)) if check_errors: raise ValueError(check_errors + '\n' + ', '.join(sorted(self.header))) # iterate over self._data and change the values according to the # functions (only needed when reading from file) if not internal_import: heads = sorted(self._header.items(), key=lambda x: x[1]) for key in self._data: check = [] for head, i in heads: if i not in check: logstring = 'Problem with row {0} in col {1}, expected' + \ ' «{4}» as datatype but received «{3}» ' + \ ' (ROW: {2}, entry {5}).' try: self._data[key][i] = self._class[head]( self._data[key][i]) check.append(i) except KeyError: log.warning( logstring.format( key, i, '|'.join([str(x) for x in self._data[key] ]), self._data[key][i], self._class[head], head)) except ValueError: log.warning( logstring.format( key, i, '|'.join([str(x) for x in self._data[key] ]), self._data[key][i], self._class[head], head)) # create entry attribute of the wordlist self.entries = sorted( set([b.lower() for a, b in self._alias.items() if b])) # assign meta-data self._meta = {} for key in [k for k in input_data if type(k) != int]: self._meta[key] = input_data[key]
def compile_model(model, path=None): """ Function compiles customized sound-class models. Parameters ---------- model : str A string indicating the name of the model which shall be created. path : str A string indication the path where the model-folder is stored. Notes ----- A model is defined by a folder placed in :file:`data/models` directory of the LingPy package. The name of the folder reflects the name of the model. It contains three files: the file :file:`converter`, the file :file:`INFO`, and the optional file :file:`scorer`. The format requirements for these files are as follows: :file:`INFO` The ``INFO``-file serves as a reference for a given sound-class model. It can contain arbitrary information (and also be empty). If one wants to define specific characteristics, like the ``source``, the ``compiler``, the ``date``, or a ``description`` of a given model, this can be done by employing a key-value structure in which the key is preceded by an ``@`` and followed by a colon and the value is written right next to the key in the same line, e.g.:: @source: Dolgopolsky (1986) This information will then be read from the ``INFO`` file and rendered when printing the model to screen with help of the :py:func:`print` function. :file:`converter` The ``converter`` file contains all sound classes which are matched with their respective sound values. Each line is reserved for one class, precede by the key (preferably an ASCII-letter) representing the class:: B : ɸ, β, f, p͡f, p͜f, ƀ E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ D : θ, ð, ŧ, þ, đ G : x, ɣ, χ ... :file:`matrix` A scoring matrix indicating the alignment scores of all sound-class characters defined by the model. The scoring is structured as a simple tab-delimited text file. The first cell contains the character names, the following cells contain the scores in redundant form (with both triangles being filled):: B 10.0 -10.0 5.0 ... E -10.0 5.0 -10.0 ... F 5.0 -10.0 10.0 ... ... :file:`scorer` The ``scorer`` file (which is optional) contains the graph of class-transitions which is used for the calculation of the scoring dictionary. Each class is listed in a separate line, followed by the symbols ``v``,``c``, or ``t`` (indicating whether the class represents vowels, consonants, or tones), and by the classes it is directly connected to. The strength of this connection is indicated by digits (the smaller the value, the shorter the path between the classes):: A : v, E:1, O:1 C : c, S:2 B : c, W:2 E : v, A:1, I:1 D : c, S:2 ... The information in such a file is automatically converted into a scoring dictionary (see :evobib:`List2012b` for details). Based on the information provided by the files, a dictionary for the conversion of IPA-characters to sound classes and a scoring dictionary are created and stored as a binary. The model can be loaded with help of the :py:class:`~lingpy.data.model.Model` class and used in the various classes and functions provided by the library. See also -------- lingpy.data.model.Model compile_dvt """ log.info("Compiling model <" + model + ">...") # get the path to the models new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps) log.debug("Model-Path: %s" % new_path) # load the sound classes sound_classes = _import_sound_classes(new_path('converter')) # dump the data cache.dump(sound_classes, model + '.converter') log.info("... successfully created the converter.") # try to load the scoring function or the score tree scorer = False if os.path.isfile(new_path('matrix')): scorer = read_scorer(new_path('matrix')) elif os.path.isfile(new_path('scorer')): score_tree = _import_score_tree(new_path('scorer')) # calculate the scoring dictionary score_dict = _make_scoring_dictionary(score_tree) # make score_dict a ScoreDict instance chars = sorted(set([s[0] for s in score_dict.keys()])) matrix = [[0 for i in range(len(chars))] for j in range(len(chars))] for (i, charA), (j, charB) in util.multicombinations2(enumerate(chars)): if i < j: matrix[i][j] = score_dict.get((charA, charB), -100) matrix[j][i] = score_dict.get((charB, charA), -100) elif i == j: matrix[i][j] = score_dict[charA, charB] scorer = misc.ScoreDict(chars, matrix) util.write_text_file(new_path('matrix'), scorer2str(scorer)) if scorer: cache.dump(scorer, model + '.scorer') log.info("... successfully created the scorer.") else: log.info("... no scoring dictionary defined.") log.info("Model <" + model + "> was compiled successfully.")
@date: 2007/07/19 """ from __future__ import print_function, division, unicode_literals import re import sys import codecs from lingpy.util import data_path # data for sampa2ipa (Peter Kleiwegs implementation) xsdata = [] _xsKeys = [' '] xs = {' ': ' '} for line in codecs.open(data_path('ipa', 'sampa.csv'), 'r', 'utf-8'): line = line.strip('\n').strip('\r') if line and not line.startswith('#'): key, val = line.split('\t') try: assert key not in xs except: sys.stderr.write(key + '\n') sys.stderr.flush() _xsKeys.append(key) xs[key] = eval('"""' + val + '"""') _kk = [] for _k in _xsKeys: _kk.append(re.escape(_k)) _kk.sort(reverse=True) # long before short
def __init__(self, filename, conf=''): """ Parse data regularly if the data has not been loaded from a pickled version. """ self.log = log.get_logger() # try to load the data internal_import = False # check whether it's a dictionary from which we load if isinstance(filename, dict): input_data = filename if 'filename' not in input_data: self.filename = rcParams['filename'] internal_import = True # make check for correct input, there was a bug with a wrong # evaluation which is hopefully fixed by now tmp_keys = [k for k in input_data if isinstance(k, int)] if len(input_data[0]) != len(input_data[tmp_keys[0]]): print(input_data[0], input_data[tmp_keys[0]]) raise ValueError("[!] Wrong input format!") # pragma: no cover # check whether it's another wordlist-object elif hasattr(filename, '_data') and hasattr(filename, '_meta'): input_data = dict([(key, [v for v in value]) for key, value in \ filename._data.items()]) input_data.update(filename._meta.items()) input_data[0] = [a for a, b in sorted( filename.header.items(), key=lambda x: x[1], reverse=False)] internal_import = True self.filename = rcParams['filename'] # or whether the data is an actual file elif isinstance(filename, string_types) and os.path.isfile(filename): input_data = read_qlc(filename) self.filename = filename # raise an error otherwise elif isinstance(filename, string_types): raise IOError("Input file '{0}' does not exist.".format(filename)) else: raise TypeError("Unrecognized type for 'filename' argument: {0}".format( type(filename).__name__)) # load the configuration file if not conf: conf = util.data_path('conf', 'qlc.rc') # read the file defined by its path in conf tmp = [line.split('\t') for line in util.read_config_file(conf)] # define two attributes, _alias, and _class which store the aliases and # the datatypes (classes) of the given entries self._alias, self._class, self._class_string, self._alias2 = {}, {}, {}, {} for name, cls, alias in tmp: # make sure the name itself is there self._alias[name.lower()] = self._alias[name.upper()] = name self._class[name.lower()] = self._class[name.upper()] = eval(cls) self._class_string[name.lower()] = self._class_string[name.upper()] = cls # add the aliases for a in alias.split(','): self._alias[a.lower()] = self._alias[a.upper()] = name self._class[a.lower()] = self._class[a.upper()] = eval(cls) self._class_string[a.lower()] = self._class_string[a.upper()] = cls self._alias2[name] = sorted(set(alias.split(','))) + [name] # append the names in data[0] to self.conf to make sure that all data # is covered, even the types which are not specifically defined in the # conf file. the datatype defaults here to "str" for name in input_data[0]: if name.lower() not in self._alias: self._alias[name.lower()] = name.lower() self._class[name.lower()] = str if name.upper() not in self._alias: self._alias[name.upper()] = name.lower() self._class[name.upper()] = str # add empty alias for empty strings XXX why was that? I can't remember # why this was important XXX self._alias[''] = '' # the header stores the indices of the data in the original data dictionary self.header = dict( zip([self._alias[x] for x in input_data[0]], range(len(input_data[0])))) # now create a specific header which has all aliases self._header = {k: v for k, v in self.header.items()} # add a sorted header for reference self.columns = sorted(self.header, key=lambda x: self.header[x]) # assign all aliases to the header for alias in self._alias: try: self._header[alias] = self._header[self._alias[alias]] except: pass # assign the data as attribute to the word list class. Note that we # need to check for the type here, but since numpy also offers integer # types, we don't check for type(x) == int, but instead use the # str.numeric-function that returns numeric values only if it is an # integer self._data = { int(k): v for k, v in input_data.items() if k != 0 and str(k).isnumeric()} # check for same length of all columns check_errors = '' for k, v in self._data.items(): if len(v) != len(self.header): check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format( k, len(v), len(self.header)) if check_errors: raise ValueError(check_errors + '\n' + ', '.join(sorted(self.header))) # iterate over self._data and change the values according to the # functions (only needed when reading from file) if not internal_import: heads = sorted(self._header.items(), key=lambda x: x[1]) for key in self._data: check = [] for head, i in heads: if i not in check: logstring = 'Problem with row {0} in col {1}, expected' + \ ' «{4}» as datatype but received «{3}» ' + \ ' (ROW: {2}, entry {5}).' try: self._data[key][i] = self._class[head](self._data[key][i]) check.append(i) except KeyError: log.warning( logstring.format( key, i, '|'.join([str(x) for x in self._data[key]]), self._data[key][i], self._class[head], head)) except ValueError: log.warning( logstring.format( key, i, '|'.join([str(x) for x in self._data[key]]), self._data[key][i], self._class[head], head)) # create entry attribute of the wordlist self.entries = sorted(set([b.lower() for a, b in self._alias.items() if b])) # assign meta-data self._meta = {} for key in [k for k in input_data if type(k) != int]: self._meta[key] = input_data[key]
def compile_model(model, path=None): """ Function compiles customized sound-class models. Parameters ---------- model : str A string indicating the name of the model which shall be created. path : str A string indication the path where the model-folder is stored. Notes ----- A model is defined by a folder placed in :file:`data/models` directory of the LingPy package. The name of the folder reflects the name of the model. It contains three files: the file :file:`converter`, the file :file:`INFO`, and the optional file :file:`scorer`. The format requirements for these files are as follows: :file:`INFO` The ``INFO``-file serves as a reference for a given sound-class model. It can contain arbitrary information (and also be empty). If one wants to define specific characteristics, like the ``source``, the ``compiler``, the ``date``, or a ``description`` of a given model, this can be done by employing a key-value structure in which the key is preceded by an ``@`` and followed by a colon and the value is written right next to the key in the same line, e.g.:: @source: Dolgopolsky (1986) This information will then be read from the ``INFO`` file and rendered when printing the model to screen with help of the :py:func:`print` function. :file:`converter` The ``converter`` file contains all sound classes which are matched with their respective sound values. Each line is reserved for one class, precede by the key (preferably an ASCII-letter) representing the class:: B : ɸ, β, f, p͡f, p͜f, ƀ E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ D : θ, ð, ŧ, þ, đ G : x, ɣ, χ ... :file:`matrix` A scoring matrix indicating the alignment scores of all sound-class characters defined by the model. The scoring is structured as a simple tab-delimited text file. The first cell contains the character names, the following cells contain the scores in redundant form (with both triangles being filled):: B 10.0 -10.0 5.0 ... E -10.0 5.0 -10.0 ... F 5.0 -10.0 10.0 ... ... :file:`scorer` The ``scorer`` file (which is optional) contains the graph of class-transitions which is used for the calculation of the scoring dictionary. Each class is listed in a separate line, followed by the symbols ``v``,``c``, or ``t`` (indicating whether the class represents vowels, consonants, or tones), and by the classes it is directly connected to. The strength of this connection is indicated by digits (the smaller the value, the shorter the path between the classes):: A : v, E:1, O:1 C : c, S:2 B : c, W:2 E : v, A:1, I:1 D : c, S:2 ... The information in such a file is automatically converted into a scoring dictionary (see :evobib:`List2012b` for details). Based on the information provided by the files, a dictionary for the conversion of IPA-characters to sound classes and a scoring dictionary are created and stored as a binary. The model can be loaded with help of the :py:class:`~lingpy.data.model.Model` class and used in the various classes and functions provided by the library. See also -------- lingpy.data.model.Model compile_dvt """ log.info("Compiling model <" + model + ">...") # get the path to the models new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps) log.debug("Model-Path: %s" % new_path) # load the sound classes sound_classes = _import_sound_classes(new_path('converter')) # dump the data cache.dump(sound_classes, model + '.converter') log.info("... successfully created the converter.") # try to load the scoring function or the score tree scorer = False if os.path.isfile(new_path('matrix')): scorer = read_scorer(new_path('matrix')) elif os.path.isfile(new_path('scorer')): score_tree = _import_score_tree(new_path('scorer')) # calculate the scoring dictionary score_dict = _make_scoring_dictionary(score_tree) # make score_dict a ScoreDict instance chars = sorted(set([s[0] for s in score_dict.keys()])) matrix = [[0 for i in range(len(chars))] for j in range(len(chars))] for (i, charA), (j, charB) in util.multicombinations2(enumerate(chars)): if i < j: matrix[i][j] = score_dict.get((charA, charB), -100) matrix[j][i] = score_dict.get((charB, charA), -100) elif i == j: matrix[i][j] = score_dict[charA, charB] scorer = misc.ScoreDict(chars, matrix) util.write_text_file(new_path('matrix'), scorer2str(scorer)) if scorer: cache.dump(scorer, model + '.scorer') log.info("... successfully created the scorer.") else: log.info("... no scoring dictionary defined.") log.info("Model <" + model + "> was compiled successfully.")