def tsv2triple(wordlist, outfile=None): """ Function converts a wordlist to a triple data structure. Notes ----- The basic values of which the triples consist are: * ID (the ID in the TSV file) * COLUMN (the column in the TSV file) * VALUE (the entry in the TSV file) """ tstore = [] for head in wordlist.header: log.debug('tsv2triple: ' + head) for key in wordlist: tstore.append((key, head.upper(), wordlist[key, head])) if outfile: out = '' for a, b, c in tstore: if isinstance(c, list): c = ' '.join([str(x) for x in c]) if c != '-': out += '{0}\t{1}\t{2}\n'.format(a, b, c) util.write_text_file(outfile, out, normalize='NFC') return tstore
def tsv2triple(wordlist, outfile=None): """ Function converts a wordlist to a triple data structure. Notes ----- The basic values of which the triples consist are: * ID (the ID in the TSV file) * COLUMN (the column in the TSV file) * VALUE (the entry in the TSV file) """ tstore = [] for head in wordlist.header: log.debug('tsv2triple: ' + head) for key in wordlist: tstore.append((key, head.upper(), wordlist[key, head])) if outfile: out = '' for a, b, c in tstore: if isinstance(c, list): c = ' '.join([text_type(x) for x in c]) if c != '-': out += '{0}\t{1}\t{2}\n'.format(a, b, c) util.write_text_file(outfile, out, normalize='NFC') return tstore
def test_convenience(): info('m') warning('m') debug('m') error('m') deprecated('o', 'n') missing_module('m') file_written('f')
def test_convenience(self): from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written info('m') warn('m') debug('m') error('m') deprecated('o', 'n') missing_module('m') file_written('f')
def normalize_alignment(alignment): """ Function normalizes an alignment. Normalization here means that columns consisting only of gaps will be deleted, and all sequences will be stretched to equal length by adding additional gap characters in the end of smaller sequences. """ # clone the alignment alm_clone = [[x for x in y] for y in alignment] # first check for alms of different length alm_lens = [len(alm) for alm in alm_clone] if alm_lens.count(1) == len(alm_lens): for i, alm in enumerate(alm_clone): alm_clone[i] = alm[0].split(' ') alm_lens[i] = len(alm_clone[i]) if len(set(alm_lens)) > 1: max_len = max(alm_lens) for i, alm in enumerate(alm_clone): new_alm = alm + ['-' for x in range(max_len)] alm_clone[i] = new_alm[:max_len] # then check for alms consisting only of gaps cols = misc.transpose(alm_clone) idxs = [] for i, col in enumerate(cols): if set(col) == set('-'): idxs += [i] for idx in idxs[::-1]: for i, alm in enumerate(alm_clone): del alm_clone[i][idx] if alignment != alm_clone: lgtxt = 'Modified the alignment:\n' for i in range(len(alignment)): lgtxt += '[!] ' + ' '.join(alignment[i]) + '->' lgtxt += ' '.join(alm_clone[i]) + '\n' log.debug(lgtxt) return alm_clone else: return alignment
def normalize_alignment(alignment): """ Function normalizes an alignment. Normalization here means that columns consisting only of gaps will be deleted, and all sequences will be stretched to equal length by adding additional gap characters in the end of smaller sequences. """ # clone the alignment alm_clone = [[x for x in y] for y in alignment] # first check for alms of different length alm_lens = [len(alm) for alm in alm_clone] if alm_lens.count(1) == len(alm_lens): for i, alm in enumerate(alm_clone): alm_clone[i] = alm[0].split(' ') alm_lens[i] = len(alm_clone[i]) if len(set(alm_lens)) > 1: max_len = max(alm_lens) for i, alm in enumerate(alm_clone): new_alm = alm + ['-' for x in range(max_len)] alm_clone[i] = new_alm[:max_len] # then check for alms consisting only of gaps cols = misc.transpose(alm_clone) idxs = [] for i, col in enumerate(cols): if set(col) == set('-'): idxs += [i] for idx in idxs[::-1]: for i, alm in enumerate(alm_clone): del alm_clone[i][idx] if alignment != alm_clone: lgtxt = 'Modified the alignment:\n' for i in range(len(alignment)): lgtxt += '[!] ' + ' '.join(alignment[i]) + '->' lgtxt += ' '.join(alm_clone[i]) + '\n' debug(lgtxt) return alm_clone else: return alignment
def _output(self, fileformat, **keywords): """ Internal function that eases its modification by daughter classes. """ # check for stamp attribute keywords["stamp"] = getattr(self, '_stamp', '') # add the default parameters, they will be checked against the keywords util.setdefaults( keywords, cols=False, distances=False, entries=("concept", "counterpart"), entry='concept', fileformat=fileformat, filename=rcParams['filename'], formatter='concept', modify_ref=False, meta=self._meta, missing=0, prettify='false', ignore='all', ref='cogid', rows=False, subset=False, # setup a subset of the data, taxa='taxa', threshold=0.6, # threshold for flat clustering tree_calc='neighbor') if fileformat in ['triple', 'triples', 'triples.tsv']: return tsv2triple(self, keywords['filename'] + '.' + fileformat) if fileformat in ['paps.nex', 'paps.csv']: paps = self.get_paps( ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing']) kw = dict(filename=keywords['filename'] + '.paps') if fileformat == 'paps.nex': kw['missing'] = keywords['missing'] return pap2nex(self.cols, paps, **kw) return pap2csv(self.cols, paps, **kw) # simple printing of taxa if fileformat == 'taxa': assert hasattr(self, 'taxa') return util.write_text_file(keywords['filename'] + '.taxa', self.cols) # csv-output if fileformat in ['csv', 'qlc', 'tsv']: # get the header line header = sorted( [s for s in set(self._alias.values()) if s in self._header], key=lambda x: self._header[x]) header = [h.upper() for h in header] self._meta.setdefault('taxa', self.cols) # get the data, in case a subset is chosen if not keywords['subset']: # write stuff to file return wl2qlc(header, self._data, **keywords) cols, rows = keywords['cols'], keywords['rows'] if not isinstance(cols, (list, tuple, bool)): raise ValueError("[i] Argument 'cols' should be list or tuple.") if not isinstance(rows, (dict, bool)): raise ValueError("[i] Argument 'rows' should be a dictionary.") # check for chosen header if cols: # get indices for header indices = [self._header[x] for x in cols] header = [c.upper() for c in cols] else: indices = [r for r in range(len(self.header))] if rows: stmts = [] for key, value in rows.items(): if key == 'ID': stmts += ["key " + value] else: idx = self._header[key] stmts += ["line[{0}] ".format(idx) + value] log.debug("calculated what should be excluded") # get the data out = {} for key, line in self._data.items(): log.debug(key) if rows: if eval(" and ".join(stmts)): out[key] = [line[i] for i in indices] else: out[key] = [line[i] for i in indices] log.debug("passing data to wl2qlc") return wl2qlc(header, out, **keywords) # output dst-format (phylip) if fileformat == 'dst': # check for distances as keyword if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self, **keywords) out = matrix2dst(self._meta['distances'], self.taxa, stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0)) return _write_file(keywords['filename'], out, fileformat) # output tre-format (newick) if fileformat in ['tre', 'nwk']: # ,'cluster','groups']: if 'tree' not in self._meta: # check for distances if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self) # we look up a function to calculate a tree in the cluster module: tree = getattr(cluster, keywords['tree_calc'])( self._meta['distances'], self.cols, distances=keywords['distances']) else: tree = self._meta['tree'] return _write_file(keywords['filename'], '{0}'.format(tree), fileformat) if fileformat in ['cluster', 'groups']: if 'distances' not in self._meta: self._meta['distances'] = wl2dst(self) # check for keywords if 'groups' not in self._meta: self._meta['groups'] = cluster.matrix2groups( keywords['threshold'], self._meta['distances'], self.taxa) lines = [] for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]): lines.append('{0}\t{1}'.format(taxon, group)) return _write_file(keywords['filename'], lines, fileformat) if fileformat in ['starling', 'star.csv']: # make lambda inline for data-check l = lambda x: ['-' if x == 0 else x][0] lines = [] if 'cognates' not in keywords: lines.append('ID\tConcept\t' + '\t'.join(self.taxa)) for i, concept in enumerate(self.concepts): for line in self.get_list(row=concept, entry=keywords['entry']): lines.append( str(i + 1) + '\t' + concept + '\t' + '\t'.join( [l(t) for t in line])) else: lines.append( 'ID\tConcept\t' + '\t'.join( ['{0}\t COG'.format(t) for t in self.taxa])) for i, concept in enumerate(self.concepts): cogs = self.get_list(row=concept, entry=keywords['cognates']) for j, line in enumerate( self.get_list(row=concept, entry=keywords['entry'])): part = '\t'.join( '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j])) lines.append(util.tabjoin(i + 1, concept, part)) return _write_file( keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv') if fileformat == 'multistate.nex': if not keywords['filename'].endswith('.multistate.nex'): keywords['filename'] += '.multistate.nex' matrix = wl2multistate(self, keywords['ref'], keywords['missing']) return multistate2nex(self.taxa, matrix, keywords['filename']) if fileformat == 'separated': if not os.path.isdir(keywords['filename']): os.mkdir(keywords['filename']) for l in self.cols: lines = [''] if 'ignore_keys' in keywords else ['ID\t'] lines[0] += '\t'.join(x.upper() for x in keywords['entries']) for key in self.get_list(col=l, flat=True): line = [] if 'ignore_keys' in keywords else [key] for entry in keywords['entries']: tmp = self[key, entry] if isinstance(tmp, list): tmp = ' '.join([str(x) for x in tmp]) line += [tmp] lines.append('\t'.join('{0}'.format(x) for x in line)) _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')
def compile_model(model, path=None): """ Function compiles customized sound-class models. Parameters ---------- model : str A string indicating the name of the model which shall be created. path : str A string indication the path where the model-folder is stored. Notes ----- A model is defined by a folder placed in :file:`data/models` directory of the LingPy package. The name of the folder reflects the name of the model. It contains three files: the file :file:`converter`, the file :file:`INFO`, and the optional file :file:`scorer`. The format requirements for these files are as follows: :file:`INFO` The ``INFO``-file serves as a reference for a given sound-class model. It can contain arbitrary information (and also be empty). If one wants to define specific characteristics, like the ``source``, the ``compiler``, the ``date``, or a ``description`` of a given model, this can be done by employing a key-value structure in which the key is preceded by an ``@`` and followed by a colon and the value is written right next to the key in the same line, e.g.:: @source: Dolgopolsky (1986) This information will then be read from the ``INFO`` file and rendered when printing the model to screen with help of the :py:func:`print` function. :file:`converter` The ``converter`` file contains all sound classes which are matched with their respective sound values. Each line is reserved for one class, precede by the key (preferably an ASCII-letter) representing the class:: B : ɸ, β, f, p͡f, p͜f, ƀ E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ D : θ, ð, ŧ, þ, đ G : x, ɣ, χ ... :file:`matrix` A scoring matrix indicating the alignment scores of all sound-class characters defined by the model. The scoring is structured as a simple tab-delimited text file. The first cell contains the character names, the following cells contain the scores in redundant form (with both triangles being filled):: B 10.0 -10.0 5.0 ... E -10.0 5.0 -10.0 ... F 5.0 -10.0 10.0 ... ... :file:`scorer` The ``scorer`` file (which is optional) contains the graph of class-transitions which is used for the calculation of the scoring dictionary. Each class is listed in a separate line, followed by the symbols ``v``,``c``, or ``t`` (indicating whether the class represents vowels, consonants, or tones), and by the classes it is directly connected to. The strength of this connection is indicated by digits (the smaller the value, the shorter the path between the classes):: A : v, E:1, O:1 C : c, S:2 B : c, W:2 E : v, A:1, I:1 D : c, S:2 ... The information in such a file is automatically converted into a scoring dictionary (see :evobib:`List2012b` for details). Based on the information provided by the files, a dictionary for the conversion of IPA-characters to sound classes and a scoring dictionary are created and stored as a binary. The model can be loaded with help of the :py:class:`~lingpy.data.model.Model` class and used in the various classes and functions provided by the library. See also -------- lingpy.data.model.Model compile_dvt """ log.info("Compiling model <" + model + ">...") # get the path to the models new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps) log.debug("Model-Path: %s" % new_path) # load the sound classes sound_classes = _import_sound_classes(new_path('converter')) # dump the data cache.dump(sound_classes, model + '.converter') log.info("... successfully created the converter.") # try to load the scoring function or the score tree scorer = False if os.path.isfile(new_path('matrix')): scorer = read_scorer(new_path('matrix')) elif os.path.isfile(new_path('scorer')): score_tree = _import_score_tree(new_path('scorer')) # calculate the scoring dictionary score_dict = _make_scoring_dictionary(score_tree) # make score_dict a ScoreDict instance chars = sorted(set([s[0] for s in score_dict.keys()])) matrix = [[0 for i in range(len(chars))] for j in range(len(chars))] for (i, charA), (j, charB) in util.multicombinations2(enumerate(chars)): if i < j: matrix[i][j] = score_dict.get((charA, charB), -100) matrix[j][i] = score_dict.get((charB, charA), -100) elif i == j: matrix[i][j] = score_dict[charA, charB] scorer = misc.ScoreDict(chars, matrix) util.write_text_file(new_path('matrix'), scorer2str(scorer)) if scorer: cache.dump(scorer, model + '.scorer') log.info("... successfully created the scorer.") else: log.info("... no scoring dictionary defined.") log.info("Model <" + model + "> was compiled successfully.")
def star2qlc(filename, clean_taxnames=False, debug=False): """ Converts a file directly output from starling to LingPy-QLC format. """ cleant = clean_taxnames or identity data = csv2list(filename) # check for strange chars in data due to notepad errors data[0][0] = data[0][0].replace('\ufeff', '') # get the header header = data[0] # debugging if debug: error = False log.info("Header line has length {0}.".format(len(header))) for line in data[1:]: if len(line) != len(header): # pragma: no cover log.error( "Error for item {0} with length {1}, expected {2}.".format( '/'.join(line[0:2]), len(line), len(header))) error = True if error: # pragma: no cover log.error("Errors were found, aborting function call.") return else: log.info("Everything went fine, carrying on with function call.") # determine language names in header taxa = [] for i in range(len(header) - 1): prev = header[i] post = header[i + 1] if prev in post and '#' in post: taxa += [prev] if len(taxa) == 1: lngIdx = i if prev == 'Number': numIdx = i if prev == 'Word': wrdIdx = i log.info('starling, indices (%s, %s, %s)' % (lngIdx, numIdx, wrdIdx)) log.info('starling, taxa: %s' % taxa) # start filling in the dictionary D = { 0: [ 'DOCULECT', 'CONCEPT', 'GLOSSID', 'WORDINSOURCE', 'ORTHOGRAPHY', 'IPA', 'COGID' ] } idx = 1 cognate_counter = 0 current_concept = '' cognate_sets = [] for line in data[2:]: gloss = line[wrdIdx] gnum = line[numIdx] # switch to next cognate set if there is a switch in concepts if current_concept != gloss and len(cognate_sets) != 0: max_cog = max(cognate_sets) cognate_counter = max_cog cognate_sets = [] current_concept = gloss else: log.debug('starling, indices (%s, %s, %s)' % (gloss, current_concept, cognate_counter)) for i in range(lngIdx, len(header), 2): word = line[i] if '{' in word: ipa = word[:word.index('{')].strip() ortho = word[word.index('{') + 1:word.index('}')].strip() else: ipa = word ortho = word cogid = int(line[i + 1]) if cogid != 0 and word: if cogid > 0: cogid = cogid + cognate_counter # append cognate sets, essential for raising the counter cognate_sets += [int(cogid)] taxon = cleant(header[i]) D[idx] = [taxon, gloss, gnum, word, ortho, ipa, cogid] idx += 1 # re-iterate through data and reassign cognate sets with negative ids for k in D: if k: cogid = D[k][-1] if cogid < 0: cogid = -cognate_counter cognate_counter += 1 D[k][-1] = cogid return D
def mcl( threshold, matrix, taxa, max_steps=1000, inflation=2, expansion=2, add_self_loops=True, revert=False, logs=True, matrix_type="distances"): """ Carry out a clustering using the MCL algorithm (:evobib:`Dongen2000`). Parameters ---------- threshold : {float, bool} The threshold that shall be used for the initial selection of links assigned to the data. If set to c{False}, the weights from the matrix will be used directly. matrix : list A two-dimensional list containing the distances. taxa : list An list containing the names of all taxa corresponding to the distances in the matrix. max_steps : int (default=1000) Maximal number of iterations. inflation : int (default=2) Inflation parameter for the MCL algorithm. expansion : int (default=2) Expansion parameter of the MCL algorithm. add_self_loops : {True, False, builtins.function} (default=True) Determine whether self-loops should be added, and if so, how they should be weighted. If a function for the calculation of self-loops is given, it will take the whole column of the matrix for each taxon as input. logs : { bool, function } (default=True) If set to c{True}, the logarithm of the score beyond the threshold will be assigned as weight to the graph. If set to c{False} all weights will be set to 1. Use a custom function to define individual ways to calculate the weights. matrix_type : { "distances", "similarities" } Specify the type of the matrix. If the matrix contains distance data, it will be adapted to similarity data. If it contains "similarities", no adaptation is needed. Examples -------- The function is automatically imported along with LingPy. >>> from lingpy import * >>> from lingpy.algorithm import squareform Create a list of arbitrary taxa. >>> taxa = ['German','Swedish','Icelandic','English','Dutch'] Create an arbitrary distance matrix. >>> matrix = squareform([0.5,0.67,0.8,0.2,0.4,0.7,0.6,0.8,0.8,0.3]) >>> matrix [[0.0, 0.5, 0.67, 0.8, 0.2], [0.5, 0.0, 0.4, 0.7, 0.6], [0.67, 0.4, 0.0, 0.8, 0.8], [0.8, 0.7, 0.8, 0.0, 0.3], [0.2, 0.6, 0.8, 0.3, 0.0]] Carry out the link-clustering analysis. >>> mcl(0.5,matrix,taxa) {1: ['German', 'English', 'Dutch'], 2: ['Swedish', 'Icelandic']} """ # check for type of matrix if type(matrix) != np.ndarray: imatrix = np.array(matrix) else: imatrix = matrix.copy() # check for matrix type and decide how to handle logs if matrix_type == 'distances': evaluate = lambda x: True if x < threshold else False if logs == True: logs = lambda x: -np.log2((1 - x)**2) elif logs == False: logs = lambda x: x elif matrix_type == 'similarities': evaluate = lambda x: True if x > threshold else False if logs == True: logs = lambda x: -np.log(x**2) else: logs = lambda x: x else: raise ValueError(matrix_type) # check for threshold if threshold: for i, j in combinations(range(len(imatrix)), 2): score = imatrix[i][j] evaluation = logs(score) if evaluate(score) else 0 imatrix[i][j] = evaluation imatrix[j][i] = evaluation # check for self_loops if add_self_loops == True: for i in range(len(imatrix)): imatrix[i][i] = 1 elif add_self_loops == False: pass else: for i in range(len(imatrix)): imatrix[i][i] = add_self_loops(imatrix[:, i]) # normalize the matrix imatrix = _normalize_matrix(imatrix) # start looping and the like steps = 0 while True: # expansion imatrix = np.linalg.matrix_power(imatrix, expansion) # inflation imatrix = imatrix ** inflation # normalization imatrix = _normalize_matrix(imatrix) # increase steps steps += 1 # check for matrix convergence if steps >= max_steps or _is_idempotent(imatrix): log.debug("Number of steps {0}.".format(steps)) break # retrieve the clusters clusters = _interprete_matrix(imatrix) # modify clusters if revert: return dict(zip(range(len(taxa)), clusters)) clr = defaultdict(list) for i, t in enumerate(taxa): clr[clusters[i]].append(t) return clr
def mcl(threshold, matrix, taxa, max_steps=1000, inflation=2, expansion=2, add_self_loops=True, revert=False, logs=True, matrix_type="distances"): """ Carry out a clustering using the MCL algorithm (:evobib:`Dongen2000`). Parameters ---------- threshold : {float, bool} The threshold that shall be used for the initial selection of links assigned to the data. If set to c{False}, the weights from the matrix will be used directly. matrix : list A two-dimensional list containing the distances. taxa : list An list containing the names of all taxa corresponding to the distances in the matrix. max_steps : int (default=1000) Maximal number of iterations. inflation : int (default=2) Inflation parameter for the MCL algorithm. expansion : int (default=2) Expansion parameter of the MCL algorithm. add_self_loops : {True, False, builtins.function} (default=True) Determine whether self-loops should be added, and if so, how they should be weighted. If a function for the calculation of self-loops is given, it will take the whole column of the matrix for each taxon as input. logs : { bool, function } (default=True) If set to c{True}, the logarithm of the score beyond the threshold will be assigned as weight to the graph. If set to c{False} all weights will be set to 1. Use a custom function to define individual ways to calculate the weights. matrix_type : { "distances", "similarities" } Specify the type of the matrix. If the matrix contains distance data, it will be adapted to similarity data. If it contains "similarities", no adaptation is needed. Examples -------- The function is automatically imported along with LingPy. >>> from lingpy import * >>> from lingpy.algorithm import squareform Create a list of arbitrary taxa. >>> taxa = ['German','Swedish','Icelandic','English','Dutch'] Create an arbitrary distance matrix. >>> matrix = squareform([0.5,0.67,0.8,0.2,0.4,0.7,0.6,0.8,0.8,0.3]) >>> matrix [[0.0, 0.5, 0.67, 0.8, 0.2], [0.5, 0.0, 0.4, 0.7, 0.6], [0.67, 0.4, 0.0, 0.8, 0.8], [0.8, 0.7, 0.8, 0.0, 0.3], [0.2, 0.6, 0.8, 0.3, 0.0]] Carry out the link-clustering analysis. >>> mcl(0.5,matrix,taxa) {1: ['German', 'English', 'Dutch'], 2: ['Swedish', 'Icelandic']} """ # check for type of matrix if type(matrix) != np.ndarray: imatrix = np.array(matrix) else: imatrix = matrix.copy() # check for matrix type and decide how to handle logs if matrix_type == 'distances': evaluate = lambda x: True if x < threshold else False if logs == True: logs = lambda x: -np.log2((1 - x)**2) elif logs == False: logs = lambda x: x elif matrix_type == 'similarities': evaluate = lambda x: True if x > threshold else False if logs == True: logs = lambda x: -np.log(x**2) else: logs = lambda x: x else: raise ValueError(matrix_type) # check for threshold if threshold: for i, j in util.combinations2(range(len(imatrix))): score = imatrix[i][j] evaluation = logs(score) if evaluate(score) else 0 imatrix[i][j] = evaluation imatrix[j][i] = evaluation # check for self_loops if add_self_loops == True: for i in range(len(imatrix)): imatrix[i][i] = 1 elif add_self_loops == False: pass else: for i in range(len(imatrix)): imatrix[i][i] = add_self_loops(imatrix[:, i]) # normalize the matrix imatrix = _normalize_matrix(imatrix) # start looping and the like steps = 0 while True: # expansion imatrix = np.linalg.matrix_power(imatrix, expansion) # inflation imatrix = imatrix**inflation # normalization imatrix = _normalize_matrix(imatrix) # increase steps steps += 1 # check for matrix convergence if steps >= max_steps or _is_idempotent(imatrix): log.debug("Number of steps {0}.".format(steps)) break # retrieve the clusters clusters = _interprete_matrix(imatrix) # modify clusters if revert: return dict(zip(range(len(taxa)), clusters)) clr = defaultdict(list) for i, t in enumerate(taxa): clr[clusters[i]].append(t) return clr
def star2qlc(filename, clean_taxnames=False, debug=False): """ Converts a file directly output from starling to LingPy-QLC format. """ cleant = clean_taxnames or identity data = csv2list(filename) # check for strange chars in data due to notepad errors data[0][0] = data[0][0].replace('\ufeff', '') # get the header header = data[0] # debugging if debug: error = False log.info("Header line has length {0}.".format(len(header))) for line in data[1:]: if len(line) != len(header): # pragma: no cover log.error("Error for item {0} with length {1}, expected {2}.".format( '/'.join(line[0:2]), len(line), len(header))) error = True if error: # pragma: no cover log.error("Errors were found, aborting function call.") return else: log.info("Everything went fine, carrying on with function call.") # determine language names in header taxa = [] for i in range(len(header) - 1): prev = header[i] post = header[i + 1] if prev in post and '#' in post: taxa += [prev] if len(taxa) == 1: lngIdx = i if prev == 'Number': numIdx = i if prev == 'Word': wrdIdx = i log.info('starling, indices (%s, %s, %s)' % (lngIdx, numIdx, wrdIdx)) log.info('starling, taxa: %s' % taxa) # start filling in the dictionary D = {0: [ 'DOCULECT', 'CONCEPT', 'GLOSSID', 'WORDINSOURCE', 'ORTHOGRAPHY', 'IPA', 'COGID']} idx = 1 cognate_counter = 0 current_concept = '' cognate_sets = [] for line in data[2:]: gloss = line[wrdIdx] gnum = line[numIdx] # switch to next cognate set if there is a switch in concepts if current_concept != gloss and len(cognate_sets) != 0: max_cog = max(cognate_sets) cognate_counter = max_cog cognate_sets = [] current_concept = gloss else: log.debug('starling, indices (%s, %s, %s)' % ( gloss, current_concept, cognate_counter)) for i in range(lngIdx, len(header), 2): word = line[i] if '{' in word: ipa = word[:word.index('{')].strip() ortho = word[word.index('{') + 1:word.index('}')].strip() else: ipa = word ortho = word cogid = int(line[i + 1]) if cogid != 0 and word: if cogid > 0: cogid = cogid + cognate_counter # append cognate sets, essential for raising the counter cognate_sets += [int(cogid)] taxon = cleant(header[i]) D[idx] = [taxon, gloss, gnum, word, ortho, ipa, cogid] idx += 1 # re-iterate through data and reassign cognate sets with negative ids for k in D: if k: cogid = D[k][-1] if cogid < 0: cogid = -cognate_counter cognate_counter += 1 D[k][-1] = cogid return D