def report(args): """ clpa report <FILE> [rules=FILE] [format=md|csv|cldf] [outfile=FILENAME] Note ---- * Rules point to a tab-separated value file in which source and target are given to convert a segment to another segment to be applied on a data-set-specific basis which may vary from dataset to dataset and can thus not be included as standard clpa behaviour. * Input file needs to be in csv-format, with tabstop as separator, and it needs to contain one column named "TOKENS". * format now allows for md (MarkDown), csv (CSV, tab as separator), or cldf (no pure cldf but rather current lingpy-csv-format). CLDF format means that the original file will be given another two columns, one called CLPA_TOKENS, one called CLPA_IDS. * if you specify an outfile from the input, the data will be written to file instead showing it on the screen. """ if len(args.args) < 1: raise ParserError('not enough arguments') # get keywords from arguments @xrotwang: is there any better way to do so? settings = defaultdict(str) settings['format'] = 'md' fname = None for arg in args.args: if '=' in arg: key, val = arg.split('=') settings[key] = val else: fname = arg if not fname: raise ParserError('no filename passed as argument') wordlist = Wordlist.from_file(fname) sounds, errors = wordlist.check(rules=settings['rules']) if settings['format'] not in ['md', 'csv']: text = wordlist.write(settings['outfile'] or None) if not settings['outfile']: print(text) return segments = OrderedDict([('existing', []), ('missing', []), ('convertible', [])]) for k in sorted(sounds, key=lambda x: (sounds[x]['frequency'], sounds[x]['id']), reverse=True): type_, symbol = None, None if k == sounds[k]['clpa']: type_, symbol = 'existing', k elif sounds[k]['clpa'] == '?': type_, symbol = 'missing', k else: check = sounds[k]['clpa'] if k != check != '?': type_, symbol = 'convertible', k + ' >> ' + sounds[k]['clpa'] if type_ and symbol: segments[type_].append( [symbol, sounds[k]['id'], sounds[k]['frequency']]) if settings['format'] == 'csv': with UnicodeWriter(settings['outfile'] or None, delimiter='\t') as writer: for key, items in segments.items(): for i, item in enumerate(items): writer.writerow([i + 1] + item + [key]) if not settings['outfile']: print(writer.read()) return text = [] header_template = """ # {0} sounds | number | sound | clpa | frequency | | ------:| ----- | ---- | ---------:|""" for key, items in segments.items(): text.append(header_template.format(key.capitalize())) for i, item in enumerate(items): text.append("| {0} | {1[0]} | {1[1]} | {1[2]} |".format( i + 1, item)) text = '\n'.join(text) if settings['outfile']: with Path(settings['outfile']).open('w', encoding='utf8') as fp: fp.write(text) else: print(text)
def report(args): """ clpa report <FILE> [rules=FILE] [format=md|csv|cldf] [outfile=FILENAME] Note ---- * Rules point to a tab-separated value file in which source and target are given to convert a segment to another segment to be applied on a data-set-specific basis which may vary from dataset to dataset and can thus not be included as standard clpa behaviour. * Input file needs to be in csv-format, with tabstop as separator, and it needs to contain one column named "TOKENS". * format now allows for md (MarkDown), csv (CSV, tab as separator), or cldf (no pure cldf but rather current lingpy-csv-format). CLDF format means that the original file will be given another two columns, one called CLPA_TOKENS, one called CLPA_IDS. * if you specify an outfile from the input, the data will be written to file instead showing it on the screen. """ if len(args.args) < 1: raise ParserError('not enough arguments') # get keywords from arguments @xrotwang: is there any better way to do so? settings = defaultdict(str) settings['format'] = 'md' fname = None for arg in args.args: if '=' in arg: key, val = arg.split('=') settings[key] = val else: fname = arg if not fname: raise ParserError('no filename passed as argument') wordlist = Wordlist.from_file(fname) sounds, errors = wordlist.check(rules=settings['rules']) if settings['format'] not in ['md', 'csv']: text = wordlist.write(settings['outfile'] or None) if not settings['outfile']: print(text) return segments = OrderedDict([('existing', []), ('missing', []), ('convertible', [])]) for k in sorted( sounds, key=lambda x: (sounds[x]['frequency'], sounds[x]['id']), reverse=True ): type_, symbol = None, None if k == sounds[k]['clpa']: type_, symbol = 'existing', k elif sounds[k]['clpa'] == '?': type_, symbol = 'missing', k else: check = sounds[k]['clpa'] if k != check != '?': type_, symbol = 'convertible', k + ' >> ' + sounds[k]['clpa'] if type_ and symbol: segments[type_].append([symbol, sounds[k]['id'], sounds[k]['frequency']]) if settings['format'] == 'csv': with UnicodeWriter(settings['outfile'] or None, delimiter='\t') as writer: for key, items in segments.items(): for i, item in enumerate(items): writer.writerow([i + 1] + item + [key]) if not settings['outfile']: print(writer.read()) return text = [] header_template = """ # {0} sounds | number | sound | clpa | frequency | | ------:| ----- | ---- | ---------:|""" for key, items in segments.items(): text.append(header_template.format(key.capitalize())) for i, item in enumerate(items): text.append("| {0} | {1[0]} | {1[1]} | {1[2]} |".format(i + 1, item)) text = '\n'.join(text) if settings['outfile']: with Path(settings['outfile']).open('w', encoding='utf8') as fp: fp.write(text) else: print(text)
def _make_one(self, path=None): return Wordlist.from_file(path or self.data_path('KSL.tsv'))