def cmd_download(self, **kw): # download source self.raw.write('sources.bib', getEvoBibAsBibtex(SOURCE, **kw)) # download data all_records = [] for i in pb(list(range(1, 20 * self.pages + 1, 20))): with self.raw.temp_download(self._url(i), 'file-{0}'.format(i), log=self.log) as fname: soup = BeautifulSoup( fname.open(encoding='utf8').read(), 'html.parser') for record in soup.findAll(name='div', attrs={"class": "results_record"}): if isinstance(record, bs4.element.Tag): children = list(record.children) number = children[0].findAll('span')[1].text.strip() concept = children[1].findAll('span')[1].text for child in children[2:]: if isinstance(child, bs4.element.Tag): dpoints = child.findAll('span') if len(dpoints) >= 3: lname = dpoints[1].text glottolog = re.findall( 'Glottolog: (........)', str(dpoints[1]))[0] entry = dpoints[2].text cogid = list( child.children)[4].text.strip() all_records.append( (number, concept, lname, glottolog, entry, cogid)) with UnicodeWriter(self.raw.posix('output.csv')) as f: f.writerows(all_records)
def geo(args): with_session(args) fname = args.pkg_dir.joinpath('static', 'download', 'languages-and-dialects-geo.csv') with transaction.manager, UnicodeWriter(fname) as writer: writer.writerow([ 'glottocode', 'name', 'isocodes', 'level', 'macroarea', 'latitude', 'longitude' ]) for l in DBSession.query(models.Languoid)\ .filter(or_( models.Languoid.level == models.LanguoidLevel.dialect, models.Languoid.level == models.LanguoidLevel.language))\ .options( joinedload(models.Languoid.macroareas), joinedload_all( common.Language.languageidentifier, common.LanguageIdentifier.identifier))\ .order_by(common.Language.name): writer.writerow([ l.id, l.name, ' '.join( i.name for i in l.get_identifier_objs(common.IdentifierType.iso)), l.level, l.macroareas[0].name if l.macroareas else '', l.latitude if l.latitude is not None else '', l.longitude if l.longitude is not None else '' ]) args.log.info('{0} written'.format(fname))
def iso2codes(args): from clldutils.dsv import UnicodeWriter nodes = list(args.repos.languoids()) res = {} for node in nodes: if node.iso: res[node.id] = (node.iso, set()) for node in nodes: if node.level == Level.family or node.id in res: continue for nid in res: matched = False for l in node.lineage: if l[1] == nid: res[nid][1].add(node.id) matched = True break if matched: break with UnicodeWriter('iso2glottocodes.csv') as writer: writer.writerow(['iso', 'glottocodes']) for gc, (iso, gcs) in res.items(): writer.writerow([iso, ';'.join([gc] + list(gcs))])
def markconservative(m, trigs, ref, hht, outfn, verbose=True, rank=None): blamefield = "hhtype" mafter = markall(m, trigs, verbose=verbose, rank=rank) ls = lstat(ref, hht) lsafter = lstat_witness(mafter, hht) log = [] for (lg, (stat, wits)) in lsafter.items(): if not ls.get(lg): if verbose: print lg, "lacks status", [mafter[k][1]['srctrickle'] for k in wits] continue if hht[stat] > hht[ls[lg]]: log = log + [ (lg, [(mafter[k][1].get(blamefield, "No %s" % blamefield), k, mafter[k][1].get('title', 'no title'), mafter[k][1].get('srctrickle', 'no srctrickle')) for k in wits], ls[lg])] for k in wits: (t, f) = mafter[k] if blamefield in f: del f[blamefield] mafter[k] = (t, f) with UnicodeWriter(outfn, dialect='excel-tab') as writer: writer.writerows(((lg, was) + mis for (lg, miss, was) in log for mis in miss)) return mafter
def get_values(self, p, language_url_pattern): q = DBSession.query(Value).join(Value.valueset)\ .filter(ValueSet.parameter_pk == p.pk)\ .options( joinedload(Value.valueset, ValueSet.language), joinedload(Value.valueset, ValueSet.contribution), joinedload(Value.domainelement), joinedload_all(Value.valueset, ValueSet.references, ValueSetReference.source) ).order_by(ValueSet.parameter_pk, ValueSet.language_pk, Value.pk) with UnicodeWriter() as writer: writer.writerow([ 'ID', 'Language_ID', 'Parameter_ID', 'Contribution_ID', 'Value', 'Source', 'Comment', ]) for v in page_query(q): writer.writerow([ v.id, language_url_pattern.format(v.valueset.language.id), p.id, v.valueset.contribution.id, v.domainelement.name if v.domainelement else v.name, ';'.join(self.format_sources(v)), getattr(v, 'comment', v.valueset.source) or '', ]) return writer.read()
def locations(glottolog, fid, outpath): with UnicodeWriter(outpath) as writer: writer.writerow(['name', 'glottocode', 'latitude', 'longitude']) for lang in glottolog.languoids(): if lang.level == Level.language and lang.latitude is not None: if fid in [l[1] for l in lang.lineage]: writer.writerow( [lang.name, lang.id, lang.latitude, lang.longitude])
def languoids(langs, outdir): with UnicodeWriter(outdir.joinpath('csv', 'glottolog.csv')) as writer: writer.writerow(['id', 'name', 'family_id', 'family_name', 'iso_code']) for lang in sorted(langs): writer.writerow([ lang.id, lang.name, lang.lineage[0][1] if lang.lineage else '', lang.lineage[0][0] if lang.lineage else '', lang.iso or '' ])
def render(self, ctx, req): with UnicodeWriter() as writer: writer.writerow(self.header(ctx, req)) for item in ctx.get_query(limit=csv.QUERY_LIMIT): writer.writerow(self.row(ctx, req, item)) return writer.read()
def write(self, path, sep="\t"): with UnicodeWriter(path, delimiter=sep) as writer: for i, item in enumerate(self): if i == 0: writer.writerow(list(item.keys())) writer.writerow(list(item.values())) if path is None: return writer.read()
def render(self, ctx, req): with UnicodeWriter() as writer: rows = iter(ctx.get_query(limit=QUERY_LIMIT)) first = next(rows, None) if first is not None: cols = first.csv_head() writer.writerow(cols) for item in chain([first], rows): writer.writerow(item.to_csv(ctx=ctx, req=req, cols=cols)) return writer.read()
def to_csvfile(self, filename): """Write a CSV file with one row for each entry in each bibfile.""" with self.connect() as conn: cursor = conn.execute( 'SELECT filename, bibkey, hash, cast(id AS text) AS id ' 'FROM entry ORDER BY lower(filename), lower(bibkey), hash, id') with UnicodeWriter(filename) as writer: writer.writerow([col[0] for col in cursor.description]) for row in cursor: writer.writerow(row)
def render(self, data, accepted_media_type=None, renderer_context=None): "Renders a list of SocietyResultSets to CSV" if data is None: return '' results = DPLACECSVResults(data) with UnicodeWriter() as writer: writer.writerow([CSV_PREAMBLE]) writer.writerow(results.field_names) for row in results: writer.writerow(row) return writer.read()
def xls2csv(fname, outdir=None): res = {} outdir = outdir or fname.parent wb = xlrd.open_workbook(as_posix(fname)) for sname in wb.sheet_names(): sheet = wb.sheet_by_name(sname) if sheet.nrows: path = outdir.joinpath(fname.stem + '.' + slug(sname, lowercase=False) + '.csv') with UnicodeWriter(path) as writer: for i in range(sheet.nrows): writer.writerow([col.value for col in sheet.row(i)]) res[sname] = path return res
def render(self, ctx, req): fid = req.route_url('parameter', id='xxx').replace('xxx', '{0}') lid = req.route_url('language', id='xxx').replace('xxx', '{0}') with UnicodeWriter() as writer: writer.writerow(['Language_ID', 'Feature_ID', 'Value']) for _lid, _fid, v in DBSession.query( Language.id, Parameter.id, Value.name)\ .filter(Language.pk == ValueSet.language_pk)\ .filter(Parameter.pk == ValueSet.parameter_pk)\ .filter(Value.valueset_pk == ValueSet.pk)\ .order_by(Parameter.pk, Language.id): if v: writer.writerow([lid.format(_lid), fid.format(_fid), v]) return writer.read()
def download(dataset, **kw): def rp(*names): return dataset.raw.joinpath(*names).as_posix() download_and_unpack_zipfiles(URL, dataset, FNAME) check_call('libreoffice --headless --convert-to docx %s --outdir %s' % (rp(FNAME), rp()), shell=True) doc = Document(rp(Path(FNAME).stem + '.docx')) for i, table in enumerate(doc.tables): with UnicodeWriter(rp('%s.csv' % (i + 1, ))) as writer: for row in table.rows: writer.writerow(map(text_and_color, row.cells))
def lookup(args): """ Looks up a single gloss from the commandline. concepticon lookup <gloss1 gloss2 ... glossN> """ api = Concepticon() found = api.lookup(args.args, language=args.language, full_search=args.full_search, similarity_level=args.similarity) with UnicodeWriter(None, delimiter='\t') as writer: writer.writerow( ['GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY']) for f in found: writer.writerow(f) print(writer.read().decode('utf-8'))
def write_tree(tree, fname, taxa_in_dplace, societies_by_glottocode): if not fname.exists(): fname.mkdir() tree.prune([n.encode('ascii') for n in taxa_in_dplace]) with fname.joinpath('summary.trees').open('w', encoding="utf-8") as handle: handle.write( NEXUS_TEMPLATE.format(tree.name if tree.name else 'UNTITLED', tree.write(format=9))) with UnicodeWriter(fname.joinpath('taxa.csv')) as writer: writer.writerow(['taxon', 'glottocode', 'xd_ids', 'soc_ids']) for gc in sorted(taxa_in_dplace): socs = societies_by_glottocode[gc] writer.writerow([ gc, gc, ', '.join(set(s.xd_id for s in socs)), ', '.join(s.id for s in socs) ]) return tree
def orthography(args): # pragma: no cover ds = get_dataset(args) out = ds.dir.joinpath('orthography.tsv') if out.exists(): if not confirm( 'There already is an orthography profile for this dataset. Overwrite?', default=False): return graphemes = Counter() for line in ds.iter_raw_lexemes(): graphemes.update(grapheme_pattern.findall(line)) with UnicodeWriter(out, delimiter='\t') as writer: writer.writerow(['graphemes', 'frequency', 'IPA']) for grapheme, frequency in graphemes.most_common(): writer.writerow([grapheme, '{0}'.format(frequency), grapheme]) log_dump(out, log=args.log)
def create(self, req, filename=None, verbose=True): # pragma: no cover meanings = [(p.name, p.id) for p in DBSession.query(Parameter).order_by(Parameter.pk)] tmp = mkdtemp() path = os.path.join(tmp, 'asjp.tab') with UnicodeWriter(f=path, delimiter=binary_type("\t")) as writer: writer.writerow([f[0] for f in self.fields] + [m[0] for m in meanings]) for lang in DBSession.query(Doculect).order_by( Doculect.pk).options( joinedload_all(Language.valuesets, ValueSet.values), joinedload_all(Language.valuesets, ValueSet.parameter)).limit(10000): row = [f[1](lang) for f in self.fields] vss = {vs.parameter.id: vs for vs in lang.valuesets} row.extend( [Doculect.format_words(vss.get(m[1])) for m in meanings]) writer.writerow(row) Download.create(self, req, filename=path) rmtree(tmp)
def _freeze(table, fpath): def conv(v, col): if v is None: return '' if isinstance(col.type, DeclEnumType): # pragma: no cover return v.value if isinstance(col.type, JSONEncodedDict): return json.dumps(v) if isinstance(v, (datetime, date)): return v.isoformat() return v keys = [col.name for col in table.columns] cols = {col.name: col for col in table.columns} rows = [keys] for row in DBSession.execute(select([table])): rows.append([conv(row[key], cols[key]) for key in keys]) if len(rows) > 1: with UnicodeWriter(fpath) as writer: writer.writerows(rows)
def write_conceptlist(clist, filename, header=False): """ Write conceptlist to file. """ def natural_sort(l): """ Code-piece from http://stackoverflow.com/questions/4836710/does-python-have-a-built-in-function-for-string-natural-sort """ convert = lambda text: int(text) if text.isdigit() else text.lower() alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] return sorted(l, key=alphanum_key) header = header or clist['header'] keys = natural_sort(list(clist.keys())) with UnicodeWriter(filename, delimiter='\t') as writer: writer.writerow(header) for k in keys: v = clist[k] if k not in ['splits', 'mergers', 'header']: writer.writerow([v[h] for h in header])
def to_csv(args): """ Parses data file 'org_data/records.tsv' into single csv files into 'raw'. In addition it outputs given warnings while parsing. If you only want to check the data integrity of data file 'org_data/records.tsv' then pass the argument 'check' -> amsd to_csv check. """ raw_path = Path(__file__).resolve().parent.parent.parent / 'raw' if not raw_path.exists(): raw_path.mkdir() csv_dataframe = { 'sticks': [], 'keywords': {}, 'sem_domain': {}, 'linked_filenames': {}, 'item_type': {}, 'material': {}, 'technique': {}, 'ling_area': {}, 'source_citation': {}, 'source_type': {}, 'holder_file': {}, 'data_entry': {} } datafile = Path( __file__).resolve().parent.parent.parent / 'org_data' / 'records.tsv' with UnicodeReader(datafile, delimiter='\t') as reader: for i, row in enumerate(reader): data = [] if i == 0: #header data.append('pk') # add pk for j, col in enumerate(row): data.append(fields[j][2].strip()) else: data.append(i) # add id for j, col_ in enumerate(row): if j > 41 and len(col_): print('Error: too many filled columns for line %i' % (i + 1)) continue if re.sub(r'[ ]+', '', col_) == '': data.append('') else: col = col_.strip() if fields[j][2] in fields_not_in_sticks \ and fields[j][2] not in ['linked_filenames', 'source_citation']: col = col.lower() if fields[j][0] == 0: if fields[j][2] in ['lat', 'long']: try: data.append(dms2dec(col)) except: print( 'Error: check lat/long notation in line %i for "%s"' % (i + 1, col)) data.append(col) else: data.append(col) elif fields[j][0] == 1 and len(fields[j][3]) == 0: if col not in csv_dataframe[fields[j][2]]: csv_dataframe[fields[j][2]][col] = len( csv_dataframe[fields[j][2]]) + 1 data.append(csv_dataframe[fields[j][2]][col]) elif fields[j][0] == 1 and len(fields[j][3]) > 1: ref_data = [] if re.match(r'^ling_area_\d+$', fields[j][2]): try: data_array = [ "|".join([ i.strip() for i in list( re.findall(fields[j][3], col) [0]) ]) ] except: print( 'Error: %s in line %i has wrong structure: %s' % (fields[j][2], i + 1, col)) data_array = [] else: data_array = re.split(fields[j][3], col) for item_ in data_array: item = item_.strip() col_name = fields[j][2] if re.match(r'^ling_area_\d+$', col_name): col_name = 'ling_area' if item not in csv_dataframe[col_name]: csv_dataframe[col_name][item] = len( csv_dataframe[col_name]) + 1 ref_data.append( csv_dataframe[col_name][item]) elif col_name in ['holder_file']: if item not in csv_dataframe[col_name]: csv_dataframe[col_name][item] = len( csv_dataframe[col_name]) + 1 ref_data.append( csv_dataframe[col_name][item]) else: dfkey = 'x_sticks_' + col_name if item not in csv_dataframe[col_name]: csv_dataframe[col_name][item] = len( csv_dataframe[col_name]) + 1 if not csv_dataframe[col_name][ item] in ref_data: ref_data.append( csv_dataframe[col_name][item]) if dfkey not in csv_dataframe: # header csv_dataframe[dfkey] = [] csv_dataframe[dfkey].append( ['stick_pk', col_name + '_pk']) csv_dataframe[dfkey].append( [i, csv_dataframe[col_name][item]]) # save ids to related table as semicolon separated lists of ids data.append(';'.join(map(str, ref_data))) csv_dataframe['sticks'].append(data) with get_catalog() as cat: images_objs = {obj.metadata['name']: obj for obj in cat} # look for similar entries for t, k in [('source_citation', 5), ('holder_file', 4), ('ling_area', 10), ('material', 1)]: check_sim = list(csv_dataframe[t].keys()) for i in range(len(check_sim)): for j in range(i + 1, len(check_sim)): if sim(check_sim[i], check_sim[j]) < k: print('sim check: %s\n%s\n%s\n' % (t, check_sim[i], check_sim[j])) if not args.args or args.args[0].lower() != 'check': for filename, data in csv_dataframe.items(): with UnicodeWriter(raw_path.joinpath(filename + '.csv')) as writer: if type(data) is list: for item in data: writer.writerow(item) else: d = [] if filename == 'ling_area': d.append([ 'pk', 'chirila_name', 'austlang_code', 'austlang_name', 'glottolog_code' ]) for k, v in data.items(): c, ac, an, g = re.split(r'\|', k) if g == 'no code': g = '' d.append([v, c, ac, an, g]) elif filename == 'linked_filenames': d.append(['pk', 'name', 'oid', 'path']) for k, v in data.items(): k_ = os.path.splitext(k)[0] if k_ in images_objs: url_path = '' for o in images_objs[k_].bitstreams: if o.id not in [ 'thumbnail.jpg', 'web.jpg' ]: url_path = o.id break if url_path == '': print("no path found for %s" % (k_)) d.append([v, k, images_objs[k_].id, url_path]) else: print("no image match for '%s'" % (k)) d.append([v, k, '']) else: d.append(['pk', 'name']) for k, v in data.items(): d.append([v, k]) for item in d: writer.writerow(item)
def extract(args): import argparse usage = """ dplace %(prog)s - extracts subsets of data for further processing. To filter societies: > dplace %(prog)s --society Cj4,Cj5,Cj6 output.csv To filter societies on a given tree: > dplace %(prog)s --tree gray_et_al2009 output.csv To filter societies only from a given dataset: > dplace %(prog)s --dataset EA output.csv """ parser = argparse.ArgumentParser(prog='extract', usage=usage) parser.add_argument('filename', help='filename', default=None) parser.add_argument('--society', help='restrict to these society ids (x,y,z)', default=None) parser.add_argument('--tree', help='restrict to this tree', default=None) parser.add_argument('--dataset', help='restrict to these datasets (x,y,z)', default=None) parser.add_argument('--variable', help='restrict to thes dataset (x,y,z)', default=None) xargs = parser.parse_args(args.args) datasets = xargs.dataset.split(",") if xargs.dataset else None variables = xargs.variable.split(",") if xargs.variable else None societies = xargs.society.split(",") if xargs.society else None # get tree if given if xargs.tree: # get trees trees = {t.id: t for t in args.repos.phylogenies} try: tree = trees.get(xargs.tree) except IndexError: raise SystemExit("Failed to find Tree %s" % xargs.tree) societies = [ s for sublist in [t.soc_ids for t in tree.taxa] for s in sublist ] with UnicodeWriter(f=xargs.filename) as out: header = [ 'ID', 'XD_ID', 'Glottocode', 'Name', 'OriginalName', 'FocalYear', 'Latitude', 'Longitude', 'Variable', 'Value' ] out.writerow(header) for record in args.repos.iter_data(datasets=datasets, variables=variables, societies=societies): s = args.repos.societies.get(record.soc_id, None) if s is None: # we get these warnings as we are currently missing the SCCS # and WNAI data args.log.warn("Missing society definition for %s" % record.soc_id) continue row = [ s.id, s.xd_id, s.glottocode, s.pref_name_for_society, s.ORIG_name_and_ID_in_this_dataset, s.main_focal_year, s.Lat, s.Long, record.var_id, record.code ] out.writerow(row)
def map(self, clist, otherlist=None, out=None, full_search=False, similarity_level=5, language='en'): assert clist.exists(), "File %s does not exist" % clist from_ = [] for item in read_dicts(clist): from_.append((item.get('ID', item.get('NUMBER')), item.get('GLOSS', item.get('ENGLISH')))) to = self._get_map_for_language(language, otherlist) if not full_search: cmap = concept_map2([i[1] for i in from_], [i[1] for i in to], similarity_level=similarity_level, freqs=self.frequencies, language=language) good_matches = 0 with UnicodeWriter(out, delimiter='\t') as writer: writer.writerow([ 'ID', 'GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'SIMILARITY' ]) for i, (fid, fgloss) in enumerate(from_): row = [fid, fgloss] matches, sim = cmap.get(i, ([], 10)) if sim <= 5: good_matches += 1 if not matches: writer.writerow(row + ['', '???', '']) elif len(matches) == 1: row.extend([ to[matches[0]][0], to[matches[0]][1].split('///')[0], sim ]) writer.writerow(row) else: # we need a list to retain the order by frequency visited = [] for j in matches: gls, cid = to[j][0], to[j][1].split('///')[0] if (gls, cid) not in visited: visited += [(gls, cid)] if len(visited) > 1: writer.writerow(['<<<', '', '', '']) for gls, cid in visited: writer.writerow(row + [gls, cid, sim]) writer.writerow(['>>>', '', '', '']) else: row.extend([visited[0][0], visited[0][1], sim]) writer.writerow(row) writer.writerow([ '#', good_matches, len(from_), '{0:.2f}'.format(good_matches / len(from_)) ]) else: cmap = concept_map([i[1] for i in from_], [ i[1] for i in self._get_map_for_language(language, otherlist) ], similarity_level=similarity_level) with UnicodeWriter(out, delimiter='\t') as writer: writer.writerow( ['ID', 'GLOSS', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS']) for i, (fid, fgloss) in enumerate(from_): row = [fid, fgloss] match = cmap.get(i) row.extend(list(to[match[0]]) if match else ['', '']) writer.writerow(row) if out is None: print(writer.read().decode('utf-8'))
def csv(wl, delimiter='\t', **kw): with UnicodeWriter(f=None, delimiter=delimiter, **kw) as writer: writer.writerow(wl.header) for row in wl: writer.writerow(row) return writer.read().decode('utf8')
def csv_writer(self, comp, name, delimiter=',', suffix='csv'): p = self.existing_dir(comp).joinpath('{0}.{1}'.format(name, suffix)) self.file_written(p) return UnicodeWriter(p, delimiter=delimiter)
def report(args): """ clpa report <FILE> [rules=FILE] [format=md|csv|cldf] [outfile=FILENAME] Note ---- * Rules point to a tab-separated value file in which source and target are given to convert a segment to another segment to be applied on a data-set-specific basis which may vary from dataset to dataset and can thus not be included as standard clpa behaviour. * Input file needs to be in csv-format, with tabstop as separator, and it needs to contain one column named "TOKENS". * format now allows for md (MarkDown), csv (CSV, tab as separator), or cldf (no pure cldf but rather current lingpy-csv-format). CLDF format means that the original file will be given another two columns, one called CLPA_TOKENS, one called CLPA_IDS. * if you specify an outfile from the input, the data will be written to file instead showing it on the screen. """ if len(args.args) < 1: raise ParserError('not enough arguments') # get keywords from arguments @xrotwang: is there any better way to do so? settings = defaultdict(str) settings['format'] = 'md' fname = None for arg in args.args: if '=' in arg: key, val = arg.split('=') settings[key] = val else: fname = arg if not fname: raise ParserError('no filename passed as argument') wordlist = Wordlist.from_file(fname) sounds, errors = wordlist.check(rules=settings['rules']) if settings['format'] not in ['md', 'csv']: text = wordlist.write(settings['outfile'] or None) if not settings['outfile']: print(text) return segments = OrderedDict([('existing', []), ('missing', []), ('convertible', [])]) for k in sorted(sounds, key=lambda x: (sounds[x]['frequency'], sounds[x]['id']), reverse=True): type_, symbol = None, None if k == sounds[k]['clpa']: type_, symbol = 'existing', k elif sounds[k]['clpa'] == '?': type_, symbol = 'missing', k else: check = sounds[k]['clpa'] if k != check != '?': type_, symbol = 'convertible', k + ' >> ' + sounds[k]['clpa'] if type_ and symbol: segments[type_].append( [symbol, sounds[k]['id'], sounds[k]['frequency']]) if settings['format'] == 'csv': with UnicodeWriter(settings['outfile'] or None, delimiter='\t') as writer: for key, items in segments.items(): for i, item in enumerate(items): writer.writerow([i + 1] + item + [key]) if not settings['outfile']: print(writer.read()) return text = [] header_template = """ # {0} sounds | number | sound | clpa | frequency | | ------:| ----- | ---- | ---------:|""" for key, items in segments.items(): text.append(header_template.format(key.capitalize())) for i, item in enumerate(items): text.append("| {0} | {1[0]} | {1[1]} | {1[2]} |".format( i + 1, item)) text = '\n'.join(text) if settings['outfile']: with Path(settings['outfile']).open('w', encoding='utf8') as fp: fp.write(text) else: print(text)
def main(): socs = read_win1252( 'ALL_soc_ids_to_lang_wAltNames_sources_5Sept2017_win1252.csv') links = { r['soc_id']: r for r in read_win1252( 'ALL_soc_links_to_other_databases_30Aug2017_win1252.csv') } locations = { 'SCCS' + r['soc_id']: r for r in reader('../../legacy/LatLong_data.csv', dicts=True) } for row in reader( '../WNAI/DPLACE_RevisedLatLong_27April2017_inclWNAI_SCCS.csv', dicts=True): if row['Dataset'] == 'SCCS': locations[row['soc_id']]['Lat'] = row['soc.latitude'] locations[row['soc_id']]['Long'] = row['soc.longitude'] with UnicodeWriter('societies.csv') as w: w.writerow([f.name for f in attr.fields(Society)]) for soc in socs: kw = { 'id': soc['soc_id'], 'glottocode': soc['glottolog_id'], 'glottocode_comment': 'Lang_assignment_change_notes' } for col in [ 'xd_id', 'pref_name_for_society', 'ORIG_name_and_ID_in_this_dataset', 'alt_names_by_society', 'main_focal_year', ]: kw[col] = soc[col] for col in ['Lat', 'Long', 'origLat', 'origLong', 'Comment']: kw[col] = locations[soc['soc_id']][col] kw['HRAF_name_ID'] = links[soc['soc_id']]['HRAF_name_ID'] kw['HRAF_link'] = links[soc['soc_id']]['HRAF_link'] w.writerow(attr.astuple(Society(**kw))) with UnicodeWriter('societies_mapping.csv') as w: w.writerow(['id', 'related']) for sid, l in links.items(): rels = [] for dsid, suffix in [ ('EA', '1'), ('EA', '2'), ('Binford', '1'), ('Binford', '2'), ('Binford', '3'), ('SCCS', ''), ('WNAI', '1'), ('WNAI', '2'), ('WNAI', '3'), ('WNAI', '4'), ('WNAI', '5'), ]: if dsid == 'SCCS': label = l['{0}_society_equivalent{1}'.format(dsid, suffix)] else: label = l['{0}_label_society_equivalent{1}'.format( dsid, suffix)] id = l['{0}_id_society_equivalent{1}'.format(dsid, suffix)] if label and id: rels.append('{0}: {1} [{2}]'.format(dsid, label, id)) w.writerow([sid, '; '.join(rels)]) var_info = { r['source']: r['APA_reference'] for r in read_win1252('SCCS_variable_sources_bibtex_to_APA.csv', ignore_dataset=True) } with UnicodeWriter('variables.csv') as w: fm = OrderedDict([ ('VarID', 'id'), ('Category', 'category'), ('VarTitle', 'title'), ('VarDefinition', 'definition'), ('VarType', 'type'), ('UserNotes', 'notes'), ('source', 'source'), ('VarTitleShort', 'changes'), ('Unit', 'units'), ]) w.writerow(fm.values()) for row in read_win1252( 'SCCS_Full_VariableList_12Sept2017_win1252.csv'): row['VarID'] = 'SCCS' + row['VarID'] row['VarType'] = row['VarType'].capitalize() if row['VarDefinition']: row['VarDefinition'] += '\n\n' row['VarDefinition'] += var_info.get(row['source'], row['source']) w.writerow([row[f] for f in fm.keys()]) with UnicodeWriter('codes.csv') as w: fm = OrderedDict([ ('VarID', 'var_id'), ('Code', 'code'), ('CodeDescription', 'description'), ('ShortName', 'name'), ]) w.writerow(fm.values()) for row in read_win1252( 'SCCS_CodeDescriptions_12Sept2017_win1252.csv'): row['VarID'] = 'SCCS' + row['VarID'] w.writerow([row[f] for f in fm.keys()]) with UnicodeWriter('data.csv') as w: fm = OrderedDict([ ('soc_id', 'soc_id'), ('SubCase', 'sub_case'), ('Year', 'year'), ('VarID', 'var_id'), ('Code', 'code'), ('EthnoReferences', 'references'), ('AdminComment', 'admin_comment'), ('UserComment', 'comment'), ('SourceCodedData', 'source_coded_data'), ]) w.writerow(fm.values()) for row in read_win1252( 'Full_SCCS_data_12Sept2017_FINAL_329451rows_win1252.csv'): row['VarID'] = 'SCCS' + row['VarID'] w.writerow([row[f] for f in fm.keys()])
def dump(args, test=False): sounds = defaultdict(dict) data = [] bipa = TranscriptionSystem('bipa') # start from assembling bipa-sounds for grapheme, sound in sorted(bipa.sounds.items(), key=lambda p: p[1].alias if p[1].alias else False): if sound.type not in ['marker']: if sound.alias: assert sound.name in sounds sounds[sound.name]['aliases'].add(grapheme) else: assert sound.name not in sounds sounds[sound.name] = { 'grapheme': grapheme, 'unicode': sound.uname or '', 'generated': '', 'note': sound.note or '', 'type': sound.type, 'aliases': set(), 'normalized': '+' if sound.normalized else '' } data.append( Grapheme(grapheme, sound.name, '+', '', 'bipa', '0', '', '', '', '', sound.note or '')) # add sounds systematically by their alias for td in args.repos.iter_transcriptiondata(): for name in td.names: bipa_sound = bipa[name] # check for consistency of mapping here if not is_valid_sound(bipa_sound, bipa): continue sound = sounds.get(name) if not sound: sound = sounds[name] = { 'grapheme': bipa_sound.s, 'aliases': {bipa_sound.s}, 'generated': '+', 'unicode': bipa_sound.uname or '', 'note': '', 'type': bipa_sound.type, 'alias': '+' if bipa_sound.alias else '', 'normalized': '+' if bipa_sound.normalized else '' } for item in td.data[name]: sound['aliases'].add(item['grapheme']) # add the values here data.append( Grapheme( item['grapheme'], name, item['explicit'], '', # sounds[name]['alias'], td.id, item.get('frequency', ''), item.get('url', ''), item.get('features', ''), item.get('image', ''), item.get('sound', ''), )) if test: break # sound classes have a generative component, so we need to treat them # separately for sc in args.repos.iter_soundclass(): for name in sounds: try: grapheme = sc[name] data.append( Grapheme( grapheme, name, '+' if name in sc.data else '', '', sc.id, )) except KeyError: # pragma: no cover args.log.debug(name, sounds[name]['grapheme']) if test: break # last run, check again for each of the remaining transcription systems, # whether we can translate the sound for ts in args.repos.iter_transcriptionsystem(exclude=['bipa']): for name in sounds: try: ts_sound = ts[name] if is_valid_sound(ts_sound, ts): sounds[name]['aliases'].add(ts_sound.s) data.append( Grapheme( ts_sound.s, name, '' if sounds[name]['generated'] else '+', '', # sounds[name]['alias'], ts.id, )) except ValueError: pass except TypeError: args.log.debug('{0}: {1}'.format(ts.id, name)) if test: break with UnicodeWriter(args.repos.data_path('sounds.tsv'), delimiter='\t') as writer: writer.writerow( ['NAME', 'TYPE', 'GRAPHEME', 'UNICODE', 'GENERATED', 'NOTE']) for k, v in sorted(sounds.items(), reverse=True): writer.writerow([ k, v['type'], v['grapheme'], v['unicode'], v['generated'], v['note'] ]) with UnicodeWriter(args.repos.data_path('graphemes.tsv'), delimiter='\t') as writer: writer.writerow([f.name for f in attr.fields(Grapheme)]) for row in data: writer.writerow(attr.astuple(row))
def trees(societies_by_glottocode, langs, outdir, year, title): label_pattern = re.compile("'[^\[]+\[([a-z0-9]{4}[0-9]{4})[^']*'") def rename(n): n.name = label_pattern.match(n.name).groups()[0] n.length = 1 glottocodes = set(societies_by_glottocode.keys()) glottocodes_in_global_tree = set() index = {} outdir = outdir.joinpath('phylogenies') languoids = {} families = [] for lang in langs: if not lang.lineage: # a top-level node if not lang.category.startswith('Pseudo '): families.append(lang) languoids[lang.id] = lang glob = Tree() glob.name = 'glottolog_global' for family in sorted(families): node = family.newick_node(nodes=languoids) node.visit(rename) taxa_in_tree = set(n.name for n in node.walk()) taxa_in_dplace = glottocodes.intersection(taxa_in_tree) if not taxa_in_dplace: continue tree = Tree("({0});".format(node.newick), format=3) tree.name = 'glottolog_{0}'.format(family.id) if family.level.name == 'family': tree = write_tree(tree, outdir.joinpath(tree.name), taxa_in_dplace, societies_by_glottocode) glottocodes_in_global_tree = glottocodes_in_global_tree.union( set(n.name for n in tree.traverse())) index[tree.name] = dict( id=tree.name, name='{0} ({1})'.format(family.name, title), author='{0} ({1})'.format(title, family.name), year=year, scaling='', reference=reference(title, year), url='http://glottolog.org/resource/languoid/id/{}'.format( family.id)) else: glottocodes_in_global_tree = glottocodes_in_global_tree.union( taxa_in_tree) glob.add_child(tree) # global write_tree(glob, outdir.joinpath(glob.name), glottocodes_in_global_tree.intersection(glottocodes), societies_by_glottocode) index[glob.name] = dict(id=glob.name, name='Global Classification ({0})'.format(title), author=title, year=year, scaling='', reference=reference(title, year), url='http://glottolog.org/') index_path = outdir.joinpath('index.csv') phylos = list(reader(index_path, dicts=True)) with UnicodeWriter(index_path) as writer: header = list(phylos[0].keys()) writer.writerow(header) for phylo in sorted(phylos): if phylo['id'] in index: writer.writerow([index[phylo['id']][k] for k in header]) del index[phylo['id']] else: writer.writerow(phylo.values()) for id_, spec in sorted(index.items()): writer.writerow([spec[k] for k in header])