def test_add_delete_rows(self): from pytsammalex.util import add_rows, filter_rows csv_path = self.tmp_path('test.csv') add_rows(csv_path, ['a', 'b'], [1, 2], [3, 4]) self.assertEqual(len(list(reader(csv_path, dicts=True))), 2) filter_rows(csv_path, lambda item: item['a'] == '1') self.assertEqual(len(list(reader(csv_path, dicts=True))), 1) add_rows(csv_path, [1, 2], [3, 4]) self.assertEqual(len(list(reader(csv_path, dicts=True))), 3)
def metadata(write_stats=True): """Writes statistics on metadata to readme.""" txt = '# Basic Statistics on Metadata\n\n' cnc = list(reader(data_path('concepticon.tsv'), namedtuples=True, delimiter="\t")) for i,cl in enumerate(PKG_PATH.joinpath('concept_set_meta').glob('*.tsv')): data = list(reader(cl, namedtuples=True, delimiter="\t")) txt += '* {0} covers {1} concept sets ({2:.2f} %)\n'.format(cl.name[:-4], len(data), len(data) / len(cnc)) if write_stats: with PKG_PATH.joinpath('concept_set_meta', 'README.md').open('w', encoding='utf8') as fp: fp.write(txt)
def test_add_delete_rows(tmpdir): csv_path = Path(tmpdir.join('test.csv')) add_rows(csv_path, ['a', 'b'], [1, 2], [3, 4]) assert (len(list(reader(csv_path, dicts=True))) == 2) filter_rows(csv_path, lambda item: item['a'] == '1') assert (len(list(reader(csv_path, dicts=True))) == 1) add_rows(csv_path, [1, 2], [3, 4]) assert (len(list(reader(csv_path, dicts=True))) == 3)
def test_rewrite(self): from clldutils.dsv import reader, rewrite tmp = self.tmp_path('test') shutil.copy(FIXTURES.joinpath('tsv.txt').as_posix(), tmp.as_posix()) rewrite(tmp.as_posix(), lambda i, row: [len(row)], delimiter='\t') self.assertEquals(list(reader(tmp))[0], ['2']) shutil.copy(FIXTURES.joinpath('csv.txt').as_posix(), tmp.as_posix()) rewrite(tmp, lambda i, row: row) self.assertEquals(list(reader(tmp)), list(reader(FIXTURES.joinpath('csv.txt'))))
def __init__(self, dir_): self.dir = dir_ self.datasets = [ Dataset(base_dir=self.dir.joinpath('datasets'), **r) for r in reader(self.dir.joinpath('datasets', 'index.csv'), dicts=True) ] self.phylogenies = [ Phylogeny(base_dir=self.dir.joinpath('phylogenies'), **r) for r in reader(self.dir.joinpath('phylogenies', 'index.csv'), dicts=True) ]
def test_add_delete_rows(self): from clldutils.dsv import add_rows, filter_rows_as_dict, reader csv_path = self.tmp_path('test.csv') add_rows(csv_path, ['a', 'b'], [1, 2], [3, 4]) self.assertEqual(len(list(reader(csv_path, dicts=True))), 2) filter_rows_as_dict(csv_path, lambda item: item['a'] == '1') self.assertEqual(len(list(reader(csv_path, dicts=True))), 1) add_rows(csv_path, [2, 2], [2, 4]) self.assertEqual(len(list(reader(csv_path, dicts=True))), 3) res = filter_rows_as_dict(csv_path, lambda item: item['a'] == '1') self.assertEqual(res, 2)
def ff_images(args): tsammalex = { i.id: i.taxa__id for i in reader(args.data_file('repos', 'tsammalex_images.csv'), namedtuples=True)} ref_pattern = re.compile('(?P<ref>[0-9]{5})') uploaded = load(args.data_file('repos', 'cdstar.json')) files = load(args.data_file('repos', 'Heath_flora_fauna_images.json')) files.update(load(args.data_file('repos', 'ffmissing.json'))) path_to_md5 = {} for md5, paths in files.items(): for path in paths: path_to_md5[Path(path.encode('utf8')).stem] = md5 missed, found, uploaded_ = 0, 0, 0 for i, img in enumerate(reader(args.data_file('repos', 'dogon_flora-fauna.csv'), delimiter=',', namedtuples=True)): stem = Path(img.filenames.encode('utf8')).stem assert stem in path_to_md5 found += 1 if path_to_md5[stem] in uploaded: m = ref_pattern.search(stem) uploaded_ += 1 yield FFImage( path_to_md5[stem], Path(files[path_to_md5[stem]][0].encode('utf8')).name, None, m.group('ref') if m else None, None, [], uploaded[path_to_md5[stem]], tsammalex.get(path_to_md5[stem])) videos = load(args.data_file('repos', 'videos_from_website.json')) videos.update(load(args.data_file('repos', 'videos.json'))) for md5, paths in videos.items(): if md5 in uploaded: path = Path(paths[0].encode('utf8')) m = ref_pattern.search(path.stem) uploaded_ += 1 yield FFImage( md5, path.name, None, m.group('ref') if m else None, None, [], uploaded[md5], tsammalex.get(md5)) else: missed += 1 print('ff_images', missed, uploaded_)
def __init__(self, dir_): self.dir = Path(dir_) self.datasets = [ Dataset(base_dir=self.dir.joinpath('datasets'), **r) for r in reader(self.dir.joinpath('datasets', 'index.csv'), dicts=True)] self.phylogenies = [ Phylogeny(base_dir=self.dir.joinpath('phylogenies'), **r) for r in reader(self.dir.joinpath('phylogenies', 'index.csv'), dicts=True)] self.societies = { s.id: s for s in chain.from_iterable(d.societies for d in self.datasets) } self.variables = { v.id: v for v in chain.from_iterable(d.societies for d in self.datasets) } self.sources = BibFile(self.dir.joinpath('datasets', 'sources.bib'))
def metadata(write_stats=True): """Writes statistics on metadata to readme.""" txt = '# Basic Statistics on Metadata\n\n' cnc = list( reader(data_path('concepticon.tsv'), namedtuples=True, delimiter="\t")) for i, cl in enumerate( PKG_PATH.joinpath('concept_set_meta').glob('*.tsv')): data = list(reader(cl, namedtuples=True, delimiter="\t")) txt += '* {0} covers {1} concept sets ({2:.2f} %)\n'.format( cl.name[:-4], len(data), len(data) / len(cnc)) if write_stats: with PKG_PATH.joinpath('concept_set_meta', 'README.md').open('w', encoding='utf8') as fp: fp.write(txt)
def fill(dataset, data, socids): lines_old = set(open(data, encoding="utf8").readlines()) res = defaultdict(list) for item in reader(data, dicts=True): res[(item["Dataset"], item["VarID"], item["soc_id"])].append(item) keys = list(item.keys()) print(dataset, len(socids), "societies") for var_id, socs in groupby(sorted(res.keys(), key=lambda t: t[1]), key=lambda t: t[1]): for ds, soc_id in socids.difference(set((s[0], s[2]) for s in socs)): rec = OrderedDict() for key in keys: rec[key] = "" rec.update(soc_id=soc_id, Dataset=ds, Code="NA", VarID=var_id) res[(ds, var_id, soc_id)].append(rec) assert sum(len(v) for k, v in res.items() if k[1] == var_id) >= len(socids) with UnicodeWriter(data) as fp: fp.writerow(keys) for key in sorted(res.keys()): fp.writerows(row.values() for row in res[key]) # fix line endings: with open(data, encoding="utf8") as fp: c = fp.read() with open(data, "w", encoding="utf8") as fp: fp.write(c.replace("\r\n", "\n")) lines_new = set(open(data, encoding="utf8").readlines()) assert lines_old.issubset(lines_new) print(len(lines_new.difference(lines_old)), "NA values added")
def read_scorer(path): """ Read a scoring function in a file into a ScoreDict object. Parameters ---------- path : Path The path to the input file that shall be read as a scoring dictionary. The matrix format is a simple csv-file in which the scoring matrix is displayed, with negative values indicating high differences between sound segments (or sound classes) and positive values indicating high similarity. The matrix should be symmetric, columns should be separated by tabstops, and the first column should provide the alphabet for which the scoring function is defined. Returns ------- scoredict : ~lingpy.algorithm.ScoreDict A ScoreDict instance which can be directly passed to LingPy's alignment functions. """ chars, matrix = [], [] for row in reader(path, delimiter='\t'): if row: chars.append(row[0]) matrix.append(map(float, row[1:])) return ScoreDict(chars, matrix)
def stats(): lines = [ '## Concept Lists', '', ' name | mapped | mergers ', ' ---- | ------ | ------- ', ] for cl in sorted( PKG_PATH.joinpath('conceptlists').glob('*.tsv'), key=lambda _cl: _cl.name): concepts = list(reader(cl, namedtuples=True, delimiter='\t')) mapped = len([c for c in concepts if c.CONCEPTICON_ID]) mapped_ratio = int((mapped / len(concepts)) * 100) concepticon_ids = Counter( [c.CONCEPTICON_ID for c in concepts if c.CONCEPTICON_ID]) mergers = len([k for k, v in concepticon_ids.items() if v > 1]) line = [ '[%s](%s) ' % (cl.stem, cl.name), badge('mapped', '%s%%' % mapped_ratio, Colors.red if mapped_ratio < 99 else Colors.brightgreen), badge('mergers', '%s' % mergers, Colors.red if mergers else Colors.brightgreen), ] lines.append(' | '.join(line)) with PKG_PATH.joinpath('conceptlists', 'README.md').open('w', encoding='utf8') as fp: fp.write('\n'.join(lines))
def import_gb20_features(datadir, data): for feature in reader(os.path.join(datadir, 'gb20features.tsv'), delimiter='\t', dicts=True): feature = FeatureSpec(feature) f = data.add(Feature, feature.id, id=feature.id, name=feature.name, doc=feature.doc, patron=feature.patron, std_comments=feature.std_comments, name_french=feature.name_french, jl_relevant_unit=feature.jl_relevant_unit, jl_function=feature.jl_function, jl_form=feature.jl_form, hard_to_deny=feature.hard_to_deny, prone_misunderstanding=feature.prone_misunderstanding, requires_extensive_data=feature.requires_extensive_data, last_edited=feature.last_edited, other_survey=feature.other_survey) for i, (deid, desc) in enumerate(feature.domain.items()): DomainElement(id='%s-%s' % (f.id, deid), parameter=f, abbr=deid, name='%s - %s' % (deid, desc), number=int(deid) if deid != '?' else 999, description=desc, jsondata=dict(icon=ORDERED_ICONS[i].name))
def _from(cls, data, container=None, skip_on_error=False): container = container or data.parent dataset = cls(data.stem) dataset.metadata.read(Dataset.filename(data, 'metadata'), container) dataset._table = dataset.metadata.get_table() dataset.sources.read(Dataset.filename(data, 'sources'), container) delimiter = ',' if dataset.table: delimiter = dataset.table.dialect.delimiter if data.suffix in TAB_SUFFIXES: delimiter = '\t' if isinstance(container, Archive): rows = container.read_text(data.name).split('\n') else: rows = data for i, row in enumerate(reader(rows, delimiter=delimiter)): if i == 0: dataset.fields = tuple(row) else: try: dataset.add_row(row) except ValueError as e: if skip_on_error: log.warn('skipping row in line %s: %s' % (i + 1, e)) else: raise e dataset.table.dialect.delimiter = delimiter dataset.table.url = data.name return dataset
def _dtab(dir_, fn): lpd = [] for d in reader(dir_.joinpath(fn), dicts=True, delimiter='\t', quoting=csv.QUOTE_NONE): lpd.append({ k.replace('\ufeff', ''): (v or '').strip() for k, v in d.items() + [("fromfile", fn)]}) return lpd
def load(table, csv, engine): schema = jsonlib.load( csv.parent.joinpath(csv.stem + '.' + CsvmJsonAdapter.extension)) converter = get_converter(schema['tableSchema'], table) engine.execute(table.insert(), [converted(d, converter) for d in reader(csv, dicts=True)]) return schema.get("dc:identifier")
def stats(): lines = [ '## Concept Lists', '', ' name | mapped | mergers ', ' ---- | ------ | ------- ', ] for cl in sorted(PKG_PATH.joinpath('conceptlists').glob('*.tsv'), key=lambda _cl: _cl.name): concepts = list(reader(cl, namedtuples=True, delimiter='\t')) mapped = len([c for c in concepts if c.CONCEPTICON_ID]) mapped_ratio = int((mapped / len(concepts)) * 100) concepticon_ids = Counter( [c.CONCEPTICON_ID for c in concepts if c.CONCEPTICON_ID]) mergers = len([k for k, v in concepticon_ids.items() if v > 1]) line = [ '[%s](%s) ' % (cl.stem, cl.name), badge('mapped', '%s%%' % mapped_ratio, Colors.red if mapped_ratio < 99 else Colors.brightgreen), badge('mergers', '%s' % mergers, Colors.red if mergers else Colors.brightgreen), ] lines.append(' | '.join(line)) with PKG_PATH.joinpath('conceptlists', 'README.md').open('w', encoding='utf8') as fp: fp.write('\n'.join(lines))
def upgrade(): conn = Connection(op.get_bind()) example_map = {} sid = 204 for example in jsonload(data_file('lingala_examples.json')): sid += 1 kw = { 'id': '60-%s' % sid, 'language_pk': conn.pk(Language, '60'), 'name': example['Text'], 'description': example['Translation'], 'gloss': '\t'.join(example['Gloss'].split()), 'analyzed': '\t'.join(example['Text'].split()), 'type': example['Type'].strip().lower(), 'jsondata': {'sort': int(example['Order_number']), 'alt_translation': None} } example_map[example['Example_number']] = conn.insert(Sentence, **kw) for ve in jsonload(data_file('lingala_value_examples.json')): vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number']) vpk = conn.pk(Value, vspk, attr='valueset_pk') conn.insert( ValueSentence, value_pk=vpk, sentence_pk=example_map[ve['Example_number']]) for i, comment in enumerate(reader(data_file('lingala_valueset_comments.tab'), delimiter='\t', dicts=True)): vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number']) comment['Comments_on_value_assignment'] = comment['Comments_on_value_assignment'].replace('\x0b', '\n') conn.update( ValueSet, { 'description': comment['Comments_on_value_assignment'], 'markup_description': None, }, pk=vspk)
def iter_lexicon(args): for fname in ['dogon_lexicon', 'flora_Dogon_Unicode', 'fauna_Dogon_Unicode']: for concept in reader(args.data_file('repos', fname + '.csv'), dicts=True): entry = Entry(**{v: concept.get(k) for k, v in FIELD_MAP.items()}) for name, gc in LEX_LANGS[fname].items(): entry.forms[gc] = concept[name].strip() if entry.ref and entry.ref != 'zzz': yield entry
def read(self, path, sep="\t", comment="#"): with Path(path).open(encoding='utf-8') as handle: lines = [ unicodedata.normalize('NFC', hline) for hline in handle.readlines() if hline and not hline.startswith(comment) ] self.extend(list(reader(lines, dicts=True, delimiter=sep)))
def _iter_etc(self, what): delimiter = '\t' path = self.dir / 'etc' / (what + '.tsv') if not path.exists(): delimiter = ',' path = path.parent / (what + '.csv') return reader(path, dicts=True, delimiter=delimiter) if path.exists() else []
def __init__(self, name_and_date, fp): parts = name_and_date.split('_') self.date = date(int(parts[-1][:4]), int(parts[-1][4:6]), int(parts[-1][6:8])) name = '_'.join(parts[:-1]) if name.startswith('_') or name.startswith('-'): name = name[1:] if not name: name = 'Codes' self.name = name list.__init__(self, reader(fp.splitlines(), dicts=True, delimiter='\t'))
def read_win1252(fname, ignore_dataset=False): with open(fname, 'rb') as fp: c = fp.read() with open(fname, 'wb') as fp: fp.write(c.replace(b'\x9d', b'')) for r in reader(fname, dicts=True, encoding='cp1252'): if ignore_dataset or (r.get('dataset') == 'SCCS') or ( r.get('Dataset') == 'SCCS') or (r.get('Datset') == 'SCCS'): yield r
def from_file(cls, fname): """ Orthography profiles must be - tab-separated CSV files - encoded in UTF-8 - with a header containing a column "Grapheme" """ return cls(*list( reader(readlines(fname, normalize='NFD'), dicts=True, delimiter='\t', quotechar=None)))
def get_sources(args): res = {} for d in reader(args.data_file('sources_CSD.csv'), delimiter=',', dicts=True): res[normalize_sid(d['Abbreviation'])] = d for sid in list(SOURCES.keys()): _sid = normalize_sid(sid) if _sid not in res: print('missing sid: %s' % sid) res[_sid] = dict(citation=SOURCES[sid], Name=sid, title=SOURCES[sid]) return res
def cmd_install(self, **kw): with self.cldf as ds: ds.add_concepts(id_factory=lambda d: d.number.replace('.', '-')) lmap = ds.add_languages() for p in self.raw.glob('*.csv'): lid = p.stem.split('-')[1] if lid in lmap: for item in reader(p, dicts=True): if item['Phonetic']: ds.add_lexemes( Language_ID=lid, Parameter_ID=item['BNC ID'].replace('.', '-'), Value=unicodedata.normalize('NFC', item['Phonetic']))
def __init__(self, path): """ A dataset is initialzed passing its directory path. """ path = Path(path) self.id = path.name self.log = logging.getLogger(pylexibank.__name__) self.dir = path self.raw = self.dir.joinpath('raw', 'data') if not self.raw.exists(): self.raw.mkdir() self.cldf_dir = self.dir.joinpath('cldf') if not self.cldf_dir.exists(): self.cldf_dir.mkdir() self.commands = import_module(self.dir) self.md = jsonlib.load(self.dir.joinpath('metadata.json')) self.languages = [] lpath = self.dir.joinpath('languages.csv') if lpath.exists(): for item in reader(lpath, dicts=True): if item['GLOTTOCODE'] and not GC_PATTERN.match( item['GLOTTOCODE']): raise ValueError("Wrong glottocode for item {0}".format( item['GLOTTOCODE'])) self.languages.append(item) self.conceptlist = None url = self.md.get('dc:conformsTo') if url and url.startswith( 'http://concepticon.clld.org/contributions/'): self.conceptlist = url.split('/')[-1] self.concepts = [] cpath = self.dir.joinpath('concepts.csv') if cpath.exists(): self.concepts = list(reader(cpath, dicts=True)) self.cognates = Cognates() # the following attributes are only set when a dataset's cldf method is run: self.glottolog_languoids = {} self.glottolog_version, self.concepticon_version = None, None
def pytest_generate_tests(metafunc): if 'test_sounds' == metafunc.function.__name__: fixturenames = None tests = [] for i, test in enumerate( reader(Path(__file__).parent / 'data' / 'test_data.tsv', delimiter='\t', dicts=True)): if i == 0: fixturenames = list(test.keys()) fixturenames.pop(fixturenames.index('bipa')) del test['bipa'] if None in test: del test[None] if len(fixturenames) != len(test.keys()): raise ValueError(set(test.keys()) - set(fixturenames)) tests.append(test) attrs = [ 'nfd-normalized', 'clts-normalized', 'aliased', 'generated', 'stressed' ] tests = sorted(tests, key=lambda t: tuple([t[a] for a in attrs])) batches = [] for _, ts in groupby(tests, lambda t: tuple([t[a] for a in attrs])): for test in ts: batches.append(tuple(test.values())) break metafunc.parametrize( ','.join(n.replace('-', '_') for n in fixturenames), batches) elif 'test_clicks' == metafunc.function.__name__: tests = [] for test in reader(Path(__file__).parent / 'data' / 'clicks.tsv', delimiter='\t', dicts=True): tests.append((test['GRAPHEME'], test['MANNER'])) metafunc.parametrize('grapheme,gtype', tests)
def get_wordlist(path, delimiter=",", quotechar='"', normalization_form="NFC", **keywords): """ Load a wordlist from a normal CSV file. Parameters ---------- path : str The path to your CSV file. delimiter : str The delimiter in the CSV file. quotechar : str The quote character in your data. row : str (default = "concept") A string indicating the name of the row that shall be taken as the basis for the tabular representation of the word list. col : str (default = "doculect") A string indicating the name of the column that shall be taken as the basis for the tabular representation of the word list. conf : string (default='') A string defining the path to the configuration file. Notes ----- This function returns a :py:class:`~lingpy.basic.wordlist.Wordlist` object. In contrast to the normal way to load a wordlist from a tab-separated file, however, this allows to directly load a wordlist from any "normal" csv-file, with your own specified delimiters and quote characters. If the first cell in the first row of your CSV file is not named "ID", the integer identifiers, which are required by LingPy will be automatically created. """ kw = dict(conf="", col="doculect", row="concept") kw.update(keywords) data = list(dsv.reader(path, delimiter=delimiter, quotechar=quotechar)) header = [h.lower() for h in data[0]] data = data[1:] D = {} if header[0] == 'ID': D[0] = header[1:] for row in data: D[row[0]] = [normalize(normalization_form, n) for n in row[1:]] else: D[0] = header for idx, row in enumerate(data): D[idx + 1] = row return Wordlist(D, row=kw['row'].lower(), col=kw['col'].lower())
def read_csv(fname, data): concepts = None for i, row in enumerate(reader(fname)): if i == 0: concepts = {j: c for j, c in enumerate(row[1:])} else: for j, c in enumerate(row[1:]): if j % 2 == 0: # even number loan, form = get_loan_and_form(c) else: if form.strip(): data[row[0]][concepts[j]] = (form, loan, c) return data
def __init__(self, clid): self.clid = clid self.concepts = { 'CONCEPTICON_ID': {}, # maps ID to GLOSS 'CONCEPTICON_GLOSS': {}, # maps GLOSS to ID } for cs in reader(data_path('concepticon.tsv'), dicts=True, delimiter='\t'): self.concepts['CONCEPTICON_ID'][cs['ID']] = cs['GLOSS'] self.concepts['CONCEPTICON_GLOSS'][cs['GLOSS']] = cs['ID'] self._cid_index = None self._cgloss_index = None self._link_col = (None, None) self._number_index = None
def stats(args): sounds = {} for row in reader(args.repos.data_path('sounds.tsv'), delimiter='\t', dicts=True): sounds[row['NAME']] = row graphs = {} for row in reader(args.repos.data_path('graphemes.tsv'), delimiter='\t', dicts=True): graphs['{GRAPHEME}-{NAME}-{DATASET}'.format(**row)] = row graphdict = defaultdict(list) for id_, row in graphs.items(): graphdict[row['GRAPHEME']] += [row['DATASET']] text = [['DATA', 'STATS', 'PERC']] text.append([ 'Unique graphemes', len(set(row['GRAPHEME'] for row in graphs.values())), '' ]) text.append(['different sounds', len(sounds), '']) text.append([ 'singletons', len([g for g in graphdict if len(set(graphdict[g])) == 1]), '' ]) text.append([ 'multiples', len([g for g in graphdict if len(set(graphdict[g])) > 1]), '' ]) total = len(sounds) for type_, count in Counter([s['TYPE'] for s in sounds.values()]).most_common(): text.append([type_ + 's', count, '{0:.2f}'.format(count / total)]) print(tabulate.tabulate(text, headers='firstrow'))
def list_attributes(write_stats=True): """Calculate the addditional attributes in the lists.""" D = {} for i,cl in enumerate(PKG_PATH.joinpath('conceptlists').glob('*.tsv')): header = list(reader(cl, delimiter="\t"))[0] header = [h for h in header if h not in ['ID', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'ENGLISH', 'GLOSS', 'NUMBER']] for h in header: try: D[h] += [cl.name] except KeyError: D[h] = [cl.name] txt = '# Common Additional Columns of Concept Lists\n' for k,v in sorted(D.items(), key=lambda x: len(x[1]), reverse=True): txt += '* {2} occurences: {0}, {1}\n'.format(k, ', '.join(v), len(v)) print(txt)
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ concepticon = { c.GLOSS: c.CONCEPTICON_ID for c in reader(args.data_file('repos', 'conceptlist.tsv'), delimiter='\t', namedtuples=True) if c.CONCEPTICON_ID} sdata = jsonlib.load(args.data_file('repos', 'classification.json')) for concept in DBSession.query(models.Concept).options(joinedload(common.Parameter._files)): for t_ in ['image', 'video']: setattr(concept, 'count_{0}s'.format(t_), len(getattr(concept, t_ + 's'))) if concept.jsondata['ref'] in sdata: util.update_species_data(concept, sdata[concept.jsondata['ref']]) if concept.name in concepticon: concept.concepticon_id = int(concepticon[concept.name])
def from_csv(data_file, model, data, name=None, visitor=None, filter_=None): if filter_ is None: filter_ = lambda r: True kw = {'delimiter': ',', 'lineterminator': str('\r\n'), 'quotechar': '"'} for fname in data_files(data_file, (name or model.__csv_name__) + '.csv'): for row in list(reader(fname, **kw))[1:]: if row and filter_(row): try: obj = model.from_csv(row, data) except (KeyError, IndexError): obj = None print(fname) print(row) raise if obj: obj = data.add(model, row[0], _obj=obj) if visitor: visitor(obj, row, data)
def viewCsvImport(request): ''' report stores how the import went. Its structure must be iterable containing dicts with str data for keys 'heading', 'body'. ''' report = [] if request.method == 'POST' and 'CsvImportForm' in request.POST: importMethod = request.POST['tableType'] fileDicts = list( dsv.reader( request.FILES['csvFile'].read().decode('utf8').splitlines(), dicts=True)) handlerFunctions = {'ms*l': handleMeaningsLanguageImport} if importMethod in handlerFunctions: report = handlerFunctions[importMethod](fileDicts, request) return render_template(request, "admin/viewCsvImport.html", {'report': report})
def list_attributes(write_stats=True): """Calculate the addditional attributes in the lists.""" D = {} for i, cl in enumerate(PKG_PATH.joinpath('conceptlists').glob('*.tsv')): header = list(reader(cl, delimiter="\t"))[0] header = [ h for h in header if h not in [ 'ID', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'ENGLISH', 'GLOSS', 'NUMBER' ] ] for h in header: try: D[h] += [cl.name] except KeyError: D[h] = [cl.name] txt = '# Common Additional Columns of Concept Lists\n' for k, v in sorted(D.items(), key=lambda x: len(x[1]), reverse=True): txt += '* {2} occurences: {0}, {1}\n'.format(k, ', '.join(v), len(v)) print(txt)
def upgrade(): conn = Connection(op.get_bind()) example_map = {} sid = 204 for example in jsonload(data_file('lingala_examples.json')): sid += 1 kw = { 'id': '60-%s' % sid, 'language_pk': conn.pk(Language, '60'), 'name': example['Text'], 'description': example['Translation'], 'gloss': '\t'.join(example['Gloss'].split()), 'analyzed': '\t'.join(example['Text'].split()), 'type': example['Type'].strip().lower(), 'jsondata': { 'sort': int(example['Order_number']), 'alt_translation': None } } example_map[example['Example_number']] = conn.insert(Sentence, **kw) for ve in jsonload(data_file('lingala_value_examples.json')): vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number']) vpk = conn.pk(Value, vspk, attr='valueset_pk') conn.insert(ValueSentence, value_pk=vpk, sentence_pk=example_map[ve['Example_number']]) for i, comment in enumerate( reader(data_file('lingala_valueset_comments.tab'), delimiter='\t', dicts=True)): vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number']) comment['Comments_on_value_assignment'] = comment[ 'Comments_on_value_assignment'].replace('\x0b', '\n') conn.update(ValueSet, { 'description': comment['Comments_on_value_assignment'], 'markup_description': None, }, pk=vspk)
def import_cldf(srcdir, md, languoids, conceptsets): with transaction.manager: contrib = Provider( id=srcdir.name, name=md['dc:title'], description=md.get('dc:bibliographicCitation'), url=md.get('dc:identifier'), license=md.get('dc:license'), aboutUrl=md.get('aboutUrl'), ) DBSession.add(contrib) sources = {} cldfdir = srcdir.joinpath('cldf') values = Data() for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False): ds = Dataset.from_metadata(fname) for src in ds.sources.items(): if src.id not in sources: sources[src.id] = cldf2clld(src, contrib, len(sources) + 1) import_dataset(ds, contrib, languoids, conceptsets, sources, values) DBSession.flush() # import cognates: if cldfdir.joinpath('cognates.csv').exists(): for csid, cognates in groupby( reader(cldfdir.joinpath('cognates.csv'), dicts=True), lambda i: i['Cognate_set_ID']): cs = Cognateset(id=unique_id(contrib, csid), contribution=contrib) for cognate in cognates: cp = values['Counterpart'].get(cognate['Word_ID']) if cp: DBSession.add( CognatesetCounterpart( cognateset=cs, counterpart=cp, cognate_detection_method=cognate[ 'Cognate_detection_method'], alignment=cognate['Alignment'], alignment_method=cognate['Alignment_method'], doubt=cognate['Doubt'] == 'True'))
def import_cldf(srcdir, md, languoids, conceptsets): with transaction.manager: contrib = Provider( id=srcdir.name, name=md['dc:title'], description=md.get('dc:bibliographicCitation'), url=md.get('dc:identifier'), license=md.get('dc:license'), aboutUrl=md.get('aboutUrl'), ) DBSession.add(contrib) sources = {} cldfdir = srcdir.joinpath('cldf') values = Data() for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False): ds = Dataset.from_metadata(fname) for src in ds.sources.items(): if src.id not in sources: sources[src.id] = cldf2clld(src, contrib, len(sources) + 1) import_dataset(ds, contrib, languoids, conceptsets, sources, values) DBSession.flush() # import cognates: if cldfdir.joinpath('cognates.csv').exists(): for csid, cognates in groupby( reader(cldfdir.joinpath('cognates.csv'), dicts=True), lambda i: i['Cognate_set_ID']): cs = Cognateset(id=unique_id(contrib, csid), contribution=contrib) for cognate in cognates: cp = values['Counterpart'].get(cognate['Word_ID']) if cp: DBSession.add(CognatesetCounterpart( cognateset=cs, counterpart=cp, cognate_detection_method=cognate['Cognate_detection_method'], alignment=cognate['Alignment'], alignment_method=cognate['Alignment_method'], doubt=cognate['Doubt'] == 'True'))
def _read_tsv(self, path): return set(tuple(row[1:]) for row in reader(path, delimiter="\t"))
def _read_tsv(self, path): return set(tuple(row[1:]) for row in reader(path, delimiter='\t'))
def read(table): fname = args.data_file(table + '.all.csv') if not fname.exists(): fname = args.data_file(table + '.csv') return list(dsv.reader(fname, namedtuples=True))
def get_tab(name): """Generator for entries in a tab file specified by name.""" return dsv.reader( get(get_taburls()[name]).split('\n'), namedtuples=True, delimiter='\t')
def main(): socs = read_win1252( 'ALL_soc_ids_to_lang_wAltNames_sources_5Sept2017_win1252.csv') links = { r['soc_id']: r for r in read_win1252( 'ALL_soc_links_to_other_databases_30Aug2017_win1252.csv') } locations = { 'SCCS' + r['soc_id']: r for r in reader('../../legacy/LatLong_data.csv', dicts=True) } for row in reader( '../WNAI/DPLACE_RevisedLatLong_27April2017_inclWNAI_SCCS.csv', dicts=True): if row['Dataset'] == 'SCCS': locations[row['soc_id']]['Lat'] = row['soc.latitude'] locations[row['soc_id']]['Long'] = row['soc.longitude'] with UnicodeWriter('societies.csv') as w: w.writerow([f.name for f in attr.fields(Society)]) for soc in socs: kw = { 'id': soc['soc_id'], 'glottocode': soc['glottolog_id'], 'glottocode_comment': 'Lang_assignment_change_notes' } for col in [ 'xd_id', 'pref_name_for_society', 'ORIG_name_and_ID_in_this_dataset', 'alt_names_by_society', 'main_focal_year', ]: kw[col] = soc[col] for col in ['Lat', 'Long', 'origLat', 'origLong', 'Comment']: kw[col] = locations[soc['soc_id']][col] kw['HRAF_name_ID'] = links[soc['soc_id']]['HRAF_name_ID'] kw['HRAF_link'] = links[soc['soc_id']]['HRAF_link'] w.writerow(attr.astuple(Society(**kw))) with UnicodeWriter('societies_mapping.csv') as w: w.writerow(['id', 'related']) for sid, l in links.items(): rels = [] for dsid, suffix in [ ('EA', '1'), ('EA', '2'), ('Binford', '1'), ('Binford', '2'), ('Binford', '3'), ('SCCS', ''), ('WNAI', '1'), ('WNAI', '2'), ('WNAI', '3'), ('WNAI', '4'), ('WNAI', '5'), ]: if dsid == 'SCCS': label = l['{0}_society_equivalent{1}'.format(dsid, suffix)] else: label = l['{0}_label_society_equivalent{1}'.format( dsid, suffix)] id = l['{0}_id_society_equivalent{1}'.format(dsid, suffix)] if label and id: rels.append('{0}: {1} [{2}]'.format(dsid, label, id)) w.writerow([sid, '; '.join(rels)]) var_info = { r['source']: r['APA_reference'] for r in read_win1252('SCCS_variable_sources_bibtex_to_APA.csv', ignore_dataset=True) } with UnicodeWriter('variables.csv') as w: fm = OrderedDict([ ('VarID', 'id'), ('Category', 'category'), ('VarTitle', 'title'), ('VarDefinition', 'definition'), ('VarType', 'type'), ('UserNotes', 'notes'), ('source', 'source'), ('VarTitleShort', 'changes'), ('Unit', 'units'), ]) w.writerow(fm.values()) for row in read_win1252( 'SCCS_Full_VariableList_12Sept2017_win1252.csv'): row['VarID'] = 'SCCS' + row['VarID'] row['VarType'] = row['VarType'].capitalize() if row['VarDefinition']: row['VarDefinition'] += '\n\n' row['VarDefinition'] += var_info.get(row['source'], row['source']) w.writerow([row[f] for f in fm.keys()]) with UnicodeWriter('codes.csv') as w: fm = OrderedDict([ ('VarID', 'var_id'), ('Code', 'code'), ('CodeDescription', 'description'), ('ShortName', 'name'), ]) w.writerow(fm.values()) for row in read_win1252( 'SCCS_CodeDescriptions_12Sept2017_win1252.csv'): row['VarID'] = 'SCCS' + row['VarID'] w.writerow([row[f] for f in fm.keys()]) with UnicodeWriter('data.csv') as w: fm = OrderedDict([ ('soc_id', 'soc_id'), ('SubCase', 'sub_case'), ('Year', 'year'), ('VarID', 'var_id'), ('Code', 'code'), ('EthnoReferences', 'references'), ('AdminComment', 'admin_comment'), ('UserComment', 'comment'), ('SourceCodedData', 'source_coded_data'), ]) w.writerow(fm.values()) for row in read_win1252( 'Full_SCCS_data_12Sept2017_FINAL_329451rows_win1252.csv'): row['VarID'] = 'SCCS' + row['VarID'] w.writerow([row[f] for f in fm.keys()])
def load_glottolog_data(self): """ Loads the Glottolog classification information from the appropriate newick file, parses it and stores the required datastructure in self.classification. """ # Don't load if the analysis doesn't use it if not self.check_glottolog_required(): return # Don't load if we already have - can this really happen? if self.glottolog_loaded: return self.glottolog_loaded = True label2name = {} glottocode2node = {} def parse_label(label): match = GLOTTOLOG_NODE_LABEL.match(label) label2name[label] = (match.group('name').strip().replace("\\'","'"), match.group('glottocode')) return ( match.group('name').strip(), match.group('glottocode'), match.group('isocode')) def get_classification(node): res = [] ancestor = node.ancestor while ancestor: res.append(label2name[ancestor.name]) ancestor = ancestor.ancestor return list(reversed(res)) # Walk the tree and build the classifications dictionary glottolog_trees = newick.read(get_glottolog_data('newick', self.glottolog_release)) for tree in glottolog_trees: for node in tree.walk(): name, glottocode, isocode = parse_label(node.name) classification = get_classification(node) self.classifications[glottocode] = classification if isocode: self.classifications[isocode] = classification glottocode2node[glottocode] = node # Load geographic metadata for t in reader( get_glottolog_data('geo', self.glottolog_release), namedtuples=True): if t.macroarea: self.glotto_macroareas[t.glottocode] = t.macroarea for isocode in t.isocodes.split(): self.glotto_macroareas[isocode] = t.macroarea if self.location_data: continue # Use user-supplied data instead if t.latitude and t.longitude: latlon = (float(t.latitude), float(t.longitude)) self.locations[t.glottocode] = latlon for isocode in t.isocodes.split(): self.locations[isocode] = latlon if self.location_data: return # Second pass of geographic data to handle dialects, which inherit # their parent language's location for t in reader( get_glottolog_data('geo', self.glottolog_release), namedtuples=True): if t.level == "dialect": failed = False node = glottocode2node[t.glottocode] ancestor = node.ancestor while label2name[ancestor.name][1] not in self.locations: if not ancestor.ancestor: # We've hit the root without finding an ancestral node # with location data! failed = True break else: ancestor = ancestor.ancestor if failed: continue latlon = self.locations[label2name[ancestor.name][1]] self.locations[t.glottocode] = latlon for isocode in t.isocodes.split(): self.locations[isocode] = latlon
def read_csv(path): return list(reader(path, dicts=True))
def get_rows(args, name): # pragma: no cover for i, row in enumerate( reader(args.data_file('InventoryID-%s.csv' % name), delimiter='\t')): if i and row[1] != 'NA': yield row
def read_all(fname, **kw): kw.setdefault('delimiter', '\t') if not kw.get('dicts'): kw.setdefault('namedtuples', True) return list(dsv.reader(fname, **kw))
def tsv_items(path, ordered=False): return list(reader(path, delimiter='\t', dicts=True))
def read_csv(self, fname, **kw): return list(reader(self.joinpath(fname), **kw))
def test_reader(self): from clldutils.dsv import reader lines = ['first\tline', 'sücond\tläneß'] encoded_lines = [l.encode('utf8') for l in lines] csv_lines = [l.replace('\t', ',') for l in lines] def check(r): res = list(r) assert len(res) == 2 assert res[1][1] == 'läneß' check(reader(lines, delimiter='\t')) for lt in ['\n', '\r\n', '\r']: if PY3: # pragma: no cover # Simulate file opened in text mode: fp = StringIO(lt.join(lines), newline='') else: # Simulate file opened in binary mode: fp = BytesIO(to_binary(lt).join(encoded_lines)) check(reader(fp, delimiter='\t')) check(reader(FIXTURES.joinpath('csv.txt'))) res = list(reader(FIXTURES.joinpath('tsv.txt'), namedtuples=True, delimiter='\t')) assert res[0].a_name == 'b' # Missing column values should be set to None: assert res[2].a_name is None r = list(reader(lines, dicts=True, delimiter='\t')) assert len(r) == 1 and r[0]['first'] == 'sücond' r = list(reader(lines, namedtuples=True, delimiter='\t')) assert len(r) == 1 and r[0].first == 'sücond' r = list(reader(csv_lines, namedtuples=True)) assert len(r) == 1 and r[0].first == 'sücond' self.assertEqual(list(reader([], dicts=True, delimiter='\t')), []) self.assertEqual( list(reader([''], dicts=True, fieldnames=['a', 'b'], delimiter='\t')), []) self.assertEqual(list(reader(['a,b', ''], dicts=True, delimiter='\t')), []) r = reader( ['a,b', '1,2,3,4', '1'], dicts=True, restkey='x', restval='y', delimiter=',') self.assertEqual(list(r), [dict(a='1', b='2', x=['3', '4']), dict(a='1', b='y')])
def reflexes(write_stats=True, path='concepticondata'): """ Returns a dictionary with concept set label as value and tuples of concept list identifier and concept label as values. """ D, G = {}, {} cpl = 0 cln = 0 clb = set([]) dpath = Path(path) if path else PKG_PATH for i, cl in enumerate(dpath.joinpath('conceptlists').glob('*.tsv')): concepts = list(reader(cl, namedtuples=True, delimiter="\t")) for j,concept in enumerate([c for c in concepts if c.CONCEPTICON_ID]): label = concept.GLOSS if hasattr(concept, 'GLOSS') else concept.ENGLISH name = cl.name try: D[concept.CONCEPTICON_GLOSS] += [(name, label)] except KeyError: D[concept.CONCEPTICON_GLOSS] = [(name, label)] try: G[label] += [(concept.CONCEPTICON_ID, concept.CONCEPTICON_GLOSS, name)] except KeyError: G[label] = [(concept.CONCEPTICON_ID, concept.CONCEPTICON_GLOSS, name)] clb.add(label) cpl += 1 cln += 1 # write basic statistics and most frequent glosses if write_stats: txt = """# Concepticon Statistics * concept sets (used): {0} * concept lists: {1} * concept labels: {2} * concept labels (unique): {3} * Ø concepts per list: {4:.2f} * Ø concepts per concept set: {5:.2f} * Ø unique concept labels per concept set: {6:.2f} """ txt = txt.format( len(D), cln, cpl, len(clb), cpl / cln, sum([len(v) for k,v in D.items()]) / len(D), sum([len(set([label for _,label in v])) for k,v in D.items()]) / len(D) ) txt += '# Twenty Most Diverse Concept Sets\n\n' txt += '| No. | concept set | distinct labels | concept lists | examples |\n' txt += '| --- | --- | --- | --- | --- |\n' for i,(k,v) in enumerate(sorted(D.items(), key=lambda x: len(set([label for _,label in x[1]])), reverse=True)[:20]): txt += '| {0} | {1} | {2} | {3} | {4} |\n'.format( i+1, k, len(set([label for _,label in v])), len(set([clist for clist,_ in v])), ', '.join(sorted(set(['«{0}»'.format(label.replace('*','`*`')) for _,label in v]))) ) txt += '# Twenty Most Frequent Concept Sets\n\n' txt += '| No. | concept set | distinct labels | concept lists | examples |\n' txt += '| --- | --- | --- | --- | --- |\n' for i,(k,v) in enumerate(sorted(D.items(), key=lambda x: len(set([clist for clist,_ in x[1]])), reverse=True)[:20]): txt += '| {0} | {1} | {2} | {3} | {4} |\n'.format( i+1, k, len(set([label for _,label in v])), len(set([clist for clist,_ in v])), ', '.join(sorted(set(['«{0}»'.format(label.replace('*','`*`')) for _,label in v]))) ) with dpath.joinpath('README.md').open('w', encoding='utf8') as fp: fp.write(txt) return D, G
def gps(args): """ Multilingual, lg family, -> Dogon, Atlantic, Mande, Berber, ... family code , Language (group), -> Names of Dogon languages alternate lg (group), -> comma separated list of language names language code, -> ISO 639-3 Language (based on native name), dialect code, ISO 3 Letter country code, OfficialVillageName, -> name! MajorCity, PopulationNumber, village (RB), village (DNAFLA), village (SIL), village (map), -> alternative name Transcribed Village Name, -> keep N Lat, W Lon, NFr, WFr, Nmn60, Wmn60, NMinFr, WMinFr, Ndg, Wdg, N Lat_2, -> 12 12.123 W Lon_2, -> 12 12.123 N SIL, W SIL, N Lat source, WLon source, N Lat map, WLon map, N Lat us, W Long us, sourceOfCoordinates, -> keep name of map, -> keep lg comment, -> keep industries, -> keep weekly market, -> keep surnames, -> keep social info, -> keep Image, ImageDescription, Audio, AudioTranscription, Video, VideoTranscription """ full_name_map = { 'Oualo (upper)': 'walo_upper', 'Oualo (lower)': 'walo_lower', 'Kenntaba-Leye': 'kentabaley', 'Djimerou-Doungo': 'djimeroudungo', 'Sassourou': 'sassouru', 'Sege-Bougie': 'seguebougie', 'Fiko': 'ficko', 'Iribanga (Fulbe)': 'iribanga_fulbe', 'Madina (near Banggel-Toupe)': 'madina_near_bangueltoupe)', 'Dourou Tanga (1)': 'douroutanga_1', 'Dourou Tanga (2)': 'douroutanga_2', 'Dourou Tanga (3)': 'douroutanga_3', 'Tena (Tere)': 'tena_aka_tere', 'Anakaga (Amamounou)': 'anakaga_in_amamounou', 'Dari (near Hombori)': 'dari_near_hombori', 'Bamba Tene': 'bambatende', 'Kenntaba-Do': 'kentabado', 'Tialegel': 'tialeggel', 'Bani-Banggou': 'banibangou', 'Ourobangourdi': 'ourobaangourdi', 'Ourodjougal': 'ourodiouggal', 'Yadianga (Fulbe)': 'yadiangapoulogoro', 'Gueourou (Fulbe)': 'gueouroupulogoro', 'Tongoro-Legu': 'tongorolegou', 'Koundougou-Mossi': 'koundougoumouniougoro', 'Billanto-Bella': 'bella', 'Dianggassagou (Diemessogou)': 'diangassagou_aka_diemessogou)', } name_map = { 'kelmita': 'kelmitaa', 'yrebann': 'yreban', 'aouguine': 'aougine', 'bendielysigen': 'bendielisigen', 'bendielydana': 'bendielidana', 'ourongeou': 'ourongueou', 'oukoulourou': 'oukolourou', 'bendielygirikombo': 'bendieligirikombo', 'dianggassagou': 'diangassagou', 'komokanina': 'komokaninaa', 'dourouna': 'dourounaa', 'idielina': 'idielinaa', 'woltigueri': 'woltiguere', 'irelikanaw': 'ireli_kanaw', 'korimaounde': 'kori_maounde', 'yandaguinedia': 'yandaginedia', 'boudoufolii': 'boudoufoli_section1', 'boudoufoliii': 'boudoufoli_section2', } def location(d): if d['OfficialVillageName'] == 'Balaguina (Balaguina-Baboye)': d['N Lat'] = d['N Lat'].replace(' 115.3', ' 15.3') if d['OfficialVillageName'] == 'Daidourou': return None, None #if d['W Lon us'] and d['N Lat us']: # return parse_deg(d['N Lat us']), parse_deg(d['W Lon us']) lat, lon = parse_deg(d['N Lat']), parse_deg(d['W Lon']) if lon: lon = -lon if lon and lon < -10: lon += 10 return lat, lon for d in reader( args.data_file('repos', 'GPS_Dogon.csv'), dicts=True): for k in d: d[k] = d[k].strip() if not d['OfficialVillageName']: continue normname = full_name_map.get(d['OfficialVillageName'].strip()) if normname is None: normname = d['OfficialVillageName'].replace('-', '').replace(' ', '').replace('(', '_aka_').replace(')', '').split(',')[0].strip().lower() normname = name_map.get(normname, normname) v = Village( d['OfficialVillageName'], normname, GPS_LANGS.get(d['Language (group)']), data=d) v.lat, v.lon = location(d) yield v
def read(self, path, sep="\t", comment="#"): with Path(path).open(encoding='utf-8') as handle: lines = [unicodedata.normalize('NFC', hline) for hline in handle.readlines() if hline and not hline.startswith(comment)] self.extend(list(reader(lines, dicts=True, delimiter=sep)))
def main(args): if DBSession.bind.dialect.name == 'postgresql': Index('ducet', collkey(common.Value.name)).create(DBSession.bind) def data_file(*comps): return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps) data = Data() data.add( common.Dataset, 'tsammalex', id="tsammalex", name="Tsammalex", description="Tsammalex: A lexical database on plants and animals", publisher_name="Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://www.shh.mpg.de", domain='tsammalex.clld.org', license='http://creativecommons.org/licenses/by/4.0/', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License'}) data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex") for rec in Database.from_file(data_file('sources.bib'), lowercase=True): data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec)) load_ecoregions(data_file, data) load_countries(data) second_languages = {} def languoid_visitor(lang, row, _): add_language_codes( data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None) second_languages[row[0]] = row[8] def habitat_visitor(cat, *_): cat.is_habitat = True def taxon_visitor(auto, taxon, *_): if auto.get(taxon.id): update_taxon_data(taxon, auto[taxon.id], data) else: print('--> missing in taxa.json:', taxon.id, taxon.name) taxon.countries_str = ' '.join([e.id for e in taxon.countries]) taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions]) auto = {s['id']: s for s in jsonload(data_file('taxa.json'))} for model, kw in [ (models.Lineage, {}), (models.Use, {}), (models.TsammalexContributor, {}), (models.Languoid, dict(visitor=languoid_visitor)), (models.Category, dict(name='categories')), (models.Category, dict(name='habitats', visitor=habitat_visitor)), (models.Taxon, dict(visitor=partial(taxon_visitor, auto))), (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])), ]: from_csv(data_file, model, data, **kw) for key, ids in second_languages.items(): target = data['Languoid'][key] for lid in models.split_ids(ids): if lid in data['Languoid']: # we ignore 2nd languages which are not yet in Tsammalex. target.second_languages.append(data['Languoid'][lid]) def image_url(source_url, type_): return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace( '/original/', '/%s/' % type_) for fname in data_files(data_file, 'images.csv'): for image in reader(fname, namedtuples=True, delimiter=","): if image.taxa__id not in data['Taxon']: continue url = URL(image.source_url) if url.host() != 'edmond.mpdl.mpg.de': continue jsondata = dict( url=image.source_url, thumbnail=image_url(image.source_url, 'thumbnail'), web=image_url(image.source_url, 'web')) f = common.Parameter_files( object=data['Taxon'][image.taxa__id], id=image.id, name=image.tags, jsondata=jsondata, mime_type=image.mime_type) for k in 'source creator date place comments permission'.split(): v = getattr(image, k) if v: models.ImageData(key=k, value=v, image=f)