Пример #1
0
def test_from_entry():
    e = Entry('book', fields={'title': 'Title'})
    assert Source.from_entry('abc', e)['title'] == 'Title'

    with pytest.raises(ValueError):
        Source.from_entry('a.b', e)

    assert Source.from_entry('a.b', e, _check_id=False).id == 'a.b'
    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv('raw.tsv', delimiter="\t", dicts=True)

        # Quite a hack to allow things like "1995.pdfb" as Source IDs:
        bib = pybtex.database.parse_string(self.raw_dir.read('sources.bib'), bib_format='bibtex')
        sources = []
        for k, e in bib.entries.items():
            # Unfortunately, Source.from_entry does not allow any keyword arguments to be passed
            # to the constructor, see https://github.com/cldf/pycldf/issues/99
            e.fields['_check_id'] = False
            sources.append(Source.from_entry(k, e))
        args.writer.add_sources(*sources)

        language_lookup = args.writer.add_languages(lookup_factory='NameInSource')
        concept_lookup = args.writer.add_concepts(
            id_factory=lambda x: x.id.split('-')[-1]+'_'+slug(x.english),
            lookup_factory='Name'
        )
        lang_sources = {l['NameInSource']: l['Source'].split(",") for l in self.languages}

        # remap concepts for personal pronouns
        remap_concepts = {
            '1SG pronoun': '1sg pronoun',
            '2SG pronoun': '2sg pronoun',
            '3SG pronoun': '3sg pronoun',
        }

        for line_dict in progressbar(data, desc='cldfify'):
            concept = line_dict['Meaning']
            concept_id = concept_lookup.get(remap_concepts.get(concept, concept))
            for language, language_id in language_lookup.items():
                value = line_dict[language].strip()
                if value:
                    args.writer.add_form(
                        Value=value,
                        Form=value,
                        Parameter_ID=concept_id,
                        Language_ID=language_id,
                        Source=lang_sources[language]
                    )
Пример #3
0
    def to_cldf(self, ds, concepts):
        """
        :param ds: the dataset object
        :concepts: a dictionary mapping concept labels to concept ids

        :return: A dataset object, ds.
        """
        source = []
        if self.language.source:
            bib = parse_string(self.language.source, "bibtex")
            try:
                ds.add_sources(
                    *[Source.from_entry(k, e) for k, e in bib.entries.items()])
                source = list(bib.entries.keys())
            except:  # noqa: E722
                self.log.warning("Invalid citekey for %s" % self.language.id)

        ds.add_language(ID=self.language.id,
                        Glottocode=self.language.glottocode,
                        ISO639P3code=self.language.iso,
                        Name=self.language.name,
                        author=self.language.author,
                        url=self.url('language.php?id=%s' % self.language.id),
                        typedby=self.language.typedby,
                        checkedby=self.language.checkedby,
                        notes=self.language.notes,
                        source=";".join(source))

        for entry in self.entries:
            if entry.name is None or len(
                    entry.name) == 0:  # skip empty entries
                continue  # pragma: no cover

            # skip entries marked as incorrect word form due to semantics
            # (x = probably, s = definitely)
            if entry.cognacy and entry.cognacy.lower() in ('s', 'x'):
                continue  # pragma: no cover

            # handle concepts
            cid = concepts.get(entry.word_id)
            if not cid:
                self.dataset.unmapped.add_concept(ID=entry.word_id,
                                                  Name=entry.word)
                # add it if we don't have it.
                ds.add_concept(ID=entry.word_id, Name=entry.word)
                cid = entry.word_id

            # handle lexemes
            try:
                lex = ds.add_forms_from_value(
                    Local_ID=entry.id,
                    Language_ID=self.language.id,
                    Parameter_ID=cid,
                    Value=entry.name,
                    # set source to entry-level sources if they exist, otherwise use
                    # the language level source.
                    Source=[entry.source] if entry.source else source,
                    Cognacy=entry.cognacy,
                    Comment=entry.comment or '',
                    Loan=True if entry.loan and len(entry.loan) else False,
                )
            except:  # NOQA: E722; pragma: no cover
                print("ERROR with %r -- %r" % (entry.id, entry.name))
                raise

            if lex:
                for cognate_set_id in entry.cognates:
                    match = self.dataset.cognate_pattern.match(cognate_set_id)
                    if not match:  # pragma: no cover
                        self.log.warning(
                            'Invalid cognateset ID for entry {0}: {1}'.format(
                                entry.id, cognate_set_id))
                    else:
                        # make global cognate set id
                        cs_id = "%s-%s" % (slug(entry.word), match.group('id'))

                        ds.add_cognate(lexeme=lex[0],
                                       Cognateset_ID=cs_id,
                                       Doubt=bool(match.group('doubt')),
                                       Source=['Greenhilletal2008'] if
                                       self.section == 'austronesian' else [])

        return ds
Пример #4
0
 def read_bib(self, fname='sources.bib'):
     bib = database.parse_string(self.read(fname), bib_format='bibtex')
     return [Source.from_entry(k, e) for k, e in bib.entries.items()]
Пример #5
0
    def cmd_makecldf(self, args):
        args.writer.add_sources(self.raw_dir.read("Citations.bib"))
        bib = parse_string(self.raw_dir.read('Borrowing_references.bib'),
                           'bibtex')
        for k, v in bib.entries.items():
            args.writer.add_sources(
                Source.from_entry(slug(k, lowercase=False), v))

        args.writer.cldf.add_component(
            'BorrowingTable', {
                'name': 'Likelihood',
                'dc:description':
                'Likelihood of borrowing (*possible*, *probable* or *clear*).',
                'datatype': {
                    'base': 'string',
                    'format': 'possible|clear|probable'
                }
            }, {
                'name': 'SourceLanguoid',
                'dc:description': 'Borrowing source of lexeme.',
            })
        args.writer.cldf['FormTable', 'form'].required = False
        args.writer.cldf['FormTable', 'value'].null = NULL_ITEMS
        args.writer.cldf['FormTable', 'value'].required = False
        args.writer.cldf['FormTable', 'value'].common_props['dc:description'] = \
            "Lexeme data. Contains a lexeme or '[No equivalent]': no suitable equivalent for a meaning exists), " \
            "'[Form not found]': no suitable equivalent was found, or '[Not reconstructable]': non-recontructable " \
            "meanings in Proto-Uralic."

        for src in self._read("Citation_codes"):
            if src["type"] == "E":
                args.writer.add_sources(
                    Source("misc",
                           src["ref_abbr"],
                           author=src["original_reference"]))

        glottocodes = {
            language["ID"]: language["Glottocode"]
            for language in self.languages
        }
        for language in self._read("Languages"):
            glottocode = glottocodes.get(language["lgid3"])
            if not glottocode:
                glottocode = self.glottolog.glottocode_by_iso.get(
                    language["ISO-639-3"])
            args.writer.add_language(
                ID=language["lgid3"],
                Name=language["language"],
                Glottocode=glottocode,
                Description=language["Description"],
                Subgroup=language["Subgroup"],
                ISO639P3code=language["ISO-639-3"],
            )

        inlists = {r['mng_item']: r for r in self._read('Meaning_lists')}
        attrs = [
            k for k in attr.fields_dict(UralexConcept).keys() if k != 'LJ_rank'
        ]
        for concept in self.concepts:
            if concept['ID'] in inlists:
                memberships = {
                    k.replace('-', '_'): v == '1'
                    for k, v in inlists[concept['ID']].items()
                    if k.replace('-', '_') in attrs
                }
                concept.update(memberships)
            args.writer.add_concept(**concept)

        for (cid, cogid), ll in itertools.groupby(
                sorted(self._read("Data"),
                       key=lambda i: (i["mng_item"], i["cogn_set"])),
                lambda i: (i["mng_item"], i["cogn_set"]),
        ):
            for language in ll:
                if language['item'] in NULL_ITEMS:
                    language['etym_notes'] = language['etym_notes'] + language[
                        'item']
                kw = dict(
                    Value=language["item"],
                    Language_ID=language["lgid3"],
                    Parameter_ID=cid,
                    Comment=language["general_notes"],
                    Source=[
                        slug(rid, lowercase=False) for rid in split_text(
                            language["ref_abbr"], ",", strip=True)
                    ],
                )
                kw.update({
                    k: language[k]
                    for k in [
                        "item_UPA",
                        "item_IPA",
                        "form_set",
                        "etym_notes",
                        "glossing_notes",
                    ]
                })

                for i, lex in enumerate(args.writer.add_lexemes(**kw)):
                    lex['Form'] = None if lex['Form'] in NULL_ITEMS else lex[
                        'Form']
                    if cogid not in ["?", "0"]:
                        args.writer.add_cognate(lexeme=lex,
                                                Cognateset_ID="{0}-{1}".format(
                                                    cid, cogid))
                    if language['borr_qual']:
                        c = ': borrowed to Pre-Permic'
                        ref = language['ref_borr']
                        if c in ref:
                            comment = c[1:].strip()
                            ref = ref.replace(c, '')
                        else:
                            comment = None
                        args.writer.objects['BorrowingTable'].append(
                            dict(
                                ID=lex['ID'],
                                Target_Form_ID=lex['ID'],
                                SourceLanguoid=language['borr_source'],
                                Likelihood=language['borr_qual'],
                                Source=bibkeys(ref),
                                Comment=comment,
                            ))
Пример #6
0
    def cmd_makecldf(self, args):
        self.create_schema(args.writer.cldf)

        pk2id = collections.defaultdict(dict)

        skip_source = [
            'Lous-1969',  # -> Loos-1969
            'Payne-1990',  # -> Payne-1990a
        ]
        updated_source_keys = {
            'Anonymous-nd': 'North-East-Frontier-Agency-1963',
        }
        updated_source_names = {
            'North-East-Frontier-Agency-1963':
            'North East Frontier Agency 1963',
        }
        sources = parse_string(
            self.raw_dir.joinpath('source.bib').read_text(encoding='utf8'),
            'bibtex')
        gbs_lg_refs = collections.defaultdict(set)
        src_names = {}
        for s in self.read('source', pkmap=pk2id).values():
            if s['id'] in skip_source:
                continue
            s['id'] = updated_source_keys.get(s['id'], s['id'])
            src_names[s['id']] = updated_source_names.get(s['id'], s['name'])
            try:
                jsd = json.loads(s['jsondata'])
                if 'wals_code' in jsd:
                    [gbs_lg_refs[c].add(s['id']) for c in jsd['wals_code']]
                gbs = jsd['gbs']
                if gbs['id'].strip():
                    sef = sources.entries[s['id']].fields
                    sef['google_book_search_id'] = gbs['id'].strip()
                    sef['google_book_viewability'] = gbs['accessInfo'][
                        'viewability'].strip()
            except (json.decoder.JSONDecodeError, KeyError):
                continue

        chapters = self.read('contribution', extended='chapter', pkmap=pk2id)

        refs = []
        crefs = collections.defaultdict(list)
        for row in self.raw_dir.read_csv('valuesetreference.csv', dicts=True):
            if row['source_pk']:
                sid = pk2id['source'][row['source_pk']]
                if sid not in skip_source:
                    refs.append(
                        (row['valueset_pk'], updated_source_keys.get(sid, sid),
                         row['description']))
        srcids = set(r[1] for r in refs)
        for row in self.raw_dir.read_csv('contributionreference.csv',
                                         dicts=True):
            sid = pk2id['source'][row['source_pk']]
            if sid not in crefs[pk2id['contribution'][row['contribution_pk']]]:
                crefs[pk2id['contribution'][row['contribution_pk']]].append(
                    sid)
                srcids.add(sid)
        unused_srcids = []
        for id_, e in sources.entries.items():
            if id_ in skip_source:
                continue
            if id_ in srcids:
                if id_ in src_names:
                    e.fields['wals_ref_name'] = src_names[id_]
                args.writer.cldf.add_sources(Source.from_entry(id_, e))
            else:
                unused_srcids.append(id_)
            # add language references out of bibtex tag 'wals_code'
            # to ensure that nothing was missed in raw/languagesource.csv (37 cases)
            if 'wals_code' in e.fields:
                [
                    gbs_lg_refs[c].add(id_)
                    for c in e.fields['wals_code'].split('; ')
                ]

        for id_, e in sources.entries.items():
            if id_ in skip_source:
                continue
            if id_ in unused_srcids:
                if id_ in src_names:
                    e.fields['wals_ref_name'] = src_names[id_]
                args.writer.cldf.add_sources(Source.from_entry(id_, e))

        editors = {
            e['contributor_pk']: int(e['ord'])
            for e in self.read('editor', key=lambda r: int(r['ord'])).values()
        }

        contributors = self.read('contributor',
                                 pkmap=pk2id,
                                 key=lambda r: r['id'])
        for row in contributors.values():
            args.writer.objects['contributors.csv'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'Url':
                row['url'],
                'Editor_Ord':
                editors[row['pk']] if row['pk'] in editors else 0,
            })

        cc = {
            chapters[fid]['id']:
            [(r['primary'], pk2id['contributor'][r['contributor_pk']])
             for r in rows]
            for fid, rows in itertools.groupby(
                self.read('contributioncontributor',
                          key=lambda d: (d['contribution_pk'], d['primary'] ==
                                         'f', int(d['ord']))).values(),
                lambda r: r['contribution_pk'])
        }

        areas = self.read('area')
        for row in areas.values():
            args.writer.objects['areas.csv'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'dbpedia_url':
                row['dbpedia_url'],
            })

        for row in self.read('parameter',
                             extended='feature',
                             pkmap=pk2id,
                             key=lambda d: fid_key(d['id'])).values():
            args.writer.objects['ParameterTable'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'Chapter_ID':
                chapters[row['contribution_pk']]['id'],
            })

        for row in self.read(
                'domainelement',
                pkmap=pk2id,
                key=lambda d:
            (fid_key(d['id'].split('-')[0]), int(d['number']))).values():
            args.writer.objects['CodeTable'].append({
                'ID':
                row['id'],
                'Parameter_ID':
                pk2id['parameter'][row['parameter_pk']],
                'Name':
                row['name'],
                'Description':
                row['description'],
                'Number':
                int(row['number']),
                'icon':
                json.loads(row['jsondata'])['icon'],
            })

        identifier = self.read('identifier')
        lang2id = collections.defaultdict(
            lambda: collections.defaultdict(list))
        for row in self.read('languageidentifier').values():
            id_ = identifier[row['identifier_pk']]
            lang2id[row['language_pk']][id_['type']].append(
                (id_['name'], id_['description']))

        families = self.read('family', pkmap=pk2id)
        genera = self.read('genus', pkmap=pk2id)
        countries = self.read('country', pkmap=pk2id)
        lang2country = collections.defaultdict(list)
        for c in self.read('countrylanguage').values():
            lang2country[c['language_pk']].append(
                pk2id['country'][c['country_pk']])
        lrefs = collections.defaultdict(list)
        for c in self.read('languagesource').values():
            sid = pk2id['source'][c['source_pk']]
            sid = updated_source_keys.get(sid, sid)
            if sid not in lrefs[c['language_pk']]:
                lrefs[c['language_pk']].append(sid)

        for row in self.read('language', extended='walslanguage',
                             pkmap=pk2id).values():
            id = row['id']
            genus = genera[row['genus_pk']]
            genus_icon = genus['icon'] if genus else ''
            family = families[genus['family_pk']]
            if row['name'] == genus['name'] == family['name']:
                # an isolate!
                genus = family = None
            iso_codes = set(i[0]
                            for i in lang2id[row['pk']].get('iso639-3', []))
            glottocodes = [
                i[0] for i in lang2id[row['pk']].get('glottolog', [])
            ]
            srcs = lrefs[row['pk']]
            if id in gbs_lg_refs:
                [srcs.append(s) for s in gbs_lg_refs[id] if s not in srcs]
            args.writer.objects['LanguageTable'].append({
                'ID':
                id,
                'Name':
                row['name'].strip(),
                'ISO639P3code':
                list(iso_codes)[0] if len(iso_codes) == 1 else None,
                'Glottocode':
                glottocodes[0] if len(glottocodes) == 1 else None,
                'ISO_codes':
                sorted(iso_codes),
                'Latitude':
                row['latitude'],
                'Longitude':
                row['longitude'],
                'Macroarea':
                row['macroarea'],
                'Genus':
                genus['name'] if genus else None,
                'GenusIcon':
                genus_icon,
                'Subfamily':
                genus['subfamily'] if genus else None,
                'Family':
                family['name'] if family else None,
                'Samples_100':
                row['samples_100'] == 't',
                'Samples_200':
                row['samples_200'] == 't',
                'Country_ID':
                lang2country[row['pk']],
                'Source':
                sorted(srcs),
            })
        args.writer.objects['LanguageTable'].sort(key=lambda d: d['ID'])

        refs = {
            dpid: [
                str(
                    Reference(
                        source=str(r[1]),
                        desc=r[2].replace('[', ')').replace(']', ')').replace(
                            ';', '.').strip() if r[2] else None))
                for r in refs_
            ]
            for dpid, refs_ in itertools.groupby(refs, lambda r: r[0])
        }

        vsdict = self.read('valueset', pkmap=pk2id)

        examples = self.read('sentence', pkmap=pk2id)
        igts = {}
        for ex in examples.values():
            if all(ex[k] for k in ['description', 'analyzed', 'gloss']):
                a, g = ex['analyzed'].split(), ex['gloss'].split()
                if len(a) != len(g):
                    a, g = [ex['analyzed']], [ex['gloss']]
                igts[ex['pk']] = ex['id']
                args.writer.objects['ExampleTable'].append({
                    'ID':
                    ex['id'],
                    'Language_ID':
                    pk2id['language'][ex['language_pk']],
                    'Primary_Text':
                    ex['name'],
                    'Translated_Text':
                    ex['description'],
                    'Analyzed_Word':
                    a,
                    'Gloss':
                    g,
                })
        example_by_value = {
            vpk: [r['sentence_pk'] for r in rows]
            for vpk, rows in itertools.groupby(
                self.read('valuesentence', key=lambda d: d['value_pk']).values(
                ), lambda d: d['value_pk'])
        }

        for row in self.read('value').values():
            vs = vsdict[row['valueset_pk']]
            comment = None
            ex = [examples[spk] for spk in example_by_value.get(row['pk'], [])]
            if len(ex) == 1 and not any(
                    ex[0][k] for k in ['description', 'analyzed', 'gloss']):
                comment = re.sub(r'[\r\n]', '', ex[0]['xhtml'])
                del example_by_value[row['pk']]
            args.writer.objects['ValueTable'].append({
                'ID':
                vs['id'],
                'Language_ID':
                pk2id['language'][vs['language_pk']],
                'Parameter_ID':
                pk2id['parameter'][vs['parameter_pk']],
                'Value':
                pk2id['domainelement'][row['domainelement_pk']].split('-')[1],
                'Code_ID':
                pk2id['domainelement'][row['domainelement_pk']],
                'Comment':
                comment,
                'Source':
                refs.get(vs['pk'], []),
                'Example_ID':
                sorted(igts[epk]
                       for epk in example_by_value.get(row['pk'], [])
                       if epk in igts),
            })

        args.writer.objects['ValueTable'].sort(
            key=lambda d: (d['Language_ID'], fid_key(d['Parameter_ID'])))

        altnames = []
        for lpk in lang2id:
            for type in lang2id[lpk]:
                if type == 'name':
                    for name, prov in lang2id[lpk][type]:
                        altnames.append((prov, name, pk2id['language'][lpk]))

        lnid = 0
        for (type, name), rows in itertools.groupby(sorted(altnames), lambda t:
                                                    (t[0], t[1])):
            lnid += 1
            args.writer.objects['language_names.csv'].append({
                'ID':
                str(lnid),
                'Language_ID': [r[2] for r in rows],
                'Name':
                name.strip(),
                'Provider':
                type,
            })

        for c in sorted(countries.values(), key=lambda x: x['id']):
            args.writer.objects['countries.csv'].append({
                'ID': c['id'],
                'Name': c['name'],
            })

        desc_dir = self.raw_dir / 'descriptions'
        src_pattern = re.compile(
            'src="https?://wals.info/static/descriptions/(?P<sid>s?[0-9]+)/images/(?P<fname>[^"]+)"'
        )

        def repl(m):
            p = desc_dir.joinpath(m.group('sid'), 'images', m.group('fname'))
            if p.exists():
                return 'src="{0}"'.format(data_url(p))
            return m.string[m.start():m.end()]

        descs = {}
        docs_dir = self.cldf_dir / 'docs'
        docs_dir.mkdir(exist_ok=True)
        for d in desc_dir.iterdir():
            if d.is_dir():
                descs[d.stem] = src_pattern.sub(
                    repl,
                    d.joinpath('body.xhtml').read_text(encoding='utf8'))

        for c in sorted(chapters.values(), key=lambda x: int(x['sortkey'])):
            if c['id'] in descs:
                fname = docs_dir / 'chapter_{}.html'.format(c['id'])
                with io.open(fname, 'w', encoding='utf-8') as f:
                    f.write(descs[c['id']])
            cid, wcid = [], []
            if c['id'] in cc:
                cid = [co[1] for co in cc[c['id']] if co[0] == 't']
                wcid = [co[1] for co in cc[c['id']] if co[0] == 'f']
            args.writer.objects['chapters.csv'].append({
                'ID':
                c['id'],
                'Name':
                c['name'],
                'wp_slug':
                c['wp_slug'],
                'Number':
                c['sortkey'],
                'Area_ID':
                areas[c['area_pk']]['id'] if c['area_pk'] in areas else '',
                'Source':
                crefs.get(c['id'], []),
                'Contributor_ID':
                cid,
                'With_Contributor_ID':
                wcid,
            })
Пример #7
0
    def cmd_makecldf(self, args):
        self.create_schema(args.writer.cldf)

        pk2id = collections.defaultdict(dict)
        sources = parse_string(
            self.raw_dir.joinpath('source.bib').read_text(encoding='utf8'),
            'bibtex')
        self.read('source', pkmap=pk2id)

        refs = []
        for row in self.raw_dir.read_csv('valuesetreference.csv', dicts=True):
            if row['source_pk']:
                refs.append(
                    (row['valueset_pk'], pk2id['source'][row['source_pk']],
                     row['description']))
        srcids = set(r[1] for r in refs)
        args.writer.cldf.add_sources(*[
            Source.from_entry(id_, e) for id_, e in sources.entries.items()
            if id_ in srcids
        ])

        contributors = self.read('contributor',
                                 pkmap=pk2id,
                                 key=lambda r: r['id'])
        for row in contributors.values():
            args.writer.objects['contributors.csv'].append({
                'ID': row['id'],
                'Name': row['name']
            })

        cc = {
            fid: [pk2id['contributor'][r['contributor_pk']] for r in rows]
            for fid, rows in itertools.groupby(
                self.read('contributioncontributor',
                          key=lambda d: (d['contribution_pk'], d['primary'] ==
                                         'f', int(d['ord']))).values(),
                lambda r: r['contribution_pk'])
        }

        areas = self.read('area')
        chapters = self.read('contribution', extended='chapter')

        for row in self.read('parameter',
                             extended='feature',
                             pkmap=pk2id,
                             key=lambda d: fid_key(d['id'])).values():
            args.writer.objects['ParameterTable'].append({
                'ID':
                row['id'],
                'Name':
                row['name'],
                'Area':
                areas[chapters[row['contribution_pk']]['area_pk']]['name'],
                'Chapter':
                chapters[row['contribution_pk']]['name'],
                'Contributor_ID':
                cc[row['contribution_pk']],
            })

        for row in self.read(
                'domainelement',
                pkmap=pk2id,
                key=lambda d:
            (fid_key(d['id'].split('-')[0]), int(d['number']))).values():
            args.writer.objects['CodeTable'].append({
                'ID':
                row['id'],
                'Parameter_ID':
                pk2id['parameter'][row['parameter_pk']],
                'Name':
                row['name'],
                'Description':
                row['description'],
                'Number':
                int(row['number']),
                'icon':
                json.loads(row['jsondata'])['icon'],
            })

        identifier = self.read('identifier')
        lang2id = collections.defaultdict(
            lambda: collections.defaultdict(list))
        for row in self.read('languageidentifier').values():
            id_ = identifier[row['identifier_pk']]
            lang2id[row['language_pk']][id_['type']].append(
                (id_['name'], id_['description']))

        families = self.read('family', pkmap=pk2id)
        genera = self.read('genus', pkmap=pk2id)

        for row in self.read('language', extended='walslanguage',
                             pkmap=pk2id).values():
            id = row['id']
            genus = genera[row['genus_pk']]
            family = families[genus['family_pk']]
            if row['name'] == genus['name'] == family['name']:
                # an isolate!
                genus = family = None
            iso_codes = set(i[0]
                            for i in lang2id[row['pk']].get('iso639-3', []))
            glottocodes = [
                i[0] for i in lang2id[row['pk']].get('glottolog', [])
            ]
            args.writer.objects['LanguageTable'].append({
                'ID':
                id,
                'Name':
                row['name'],
                'ISO639P3code':
                list(iso_codes)[0] if len(iso_codes) == 1 else None,
                'Glottocode':
                glottocodes[0] if len(glottocodes) == 1 else None,
                'ISO_codes':
                sorted(iso_codes),
                'Latitude':
                row['latitude'],
                'Longitude':
                row['longitude'],
                'Genus':
                genus['name'] if genus else None,
                'Subfamily':
                genus['subfamily'] if genus else None,
                'Family':
                family['name'] if family else None,
                'Samples_100':
                row['samples_100'] == 't',
                'Samples_200':
                row['samples_200'] == 't',
            })
        args.writer.objects['LanguageTable'].sort(key=lambda d: d['ID'])

        refs = {
            dpid: [
                str(
                    Reference(
                        source=str(r[1]),
                        desc=r[2].replace('[', ')').replace(']', ')').replace(
                            ';', '.').strip() if r[2] else None))
                for r in refs_
            ]
            for dpid, refs_ in itertools.groupby(refs, lambda r: r[0])
        }

        vsdict = self.read('valueset', pkmap=pk2id)

        examples = self.read('sentence', pkmap=pk2id)
        igts = {}
        for ex in examples.values():
            if all(ex[k] for k in ['description', 'analyzed', 'gloss']):
                a, g = ex['analyzed'].split(), ex['gloss'].split()
                if len(a) != len(g):
                    a, g = [ex['analyzed']], [ex['gloss']]
                igts[ex['pk']] = ex['id']
                args.writer.objects['ExampleTable'].append({
                    'ID':
                    ex['id'],
                    'Language_ID':
                    pk2id['language'][ex['language_pk']],
                    'Primary_Text':
                    ex['name'],
                    'Translated_Text':
                    ex['description'],
                    'Analyzed_Word':
                    a,
                    'Gloss':
                    g,
                })
        example_by_value = {
            vpk: [r['sentence_pk'] for r in rows]
            for vpk, rows in itertools.groupby(
                self.read('valuesentence', key=lambda d: d['value_pk']).values(
                ), lambda d: d['value_pk'])
        }

        for row in self.read('value').values():
            vs = vsdict[row['valueset_pk']]
            comment = None
            ex = [examples[spk] for spk in example_by_value.get(row['pk'], [])]
            if len(ex) == 1 and not any(
                    ex[0][k] for k in ['description', 'analyzed', 'gloss']):
                comment = ex[0]['name']
                del example_by_value[row['pk']]
            args.writer.objects['ValueTable'].append({
                'ID':
                vs['id'],
                'Language_ID':
                pk2id['language'][vs['language_pk']],
                'Parameter_ID':
                pk2id['parameter'][vs['parameter_pk']],
                'Value':
                pk2id['domainelement'][row['domainelement_pk']].split('-')[1],
                'Code_ID':
                pk2id['domainelement'][row['domainelement_pk']],
                'Comment':
                comment,
                'Source':
                refs.get(vs['pk'], []),
                'Example_ID':
                sorted(igts[epk]
                       for epk in example_by_value.get(row['pk'], [])
                       if epk in igts),
            })

        args.writer.objects['ValueTable'].sort(
            key=lambda d: (d['Language_ID'], fid_key(d['Parameter_ID'])))

        altnames = []
        for lpk in lang2id:
            for type in lang2id[lpk]:
                if type == 'name':
                    for name, prov in lang2id[lpk][type]:
                        altnames.append((prov, name, pk2id['language'][lpk]))

        lnid = 0
        for (type, name), rows in itertools.groupby(sorted(altnames), lambda t:
                                                    (t[0], t[1])):
            lnid += 1
            args.writer.objects['language_names.csv'].append({
                'ID':
                str(lnid),
                'Language_ID': [r[2] for r in rows],
                'Name':
                name,
                'Provider':
                type,
            })