def cmd_makecldf(self, args):

        glottolog = Glottolog(args.glottolog.dir)
        clts = CLTS(Config.from_file().get_clone('clts'))
        bipa = clts.bipa
        clts_eurasian = clts.transcriptiondata_dict['eurasian']

        args.writer.cldf.add_columns("ValueTable", {
            "name": "Marginal",
            "datatype": "boolean"
        }, {
            "name": "Value_in_Source",
            "datatype": "string"
        })

        args.writer.cldf.add_columns('ParameterTable', {
            'name': 'CLTS_BIPA',
            'datatype': 'string'
        }, {
            'name': 'CLTS_Name',
            'datatype': 'string'
        })
        args.writer.cldf.add_component("LanguageTable", "Family",
                                       "Glottolog_Name")

        # load language mapping and build inventory info
        languages = []
        lang_map = {}
        all_glottolog = {lng.id: lng for lng in glottolog.languoids()}
        unknowns = defaultdict(list)
        for row in progressbar(
                self.etc_dir.read_csv("languages.csv", dicts=True)):
            lang_map[row["name"]] = slug(row["name"])
            lang_dict = {"ID": slug(row["name"]), "Name": row["name"]}
            if row["glottocode"] in all_glottolog:
                lang = all_glottolog[row["glottocode"]]
                lang_dict.update({
                    "Family":
                    lang.family if lang.lineage else None,
                    "Glottocode":
                    lang.id,
                    "ISO639P3code":
                    lang.iso_code,
                    "Latitude":
                    lang.latitude,
                    "Longitude":
                    lang.longitude,
                    "Macroarea":
                    lang.macroareas[0].name if lang.macroareas else None,
                    "Glottolog_Name":
                    lang.name,
                })
            languages.append(lang_dict)

        # Read raw data
        with open(self.raw_dir.joinpath(
                'phono_dbase.json').as_posix()) as handler:
            raw_data = json.load(handler)

        # Iterate over raw data
        values = []
        parameters = []
        inventories = []
        counter = 1
        segment_set = set()
        with open(self.raw_dir.joinpath('sources.txt').as_posix()) as f:
            sources = [source.strip() for source in f.readlines()][1:]
        sources_ = Sources.from_file(self.raw_dir / "sources.bib")
        args.writer.cldf.add_sources(*sources_)
        for idx, (language, langdata) in enumerate(raw_data.items()):
            cons = langdata["cons"]
            vows = langdata["vows"]
            tones = [tone for tone in langdata["tones"] if tone]
            source = sources[idx]
            # Prepare language key
            lang_key = language.split("#")[0].replace(",", "")

            # Add consonants and vowels to values, also collecting parameters
            for segment in cons + vows:
                marginal = bool(segment[0] == "(")

                # Obtain the corresponding BIPA grapheme, is possible
                normalized = normalize_grapheme(segment)
                par_id = compute_id(normalized)
                if normalized in clts_eurasian.grapheme_map:
                    sound = bipa[clts_eurasian.grapheme_map[normalized]]
                else:
                    sound = bipa['<NA>']
                    unknowns[normalized] += [(segment, lang_key)]
                if sound.type == 'unknownsound':
                    bipa_grapheme = ''
                    desc = ''
                else:
                    bipa_grapheme = str(sound)
                    desc = sound.name
                parameters.append((par_id, normalized, bipa_grapheme, desc))

                values.append({
                    "ID": str(counter),
                    "Language_ID": lang_map[lang_key],
                    "Marginal": marginal,
                    "Parameter_ID": par_id,
                    "Value": normalized,
                    "Value_in_Source": segment,
                    "Source": [source],
                })
                counter += 1

        # Build segment data
        segments = [{
            "ID": id,
            "Name": normalized,
            "BIPA": bipa_grapheme,
            "Description": desc
        } for id, normalized, bipa_grapheme, desc in set(parameters)]

        # Write data and validate
        args.writer.write(
            **{
                "ValueTable": values,
                "LanguageTable": languages,
                "ParameterTable": segments,
            })
        for g, rest in unknowns.items():
            print('\t'.join([repr(g), str(len(rest)), g]))
예제 #2
0
    def cmd_makecldf(self, args):
        args.writer.cldf.add_component('ParameterTable')
        args.writer.cldf.add_component(
            'LanguageTable',
            'Continent',
            'Genus',
            'WALSCode',  # we add more language metadata
        )
        args.writer.cldf.add_component('CodeTable')

        args.writer.objects['ParameterTable'] = [{
            'ID':
            'sortalclassifier',
            'Name':
            'sortal classifier',
            'Description':
            'Does the language have sortal classifiers, regardless of optional of obligatory?'
        }, {
            'ID':
            'morphosyntacticplural',
            'Name':
            'morphosyntactic plural',
            'Description':
            'Does the language have morphosyntactic plural markers?'
        }]
        args.writer.objects['CodeTable'] = [
            {
                'ID': 'sortalclassifier-1',
                'Parameter_ID': 'sortalclassifier',
                'Name': 'yes'
            },
            {
                'ID': 'sortalclassifier-0',
                'Parameter_ID': 'sortalclassifier',
                'Name': 'no'
            },
            {
                'ID': 'morphosyntacticplural-1',
                'Parameter_ID': 'morphosyntacticplural',
                'Name': 'yes'
            },
            {
                'ID': 'morphosyntacticplural-0',
                'Parameter_ID': 'morphosyntacticplural',
                'Name': 'no'
            },
        ]

        l2s = collections.defaultdict(list)
        sources = []
        for src in sorted(Sources.from_file(self.raw_dir /
                                            'sources.bib').items(),
                          key=lambda i: i.id):
            if src.get('Wals_code'):
                for code in split_text(src['Wals_code'], ';', strip=True):
                    l2s[code].append(src.id)
                sources += [src]

        args.writer.cldf.add_sources(*sources)

        for row in self.raw_dir.read_csv('GSSG_ListOfLanguages.csv',
                                         delimiter=';',
                                         dicts=True):
            lidx = slug(row['language_name'], lowercase=False)
            args.writer.objects['LanguageTable'].append({
                'ID':
                lidx,
                'Name':
                row['language_name'],
                'Latitude':
                row['latitude'],
                'Longitude':
                row['longitude'],
                'Glottocode':
                row['glottocode'],
                'ISO639P3code':
                row['iso_code'],
                'Continent':
                row['continent'],
                'Genus':
                row['genus'],
                'WALSCode':
                row['wals_code']
            })
            for param in ['sortal_classifier', 'morphosyntactic_plural']:
                pid = param.replace('_', '')
                args.writer.objects['ValueTable'].append({
                    "ID":
                    '{}-{}'.format(lidx, pid),
                    "Value":
                    row[param],
                    "Language_ID":
                    lidx,
                    "Parameter_ID":
                    pid,
                    "Code_ID":
                    '{}-{}'.format(pid, '1' if row[param] == 'yes' else '0'),
                    "Source":
                    l2s.get(row['wals_code'], [])
                })
예제 #3
0
def get_data(cldf, args):
    relscount = 0
    cldf.sources = Sources.from_file(args.repos.path('sources.bib'))
    categorical_variables = set()
    data = collections.defaultdict(list)
    dsids = [ds.id for ds in args.repos.datasets]
    for ds in args.repos.datasets:
        data['datasets.csv'].append({
            'ID': ds.id,
            'Name': ds.name,
            'Description': ds.description,
            'Type': ds.type,
            'Year': ds.year,
            'Author': ds.author,
            'Reference': ds.reference,
            'URL': ds.url,
        })
        for soc in ds.societies:
            data['LanguageTable'].append({
                'ID':
                soc.id,
                'Dataset_ID':
                ds.id,
                'Name':
                soc.pref_name_for_society,
                'Glottocode':
                soc.glottocode,
                'Latitude':
                soc.Lat,
                'Longitude':
                soc.Long,
                'Comment':
                soc.Comment,
                'Glottocode_Comment':
                soc.glottocode_comment,
                'xd_id':
                soc.xd_id,
                'ORIG_name_and_ID_in_this_dataset':
                soc.ORIG_name_and_ID_in_this_dataset,
                'alt_names_by_society':
                soc.alt_names_by_society,
                'main_focal_year':
                soc.main_focal_year,
                'HRAF_ID':
                soc.HRAF_name_ID.id if soc.HRAF_name_ID else None,
                'HRAF_Name':
                soc.HRAF_name_ID.name if soc.HRAF_name_ID else None,
                'HRAF_Link':
                soc.HRAF_link,
                'origLat':
                soc.origLat,
                'origLong':
                soc.origLong,
            })
        for soc in ds.society_relations:
            for rel in soc.related:
                relscount += 1
                data['society_relations.csv'].append({
                    'ID':
                    str(relscount),
                    'Society_ID':
                    soc.id,
                    'Related_Society_ID':
                    rel.id if rel.dataset in dsids else None,
                    'Related_Society_External_ID':
                    rel.id if rel.dataset not in dsids else None,
                    'Related_Society_Name':
                    rel.name,
                    'Related_Society_Dataset':
                    rel.dataset,
                })
        for param in ds.variables:
            data['ParameterTable'].append({
                'ID': param.id.replace('.', '_'),
                'Dataset_ID': ds.id,
                'Name': param.title,
                'Description': param.definition,
                "Category": param.category,
                "Type": param.type,
                "Units": param.units,
                "Source": param.source,
                "Changes": param.changes,
                "Notes": param.notes,
            })
            for code in param.codes:
                if code.code == 'NA':
                    continue
                categorical_variables.add(code.var_id)
                data['CodeTable'].append({
                    'ID':
                    '{}-{}'.format(code.var_id, code.code).replace('.', '_'),
                    'Parameter_ID':
                    code.var_id.replace('.', '_'),
                    'Name':
                    code.name,
                    'Description':
                    code.description,
                })

        codes = set(c['ID'] for c in data['CodeTable'])
        for i, d in enumerate(ds.data, start=1):
            code_id = None \
                if (d.var_id not in categorical_variables) or d.code == 'NA' \
                else '{}-{}'.format(d.var_id, d.code).replace('.', '_')
            if code_id and (code_id not in codes) and args.fix_code_id:
                # This is a backwards compatibility fix. New releases should not have references
                # to undefined codes!
                code_id = None  # pragma: no cover

            data['ValueTable'].append({
                'ID':
                '{}-{}'.format(ds.id, i),
                'Language_ID':
                d.soc_id,
                'Parameter_ID':
                d.var_id.replace('.', '_'),
                'Dataset_ID':
                ds.id,
                'Code_ID':
                code_id,
                'Value':
                d.code,
                'Comment':
                d.comment,
                'Sub_Case':
                d.sub_case,
                'Year':
                d.year,
                'Source': [ref.format_cldf() for ref in d.references],
                'Source_Coded_Data':
                d.source_coded_data,
                'Admin_Comment':
                d.admin_comment,
            })
    return data