示例#1
0
def main(scripts, dev, glr):
    cldf_dir = Path('cldf')
    bib = parse_string(read_text(cldf_dir / 'sources.bib'),
                       bib_format='bibtex')
    write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex'))

    glottolog = Glottolog(glr)
    ds = StructureDataset.in_dir(cldf_dir)
    ds.tablegroup.notes.append(
        OrderedDict([('dc:title', 'environment'),
                     ('properties',
                      OrderedDict([
                          ('glottolog_version', git_describe(glottolog.repos)),
                      ]))]))
    ds.add_columns('ValueTable', {
        'name': 'Marginal',
        'datatype': 'boolean'
    }, {
        'name': 'Allophones',
        'separator': ' '
    }, 'Contribution_ID')
    features = [
        "tone", "stress", "syllabic", "short", "long", "consonantal",
        "sonorant", "continuant", "delayedRelease", "approximant", "tap",
        "trill", "nasal", "lateral", "labial", "round", "labiodental",
        "coronal", "anterior", "distributed", "strident", "dorsal", "high",
        "low", "front", "back", "tense", "retractedTongueRoot",
        "advancedTongueRoot", "periodicGlottalSource", "epilaryngealSource",
        "spreadGlottis", "constrictedGlottis", "fortis",
        "raisedLarynxEjective", "loweredLarynxImplosive", "click"
    ]
    ds.add_component('ParameterTable', 'SegmentClass', *features)
    ds.add_component('LanguageTable')
    ds.add_table(
        'contributions.csv', 'ID', 'Name', 'Contributor_ID', {
            'name': 'Source',
            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
            'separator': ';'
        }, 'URL')
    ds.add_table(
        'contributors.csv',
        'ID',
        'Name',
        'Description',
        'Readme',
        'Contents',
        {
            'name': 'Source',
            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
            'separator': ';'
        },
        'URL',
    )

    def read(what):
        return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True)

    languoids = {l.id: l for l in glottolog.languoids()}

    values, segments, languages, inventories, sources = [], [], {}, {}, []
    for contrib in read('contributors.csv'):
        sources.append(
            dict(
                ID=contrib.Name,
                Name=contrib.Contributor,
                Description=contrib.Description,
                Readme=desc(dev, contrib.Name),
                Contents=contrib.Contents,
                Source=[
                    c.strip().lower() for c in contrib.Citation.split(';')
                ],
                URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '',
            ))

    pid_map = {}
    for row in read('parameters.csv'):
        pid = md5(row.Description.encode('utf8')).hexdigest().upper()
        pid_map[row.ID] = pid
        segments.append(
            dict(ID=pid,
                 Name=row.Name,
                 Description=row.Description,
                 SegmentClass=row.SegmentClass,
                 **{f: getattr(row, f)
                    for f in features}))

    src = {}
    for row in read('contributions.csv'):
        src[row.ID] = row.References.split(
            ';') if row.References != 'no source given' else []
        src[row.ID] = [sid.lower() for sid in src[row.ID]]
        inventories[row.ID] = dict(ID=row.ID,
                                   Name=row.Name,
                                   Contributor_ID=row.Contributor_ID,
                                   URL=row.URI if row.URI != 'NA' else '',
                                   Source=src[row.ID])

    uniq = set()
    for row in read('values.csv'):
        pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID)
        if pk in uniq:
            print('skipping duplicate phoneme {0}'.format(pk))
            continue
        uniq.add(pk)
        lid = row.Language_ID if row.Language_ID in languoids else slug(
            inventories[row.Contribution_ID]['Name'])
        if lid not in languages:
            #
            # FIXME: Language_ID == 'NA' for three inventories! This must be mapped!
            #
            lang = languoids.get(lid)
            languages[lid] = dict(
                ID=lid,
                Name=lang.name if lang else None,
                Glottocode=lang.id if lang else None,
                ISO639P3code=row.ISO639P3code
                if row.ISO639P3code != 'NA' else None,
            )
        values.append(
            dict(
                ID=row.ID,
                Language_ID=lid,
                Parameter_ID=pid_map[row.Parameter_ID],
                Contribution_ID=row.Contribution_ID,
                Value=row.Name,
                Marginal=None if row.Marginal == 'NA' else eval(
                    row.Marginal.lower().capitalize()),  # FALSE|TRUE|NA
                Allophones=row.Allophones.split()
                if row.Allophones != 'NA' else [],
                Source=src[row.Contribution_ID],
            ))

    ds.write(
        **{
            'ValueTable': values,
            'LanguageTable': languages.values(),
            'ParameterTable': segments,
            'contributions.csv': inventories.values(),
            'contributors.csv': sources
        })
    ds.validate(logging.getLogger(__name__))
    def cmd_makecldf(self, args):

        glottolog = Glottolog(args.glottolog.dir)
        clts = CLTS(Config.from_file().get_clone('clts'))
        bipa = clts.bipa
        clts_eurasian = clts.transcriptiondata_dict['eurasian']

        args.writer.cldf.add_columns("ValueTable", {
            "name": "Marginal",
            "datatype": "boolean"
        }, {
            "name": "Value_in_Source",
            "datatype": "string"
        })

        args.writer.cldf.add_columns('ParameterTable', {
            'name': 'CLTS_BIPA',
            'datatype': 'string'
        }, {
            'name': 'CLTS_Name',
            'datatype': 'string'
        })
        args.writer.cldf.add_component("LanguageTable", "Family",
                                       "Glottolog_Name")

        # load language mapping and build inventory info
        languages = []
        lang_map = {}
        all_glottolog = {lng.id: lng for lng in glottolog.languoids()}
        unknowns = defaultdict(list)
        for row in progressbar(
                self.etc_dir.read_csv("languages.csv", dicts=True)):
            lang_map[row["name"]] = slug(row["name"])
            lang_dict = {"ID": slug(row["name"]), "Name": row["name"]}
            if row["glottocode"] in all_glottolog:
                lang = all_glottolog[row["glottocode"]]
                lang_dict.update({
                    "Family":
                    lang.family if lang.lineage else None,
                    "Glottocode":
                    lang.id,
                    "ISO639P3code":
                    lang.iso_code,
                    "Latitude":
                    lang.latitude,
                    "Longitude":
                    lang.longitude,
                    "Macroarea":
                    lang.macroareas[0].name if lang.macroareas else None,
                    "Glottolog_Name":
                    lang.name,
                })
            languages.append(lang_dict)

        # Read raw data
        with open(self.raw_dir.joinpath(
                'phono_dbase.json').as_posix()) as handler:
            raw_data = json.load(handler)

        # Iterate over raw data
        values = []
        parameters = []
        inventories = []
        counter = 1
        segment_set = set()
        with open(self.raw_dir.joinpath('sources.txt').as_posix()) as f:
            sources = [source.strip() for source in f.readlines()][1:]
        sources_ = Sources.from_file(self.raw_dir / "sources.bib")
        args.writer.cldf.add_sources(*sources_)
        for idx, (language, langdata) in enumerate(raw_data.items()):
            cons = langdata["cons"]
            vows = langdata["vows"]
            tones = [tone for tone in langdata["tones"] if tone]
            source = sources[idx]
            # Prepare language key
            lang_key = language.split("#")[0].replace(",", "")

            # Add consonants and vowels to values, also collecting parameters
            for segment in cons + vows:
                marginal = bool(segment[0] == "(")

                # Obtain the corresponding BIPA grapheme, is possible
                normalized = normalize_grapheme(segment)
                par_id = compute_id(normalized)
                if normalized in clts_eurasian.grapheme_map:
                    sound = bipa[clts_eurasian.grapheme_map[normalized]]
                else:
                    sound = bipa['<NA>']
                    unknowns[normalized] += [(segment, lang_key)]
                if sound.type == 'unknownsound':
                    bipa_grapheme = ''
                    desc = ''
                else:
                    bipa_grapheme = str(sound)
                    desc = sound.name
                parameters.append((par_id, normalized, bipa_grapheme, desc))

                values.append({
                    "ID": str(counter),
                    "Language_ID": lang_map[lang_key],
                    "Marginal": marginal,
                    "Parameter_ID": par_id,
                    "Value": normalized,
                    "Value_in_Source": segment,
                    "Source": [source],
                })
                counter += 1

        # Build segment data
        segments = [{
            "ID": id,
            "Name": normalized,
            "BIPA": bipa_grapheme,
            "Description": desc
        } for id, normalized, bipa_grapheme, desc in set(parameters)]

        # Write data and validate
        args.writer.write(
            **{
                "ValueTable": values,
                "LanguageTable": languages,
                "ParameterTable": segments,
            })
        for g, rest in unknowns.items():
            print('\t'.join([repr(g), str(len(rest)), g]))
示例#3
0
def main(scripts, dev, glr):
    cldf_dir = Path('cldf')
    bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex')
    for _, e in bib.entries.items():
        for field in e.fields:
            e.fields[field] = e.fields[field].replace('\\', '')
    write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex'))

    glottolog = Glottolog(glr)

    ds = StructureDataset.in_dir(cldf_dir)

    def describe_repos(r, org, name=None):
        return OrderedDict([
            ('dc:title', '{0}/{1}'.format(org, name or r.name)),
            ('dc:description', git_describe(r))])

    ds.tablegroup.common_props['prov:wasDerivedFrom'] = [
        describe_repos(dev, 'phoible'),
        describe_repos(scripts, 'bambooforest'),
        describe_repos(glottolog.repos, 'clld'),
    ]
    ds.tablegroup.common_props['prov:wasGeneratedBy'] = describe_repos(
        Path(__file__).parent, 'cldf-datasets', name='phoible')

    ds.add_columns(
        'ValueTable',
        {'name': 'Marginal', 'datatype': 'boolean'},
        {'name': 'Allophones', 'separator': ' '},
        'Contribution_ID')
    features = ["tone","stress","syllabic","short","long","consonantal","sonorant","continuant","delayedRelease","approximant","tap","trill","nasal","lateral","labial","round","labiodental","coronal","anterior","distributed","strident","dorsal","high","low","front","back","tense","retractedTongueRoot","advancedTongueRoot","periodicGlottalSource","epilaryngealSource","spreadGlottis","constrictedGlottis","fortis","raisedLarynxEjective","loweredLarynxImplosive","click"]
    ds.add_component('ParameterTable', 'SegmentClass', *features)
    ds.add_component('LanguageTable', 'Family_Glottocode', 'Family_Name')
    table = ds.add_table(
        'contributions.csv', 
        'ID', 
        'Name', 
        'Contributor_ID', 
        {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'},
        'URL',
        {'name': 'count_phonemes', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_consonants', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_vowels', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_tones', 'datatype': {'base': 'integer', 'minimum': 0}, 'null': 'NA'},
    )
    table.tableSchema.primaryKey = ['ID']
    table.tableSchema.foreignKeys.append(ForeignKey.fromdict(dict(
        columnReference='Contributor_ID',
        reference=dict(resource='contributors.csv', columnReference='ID'))))
    table.common_props['dc:conformsTo'] = None
    table = ds.add_table(
        'contributors.csv',
        'ID',
        'Name',
        'Description',
        'Readme',
        'Contents',
        {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'},
        'URL',
        {'name': 'with_tones', 'datatype': {'base': 'boolean', 'format': '1|0'}},
    )
    table.tableSchema.primaryKey = ['ID']
    table.common_props['dc:conformsTo'] = None

    def read(what):
        return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True)

    languoids = {l.id: l for l in glottolog.languoids()}

    values, segments, languages, inventories, sources = [], [], OrderedDict(), OrderedDict(), []
    with_tones = {}
    for contrib in read('contributors.csv'):
        sources.append(dict(
            ID=contrib.Name,
            Name=contrib.Contributor,
            Description=contrib.Description,
            Readme=desc(dev, contrib.Name),
            Contents=contrib.Contents,
            Source=[c.strip().lower() for c in contrib.Citation.split(';')],
            URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '',
            with_tones=contrib.with_tones == '1',
        ))
        with_tones[contrib.Name] = contrib.with_tones == '1'

    pid_map = {}
    for row in read('parameters.csv'):
        pid = md5(row.Description.encode('utf8')).hexdigest().upper()
        pid_map[row.ID] = (pid, row.SegmentClass)
        segments.append(dict(
            ID=pid,
            Name=row.Name,
            Description=row.Description,
            SegmentClass=row.SegmentClass,
            **{f: getattr(row, f) for f in features}
        ))

    src = {}
    for row in read('contributions.csv'):
        src[row.ID] = row.References.split(';') if row.References != 'no source given' else []
        src[row.ID] = [sid.lower() for sid in src[row.ID]]
        inventories[row.ID] = dict(
            ID=row.ID, 
            Name=row.Name, 
            Contributor_ID=row.Contributor_ID.upper(), 
            URL=row.URI if row.URI != 'NA' else '',
            Source=src[row.ID],
            count_phonemes=0,
            count_consonants=0,
            count_vowels=0,
            count_tones=0,
        )

    uniq, counts = set(), Counter()
    for row in read('values.csv'):
        pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID)
        if pk in uniq:
            print('skipping duplicate phoneme {0}'.format(pk))
            continue
        uniq.add(pk)
        lid = row.Language_ID if row.Language_ID in languoids else slug(inventories[row.Contribution_ID]['Name'])
        if lid not in languages:
            #
            # FIXME: Language_ID == 'NA' for three inventories! This must be mapped!
            #
            lang = languoids.get(lid)
            fam = lang.lineage[0] if lang and lang.lineage else None
            languages[lid] = dict(
                ID=lid,
                Name=lang.name if lang else None,
                Glottocode=lang.id if lang else None,
                ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None,
                Macroarea=lang.macroareas[0].value if lang and lang.macroareas else None,
                Latitude=lang.latitude if lang else None,
                Longitude=lang.longitude if lang else None,
                Family_Glottocode=fam[1] if fam else None,
                Family_Name=fam[0] if fam else None,
            )
        pid, sc = pid_map[row.Parameter_ID]
        counts.update([(row.Contribution_ID, sc)])
        values.append(dict(
            ID=row.ID,
            Language_ID=lid,
            Parameter_ID=pid,
            Contribution_ID=row.Contribution_ID,
            Value=row.Name,
            Marginal=None if row.Marginal == 'NA' else eval(row.Marginal.lower().capitalize()),  # FALSE|TRUE|NA
            Allophones=row.Allophones.split() if row.Allophones != 'NA' else [],
            Source=src[row.Contribution_ID],
        ))
    for key, count in counts.items():
        inventories[key[0]]['count_{0}s'.format(key[1])] = count
        inventories[key[0]]['count_phonemes'] += count

    for inv in inventories.values():
        if not with_tones[inv['Contributor_ID']]:
            assert inv['count_tones'] == 0
            inv['count_tones'] = 'NA'

    ds.write(**{
        'ValueTable': values,
        'LanguageTable': languages.values(),
        'ParameterTable': segments,
        'contributions.csv': inventories.values(),
        'contributors.csv': sources
    })
    ds.validate(logging.getLogger(__name__))
示例#4
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """

    print('Parsing markdown intros...')
    for contrib in DBSession.query(models.Contribution):
        if contrib.description:
            contrib.markup_description = markdown(contrib.description)
        else:
            contrib.markup_description = None
    print('... done')

    print('Retrieving language data from glottolog...')

    catconf = cldfcatalog.Config.from_file()
    glottolog_path = catconf.get_clone('glottolog')
    glottolog = Glottolog(glottolog_path)

    lang_ids = [lang.id for lang in DBSession.query(common.Language)]
    languoids = {l.id: l for l in glottolog.languoids(lang_ids)}

    glottocodes = [(l.id,
                    common.Identifier(id=l.id, name=l.id, type='glottolog'))
                   for l in languoids.values()]
    glottocodes = OrderedDict(sorted(glottocodes, key=lambda t: t[0]))

    isocodes = [(l.iso, common.Identifier(id=l.iso,
                                          name=l.iso,
                                          type='iso639-3'))
                for l in languoids.values() if l.iso]
    isocodes = OrderedDict(sorted(isocodes, key=lambda t: t[0]))

    DBSession.add_all(glottocodes.values())
    DBSession.add_all(isocodes.values())
    DBSession.flush()

    for lang in DBSession.query(common.Language):
        if lang.id not in languoids:
            continue
        languoid = languoids[lang.id]
        lang.name = languoid.name
        lang.latitude = languoid.latitude
        lang.longitude = languoid.longitude
        lang.macroarea = languoid.macroareas[
            0].name if languoid.macroareas else ''

        DBSession.add(
            common.LanguageIdentifier(
                language=lang, identifier_pk=glottocodes[languoid.id].pk))

        if languoid.iso in isocodes:
            DBSession.add(
                common.LanguageIdentifier(
                    language=lang, identifier_pk=isocodes[languoid.iso].pk))

    DBSession.flush()
    print('... done')

    print('Making pretty colourful dots for parameter values...')
    all_icons = [icon.name for icon in ORDERED_ICONS]

    code_query = DBSession.query(common.DomainElement)\
        .order_by(common.DomainElement.parameter_pk, common.DomainElement.id)
    for _, param_codes in groupby(code_query, lambda c: c.parameter_pk):
        icons = cycle(all_icons)
        for code in param_codes:
            code.update_jsondata(icon=next(icons))

    DBSession.flush()
    print('... done')