示例#1
0
文件: initializedb.py 项目: clld/cdk
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=cdk.__name__,
        name="CDK",
        description="Comprehensive Dictionary of Ket",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='cdk.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    DBSession.add(dataset)

    contrib = common.Contribution(id='ket', name=dataset.name)
    DBSession.add(contrib)
    for i, (id, name) in enumerate([
        ('kotorov', 'E.G. Kotorova'), ('nefedov', 'A.V. Nefedov'),
    ]):
        dataset.editors.append(
            common.Editor(contributor=common.Contributor(id=id, name=name), ord=i))

    ket = data.add(
        common.Language, 'ket',
        id='ket',
        name='Ket',
        latitude=63.76,
        longitude=87.55)
    add_language_codes(data, ket, 'ket', glottocode='kett1243')
    for abbr, name in DIALECTS.items():
        data.add(common.Language, abbr, id=abbr, name=name)

    with args.data_file('sources.txt').open(encoding='utf8') as fp:
        for i, chunk in enumerate(fp.read().split('\n\n\n')):
            try:
                id_, year, author, desc = chunk.split('\n')
            except:
                print(chunk)
                raise
            data.add(
                common.Source,
                id_,
                id=str(i + 1),
                name=id_,
                author=author,
                year=year,
                description=desc)

    with UnicodeReader(args.data_file('Ket_nouns_and_other_pos_table.docx.csv')) as reader:
        load(data, reader, ket, contrib, verbs=False)

    with UnicodeReader(args.data_file('Ket_verbs_table.docx.csv')) as reader:
        load(data, reader, ket, contrib)

    print('parsing examples problematic in %s cases' % len(PROBLEMS))
示例#2
0
 def languoid_visitor(lang, row, _):
     try:
         add_language_codes(
             data, lang, lang.id.split('-')[0], glottolog, glottocode=row[2] or None)
     except:
         print(row)
         raise
     second_languages[row[0]] = row[8]
示例#3
0
def test_add_language_codes(env):
    from clld.db.models.common import Language
    from clld.scripts.util import Data, add_language_codes

    add_language_codes(Data(),
                       Language(),
                       'iso',
                       glottocodes=dict(iso='glot1234'))
示例#4
0
def main(args):
    glottocodes = {}
    if getuser() == "robert":
        glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3")

    data = Data()
    dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org")
    DBSession.add(dataset)

    bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True)

    for i, spec in enumerate(
        [
            ("bickel", "Balthasar Bickel", "University of Zurich"),
            ("nichols", "Johanna Nichols", "University of California, Berkeley"),
        ]
    ):
        contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1])
        DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor))

    for l in rows(
        args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True
    ):
        # LID	language	ISO639.3.2013	stock	continent	area	latitude	longitude
        if l.stock not in data["Stock"]:
            stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock)
        else:
            stock = data["Stock"][l.stock]

        if l.continent not in data["Continent"]:
            continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent)
        else:
            continent = data["Continent"][l.continent]

        if l.area not in data["Area"]:
            area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent)
        else:
            area = data["Area"][l.area]

        lang = data.add(
            models.Languoid,
            l.LID,
            id=l.LID,
            name=l.language,
            latitude=coord(l.latitude),
            longitude=coord(l.longitude),
            stock=stock,
            area=area,
        )
        add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes)

    loader.case_alignment(args, data, bib)
    loader.inclusive_excusive(args, data, bib)
示例#5
0
def load_families(data,
                  languages,
                  glottolog=None,
                  icons=ORDERED_ICONS,
                  isolates_icon=ISOLATES_ICON):
    """Add Family objects to a database and update Language object from Glottolog.

    Family information is retrieved from Glottolog based on the id attribute of a
    language. This id must be either a glottocode or an ISO 639-3 code.

    :param data:
    :return:
    """
    icons = cycle([
        getattr(i, 'name', i) for i in icons
        if getattr(i, 'name', i) != isolates_icon
    ])
    glottolog = glottolog or Glottolog()

    for language in languages:
        if isinstance(language, (tuple, list)) and len(language) == 2:
            code, language = language
        else:
            code = language.id
        if code != '-':
            gl_language = glottolog.languoid(code)
            if gl_language:
                gl_family = gl_language.family
                if gl_family:
                    family = data['Family'].get(gl_family.id)
                    #print family
                    if not family:
                        family = data.add(
                            Family,
                            gl_family.id,
                            id=gl_family.id,
                            name=gl_family.name,
                            description=Identifier(
                                name=gl_family.id,
                                type=IdentifierType.glottolog.value).url(),
                            jsondata=dict(icon=next(icons)))
                    language.family = family

                language.macroarea = gl_language.macroareas[0]
                add_language_codes(data,
                                   language,
                                   gl_language.iso_code,
                                   glottocode=gl_language.id)
                for attr in 'latitude', 'longitude', 'name':
                    if getattr(language, attr) is None:
                        setattr(language, attr, getattr(gl_language, attr))
            else:
                language.macroarea = None
示例#6
0
def load_families(
        data,
        languages,
        glottolog=None,
        icons=ORDERED_ICONS,
        isolates_icon=ISOLATES_ICON):
    """Add Family objects to a database and update Language object from Glottolog.

    Family information is retrieved from Glottolog based on the id attribute of a
    language. This id must be either a glottocode or an ISO 639-3 code.

    :param data:
    :return:
    """
    icons = cycle([getattr(i, 'name', i) for i in icons
                   if getattr(i, 'name', i) != isolates_icon])
    glottolog = glottolog or Glottolog()

    for language in languages:
        if isinstance(language, (tuple, list)) and len(language) == 2:
            code, language = language
        else:
            code = language.id
        gl_language = glottolog.get(code) \
            if isinstance(glottolog, dict) else glottolog.languoid(code)
        if gl_language:
            gl_family = gl_language.family
            if not gl_family and getattr(gl_language, 'level', None) == 'family':
                # Make sure top-level families are not treated as isolates!
                gl_family = gl_language

            if gl_family:
                family = data['Family'].get(gl_family.id)
                if not family:
                    family = data.add(
                        Family,
                        gl_family.id,
                        id=gl_family.id,
                        name=gl_family.name,
                        description=Identifier(
                            name=gl_family.id, type=IdentifierType.glottolog.value).url(),
                        jsondata=dict(icon=next(icons)))
                language.family = family

            language.macroarea = gl_language.macroareas[0] if gl_language.macroareas else None
            add_language_codes(
                data, language, gl_language.iso_code, glottocode=gl_language.id)
            for attr in 'latitude', 'longitude', 'name':
                if getattr(language, attr) is None:
                    setattr(language, attr, getattr(gl_language, attr))
示例#7
0
 def languoid_visitor(lang, row, _):
     add_language_codes(
         data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None)
     second_languages[row[0]] = row[8]
示例#8
0
def main(args):
    # determine if we run on a machine where other databases are available for lookup
    # locally:
    data = Data()
    genera = get_genera(data) if astroman else {}
    glottocodes, lnames, geocoords = {}, {}, {}
    if astroman:
        for k, v in glottocodes_by_isocode(
                'postgresql://robert@/glottolog3',
                cols=['id', 'name', 'latitude', 'longitude']).items():
            glottocodes[k] = v[0]
            lnames[k] = v[1]
            geocoords[k] = (v[2], v[3])

    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset, 'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
                'Creative Commons Attribution-ShareAlike 3.0 Unported License'})

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(common.Editor(
            dataset=dataset,
            ord=i + 1,
            contributor=common.Contributor(id=spec[0], name=spec[1])))

    squibs = defaultdict(list)
    for row in get_rows(args, 'Squib'):
        squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    aggregated = list(reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True))
    inventory_names = {}
    for key, items in groupby(
            sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)),
            key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1])

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus, genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(
                models.Variety, row.LanguageCode,
                id=row.LanguageCode,
                name=lnames[row.LanguageCode],
                genus=genus,
                country=strip_quotes(row.Country),
                area=strip_quotes(row.Area),
                latitude=coords[0],
                longitude=coords[1],
                jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(
                common.Contributor, row.Source,
                id=row.Source,
                name=SOURCES[row.Source][0],
                description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(models.ContributorReference(
                    source=data['Source'][ref], contributor=contributor))

        contrib = data.add(
            models.Inventory, row.InventoryID,
            id=row.InventoryID,
            language=lang,
            source=row.Source,
            source_url=source_urls.get(row.InventoryID),
            internetarchive_url=ia_urls.get(row.InventoryID),
            name=inventory_names[row.InventoryID],
            description=row.LanguageName)

        DBSession.add(common.ContributionContributor(
            contribution=contrib, contributor=contributor))

        for j, squib in enumerate(squibs.get(row.InventoryID, [])):
            f = common.Contribution_files(
                object=contrib,
                id='squib-%s-%s.pdf' % (contrib.id, j + 1),
                name='Phonological squib',
                description=squib,
                mime_type='application/pdf')
            assert f
            # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment, row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join(
                    [t[0] for t in unicode_desc
                     if t[1].split()[0] not in ['COMBINING', 'MODIFIER']]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(
            id=row.PhonemeID,
            contribution=inventory,
            language=inventory.language,
            parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(common.ValueSetReference(
                source=data['Source'][ref],
                valueset=vs))

        DBSession.add(common.Value(
            id=row.PhonemeID,
            name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name),
            valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(
                common.ContributionReference, '%s-%s' % (inventory_id, ref),
                source=data['Source'][ref],
                contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(common.Parameter_data(
                    key=features[j],
                    value=value,
                    ord=j,
                    object_pk=data['Segment'][row[0]].pk))
    DBSession.flush()
示例#9
0
def main(args):
    data = Data()
    glottocodes, bibtex_keys = {}, defaultdict(set)
    for d in reader(
            args.data_file('repos', 'mappings',
                           'InventoryID-ISO-gcode-Bibkey-Source.tsv')):
        glottocodes[d['InventoryID']] = d['Glottocode']
        bibtex_keys[d['InventoryID']].add(d['BibtexKey'])

    glottolog = Glottolog(
        Path(phoible.__file__).parent.parent.parent.parent.joinpath(
            'glottolog3', 'glottolog'))
    languoids = {l.id: l for l in glottolog.languoids()}

    phonemes = sorted(list(
        reader(args.data_file('repos', 'data', 'phoible-by-phoneme.tsv'))),
                      key=lambda r: (r['InventoryID'], r['GlyphID']))

    inventories = defaultdict(set)
    for p in phonemes:
        if p['InventoryID'] in glottocodes:
            inventories[(languoids[glottocodes[p['InventoryID']]].name,
                         p['SpecificDialect'], p['Source'].upper())].add(
                             (p['InventoryID'], p['LanguageName']))

    inventory_names = {}
    for (glname, dname, source), invids in inventories.items():
        if len(invids) == 1:
            invid, lname = invids.pop()
            inventory_names[invid] = name_in_source(glname,
                                                    dname) + ' [%s]' % source
        else:
            use_lname = len(set(r[1] for r in invids)) == len(invids)
            for i, (invid,
                    lname) in enumerate(sorted(invids,
                                               key=lambda j: int(j[0]))):
                disambiguation = ' %s' % (i + 1, )
                if use_lname:
                    disambiguation = ' (%s)' % lname
                inventory_names[invid] = name_in_source(
                    glname, dname) + '%s [%s]' % (disambiguation, source)

    for (invid, lname, dname, source), ps in groupby(
            phonemes, lambda p: (p['InventoryID'], p['LanguageName'], p[
                'SpecificDialect'], p['Source'])):
        if invid not in glottocodes:
            continue
        ps = list(ps)
        gc = glottocodes[invid]
        lang = data['Variety'].get(gc)
        if not lang:
            languoid = languoids[gc]
            lang = data.add(
                models.Variety,
                gc,
                id=gc,
                language_code=ps[0]['LanguageCode'],
                name=languoid.name,
                level=text_type(languoid.level.name),
                latitude=languoid.latitude,
                longitude=languoid.longitude,
            )
            if lang.latitude is None and languoid.level == Level.dialect:
                ll = get_language(languoid)
                lang.latitude = ll.latitude
                lang.longitude = ll.longitude

        contrib = data.add(
            models.Inventory,
            invid,
            id=invid,
            #language=lang,
            source=source,
            #source_url=source_urls.get(row.InventoryID),
            #internetarchive_url=ia_urls.get(row.InventoryID),
            name=inventory_names[invid],
            description=name_in_source(lname, dname))

    return

    # FIXME: read from mappings file!
    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(
            common.Editor(dataset=dataset,
                          ord=i + 1,
                          contributor=common.Contributor(id=spec[0],
                                                         name=spec[1])))

    #squibs = defaultdict(list)
    #for row in get_rows(args, 'Squib'):
    #    squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    # FIXME: group phoible-by-phoneme by LanguageCode, Source (make sure this is unique!)
    aggregated = list(
        reader(args.data_file('phoible-aggregated.tsv'),
               delimiter='\t',
               namedtuples=True))
    inventory_names = {}
    for key, items in groupby(sorted(aggregated,
                                     key=lambda t: (t.LanguageCode, t.Source)),
                              key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i +
                                                                    1, key[1])

    # pull in Glottolog families instead? or in addition?

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus,
                            genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(models.Variety,
                            row.LanguageCode,
                            id=row.LanguageCode,
                            name=lnames[row.LanguageCode],
                            genus=genus,
                            country=strip_quotes(row.Country),
                            area=strip_quotes(row.Area),
                            latitude=coords[0],
                            longitude=coords[1],
                            jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data,
                               lang,
                               row.LanguageCode,
                               glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(common.Contributor,
                                   row.Source,
                                   id=row.Source,
                                   name=SOURCES[row.Source][0],
                                   description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(
                    models.ContributorReference(source=data['Source'][ref],
                                                contributor=contributor))

        contrib = data.add(models.Inventory,
                           row.InventoryID,
                           id=row.InventoryID,
                           language=lang,
                           source=row.Source,
                           source_url=source_urls.get(row.InventoryID),
                           internetarchive_url=ia_urls.get(row.InventoryID),
                           name=inventory_names[row.InventoryID],
                           description=row.LanguageName)

        DBSession.add(
            common.ContributionContributor(contribution=contrib,
                                           contributor=contributor))

        #for j, squib in enumerate(squibs.get(row.InventoryID, [])):
        #    f = common.Contribution_files(
        #        object=contrib,
        #        id='squib-%s-%s.pdf' % (contrib.id, j + 1),
        #        name='Phonological squib',
        #        description=squib,
        #        mime_type='application/pdf')
        #    assert f
        #    # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'),
                      namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment,
                row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join([
                    t[0] for t in unicode_desc
                    if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
                ]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(id=row.PhonemeID,
                             contribution=inventory,
                             language=inventory.language,
                             parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            common.Value(
                id=row.PhonemeID,
                name='%s %s' %
                (row.Phoneme, data['Inventory'][row.InventoryID].name),
                valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(common.ContributionReference,
                     '%s-%s' % (inventory_id, ref),
                     source=data['Source'][ref],
                     contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(
            reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(
                    common.Parameter_data(
                        key=features[j],
                        value=value,
                        ord=j,
                        object_pk=data['Segment'][row[0]].pk))

    # FIXME: add allophones!

    DBSession.flush()
示例#10
0
def main(args):
    # determine if we run on a machine where other databases are available for lookup
    # locally:
    data = Data()
    genera = get_genera(data) if astroman else {}
    glottocodes, lnames, geocoords = {}, {}, {}
    if astroman:
        for k, v in glottocodes_by_isocode(
                'postgresql://robert@/glottolog3',
                cols=['id', 'name', 'latitude', 'longitude']).items():
            glottocodes[k] = v[0]
            lnames[k] = v[1]
            geocoords[k] = (v[2], v[3])

    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(
            common.Editor(dataset=dataset,
                          ord=i + 1,
                          contributor=common.Contributor(id=spec[0],
                                                         name=spec[1])))

    squibs = defaultdict(list)
    for row in get_rows(args, 'Squib'):
        squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    aggregated = list(
        reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True))
    inventory_names = {}
    for key, items in groupby(sorted(aggregated,
                                     key=lambda t: (t.LanguageCode, t.Source)),
                              key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i +
                                                                    1, key[1])

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus,
                            genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(models.Variety,
                            row.LanguageCode,
                            id=row.LanguageCode,
                            name=lnames[row.LanguageCode],
                            genus=genus,
                            country=strip_quotes(row.Country),
                            area=strip_quotes(row.Area),
                            latitude=coords[0],
                            longitude=coords[1],
                            jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data,
                               lang,
                               row.LanguageCode,
                               glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(common.Contributor,
                                   row.Source,
                                   id=row.Source,
                                   name=SOURCES[row.Source][0],
                                   description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(
                    models.ContributorReference(source=data['Source'][ref],
                                                contributor=contributor))

        contrib = data.add(models.Inventory,
                           row.InventoryID,
                           id=row.InventoryID,
                           language=lang,
                           source=row.Source,
                           source_url=source_urls.get(row.InventoryID),
                           internetarchive_url=ia_urls.get(row.InventoryID),
                           name=inventory_names[row.InventoryID],
                           description=row.LanguageName)

        DBSession.add(
            common.ContributionContributor(contribution=contrib,
                                           contributor=contributor))

        for j, squib in enumerate(squibs.get(row.InventoryID, [])):
            f = common.Contribution_files(object=contrib,
                                          id='squib-%s-%s.pdf' %
                                          (contrib.id, j + 1),
                                          name='Phonological squib',
                                          description=squib,
                                          mime_type='application/pdf')
            assert f
            # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'),
                      namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment,
                row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join([
                    t[0] for t in unicode_desc
                    if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
                ]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(id=row.PhonemeID,
                             contribution=inventory,
                             language=inventory.language,
                             parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            common.Value(
                id=row.PhonemeID,
                name='%s %s' %
                (row.Phoneme, data['Inventory'][row.InventoryID].name),
                valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(common.ContributionReference,
                     '%s-%s' % (inventory_id, ref),
                     source=data['Source'][ref],
                     contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(
            reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(
                    common.Parameter_data(
                        key=features[j],
                        value=value,
                        ord=j,
                        object_pk=data['Segment'][row[0]].pk))
    DBSession.flush()
示例#11
0
def main(args):
    #
    # order of init:
    # - villages
    # - files
    # - movies
    #
    videos = defaultdict(list)
    for f in util.iter_files(args):
        obj = models.File(**attr.asdict(f))
        if obj.mime_type.startswith('video'):
            videos[slug(obj.name.split('.')[0])].append(obj)
        DBSession.add(obj)

    lexicon = list(util.iter_lexicon(args))
    villages = util.get_villages(args)
    ff_images = list(util.ff_images(args))
    bib = list(util.get_bib(args))
    data = Data()

    dataset = common.Dataset(
        id=dogonlanguages.__name__,
        name="Dogon and Bangime Linguistics",
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='dogonlanguages.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'}
    )
    DBSession.add(dataset)

    if Glottolog:
        if socket.gethostname() == 'dlt5502178l':
            glottolog = Glottolog(
                Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath(
                    'glottolog3', 'glottolog'))
        else:
            glottolog = Glottolog(
                Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath(
                    'glottolog'))
        languoids = {l.id: l for l in glottolog.languoids()}
    else:
        languoids = {}
    print('got glottolog')

    for c in util.CONTRIBUTORS:
        id_ = slug(c.name.split()[-1])
        data.add(models.Member, id_, id=id_, **attr.asdict(c))
    data.add(
        models.Member, 'forkel',
        id='forkel',
        name='Robert Forkel',
        email='*****@*****.**',
        in_project=False)

    for i, id_ in enumerate(['moran', 'forkel', 'heath']):
        DBSession.add(common.Editor(
            dataset=dataset, ord=i + 1, contributor=data['Member'][id_]))

    contrib = data.add(common.Contribution, 'd', id='d', name='Dogon Languages')
    for doc in bib:
        obj = data.add(
            models.Document,
            doc.rec.id,
            _obj=bibtex2source(doc.rec, cls=models.Document))
        keywords = nfilter([s.strip() for s in doc.rec.get('keywords', '').split(',')])
        for dt in 'grammar lexicon typology texts'.split():
            if dt in keywords:
                obj.doctype = dt
                break
        obj.project_doc = ('DLP' in keywords) or bool(doc.files)
        if obj.project_doc:
            for i, cid in enumerate(util.get_contributors(doc.rec, data)):
                models.DocumentContributor(
                    document=obj, contributor=data['Member'][cid], ord=i)
        for i, (path, cdstar) in enumerate(doc.files):
            common.Source_files(
                id='%s-%s' % (obj.id, i + 1),
                name=path,
                object=obj,
                mime_type=guess_type(path)[0],
                jsondata=cdstar,
            )

    print('got bib')

    for name, (gc, desc) in LANGUAGES.items():
        gl_lang = languoids[gc]
        lat, lon = gl_lang.latitude, gl_lang.longitude
        lang = data.add(
            models.Languoid, gc,
            id=gc,
            name=name,
            description=desc,
            latitude=lat,
            longitude=lon,
            family=gl_lang.family.name if gl_lang and gl_lang.family else name,
        )
        if name == 'Penange' and lang.longitude > 0:
            lang.longitude = -lang.longitude
        if name == 'Bankan Tey':
            lang.latitude, lang.longitude = 15.07, -2.91
        if name == 'Ben Tey':
            lang.latitude, lang.longitude = 14.85, -2.95
        if name == 'Togo Kan':
            lang.latitude, lang.longitude = 14.00, -3.25
        add_language_codes(data, lang, gl_lang.iso, glottocode=gc)

    villages_by_name = defaultdict(list)
    contrib_by_initial = {c.abbr: c for c in data['Member'].values()}
    for i, village in enumerate(villages):
        lang = None
        if village.glottocode:
            lang = data['Languoid'].get(village.glottocode)
            if not lang:
                gl_lang = languoids[village.glottocode]
                lang = data.add(
                    models.Languoid, gl_lang.id,
                    id=gl_lang.id,
                    name=gl_lang.name,
                    in_project=False,
                    family=gl_lang.family.name if gl_lang.family else gl_lang.name)
        v = data.add(
            models.Village, str(i + 1),
            id=str(i + 1),
            name=village.name,
            description=village.data.pop('social info'),
            surnames=village.data.pop('surnames'),
            major_city=village.data['MajorCity'] == 'Y',
            transcribed_name=village.data.pop('Transcribed Village Name'),
            source_of_coordinates=village.data.pop('sourceOfCoordinates'),
            latitude=village.lat,
            longitude=village.lon,
            languoid=lang,
            jsondata=village.data,
        )
        villages_by_name[village.name].append(v)
        for img in village.images:
            mimetype = guess_type(img.name)[0]
            if mimetype:
                f = models.Village_files(
                    id=img.id,
                    name=img.name,
                    description=img.description,
                    date_created=img.date,
                    latitude=img.coords[0] if img.coords else None,
                    longitude=-img.coords[1] if img.coords else None,
                    object=v,
                    mime_type=mimetype,
                    jsondata=img.cdstar,
                )
                for initial in img.creators:
                    if initial in contrib_by_initial:
                        models.Fotographer(
                            foto=f, contributor=contrib_by_initial[initial])

    for cat, desc, place, name in MOVIES:
        s = slug(name)
        m = models.Movie(
            id=s,
            name=desc,
            description=cat,
            place=place,
        )
        if place in villages_by_name and len(villages_by_name[place]) == 1:
            m.village = villages_by_name[place][0]
            #print('found village: %s' % name)
        for v in videos[s]:
            #print('found video: %s' % name)
            v.movie = m
            m.duration = v.duration

    names = defaultdict(int)
    for concept in lexicon:
        add(concept, data, names, contrib)

    count = set()
    for img in ff_images:
        if img.id in count:
            continue
        count.add(img.id)
        if img.ref:
            if img.ref in data['Concept']:
                concept = data['Concept'][img.ref]
                if img.tsammalex_taxon and not concept.tsammalex_taxon:
                    concept.tsammalex_taxon = img.tsammalex_taxon
                    #print(concept.tsammalex_taxon)
                common.Parameter_files(
                    object=concept,
                    id=img.id,
                    name=img.name.decode('utf8'),
                    mime_type=guess_type(img.name)[0],
                    jsondata=img.cdstar)
            else:
                print('missing ref: %s' % img.ref)
示例#12
0
def main(args):  # pragma: no cover
    wl = Wordlist.from_metadata(args.data_file('cldf', 'cldf-metadata.json'))

    data = Data()
    data.add(
        common.Contributor, 'barthwolfgang',
        id='barthwolfgang',
        name="Wolfgang Barth",
        url="http://www.dynamicsoflanguage.edu.au/")
    #
    # FIXME: get dataset attributes from CLDF metadata!
    #
    dataset = common.Dataset(
        id='parabank',
        name='Parabank Pronouns',
        description='Database of pronouns',
        domain='parabank.clld.org',
        publisher_name="CoEDL Centre of Excellence for the Dynamics of Language",
        publisher_place="Canberra, Australia",
        publisher_url="http://www.dynamicsoflanguage.edu.au/",
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0'})
    DBSession.add(dataset)

    for i, editor in enumerate(['barthwolfgang']):
        common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1)

    contrib = common.Contribution(id='contrib', name='the contribution')

    for l in wl['LanguageTable']:
        lang = data.add(
            models.ParabankLanguage,
            l['ID'],
            id=l['ID'],
            name=l['Name'],
            description=l['Notes'],
            source=l['Source_Citation'],
            classification=l['Classification'],
        )
        add_language_codes(data, lang, None, glottocode=l['Glottocode'])

    for p in wl['ParameterTable']:
        data.add(
            common.Parameter,
            p['ID'],
            id=p['ID'],
            name='{0} ({1})'.format(p['Name'], p['ID']),
            #description=p['Description'],
        )

    for f in wl['FormTable']:
        vsid = '{0}-{1}'.format(f['Parameter_ID'], f['Language_ID'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id=vsid,
                language=data['ParabankLanguage'][f['Language_ID']],
                parameter=data['Parameter'][f['Parameter_ID']],
                contribution=contrib)

        DBSession.add(models.Word(
            id=f['ID'],
            name=f['Form'],
            comment=f.get('Comment'),
            original=f['Original_parameter'],
            valueset=vs))

    load_families(
        data,
        [(l.glottocode, l) for l in data['ParabankLanguage'].values()],
        glottolog_repos=args.data_file('glottolog'),
        isolates_icon='tcccccc')
示例#13
0
def load_families(data,
                  languages,
                  glottolog=None,
                  icons=ORDERED_ICONS,
                  isolates_icon=ISOLATES_ICON):
    """Add Family objects to a database and update Language object from Glottolog.

    Family information is retrieved from Glottolog based on the id attribute of a
    language. This id must be either a glottocode or an ISO 639-3 code.

    :param data:
    :return:
    """
    icons = cycle([
        getattr(i, 'name', i) for i in icons
        if getattr(i, 'name', i) != isolates_icon
    ])
    glottolog = glottolog or Glottolog()
    print len(languages), languages

    for language in languages:
        if isinstance(language, (tuple, list)) and len(language) == 2:
            code, language = language
        else:
            code = language.id
        print language, code
        if code != '-':
            gl_language = glottolog.languoid(code)
            if gl_language:
                gl_family = gl_language.family
                if gl_family and gl_family.name in [
                        'Sino-Tibetan', 'Dravidian', 'Indo-European',
                        'Austroasiatic'
                ]:  # the second condition is added by me (shafqat)
                    family = data['Family'].get(gl_family.id)
                    #print 'this one'
                    #print gl_family.name
                    if not family:
                        family = data.add(
                            Family,
                            gl_family.id,
                            id=gl_family.id,
                            name=gl_family.name,
                            description=Identifier(
                                name=gl_family.id,
                                type=IdentifierType.glottolog.value).url(),
                            ##jsondata=dict(icon=next(icons)))
                            jsondata=dict(icon=custom_icons[gl_family.name])
                        )  ## based on family, we can use different icons if we like as needed in case of LSI
                    language.family = family

                language.macroarea = gl_language.macroareas[0]
                add_language_codes(data,
                                   language,
                                   gl_language.iso_code,
                                   glottocode=gl_language.id)
                for attr in 'latitude', 'longitude', 'name':
                    if getattr(language, attr) is None:
                        setattr(language, attr, getattr(gl_language, attr))
            else:
                language.macroarea = None
示例#14
0
文件: initializedb.py 项目: clld/abvd
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=abvd.__name__,
        name='ABVD',
        description='',
        domain='abvd.clld.org',
        published=date.today(),
        license='https://creativecommons.org/licenses/by/4.0/',
        contact='',
        jsondata={
            'doi': args.doi,
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)

    for name in ['Simon Greenhill', 'Robert Blust', 'Russell Gray']:
        common.Editor(contributor=contributor(data, name), dataset=dataset)

    cnames = Counter()
    families = Counter([l['Family'] for l in args.cldf['LanguageTable']])
    colors = dict(
        zip([i[0] for i in families.most_common()], color.qualitative_colors(len(families))))
    cid2l = {}
    for lang in args.cldf['LanguageTable']:
        lid = (lang['Name'], lang['Glottocode'])
        l = data['Language'].get(lid)
        if not l:
            l = data.add(
                common.Language, lid,
                id=lang['ID'],
                name=lang['Name'],
                latitude=lang['Latitude'],
                longitude=lang['Longitude'],
                jsondata=dict(
                    family=lang['Family'],
                    icon='{0}{1}'.format('c' if lang['Family'] else 't', colors[lang['Family']]),
                ),
            )
            if lang['Glottocode'] or lang['ISO639P3code']:
                add_language_codes(
                    data, l, isocode=lang['ISO639P3code'], glottocode=lang['Glottocode'])
        cid2l[lang['ID']] = l
        cname = '{0} ({1})'.format(lang['Name'], lang['author'])
        cnames.update([cname])
        if cnames[cname] > 1:
            cname += ' {0}'.format(cnames[cname])
        c = data.add(
            models.Wordlist, lang['ID'],
            id=lang['ID'],
            name=cname,
            description=lang['author'],
            language=l,
            notes=lang['notes'],
        )
        i = 0
        typers = (lang['typedby'] or '').split(' and ')
        checkers = (lang['checkedby'] or '').split(' and ')
        for name in typers:
            i += 1
            DBSession.add(common.ContributionContributor(
                contribution=c,
                contributor=contributor(data, name),
                ord=i,
                jsondata=dict(type='typedby and checkedby' if name in checkers else 'typedby'),
            ))
        for name in checkers:
            if name in typers:
                continue
            i += 1
            DBSession.add(common.ContributionContributor(
                contribution=c,
                contributor=contributor(data, name),
                ord=i,
                jsondata=dict(type='checkedby'),
            ))

    for param in args.cldf['ParameterTable']:
        data.add(
            common.Parameter, param['ID'],
            id=param['ID'],
            name=param['Name'],
        )

    #
    # FIXME: add sources!
    #
    vsrs = set()
    for row in args.cldf['FormTable']:
        vs = data['ValueSet'].get((row['Language_ID'], row['Parameter_ID']))
        if not vs:
            vs = data.add(
                common.ValueSet,
                (row['Language_ID'], row['Parameter_ID']),
                id='{0}-{1}'.format(row['Language_ID'], row['Parameter_ID']),
                language=cid2l[row['Language_ID']],
                parameter=data['Parameter'][row['Parameter_ID']],
                contribution=data['Wordlist'][row['Language_ID']],
            )
        v = data.add(
            common.Value,
            row['ID'],
            id=row['ID'],
            name=row['Form'],
            valueset=vs
        )

    for row in args.cldf['CognateTable']:
        cc = data['Cognateset'].get(row['Cognateset_ID'])
        if not cc:
            cc = data.add(Cognateset, row['Cognateset_ID'], id=row['Cognateset_ID'])
        data.add(
            Cognate,
            row['ID'],
            cognateset=cc,
            counterpart=data['Value'][row['Form_ID']],
            doubt=row['Doubt'],
        )
示例#15
0
文件: initdb.py 项目: clld/glottolog3
def load_languoid(data, lang, nodemap):
    dblang = data.add(
        models.Languoid,
        lang.id,
        id=lang.id,
        hid=lang.hid,
        name=lang.name,
        bookkeeping=lang.category == models.BOOKKEEPING,
        newick=lang.newick_node(nodemap).newick,
        latitude=lang.latitude,
        longitude=lang.longitude,
        #
        # TODO: switch to using the AES labels, i.e. lang.endangerment.description!
        #
        status=models.LanguoidStatus.get(
            lang.endangerment.name if lang.endangerment else 'safe'),
        level=models.LanguoidLevel.from_string(lang.level.name),
        father=data['Languoid'][lang.lineage[-1][1]] if lang.lineage else None)
    if lang.iso:
        add_language_codes(data, dblang, lang.iso)

    for prov, names in lang.names.items():
        for name in names:
            l = 'en'
            if '[' in name and name.endswith(']'):
                name, l = [s.strip() for s in name[:-1].split('[', 1)]
            add_identifier(dblang, data, name, 'name', prov, lang=l)

    for prov, ids in lang.identifier.items():
        for id_ in split_text(ids, separators=',;'):
            add_identifier(dblang, data, id_, prov, None)

    if not dblang.bookkeeping:
        # Languages in Bookkeeping do not have a meaningful classification!
        clf = lang.classification_comment
        if clf:
            for attr, pid in [('sub', 'sc'), ('family', 'fc')]:
                val = getattr(clf, attr)
                if attr == 'sub' and not val:
                    # Handle cases with subrefs but no sub comment.
                    val = getattr(clf, 'subrefs')
                    if val:
                        val = ', '.join('{0}'.format(r) for r in val)
                if attr == 'family' and not val:
                    # Handle cases with subrefs but no sub comment.
                    val = getattr(clf, 'familyrefs')
                    if val:
                        val = ', '.join('{0}'.format(r) for r in val)
                if not val:
                    continue
                vs = common.ValueSet(
                    id='%s-%s' % (pid, lang.id),
                    description=val,
                    language=dblang,
                    parameter=data['Parameter'][pid],
                    contribution=data['Contribution']['clf'])
                DBSession.add(common.Value(id='%s-%s' % (pid, lang.id), valueset=vs))

    iso_ret = lang.iso_retirement
    if iso_ret:
        DBSession.add(models.ISORetirement(
            id=iso_ret.code,
            name=iso_ret.name,
            description=iso_ret.comment,
            effective=iso_ret.effective,
            reason=iso_ret.reason,
            remedy=iso_ret.remedy,
            change_request=iso_ret.change_request,
            languoid=dblang))

    eth_cmt = lang.ethnologue_comment
    if eth_cmt:
        DBSession.add(models.EthnologueComment(
            comment=eth_cmt.comment,
            code=eth_cmt.isohid,
            type=eth_cmt.comment_type,
            affected=eth_cmt.ethnologue_versions,
            languoid=dblang))
示例#16
0
def load_languoid(data, lang, nodemap):
    dblang = data.add(
        models.Languoid,
        lang.id,
        id=lang.id,
        hid=lang.hid,
        name=lang.name,
        bookkeeping=lang.category == models.BOOKKEEPING,
        newick=lang.newick_node(nodemap).newick,
        latitude=lang.latitude,
        longitude=lang.longitude,
        status=models.LanguoidStatus.get(
            lang.endangerment.name if lang.endangerment else 'safe'),
        level=models.LanguoidLevel.from_string(lang.level.name),
        father=data['Languoid'][lang.lineage[-1][1]] if lang.lineage else None)
    if lang.iso:
        add_language_codes(data, dblang, lang.iso)

    for prov, names in lang.names.items():
        for name in names:
            l = 'en'
            if '[' in name and name.endswith(']'):
                name, l = [s.strip() for s in name[:-1].split('[', 1)]
            add_identifier(dblang, data, name, 'name', prov, lang=l)

    for prov, ids in lang.identifier.items():
        for id_ in split_text(ids, separators=',;'):
            add_identifier(dblang, data, id_, prov, None)

    if not dblang.bookkeeping:
        # Languages in Bookkeeping do not have a meaningful classification!
        clf = lang.classification_comment
        if clf:
            for attr, pid in [('sub', 'sc'), ('family', 'fc')]:
                val = getattr(clf, attr)
                if attr == 'sub' and not val:
                    # Handle cases with subrefs but no sub comment.
                    val = getattr(clf, 'subrefs')
                    if val:
                        val = ', '.join('{0}'.format(r) for r in val)
                if not val:
                    continue
                vs = common.ValueSet(
                    id='%s-%s' % (pid, lang.id),
                    description=val,
                    language=dblang,
                    parameter=data['Parameter'][pid],
                    contribution=data['Contribution']['clf'])
                DBSession.add(common.Value(id='%s-%s' % (pid, lang.id), valueset=vs))

    iso_ret = lang.iso_retirement
    if iso_ret:
        DBSession.add(models.ISORetirement(
            id=iso_ret.code,
            name=iso_ret.name,
            description=iso_ret.comment,
            effective=iso_ret.effective,
            reason=iso_ret.reason,
            remedy=iso_ret.remedy,
            change_request=iso_ret.change_request,
            languoid=dblang))

    eth_cmt = lang.ethnologue_comment
    if eth_cmt:
        DBSession.add(models.EthnologueComment(
            comment=eth_cmt.comment,
            code=eth_cmt.isohid,
            type=eth_cmt.comment_type,
            affected=eth_cmt.ethnologue_versions,
            languoid=dblang))
示例#17
0
def load_languoid(glottolog, data, lang, nodemap):
    """
    Load data from one Languoid object.

    :param glottolog: A `pyglottolog.Glottolog` instance.
    :param data: A `dict` providing access to previously loaded data.
    :param lang: The `pyglottolog.languoids.Languoid` object.
    :param nodemap: A `dict` mapping glottocodes to `pyglottolog.languoids.Languoid`s.
    :return:
    """
    dblang = data.add(
        models.Languoid,
        lang.id,
        id=lang.id,
        hid=lang.hid,
        name=lang.name,
        bookkeeping=lang.category == glottolog.language_types.bookkeeping.category,
        category=lang.category,
        newick=lang.newick_node(nodemap).newick,
        latitude=lang.latitude,
        longitude=lang.longitude,
        level=models.LanguoidLevel.from_string(lang.level.name),
        father=data['Languoid'][lang.lineage[-1][1]] if lang.lineage else None,
        jsondata=dict(
            iso_retirement=lang.iso_retirement.__json__() if lang.iso_retirement else None,
            ethnologue_comment=lang.ethnologue_comment.__json__()
            if lang.ethnologue_comment else None,
            links=[l.__json__() for l in lang.links],
        )
    )
    if lang.iso:
        add_language_codes(data, dblang, lang.iso)

    add_identifiers(data, dblang, lang.names, name_type=True)
    add_identifiers(data, dblang, lang.identifier, name_type=False)

    add = functools.partial(add_values, data, dblang)
    add('macroarea', [(m.id, m.name) for m in lang.macroareas])
    add('country', [(c.id, c.name) for c in lang.countries])
    if lang.endangerment:
        add('aes',
            [(lang.endangerment.status.id, lang.endangerment.status.name)],
            source=lang.endangerment.source.name,
            jsondata=attr.asdict(lang.endangerment.source),
            description=lang.endangerment.comment,
        )
    if lang.level == glottolog.languoid_levels.language:
        add('ltype', [(lang.category, lang.category)])

    if not dblang.bookkeeping:
        # Languages in Bookkeeping do not have a meaningful classification!
        clf = lang.classification_comment
        if clf:
            for attr_, pid in [('sub', 'sc'), ('family', 'fc')]:
                val = getattr(clf, attr_)
                if not val:
                    val = getattr(clf, attr_ + 'refs')
                    if val:
                        val = ', '.join('{0}'.format(r) for r in val)
                if val:
                    add(pid, [('1', '')], with_de=False, description=val)
示例#18
0
def main(args):
    """Fills the database with data retrieved from tabular files.
    'filltables()' iterates over each row of each table.
    'typ' is the name of the table, 'name' a key column and 'tupl'
    the other columns as a dict {column_name,cell_value}."""

    data = Data()
    count = 0
    # dataset
    dataset = _addDataset(data)
    # load languages
    for typ, name, tupl in filltables():
        if not name or name == "na":
            continue
        #TODO we exclude non core language
        if typ == "languages" and tupl.get('Id').startswith('L_'):
            #print(name, tupl)
            lang = _addLang([
                name,
                tupl.get('Language', "na"),
                tupl.get('Family', "na"),
                tupl.get('fam_glottocode', ""),
                tupl.get('Area', "na"),
                tupl.get('Creator', "na"),
                tupl.get('Date', "na"),
                tupl.get('Archive', "na"),
                tupl.get('Archive_link', "na"),
                tupl.get('Translation', "na"),
                tupl.get('License', "na"),
                tupl.get('Audio license', "na"),
                tupl.get('NAKALA', "na"),
                tupl.get('Gloss', "na"),
                tupl.get('Words', 0),
                tupl.get('Speakers', 0),
                tupl.get('Texts', 0),
                tupl.get('Core words', 0),
                tupl.get('Core speakers', 0),
                tupl.get('Core texts', 0),
                tupl.get('Latitude', 0.0),
                tupl.get('Longitude', 0.0),
                tupl.get('Extended', "no")
            ])
            add_language_codes(data,
                               lang,
                               tupl.get('iso-639-3'),
                               glottocode=name)
        elif typ == "editors":
            dataset, count = _addEditor(dataset, count, [
                name,
                tupl.get('url', "na"),
                tupl.get('email', "na"),
                tupl.get('address', "na"),
                tupl.get('team', "na"),
                tupl.get('function', "na")
            ])
        elif typ == "sources":
            _addSource([
                name,
                tupl.get('bibtex_type', "na"),
                tupl.get('author', "na"),
                tupl.get('year', "na"),
                tupl.get('title', "na"),
                tupl.get('url', "na"),
                tupl.get('note', "na")
            ])
        else:
            #TODO for texts, we exclude delete and so on in column extended
            if tupl.get('extended') in ['no', 'yes']:
                #if typ=='dolg1241' : print(tupl):''
                _addText([
                    typ, name,
                    tupl.get('name', "na"),
                    tupl.get('spk_code', "na"),
                    tupl.get('spk_age', '0'),
                    tupl.get('spk_age_c', "na"),
                    tupl.get('spk_sex', "na"),
                    tupl.get('rec_date', "na"),
                    tupl.get('rec_date_c', "na"),
                    tupl.get('genre', "na"),
                    tupl.get('genre_stim', "na"),
                    tupl.get('gloss', "na"),
                    tupl.get('transl', "na"),
                    tupl.get('sound', "na"),
                    tupl.get('overlap', "na"),
                    tupl.get('processed', "na"),
                    tupl.get('nakala', "na"),
                    tupl.get('words', 0),
                    tupl.get('extended', "no")
                ])
        # dataset
        # Note: needs to run after loading (for editors)

    DBSession.add(dataset)
    DBSession.flush()
示例#19
0
    def test_add_language_codes(self, ):
        from clld.db.models.common import Language
        from clld.scripts.util import Data, add_language_codes

        add_language_codes(Data(), Language(), 'iso', glottocodes=dict(iso='glot1234'))
示例#20
0
文件: initializedb.py 项目: clld/csd
def main(args):
    sources = get_sources(args)
    Index('ducet1', collkey(common.Value.name)).create(DBSession.bind)
    Index('ducet2', collkey(models.Counterpart.phonetic)).create(DBSession.bind)
    data = Data()
    glottocodes, geocoords = {}, defaultdict(lambda: (None, None))
    for k, v in glottocodes_by_isocode(
            'postgresql://robert@/glottolog3',
            cols=['id', 'latitude', 'longitude']).items():
        glottocodes[k] = v[0]
        geocoords[k] = (v[1], v[2])
    geocoords['win'] = (43.50, -88.50)

    dataset = common.Dataset(
        id=csd.__name__,
        name="Comparative Siouan Dictionary",
        description="Comparative Siouan Dictionary",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='csd.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    contrib = common.Contribution(id='csd', name=dataset.name)
    for i, spec in enumerate([
        ('Robert L. Rankin', True),
        ('Richard T. Carter', True),
        ('A. Wesley Jones', True),
        ('John E. Koontz', True),
        ('David S. Rood', True),
        ('Iren Hartmann', True),
    ]):
        name, primary = spec
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i, primary=primary))

    d = Dictionary(
        args.data_file(TXT),
        entry_impl=CsdEntry,
        entry_sep='\\lx ')
    d.entries = list(filter(lambda r: r.get('lx'), d.entries))[1:]
    print(len(d.entries))

    for i, v in enumerate(_LANGUAGES):
        l = data.add(
            models.Languoid, v[0],
            id=v[0],
            name=v[1],
            ord=i,
            color=v[4].lower(),
            proto=v[0].startswith('p') and len(v[0]) == 3,
            latitude=geocoords[v[2]][0],
            longitude=geocoords[v[2]][1],
            parent=data['Languoid'].get(v[5]))
        if v[2]:
            add_language_codes(data, l, v[2], glottocodes=glottocodes)
        if l.id == 'pn':
            l.latitude, l.longitude = (42.75, -98.03)
        if l.id == 'op':
            l.latitude, l.longitude = (43.5, -96.6)
        if l.id == 'mo':
            l.latitude, l.longitude = (40.05, -95.52)

    pnames = set()

    def _get(d, marker):
        _l = set(nfilter(d.get(marker, [])))
        if _l:
            _l = list(_l)
            if marker not in ['oo', 'or']:
                assert len(_l) == 1
                _l = _l[0]
            return _l

    def add_counterpart(d, vs, id,
                        phonetic,  # forms
                        cognate,  # oo
                        me, cm, so, org):
        assert phonetic or cognate

        if not cognate:
            if vs.language.proto:
                cognate = phonetic
                phonetic = None
            else:
                cognate = '[%s]' % phonetic
        m = models.Counterpart(
            id=id,
            name=cognate,
            phonetic=phonetic,
            description=me or '[%s]' % vs.parameter.name,
            comment=cm,
            original_entry=org,
            other_reconstructions='; '.join(_get(d, 'or') or []) if vs.language.id == 'psi' else None,
            valueset=vs)
        if so:
            for sid in nfilter([s.strip() for s in SEP_PATTERN.split(so or '')]):
                match = SID_PATTERN.match(sid)
                if not match:
                    continue

                name = sid
                sid = normalize_sid(match.group('key'))
                source = data['Source'].get(sid)
                if not source:
                    if sid in sources:
                        s = sources[sid]
                        source = data.add(
                            common.Source, sid,
                            id=sid,
                            name=s['Name'].upper() if len(s['Name']) <= 3 else s['Name'],
                            description=s.get('Title', s['citation']),
                            author=s.get('Author'),
                            title=s.get('Title'),
                            year=s.get('Year'),
                        )
                    else:
                        source = data.add(
                            common.Source, sid,
                            id=sid,
                            name=name.upper() if len(name) <= 3 else name)
                m.references.append(models.ValueReference(
                    source=source, description=match.group('pages')))

    for i, entry in enumerate(sorted(d.entries, key=lambda d: d.get('lx'), reverse=True)):
        lemma = entry.get('lx')
        if not lemma or not lemma.strip():
            continue
        pname = lemma
        j = 1
        while pname in pnames:
            pname = '%s (%s)' % (lemma, j)
            j += 1
        pnames.add(pname)
        contrib = data.add(
            common.Contribution, pname, id=str(i + 1), name='Entry "%s"' % pname)
        meaning = data.add(
            models.Entry, pname,
            id=str(i + 1),
            name=pname,
            contribution=contrib,
            description=entry.get('com'),
            psi_reconstruction_with_root_extension_code=entry.get('lxcm'),
            sd=normalize_comma_separated(entry.get('sd'), SD, lower=True),
            ps=normalize_comma_separated(entry.get('ps'), PS),
            othlgs='\n---\n'.join(entry.getall('othlgs')))
        if meaning.description:
            meaning.description = meaning.description.replace('.\n', '.\n\n')

        for lid, words in entry.get_words().items():
            vsid = '%s-%s' % (lid, meaning.id)
            vs = data.add(
                common.ValueSet, vsid,
                id=vsid,
                parameter=meaning,
                contribution=contrib,
                language=data['Languoid'][lid])

            for j, d in enumerate(words):
                looped = False

                for k, (oo, me, so, cm, org) in enumerate(izip_longest(
                        *[d.get(_m, []) for _m in 'oo me so cm _org'.split()])):
                    if not oo:
                        continue
                    looped = True
                    add_counterpart(d,
                        vs,
                        '%s-%s-%s' % (vsid, j + 1, k + 1),
                        d['forms'],
                        oo,
                        me,
                        cm,
                        so,
                        org)

                if not looped:  # not oo
                    if not d['forms']:
                        print '--->', d
                        continue
                    add_counterpart(d,
                        vs,
                        '%s-%s-%s' % (vsid, j + 1, 1),
                        d['forms'],
                        '; '.join(_get(d, 'oo') or []),
                        _get(d, 'me'),
                        _get(d, 'cm'),
                        _get(d, 'so'),
                        _get(d, '_org'))