Пример #1
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    repos = Path(
        os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data')

    with transaction.manager:
        dataset = common.Dataset(
            id=lexirumah.__name__,
            name="lexirumah",
            publisher_name=
            "Max Planck Institute for the Science of Human History",
            publisher_place="Jena",
            publisher_url="http://shh.mpg.de",
            license="http://creativecommons.org/licenses/by/4.0/",
            domain='lexirumah.model-ling.eu',
            contact='*****@*****.**',
            jsondata={
                'license_icon':
                'cc-by.png',
                'license_name':
                'Creative Commons Attribution 4.0 International License'
            })
        DBSession.add(dataset)

    glottolog_repos = Path(
        lexirumah.__file__).parent.parent.parent.parent.joinpath(
            'glottolog3', 'glottolog')
    languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()}
    concepticon = Concepticon(
        Path(lexirumah.__file__).parent.parent.parent.parent.joinpath(
            'concepticon', 'concepticon-data'))
    conceptsets = {c.id: c for c in concepticon.conceptsets.values()}

    skip = True
    for dname in sorted(repos.joinpath('datasets').iterdir(),
                        key=lambda p: p.name):
        #if dname.name == 'benuecongo':
        #    skip = False
        #if skip:
        #    continue
        if dname.is_dir() and dname.name != '_template':
            mdpath = dname.joinpath('cldf', 'metadata.json')
            if mdpath.exists():
                print(dname.name)
                import_cldf(dname, load(mdpath), languoids, conceptsets)

    with transaction.manager:
        load_families(Data(),
                      DBSession.query(LexiRumahLanguage),
                      glottolog_repos=glottolog_repos,
                      isolates_icon='tcccccc')
Пример #2
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    repos = Path(os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data')

    with transaction.manager:
        dataset = common.Dataset(
            id=lexirumah.__name__,
            name="lexirumah",
            publisher_name="Max Planck Institute for the Science of Human History",
            publisher_place="Jena",
            publisher_url="http://shh.mpg.de",
            license="http://creativecommons.org/licenses/by/4.0/",
            domain='lexirumah.model-ling.eu',
            contact='*****@*****.**',
            jsondata={
                'license_icon': 'cc-by.png',
                'license_name': 'Creative Commons Attribution 4.0 International License'})
        DBSession.add(dataset)

    glottolog_repos = Path(
        lexirumah.__file__).parent.parent.parent.parent.joinpath('glottolog3', 'glottolog')
    languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()}
    concepticon = Concepticon(
        Path(lexirumah.__file__).parent.parent.parent.parent.joinpath('concepticon', 'concepticon-data'))
    conceptsets = {c.id: c for c in concepticon.conceptsets.values()}

    skip = True
    for dname in sorted(repos.joinpath('datasets').iterdir(), key=lambda p: p.name):
        #if dname.name == 'benuecongo':
        #    skip = False
        #if skip:
        #    continue
        if dname.is_dir() and dname.name != '_template':
            mdpath = dname.joinpath('cldf', 'metadata.json')
            if mdpath.exists():
                print(dname.name)
                import_cldf(dname, load(mdpath), languoids, conceptsets)

    with transaction.manager:
        load_families(
            Data(),
            DBSession.query(LexiRumahLanguage),
            glottolog_repos=glottolog_repos,
            isolates_icon='tcccccc')
Пример #3
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    repos = Path(os.path.expanduser('~')).joinpath('venvs/lexibank/lexibank-data')

    with transaction.manager:
        dataset = common.Dataset(
            id=lexibank.__name__,
            name="lexibank",
            publisher_name="Max Planck Institute for the Science of Human History",
            publisher_place="Jena",
            publisher_url="http://shh.mpg.de",
            license="http://creativecommons.org/licenses/by/4.0/",
            domain='lexibank.clld.org',
            contact='*****@*****.**',
            jsondata={
                'license_icon': 'cc-by.png',
                'license_name': 'Creative Commons Attribution 4.0 International License'})
        DBSession.add(dataset)

    glottolog = Glottolog(
        Path(lexibank.__file__).parent.parent.parent.parent.joinpath('glottolog3', 'glottolog'))
    languoids = {l.id: l for l in glottolog.languoids()}
    concepticon = Concepticon(
        Path(lexibank.__file__).parent.parent.parent.parent.joinpath('concepticon', 'concepticon-data'))
    conceptsets = {c['ID']: c for c in concepticon.conceptsets()}

    for dname in repos.joinpath('datasets').iterdir():
        #if dname.name not in ['acbd']:
        #    continue
        if dname.is_dir() and dname.name != '_template':
            #if dname.name != 'zenodo34092':
            #    continue
            mdpath = dname.joinpath('metadata.json')
            if mdpath.exists():
                print(dname.name)
                import_cldf(dname, load(mdpath), languoids, conceptsets)

    with transaction.manager:
        load_families(
            Data(),
            DBSession.query(LexibankLanguage),
            glottolog=languoids,
            isolates_icon='tcccccc')
Пример #4
0
def test_collkey(data):
    from clld.db.util import collkey
    from clld.db.models.common import Language

    collkey(Language.name)
Пример #5
0
 def order(self):
     return collkey(Value.name)
Пример #6
0
def main(args):

    Index('ducet', collkey(func.translate(common.Value.name, 'ˈ,ː,ˌ', '')))\
        .create(DBSession.bind)

    data = Data()

    dataset = common.Dataset(
        id=numerals.__name__,
        name="Numeralbank",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain="numerals.clld.org",
        jsondata={
            "license_icon": "cc-by.png",
            "license_name": "Creative Commons Attribution 4.0 International License",
        },
    )

    DBSession.add(dataset)

    for i, (id_, name) in enumerate(
        [("verkerkannemarie", "Annemarie Verkerk"), ("rzymskichristoph", "Christoph Rzymski")]
    ):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)

    DBSession.add(dataset)

    # Take meta data from curated CLDF data set
    ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json')
    # Parameters:
    for parameter in ds["ParameterTable"]:
        data.add(
            models.NumberParameter,
            parameter["ID"],
            id=parameter["ID"],
            name="{0}".format(parameter["ID"]),
            concepticon_id=parameter['Concepticon_ID'],
        )
    basis_parameter = data.add(
        models.NumberParameter,
        "0",
        id="0",
        name="Base",
    )
    load_family_langs = []
    for language in ds["LanguageTable"]:
        lang = data.add(
            models.Variety,
            language["ID"],
            id=language["ID"],
            name=language["Name"],
            latitude=language["Latitude"],
            longitude=language["Longitude"],
            creator=language["Contributor"],
            comment=language["Comment"],
            url_soure_name=language["SourceFile"],
        )
        if language["Glottocode"]:
            load_family_langs.append((language["Glottocode"], lang))

    # get orginal forms
    ds = Wordlist.from_metadata(data_repos[0]['data_path'] / 'cldf' / 'cldf-metadata.json')
    org_forms = {f["ID"]: f for f in ds["FormTable"]}

    d = data_repos[1]
    contrib = data.add(
        common.Contribution,
        d['id'],
        id=d['id'],
        name=d['name']
    )

    # process curated forms
    ds = Wordlist.from_metadata(data_repos[1]['data_path'] / 'cldf' / 'cldf-metadata.json')

    # Add Base info if given
    for language in ds["LanguageTable"]:
        if language["Base"]:
            basis = language["Base"]
            de = data["DomainElement"].get(basis)
            if not de:
                de = data.add(
                    common.DomainElement,
                    basis,
                    id=text_type(basis),
                    name=text_type(basis),
                    parameter=basis_parameter,
                )
            vs = data.add(
                common.ValueSet,
                data["Variety"][language["ID"]].id,
                id=data["Variety"][language["ID"]].id,
                language=data["Variety"][language["ID"]],
                parameter=basis_parameter,
                contribution=contrib,
            )

            common.Value(
                id=data["Variety"][language["ID"]].id,
                valueset=vs,
                domainelement=de
            )

    # Forms:
    for form in ds["FormTable"]:
        valueset_id = "{0}-{1}".format(form["Parameter_ID"], form["Language_ID"])
        valueset = data["ValueSet"].get(valueset_id)

        # Unless we already have something in the VS:
        if not valueset:
            if form["Language_ID"] in data["Variety"]:
                vs = data.add(
                    common.ValueSet,
                    valueset_id,
                    id=valueset_id,
                    language=data["Variety"][form["Language_ID"]],
                    parameter=data["NumberParameter"][form["Parameter_ID"]],
                    contribution=contrib,
                )

        org_form = ""
        if form["ID"] in org_forms:
            if unicodedata.normalize('NFC', org_forms[form["ID"]]["Form"].strip()) != form["Form"]:
                org_form = org_forms[form["ID"]]["Form"]
        else:
            org_form = "no original form"
        DBSession.add(
            models.NumberLexeme(
                id=form["ID"],
                name=form["Form"],
                comment=form["Comment"],
                is_loan=form["Loan"],
                other_form=form["Other_Form"],
                org_form=org_form,
                is_problematic=form["Problematic"],
                valueset=vs,
            )
        )

    load_families(
        Data(),
        load_family_langs,
        glottolog_repos=gl_repos,
        strict=False,
    )

    distinct_varieties = DBSession.query(models.Variety.family_pk).distinct().all()
    families = dict(
        zip([r[0] for r in distinct_varieties], color.qualitative_colors(len(distinct_varieties)))
    )

    for l in DBSession.query(models.Variety):
        l.jsondata = {"color": families[l.family_pk]}

    p = common.Parameter.get("0")
    colors = color.qualitative_colors(len(p.domain))

    for i, de in enumerate(p.domain):
        de.jsondata = {"color": colors[i]}
Пример #7
0
    def test_collkey(self):
        from clld.db.util import collkey
        from clld.db.models.common import Language

        collkey(Language.name)
Пример #8
0
 def order(self):
     return collkey(Counterpart.phonetic)
Пример #9
0
def main(args):
    if DBSession.bind.dialect.name == 'postgresql':
        Index('ducet', collkey(common.Value.name)).create(DBSession.bind)

    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(
            data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace(
            '/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(
                url=image.source_url,
                thumbnail=image_url(image.source_url, 'thumbnail'),
                web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(
                object=data['Taxon'][image.taxa__id],
                id=image.id,
                name=image.tags,
                jsondata=jsondata,
                mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)
Пример #10
0
def main(args):
    fts.index('fts_index', Word.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")

    if DBSession.bind.dialect.name == 'postgresql':
        Index('ducet', collkey(common.Unit.name)).create(DBSession.bind)
    data = Data()

    dataset = common.Dataset(
        id=dictionaria.__name__,
        name="Dictionaria",
        description="The Dictionary Journal",
        published=date(2017, 3, 30),
        contact='*****@*****.**',
        domain='dictionaria.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    for i, (id_, name) in enumerate([
        ('haspelmathmartin', 'Martin Haspelmath'),
        ('stiebelsbarbara', 'Barbara Stiebels')
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))

    comparison_meanings = {}

    print('loading concepts ...')

    glosses = set()
    concepticon = Concepticon(
        REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data'))
    if not args.no_concepts:
        for conceptset in concepticon.conceptsets.values():
            if conceptset.gloss in glosses:
                continue
            glosses.add(conceptset.gloss)
            cm = data.add(
                ComparisonMeaning,
                conceptset.id,
                id=conceptset.id,
                name=conceptset.gloss.lower(),
                description=conceptset.definition,
                concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id)
            comparison_meanings[cm.id] = cm

    DBSession.flush()

    print('... done')

    comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()}
    submissions = []

    for submission in REPOS.joinpath(
            'submissions-internal' if args.internal else 'submissions').glob('*'):
        if not submission.is_dir():
            continue

        try:
            submission = Submission(submission)
        except ValueError:
            continue

        md = submission.md
        if md is None:
            print('no md', submission.id)
            continue

        if not md['date_published']:
            print('no date', submission.id)
            continue

        id_ = submission.id
        if args.dict and args.dict != id_ and args.dict != 'all':
            print('not selected', submission.id)
            continue
        lmd = md['language']
        props = md.get('properties', {})
        props.setdefault('custom_fields', [])
        props['metalanguage_styles'] = {}
        for v, s in zip(props.get('metalanguages', {}).values(),
                        ['success', 'info', 'warning', 'important']):
            props['metalanguage_styles'][v] = s
        props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f
                                  for f in props['custom_fields']]
        props.setdefault('choices', {})

        language = data['Variety'].get(lmd['glottocode'])
        if not language:
            language = data.add(
                Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name'])

        md['date_published'] = md['date_published'] or date.today().isoformat()
        if '-' not in md['date_published']:
            md['date_published'] = md['date_published'] + '-01-01'
        dictionary = data.add(
            Dictionary,
            id_,
            id=id_,
            number=md.get('number'),
            name=props.get('title', lmd['name'] + ' dictionary'),
            description=submission.description,
            language=language,
            published=date(*map(int, md['date_published'].split('-'))),
            doi=md.get('doi'),
            jsondata=props)

        for i, spec in enumerate(md['authors']):
            if not isinstance(spec, dict):
                cname, address = spec, None
                spec = {}
            else:
                cname, address = spec['name'], spec.get('affiliation')
            name = HumanName(cname)
            cid = slug('%s%s' % (name.last, name.first))
            contrib = data['Contributor'].get(cid)
            if not contrib:
                contrib = data.add(
                    common.Contributor,
                    cid,
                    id=cid,
                    name=cname,
                    address=address,
                    url=spec.get('url'),
                    email=spec.get('email'))
            DBSession.add(common.ContributionContributor(
                ord=i + 1,
                primary=spec.get('primary', True),
                contributor=contrib,
                contribution=dictionary))

        submissions.append((dictionary.id, language.id, submission))
    transaction.commit()

    for did, lid, submission in submissions:
        transaction.begin()
        print('loading %s ...' % submission.id)
        dictdata = Data()
        lang = Variety.get(lid)
        submission.load_sources(Dictionary.get(did), dictdata)
        submission.load_examples(Dictionary.get(did), dictdata, lang)
        submission.dictionary.load(
            submission,
            dictdata,
            Dictionary.get(did),
            lang,
            comparison_meanings,
            OrderedDict(submission.md.get('properties', {}).get('labels', [])))
        transaction.commit()
        print('... done')

    transaction.begin()
    load_families(
        Data(),
        [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)],
        glottolog_repos='../../glottolog/glottolog')
Пример #11
0
 def order(self):
     return collkey(Word.name), Word.number
Пример #12
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    data = Data()

    def read(table):
        return list(dsv.reader(
            args.data_file(table + '.csv'), delimiter=',', namedtuples=True))

    dataset = common.Dataset(
        id=ids.__name__,
        name="IDS",
        description="The Intercontinental Dictionary Series",
        #published=date(2009, 8, 15),
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'http://i.creativecommons.org/l/by-nc-nd/2.0/de/88x31.png',
            'license_name':
                'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany License',
        },
        domain='ids.clld.org')

    DBSession.add(dataset)
    data_desc = defaultdict(dict)
    for l in read('x_lg_data'):
        data_desc[l.lg_id][l.map_ids_data] = l.header

    # language lang
    exclude = []
    for l in read('lang'):
        if l.status == '1':
            exclude.append(l.lg_id)
            continue
        lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=l.lg_name)
        data.add(
            models.Dictionary, l.lg_id,
            id=l.lg_id, name=l.lg_name,
            language=lang,
            default_representation=data_desc[l.lg_id].get('1'),
            alt_representation=data_desc[l.lg_id].get('2'),
            jsondata=dict(status=l.status, date=l.date))

    iso_codes = {l.id: l.sil_code for l in read('sil_lang')}
    languages = {l.lg_id: iso_codes[l.sil_id]
                 for l in read('x_lg_sil') if l.lg_id not in exclude}
    load_families(Data(), [(v, data['IdsLanguage'][k]) for k, v in languages.items()])

    contributors = defaultdict(list)
    sources = defaultdict(list)
    for l in read('lang_compilers'):
        if l.lg_id in exclude:
            continue
        if l.name == "BIBIKO":
            continue
        #name	lg_id	what_did_id
        if int(l.what_did_id) in models.ROLES:
            contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id))
        else:
            if int(l.what_did_id) not in [4, 395]:
                print(l.what_did_id)
                raise ValueError
            sources[l.name].append(l.lg_id)

    for s, roles in contributors.items():
        name = roles[0][0]
        c = data.add(common.Contributor, s, id=s, name=name)
        if name == 'Mary Ritchie Key':
            c.address = 'University of California, Irvine'
        for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]):
            sroles = sorted(
                [s[1] for s in specs],
                reverse=True,
                key=lambda what: what + 2 if what == 2 else what)
            what = sroles[0]
            DBSession.add(common.ContributionContributor(
                contribution=data['Dictionary'][lg],
                contributor=c,
                ord=what,
                primary=what == 2))

    data.add(
        common.Contributor, 'bernardcomrie',
        id='bernardcomrie',
        name="Bernard Comrie",
        address="Max Planck Institute for Evolutionary Anthropology, Leipzig")

    for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']):
        common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1)

    for i, name in enumerate(sorted(sources.keys())):
        c = data.add(common.Source, name, id=str(i + 1), name=name, description=name)

    DBSession.flush()
    for name, lgs in sources.items():
        for lg in lgs:
            if lg in exclude:
                continue
            try:
                DBSession.add(common.LanguageSource(
                    language_pk=data['IdsLanguage'][lg].pk,
                    source_pk=data['Source'][name].pk))
            except KeyError:
                print(name, lgs)
                continue

    altnames = {}
    for i, l in enumerate(read('alt_names')):
        if l.name in altnames:
            identifier = altnames[l.name]
        else:
            identifier = data.add(
                common.Identifier, l.name,
                id='name-%s' % i, type='name', name=l.name, description='IDS')
            altnames[l.name] = identifier
        if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name:
            DBSession.add(common.LanguageIdentifier(
                identifier=identifier,
                language=data['IdsLanguage'][l.lg_id]))

    # parameter chapter/entry
    for l in read('chapter'):
        data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title)

    entries = {}
    for l in read('entry'):
        id_ = '%s-%s' % (l.chap_id, l.entry_id)
        name = l.trans_english
        if name in entries:
            entries[name] += 1
            name = name + ' (%s)' % entries[name]
        else:
            entries[name] = 1
        kw = {'id': id_, 'name': name, 'chapter': data['Chapter'][l.chap_id]}
        for ll in 'french russian spanish portugese'.split():
            kw[ll] = getattr(l, 'trans_' + ll)
        data.add(models.Entry, id_, sub_code=l.entry_id, **kw)

    misaligned = []

    DBSession.flush()
    for entity in 'IdsLanguage Entry Chapter Dictionary'.split():
        for k in data[entity].keys()[:]:
            data[entity][k] = data[entity][k].pk

    synsets = set()

    for lg_id, entries in groupby(
            sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id):
        if lg_id in exclude or not lg_id:
            continue

        # keep the memory footprint reasonable
        transaction.commit()
        transaction.begin()

        try:
            language = common.Language.get(data['IdsLanguage'][lg_id])
        except KeyError:
            print(list(entries))
            raise
        desc = data_desc.get(lg_id, {})
        words = defaultdict(list)
        for l in entries:
            if empty.match(l.data_1):
                continue

            entry_id = '%s-%s' % (l.chap_id, l.entry_id)
            if entry_id not in data['Entry']:
                data.add(
                    models.Entry, entry_id,
                    id=entry_id,
                    name=entry_id,
                    #active=False,
                    sub_code=l.entry_id,
                    chapter_pk=data['Chapter'][l.chap_id])
                DBSession.flush()
                data['Entry'][entry_id] = data['Entry'][entry_id].pk

            id_ = '%s-%s' % (entry_id, l.lg_id)
            if id_ in synsets:
                vs = models.Synset.get(id_)
            else:
                vs = models.Synset(
                    id=id_,
                    comment=get_string(l.comment or ''),
                    alt_representation=get_string(l.data_2),
                    language=language,
                    contribution_pk=data['Dictionary'][l.lg_id],
                    parameter_pk=data['Entry'][entry_id])
                synsets.add(id_)

            trans1 = list(split_counterparts(l.data_1))
            trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2))

            if trans2:
                if len(trans2) != len(trans1):
                    if language.id != '238':
                        misaligned.append((l.chap_id, l.entry_id, l.lg_id))
                        #print('===', language.id, language.name)
                        #print(l.data_1)
                        #print(l.data_2)
                    #assert language.id == '238'  # Rapa Nui has problems!
                    trans2 = None

            for i, word in enumerate(trans1):
                v = models.Counterpart(
                    id=id_ + '-' + str(i + 1 + len(vs.values)),
                    name=word,
                    description=desc.get('1'),
                    valueset=vs)
                words[word].append((v, trans2[i] if trans2 else None))

        for i, form in enumerate(words.keys()):
            # Since we identify words based on their string representation, we have to
            # make sure a word has the same alternative transcription for all meanings.
            if language.id == '238':
                alt_names = []
            else:
                alt_names = set(norm(w[1] or '', desc.get('2'), language.id)
                                for w in words[form])
            alt_names = nfilter(alt_names)
            try:
                assert len(alt_names) <= 1
            except AssertionError:
                print('---', language.id, language.name)
                print(alt_names)
            word = models.Word(
                id='%s-%s' % (language.id, i + 1),
                name=form,
                description=desc.get('1'),
                language=language,
                alt_name=', '.join(alt_names) if alt_names else None,
                alt_description=desc.get('2')
            )
            for v, _ in words[form]:
                word.counterparts.append(v)
            DBSession.add(word)

        DBSession.flush()

    with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp:
        fp.writerows(misaligned)
Пример #13
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    data = Data()
    concept_list = Concepticon(CONCEPTICON_REPOS).conceptlist('Key-2016-1310')

    def concepticon_id(ids_code):
        for item in concept_list:
            if item['IDS_ID'] == ids_code:
                return int(item['CONCEPTICON_ID']) if item['CONCEPTICON_ID'] else None

    def read(table):
        fname = args.data_file(table + '.all.csv')
        if not fname.exists():
            fname = args.data_file(table + '.csv')
        return list(dsv.reader(fname, namedtuples=True))

    dataset = common.Dataset(
        id=ids.__name__,
        name="IDS",
        description="The Intercontinental Dictionary Series",
        published=date(2015, 5, 25),
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name':
                'Creative Commons Attribution 4.0 International License',
        },
        domain='ids.clld.org')

    DBSession.add(dataset)

    for rec in Database.from_file(args.data_file('sources.bib'), lowercase=True):
        if rec.id not in data['Source']:
            data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    DBSession.flush()

    data_desc = defaultdict(dict)
    for l in read('x_lg_data'):
        data_desc[l.lg_id][l.map_ids_data] = l.header

    # language lang
    iso_codes = {l.id: l.sil_code for l in read('sil_lang')}
    iso_codes = {l.lg_id: iso_codes[l.sil_id] for l in read('x_lg_sil')}
    languages = []

    exclude = []
    for l in read('lang'):
        if l.status == '1':
            exclude.append(l.lg_id)
            continue
        lang_changed = LANGS.get(int(l.lg_id), {})
        code = lang_changed.get('glotto') or lang_changed.get('iso') or iso_codes.get(l.lg_id)
        lang = data.add(models.IdsLanguage, l.lg_id, id=l.lg_id, name=lang_changed.get('name', l.lg_name))
        if code:
            languages.append((code, lang))
        data.add(
            models.Dictionary, l.lg_id,
            id=l.lg_id, name=l.lg_name,
            language=lang,
            default_representation=data_desc[l.lg_id].get('1'),
            alt_representation=data_desc[l.lg_id].get('2'),
            jsondata=dict(status=l.status, date=l.date))

    iso2glotto = {}
    for l in walk_tree(tree=languoids_path('tree', GLOTTOLOG_REPOS)):
        if l.iso:
            iso2glotto[l.iso] = l.id

    load_families(
        Data(), [(iso2glotto.get(c, c), l) for c, l in languages], glottolog=Glottolog(GLOTTOLOG_REPOS), isolates_icon='tcccccc')

    contributors = defaultdict(list)
    sources = defaultdict(list)
    for l in read('lang_compilers'):
        if l.lg_id in exclude:
            continue
        if l.name == "BIBIKO":
            continue
        #name	lg_id	what_did_id
        if int(l.what_did_id) in models.ROLES:
            contributors[slug(l.name)].append((l.name, int(l.what_did_id), l.lg_id))
        else:
            assert int(l.what_did_id) in [4, 395]
            sources[l.name].append(l.lg_id)

    for s, roles in contributors.items():
        name = roles[0][0]
        c = data.add(common.Contributor, s, id=s, name=name)
        if name == 'Mary Ritchie Key':
            c.address = 'University of California, Irvine'
        for lg, specs in groupby(sorted(roles, key=lambda r: r[2]), key=lambda r: r[2]):
            sroles = sorted(
                [s[1] for s in specs],
                reverse=True,
                key=lambda what: what + 2 if what == 2 else what)
            what = sroles[0]
            DBSession.add(common.ContributionContributor(
                contribution=data['Dictionary'][lg],
                contributor=c,
                ord=what,
                primary=what == 2))

    data.add(
        common.Contributor, 'bernardcomrie',
        id='bernardcomrie',
        name="Bernard Comrie",
        address="University of California, Santa Barbara")

    for i, editor in enumerate(['maryritchiekey', 'bernardcomrie']):
        common.Editor(dataset=dataset, contributor=data['Contributor'][editor], ord=i + 1)

    #for i, name in enumerate(sorted(sources.keys())):
    #    c = data.add(common.Source, name, id=str(i + 1), name=name, description=name)

    DBSession.flush()
    for name, lgs in sources.items():
        for _src in name.split(';'):
            src = data['Source'].get(_src.strip())
            if not src:
                print('-- missing source --', _src)
                raise ValueError
            for lg in lgs:
                if lg in exclude:
                    continue
                assert lg in data['Dictionary']
                DBSession.add(common.ContributionReference(
                    contribution_pk=data['Dictionary'][lg].pk, source_pk=src.pk))

    altnames = {}
    for i, l in enumerate(read('alt_names')):
        if l.name in altnames:
            identifier = altnames[l.name]
        else:
            identifier = data.add(
                common.Identifier, l.name,
                id='name-%s' % i, type='name', name=l.name, description='IDS')
            altnames[l.name] = identifier
        if l.lg_id not in exclude and l.name != data['IdsLanguage'][l.lg_id].name:
            DBSession.add(common.LanguageIdentifier(
                identifier=identifier,
                language=data['IdsLanguage'][l.lg_id]))

    # parameter chapter/entry
    for l in read('chapter'):
        data.add(models.Chapter, l.chap_id, id=l.chap_id, name=l.chap_title)

    entries = {}
    for l in read('entry'):
        id_ = '%s-%s' % (l.chap_id, l.entry_id)
        name = l.trans_english
        if name in entries:
            entries[name] += 1
            name = name + ' (%s)' % entries[name]
        else:
            entries[name] = 1
        kw = {
            'id': id_,
            'name': name,
            'concepticon_id': concepticon_id(id_),
            'chapter': data['Chapter'][l.chap_id]}
        for ll in 'french russian spanish portugese'.split():
            kw[ll] = getattr(l, 'trans_' + ll)
        data.add(models.Entry, id_, sub_code=l.entry_id, **kw)

    misaligned = []

    DBSession.flush()
    for entity in 'IdsLanguage Entry Chapter Dictionary'.split():
        for k in data[entity].keys()[:]:
            data[entity][k] = data[entity][k].pk

    synsets = set()
    counterparts = set()
    problems = defaultdict(list)

    for lg_id, entries in groupby(
            sorted(read('ids'), key=lambda t: t.lg_id), lambda k: k.lg_id):
        if lg_id in exclude or not lg_id:
            continue

        # keep the memory footprint reasonable
        transaction.commit()
        transaction.begin()

        language = common.Language.get(data['IdsLanguage'][lg_id])
        desc = data_desc.get(lg_id, {})
        words = defaultdict(list)
        for l in entries:
            if empty.match(l.data_1):
                continue

            entry_id = '%s-%s' % (l.chap_id, l.entry_id)
            if entry_id not in data['Entry']:
                continue
                #data.add(
                #    models.Entry, entry_id,
                #    id=entry_id,
                #    name=entry_id,
                #    concepticon_id=concepticon_id(entry_id),
                #    sub_code=l.entry_id,
                #    chapter_pk=data['Chapter'][l.chap_id])
                #DBSession.flush()
                #data['Entry'][entry_id] = data['Entry'][entry_id].pk

            id_ = '%s-%s' % (entry_id, l.lg_id)
            if id_ in synsets:
                vs = models.Synset.get(id_)
            else:
                vs = models.Synset(
                    id=id_,
                    comment=get_string(l.comment or ''),
                    alt_representation=get_string(l.data_2),
                    language=language,
                    contribution_pk=data['Dictionary'][l.lg_id],
                    parameter_pk=data['Entry'][entry_id])
                synsets.add(id_)

            trans1 = list(split_counterparts(l.data_1))
            trans2 = None if empty.match(l.data_2) else list(split_counterparts(l.data_2))

            if trans2:
                if len(trans2) != len(trans1):
                    if language.id != '238':
                        misaligned.append((l.chap_id, l.entry_id, l.lg_id))
                        #print('===', language.id, language.name)
                        #print(l.data_1)
                        #print(l.data_2)
                    # 83 cases of misaligned transcriptions
                    trans2 = None

            for i, word in enumerate(trans1):
                cid = id_ + '-' + str(i + 1 + len(vs.values))
                if cid not in counterparts:
                    v = models.Counterpart(
                        id=cid,
                        name=word,
                        description=desc.get('1'),
                        valueset=vs)
                    words[word].append((v, trans2[i] if trans2 else None))
                    counterparts.add(cid)
                else:
                    print(cid)
                    #12 - 420 - 811 - 3
                    #5 - 390 - 818 - 3
                    #2 - 930 - 819 - 3
                    #2 - 930 - 819 - 3
                    #3 - 120 - 819 - 3
                    #10 - 140 - 822 - 3
                    #9 - 160 - 825 - 3
                    #2 - 430 - 829 - 4

        for i, form in enumerate(words.keys()):
            # Since we identify words based on their string representation, we have to
            # make sure a word has the same alternative transcription for all meanings.
            if language.id == '238':
                alt_names = []
            else:
                alt_names = set(norm(w[1] or '', desc.get('2'), language.id)
                                for w in words[form])
            alt_names = nfilter(alt_names)
            try:
                assert len(alt_names) <= 1
            except AssertionError:
                problems[(language.id, language.name)].append(alt_names)
            word = models.Word(
                id='%s-%s' % (language.id, i + 1),
                name=form,
                description=desc.get('1'),
                language=language,
                alt_name=', '.join(alt_names) if alt_names else None,
                alt_description=desc.get('2')
            )
            for v, _ in words[form]:
                word.counterparts.append(v)
            DBSession.add(word)

        DBSession.flush()

    with dsv.UnicodeWriter(args.data_file('misaligned.csv')) as fp:
        fp.writerows(misaligned)

    # about 250 cases where alternative transcriotions do not covary across meanings.
    for k, v in problems.items():
        print(k, len(v))
Пример #14
0
 def order(self):
     return collkey(func.translate(Value.name, 'ˈ,ː,ˌ', ''))
Пример #15
0
def main(args):
    sources = get_sources(args)
    Index('ducet1', collkey(common.Value.name)).create(DBSession.bind)
    Index('ducet2', collkey(models.Counterpart.phonetic)).create(DBSession.bind)
    data = Data()
    glottocodes, geocoords = {}, defaultdict(lambda: (None, None))
    for k, v in glottocodes_by_isocode(
            'postgresql://robert@/glottolog3',
            cols=['id', 'latitude', 'longitude']).items():
        glottocodes[k] = v[0]
        geocoords[k] = (v[1], v[2])
    geocoords['win'] = (43.50, -88.50)

    dataset = common.Dataset(
        id=csd.__name__,
        name="Comparative Siouan Dictionary",
        description="Comparative Siouan Dictionary",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='csd.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    contrib = common.Contribution(id='csd', name=dataset.name)
    for i, spec in enumerate([
        ('Robert L. Rankin', True),
        ('Richard T. Carter', True),
        ('A. Wesley Jones', True),
        ('John E. Koontz', True),
        ('David S. Rood', True),
        ('Iren Hartmann', True),
    ]):
        name, primary = spec
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i, primary=primary))

    d = Dictionary(
        args.data_file(TXT),
        entry_impl=CsdEntry,
        entry_sep='\\lx ')
    d.entries = list(filter(lambda r: r.get('lx'), d.entries))[1:]
    print(len(d.entries))

    for i, v in enumerate(_LANGUAGES):
        l = data.add(
            models.Languoid, v[0],
            id=v[0],
            name=v[1],
            ord=i,
            color=v[4].lower(),
            proto=v[0].startswith('p') and len(v[0]) == 3,
            latitude=geocoords[v[2]][0],
            longitude=geocoords[v[2]][1],
            parent=data['Languoid'].get(v[5]))
        if v[2]:
            add_language_codes(data, l, v[2], glottocodes=glottocodes)
        if l.id == 'pn':
            l.latitude, l.longitude = (42.75, -98.03)
        if l.id == 'op':
            l.latitude, l.longitude = (43.5, -96.6)
        if l.id == 'mo':
            l.latitude, l.longitude = (40.05, -95.52)

    pnames = set()

    def _get(d, marker):
        _l = set(nfilter(d.get(marker, [])))
        if _l:
            _l = list(_l)
            if marker not in ['oo', 'or']:
                assert len(_l) == 1
                _l = _l[0]
            return _l

    def add_counterpart(d, vs, id,
                        phonetic,  # forms
                        cognate,  # oo
                        me, cm, so, org):
        assert phonetic or cognate

        if not cognate:
            if vs.language.proto:
                cognate = phonetic
                phonetic = None
            else:
                cognate = '[%s]' % phonetic
        m = models.Counterpart(
            id=id,
            name=cognate,
            phonetic=phonetic,
            description=me or '[%s]' % vs.parameter.name,
            comment=cm,
            original_entry=org,
            other_reconstructions='; '.join(_get(d, 'or') or []) if vs.language.id == 'psi' else None,
            valueset=vs)
        if so:
            for sid in nfilter([s.strip() for s in SEP_PATTERN.split(so or '')]):
                match = SID_PATTERN.match(sid)
                if not match:
                    continue

                name = sid
                sid = normalize_sid(match.group('key'))
                source = data['Source'].get(sid)
                if not source:
                    if sid in sources:
                        s = sources[sid]
                        source = data.add(
                            common.Source, sid,
                            id=sid,
                            name=s['Name'].upper() if len(s['Name']) <= 3 else s['Name'],
                            description=s.get('Title', s['citation']),
                            author=s.get('Author'),
                            title=s.get('Title'),
                            year=s.get('Year'),
                        )
                    else:
                        source = data.add(
                            common.Source, sid,
                            id=sid,
                            name=name.upper() if len(name) <= 3 else name)
                m.references.append(models.ValueReference(
                    source=source, description=match.group('pages')))

    for i, entry in enumerate(sorted(d.entries, key=lambda d: d.get('lx'), reverse=True)):
        lemma = entry.get('lx')
        if not lemma or not lemma.strip():
            continue
        pname = lemma
        j = 1
        while pname in pnames:
            pname = '%s (%s)' % (lemma, j)
            j += 1
        pnames.add(pname)
        contrib = data.add(
            common.Contribution, pname, id=str(i + 1), name='Entry "%s"' % pname)
        meaning = data.add(
            models.Entry, pname,
            id=str(i + 1),
            name=pname,
            contribution=contrib,
            description=entry.get('com'),
            psi_reconstruction_with_root_extension_code=entry.get('lxcm'),
            sd=normalize_comma_separated(entry.get('sd'), SD, lower=True),
            ps=normalize_comma_separated(entry.get('ps'), PS),
            othlgs='\n---\n'.join(entry.getall('othlgs')))
        if meaning.description:
            meaning.description = meaning.description.replace('.\n', '.\n\n')

        for lid, words in entry.get_words().items():
            vsid = '%s-%s' % (lid, meaning.id)
            vs = data.add(
                common.ValueSet, vsid,
                id=vsid,
                parameter=meaning,
                contribution=contrib,
                language=data['Languoid'][lid])

            for j, d in enumerate(words):
                looped = False

                for k, (oo, me, so, cm, org) in enumerate(izip_longest(
                        *[d.get(_m, []) for _m in 'oo me so cm _org'.split()])):
                    if not oo:
                        continue
                    looped = True
                    add_counterpart(d,
                        vs,
                        '%s-%s-%s' % (vsid, j + 1, k + 1),
                        d['forms'],
                        oo,
                        me,
                        cm,
                        so,
                        org)

                if not looped:  # not oo
                    if not d['forms']:
                        print '--->', d
                        continue
                    add_counterpart(d,
                        vs,
                        '%s-%s-%s' % (vsid, j + 1, 1),
                        d['forms'],
                        '; '.join(_get(d, 'oo') or []),
                        _get(d, 'me'),
                        _get(d, 'cm'),
                        _get(d, 'so'),
                        _get(d, '_org'))