Пример #1
0
def add_sources(sources_file_path, session):
    """
    Creates and adds to the given SQLAlchemy session the common.Source model
    instances that comprise the project's references. Expects the path to a
    bibtex file as its first argument.
    Returns a dict containing the added model instances with the bibtex IDs
    being the keys.
    Helper for the main function.
    """
    d = {}

    bibtex_db = bibtex.Database.from_file(sources_file_path, encoding='utf-8')
    seen = set()

    for record in bibtex_db:

        if record.id in seen:
            continue

        d[record.id] = bibtex2source(record)
        session.add(d[record.id])
        seen.add(record.id)

    session.flush()

    return d
Пример #2
0
def main(args):
    assert args.glottolog, 'The --glottolog option is required!'

    clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts')
    data = Data()
    ds = data.add(
        common.Dataset,
        lsi.__name__,
        id=lsi.__name__,
        name=
        'The Comparative Vocabularies of the "Linguistic Survey of India" Online',
        domain='lsi.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )

    for i, name in enumerate(
        ['Taraka Rama', 'Robert Forkel', 'Johann-Mattis List']):
        common.Editor(dataset=ds,
                      ord=i,
                      contributor=common.Contributor(id=slug(
                          HumanName(name).last),
                                                     name=name))

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode',
                          'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            order=int(lang['Order']),
            number=lang['NumberInSource'],
            family_in_source=lang['FamilyInSource'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in iteritems(args.cldf, 'ParameterTable', 'id',
                           'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id']),
            description=param['Concepticon_Gloss'],
            concepticon_id=param['concepticonReference'],
            pages=param['PageNumber'],
        )

    inventories = collections.defaultdict(set)
    for form in iteritems(args.cldf, 'FormTable', 'id', 'form',
                          'languageReference', 'parameterReference', 'source'):
        inventories[form['languageReference']] = inventories[
            form['languageReference']].union(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(
            models.Form,
            form['id'],
            id=form['id'],
            name=form['form'],
            description=''.join(form['Segments']).replace('+', ' '),
            segments=' '.join(form['Segments']),
            valueset=vs,
        )
    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(inventory=[(str(c), c.name)
                                                        for c in inv
                                                        if hasattr(c, 'name')])

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )
Пример #3
0
def main(args):  # pragma: no cover
    data = Data()
    clts_repos = Path(__file__).parent.parent.parent.parent.resolve() / 'clts-data'
    clts_repos = CLTS(clts_repos)
    print(clts_repos.repos)
    version = 'v2.1.0' # assert_release(clts_repos.repos)

    for rec in Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    dataset = common.Dataset(
        id='clts',
        name="CLTS {0}".format(version),
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='clts.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, name in enumerate([
        'Johann-Mattis List',
        'Cormac Anderson',
        'Tiago Tresoldi',
        'Robert Forkel',
    ]):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    for line in args.cldf['data/features.tsv']:
        data.add(
            models.Feature,
            line['ID'],
            id=line['ID'],
            name='{} {}: {}'.format(line['TYPE'], line['FEATURE'], line['VALUE']),
            sound_type=line['TYPE'],
            feature=line['FEATURE'],
            value=line['VALUE'],
        )

    DBSession.add(models.SoundSegment(
        id='NA',
        name='<NA>',
        description='<NA>',
        type='marker',
        generated=True,
        unicode='',
        color='#bbbbbb',
    ))
    for line in args.cldf['data/sounds.tsv']:
        s = data.add(
            models.SoundSegment,
            line['ID'],
            id=line['ID'],
            name=line['GRAPHEME'],
            description=line['NAME'],
            type=line['TYPE'],
            generated=line['GENERATED'],
            unicode=' / '.join(line['UNICODE']),
            color=clts_repos.soundclass('color').resolve_sound(line['GRAPHEME']),
        )
        if s.color == '0':
            s.color = '#bbbbbb'
        assert s.color in LEGEND
    DBSession.flush()

    seen = set()
    for line in args.cldf['data/sounds.tsv']:
        for fid in line['FEATURES']:
            spk, fpk = data['SoundSegment'][line['ID']].pk, data['Feature'][fid].pk
            if (spk, fpk) not in seen:
                DBSession.add(models.SoundSegmentFeature(soundsegment_pk=spk, feature_pk=fpk))
                seen.add((spk, fpk))

    english = data.add(
        common.Language, 'eng',
        id='eng',
        name='English')

    for line in args.cldf['sources/index.tsv']:
        c = data.add(
            models.Transcription,
            line['NAME'],
            id=line['NAME'],
            name=line['NAME'],
            description=line['DESCRIPTION'].replace(':bib:', '/sources/'),
            datatype=getattr(models.Datatype, line['TYPE'])
        )
        for ref in line.get('REFS', []):
            common.ContributionReference(source=data['Source'][ref], contribution=c)

    sound_url_template = args.cldf['data/graphemes.tsv', 'SOUND'].valueUrl
    image_url_template = args.cldf['data/graphemes.tsv', 'IMAGE'].valueUrl

    for line in args.cldf['data/graphemes.tsv']:
        key = line['DATASET'] + ':' + line['NAME'] + ':' + line['GRAPHEME']
        if key not in data['Grapheme']:
            sound_id = line['NAME'].replace(' ', '_')
            vs = data['ValueSet'].get((line['DATASET'], line['NAME']))
            if not vs:
                try:
                    vs = data.add(
                        common.ValueSet,
                        (line['DATASET'], line['NAME']),
                        id=key,
                        description=line['NAME'],
                        language=english,
                        contribution=data['Transcription'][line['DATASET']],
                        parameter=data['SoundSegment'][sound_id]
                    )
                except:
                    print(line)
                    raise
            data.add(
                models.Grapheme,
                key,
                id=key,
                name=line['GRAPHEME'],
                description=line['NAME'],
                url=line['URL'].unsplit() if line['URL'] else None,
                audio=sound_url_template.expand(line) if line['SOUND'] else None,
                image=image_url_template.expand(line) if line['IMAGE'] else None,
                valueset=vs
            )
Пример #4
0
def main(args):  # pragma: no cover
    data = Data()
    clts = CLTS(input('Path to cldf-clts/clts:') or '../../cldf-clts/clts')
    ds = data.add(
        common.Dataset,
        tppsr.__name__,
        id=tppsr.__name__,
        name='Tableaux phonétiques des patois suisses romands Online',
        domain='tppsr.clld.org',
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="https://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'},
    )
    for i, name in enumerate(['Hans Geisler', 'Robert Forkel', 'Johann-Mattis List']):
        common.Editor(
            dataset=ds,
            ord=i,
            contributor=common.Contributor(id=slug(HumanName(name).last), name=name)
        )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['Number'],
            name=lang['name'],
            description=lang['FullName'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            canton=lang['Canton'],
            group=lang['DialectGroup'],
            recorded=lang['DateOfRecording'],
            population=int(lang['Population']) if lang['Population'] else None,
            speaker_age=int(lang['SpeakerAge']) if lang['SpeakerAge'] else None,
            speaker_proficiency=lang['SpeakerProficiency'],
            speaker_language_use=lang['SpeakerLanguageUse'],
            speaker_gender=lang['SpeakerGender'],
            investigators=lang['Investigators'],
        )
    colors = qualitative_colors(len(set(l.canton for l in data['Variety'].values())), set='tol')
    for i, (_, langs) in enumerate(itertools.groupby(
        sorted(data['Variety'].values(), key=lambda l: l.canton),
        lambda l: l.canton,
    )):
        for lang in langs:
            lang.update_jsondata(color=colors[i])

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)
    for param in iteritems(args.cldf, 'ParameterTable', 'id', 'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['Number'],
            number=int(param['Number']),
            name='{} [{}]'.format(param['name'], param['Number']),
            latin_gloss=param['Latin_Gloss'],
            french_gloss=param['French_Gloss'],
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
            concepticon_concept_id=param['id'].split('_')[0],
        )

    inventories = collections.defaultdict(set)
    scan_url_template = args.cldf['FormTable', 'Scan'].valueUrl
    for form in iteritems(args.cldf, 'FormTable', 'id', 'value', 'form', 'languageReference', 'parameterReference', 'source'):
        if not form['form']:
            continue
        inventories[form['languageReference']] = inventories[form['languageReference']].union(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        f = data.add(
            models.Form,
            form['id'],  # Gauchat-1925-480-1_
            id=form['id'],
            name=form['form'].replace('+', ' '),
            description=form['value'],
            segments=' '.join(form['Segments']),
            valueset=vs,
            scan=scan_url_template.expand(**form),
            prosodic_structure=form['ProsodicStructure'],
        )

    for example in args.cldf['ExampleTable']:
        sentence = models.Phrase(
            id=example['ID'],
            language=data['Variety'][example['Language_ID']],
            name=example['Primary_Text'],
            description=example['Translated_Text'],
            original_script=example['Alt_Transcription'],
        )
        for cid in example['Concept_ID']:
            DBSession.add(models.ConceptSentence(concept=data['Concept'][cid], sentence=sentence))
        for fid in example['Form_ID']:
            DBSession.add(common.ValueSentence(value=data['Form'][fid], sentence=sentence))

    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(
            inventory=[(str(c), c.name) for c in inv if hasattr(c, 'name')])

    for (vsid, sid), pages in refs.items():
        DBSession.add(common.ValueSetReference(
            valueset=data['ValueSet'][vsid],
            source=data['Source'][sid],
            description='; '.join(nfilter(pages))
        ))
Пример #5
0
def main(args):  # pragma: no cover
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')
    clts = CLTS(
        input('Path to cldf-clts/clts:') or '../../cldf-clts/clts-data')
    data = Data()
    ds = data.add(
        common.Dataset,
        vanuatuvoices.__name__,
        id=vanuatuvoices.__name__,
        name='Vanuatu Voices',
        domain='vanuatuvoices.clld.org',
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    form2audio = audioutil.form2audio(args.cldf, 'audio/mpeg')

    r = get_dataset('vanuatuvoices', ep='lexibank.dataset')
    authors, _ = r.get_creators_and_contributors()
    for ord, author in enumerate(authors):
        cid = slug(HumanName(author['name']).last)
        img = pathlib.Path(
            vanuatuvoices.__file__).parent / 'static' / '{}.jpg'.format(cid)
        c = data.add(
            common.Contributor,
            cid,
            id=cid,
            name=author['name'],
            description=author.get('description'),
            jsondata=dict(img=img.name if img.exists() else None),
        )
    data.add(
        common.Contributor,
        'forkel',
        id='forkel',
        name='Robert Forkel',
        description='Data curation and website implementation',
        jsondata=dict(img=None),
    )
    for ord, cid in enumerate(['walworth', 'forkel', 'gray']):
        DBSession.add(
            common.Editor(ord=ord,
                          dataset=ds,
                          contributor=data['Contributor'][cid]))

    contribs = collections.defaultdict(lambda: collections.defaultdict(list))
    for c in args.cldf.iter_rows('contributions.csv'):
        for role in ['phonetic_transcriptions', 'recording', 'sound_editing']:
            for name in c[role].split(' and '):
                if name:
                    cid = slug(HumanName(name).last)
                    contribs[c['Language_ID']][cid].append(role)

    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        contrib = data.add(
            common.Contribution,
            lang['id'],
            id=lang['id'],
            name='Wordlist for {}'.format(lang['name']),
        )
        if lang['id'] in contribs:
            for cid, roles in contribs[lang['id']].items():
                DBSession.add(
                    common.ContributionContributor(
                        contribution=contrib,
                        contributor=data['Contributor'][cid],
                        jsondata=dict(roles=roles),
                    ))
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            description=lang['LongName'],
            contribution=contrib,
            island=lang['Island'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id'].split('_')[0]),
            description=param['Bislama_Gloss'],
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
        )
    inventories = collections.defaultdict(collections.Counter)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        inventories[form['languageReference']].update(form['Segments'])
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=data['Contribution'][form['languageReference']],
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(Counterpart,
                 form['id'],
                 id=form['id'],
                 name=form['form'],
                 valueset=vs,
                 audio=form2audio.get(form['id']))

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))

    for lid, inv in inventories.items():
        inv = [clts.bipa[c] for c in inv]
        data['Variety'][lid].update_jsondata(
            inventory=[(str(c), c.name) for c in inv
                       if getattr(c, 'name', None)])
Пример #6
0
def main(args):  # pragma: no cover
    data = Data()

    print("Setting up dataset…")
    dataset = common.Dataset(
        id=cariban.__name__,
        domain="cariban.clld.org",
        name="Comparative Cariban Database",
        description="Comparative Cariban Database",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_url="https://www.eva.mpg.de",
        publisher_place="Leipzig",
        license="https://creativecommons.org/licenses/by/4.0/",
        contact="*****@*****.**",
        jsondata={'function_paradigms': []},
    )

    fps = []
    for morph_func in args.cldf["ValueTable"]:
        for function in morph_func["Function"]:
            for cons in morph_func["Construction"]:
                fps.append({
                    'Function': function,
                    'Construction': cons,
                    'Morpheme': morph_func['Morpheme']})
    dataset.update_jsondata(function_paradigms=fps)

    DBSession.add(dataset)
    DBSession.flush()

    print("Adding contributors…")
    c = common.Contributor(id="fm",name="Florian Matter", email="*****@*****.**", url="https://florianmatter.gitlab.io/")
    dataset.editors.append(common.Editor(contributor=c, ord=1, primary=True))

    print("Adding languages…")
    dialect_mapping = {}
    lang_shorthands = {}
    glottocodes = {}
    lang_ids = {}
    for lang in args.cldf["LanguageTable"]:
        if lang["Sampled"] == "y":
            language = data.add(
                common.Language,
                lang["ID"],
                id=lang["ID"],
                name=lang["Name"],
                latitude=float(lang["Latitude"]) if lang["Latitude"] is not None else None,
                longitude=float(lang["Longitude"]) if lang["Longitude"] is not None else None,
                jsondata={'Shorthand': lang['Shorthand'], 'Glottocode': lang['Glottocode']},
            )
            add_language_codes(data, language, isocode=lang["ISO"], glottocode = lang["Glottocode"])
        if lang["Dialect_Of"] not in [None, "y"]:
            dialect_mapping[lang["ID"]] = lang["Dialect_Of"]
        lang_shorthands[lang["Shorthand"]] = {"ID": lang["ID"], "Name": lang["Name"]}
        glottocodes[lang["Glottocode"]] = {"ID": lang["ID"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]}
        lang_ids[lang["ID"]] = {"Glottocode": lang["Glottocode"], "Name": lang["Name"], "Shorthand": lang["Shorthand"]}

    def get_lang_id(key):
        if key in lang_shorthands:
            lang_id = lang_shorthands[key]["ID"]
        elif key in glottocodes:
            lang_id = glottocodes[key]["ID"]
        elif key in lang_ids:
            lang_id = key
        else:
            print("Could not identify language %s" % key)
            return None
        if lang_id in dialect_mapping:
            lang_id = dialect_mapping[lang_id]
        return lang_id

    def get_key_and_page(source_string):
        if len(source_string.split("[")) > 1:
            bib_key = source_string.split("[")[0]
            pages = source_string.split("[")[1].split("]")[0]
        else:
            bib_key = source_string
            pages = ""
        return bib_key, pages

    print("Adding sources…")
    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    
    print("Adding language sources…")
    DBSession.flush()
    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        if "keywords" in rec:
            for keyword in rec["keywords"].split(","):
                if keyword in lang_shorthands:
                    lang_id = get_lang_id(keyword.strip(" "))
                    if lang_id in data["Language"]:
                        data.add(common.LanguageSource,
                        rec.id+lang_id,
                        language_pk=data["Language"][lang_id].pk,
                        source_pk=data["Source"][rec.id].pk
                        )
        
    data.add(
        common.Source,
        "pc",
        id="pc",
        name="Personal communication",
        description="Placeholder for data obtained from personal communication.",
        bibtex_type=bibtex.EntryType.misc
    )

#     print("Adding glossing abbreviations…")
#     length = len(pynterlinear.get_all_abbrevs().keys())
#     for i, (key, name) in enumerate(pynterlinear.get_all_abbrevs().items()):
#         print("%s/%s" % (i+1, length), end="\r")
#         DBSession.add(common.GlossAbbreviation(id=key, name=name))
#     print("")
#
    print("Adding examples…")
    gloss_replacements = {
        "S_A_": "Sa",
        "S_P_": "Sp"
    }
    def clldify_glosses(gloss_line):
        for orig, new in gloss_replacements.items():
            gloss_line = gloss_line.replace(orig,new)
        gloss_line = re.sub(r"(\d)([A-Z])", r"\1.\2", gloss_line)
        return gloss_line

    for ex in args.cldf["ExampleTable"]:
        lang_id = get_lang_id(ex["Language_ID"])
        new_ex = data.add(common.Sentence,
            ex["ID"],
            id=ex["ID"],
            name=ex["Name"],
            description=ex["Translated_Text"],
            analyzed="\t".join(ex["Analyzed_Word"]),
            gloss=clldify_glosses("\t".join(ex["Gloss"])),
            language=data["Language"][lang_id],
            comment=ex["Comment"],
            markup_gloss="\t".join(ex["Morpheme_IDs"])
        )
        
        if ex["Source"]:
            bib_key, pages = get_key_and_page(ex["Source"])
            if bib_key in data["Source"]:
                source = data["Source"][bib_key]
                DBSession.add(common.SentenceReference(
                    sentence=new_ex,
                    source=source,
                    key=source.id,
                    description=pages.replace("--","–"))
                )

    def add_morpheme_reference(morpheme, source_string):
        bib_key, pages = get_key_and_page(source_string)
        if bib_key in data["Source"]:
            source = data["Source"][bib_key]
            DBSession.add(models.MorphemeReference(
                morpheme=morpheme,
                source=source,
                key=source.id,
                description=pages.replace("--","–")
                )
            )

    print("Adding morphemes…")
    for morph in args.cldf["FormTable"]:
        lang_id = get_lang_id(morph["Language_ID"])
        form = util.merge_allomorphs("; ".join(morph["Form"])).split("; ")
        new_morph = data.add(models.Morpheme,
            morph["ID"],
            morpheme_type="grammatical",
            language=data["Language"][lang_id],
            name="/".join(form),
            id=morph["ID"],
        )
        
        if morph["Source"]: add_morpheme_reference(new_morph, morph["Source"][0])

    print("Adding constructions…")
    data.add(models.DeclarativeType, "imp", id="imp", name="imperative")
    data.add(models.DeclarativeType, "decl", id="decl", name="declarative")
    data.add(models.MainClauseVerb, "y", id="y", name="main clause construction")
    data.add(models.MainClauseVerb, "n", id="n", name="subordinate clause construction")

    for cons in args.cldf["ParameterTable"]:
        lang_id = get_lang_id(cons["Language_ID"])
        new_construction = data.add(
            models.Construction,
            cons["ID"],
            id=cons["ID"],
            language=data["Language"][lang_id],
            name=cons["Description"],
            mainclauseverb=data["MainClauseVerb"][cons["MainClauseVerb"]],
        )
        if cons["DeclarativeType"]: new_construction.declarativetype = data["DeclarativeType"][cons["DeclarativeType"]]

    def add_morph_func(morpheme, func_key, construction):
        data.add(models.MorphemeFunction,
            "%s:%s" % (morpheme, function),
            id="%s:%s" % (morpheme, func_key),
            name="MorphemeFunction %s:%s"% (morpheme, func_key),
            unit=data["Morpheme"][morpheme],
            unitparameter=data["Meaning"][function],
            construction=construction
        )

    print("Adding morpheme functions…")
    for morph_func in args.cldf["ValueTable"]:
        for function in morph_func["Function"]:
            func_key = function.replace(".","_")
            if ">" in function or function == "LK" or bool(re.search(r"\d[SP]$", function) or function == "3"):
                meaning_type="inflectional"
            else:
                meaning_type="derivational"
            if function not in data["Meaning"]:
                data.add(models.Meaning,
                    function,
                    id=func_key,
                    name=function,
                    meaning_type=meaning_type
                )
            #Only some morpheme functions are specified as occurring in specific constructions
            if len(morph_func["Construction"]) == 0:
                for morpheme in morph_func["Morpheme"]:
                    add_morph_func(morpheme, func_key, None)
            else:
                for construction in morph_func["Construction"]:
                    if len(morph_func["Morpheme"]) == 1 and morph_func["Morpheme"][0] != "?":
                        for morpheme in morph_func["Morpheme"]:
                            if data["Morpheme"][morpheme].language != data["Construction"][construction].language:
                                print("Warning: the %s Morpheme %s is stated to occur in the %s construction %s!" % (
                                data["Morpheme"][morpheme].language,
                                data["Morpheme"][morpheme],
                                data["Construction"][construction].language,
                                data["Construction"][construction]
                                )
                                )
                            cons_func_key = func_key + ":" + construction
                            add_morph_func(morpheme, cons_func_key, data["Construction"][construction])

    print("Checking examples for illustrated morphemes…")
    proto_languages = ["pc"]
    is_illustrated = {}
    for key, row in data["MorphemeFunction"].items():
        if row.unit.language.id in proto_languages: continue
        is_illustrated["%s:%s" % (row.unit.id, row.unitparameter.id)] = False
    for row in args.cldf["ExampleTable"]:
        for word in row["Morpheme_IDs"]:
            morph_ids = util.split_word(word)
            for unit_value in morph_ids:
                if unit_value in ["X","-","=", "~"]:
                    continue
                unitvaluesentence_key = "{0}-{1}".format(unit_value.replace(".","-"),row["ID"])
                if unitvaluesentence_key in data["UnitValueSentence"].keys():
                    continue
                is_illustrated[unit_value] = True
                morph_id = unit_value.split(":")[0]
                if morph_id not in data["Morpheme"].keys():
                    print("Warning: Example %s illustrates unknown morpheme %s" % (row["ID"], morph_id))
                elif data["Morpheme"][morph_id].language != data["Sentence"][row["ID"]].language:
                    print("Warning: The %s example %s claims to contain the %s morpheme %s." % (
                        data["Sentence"][row["ID"]].language,
                        row["ID"],
                        data["Morpheme"][morph_id].language,
                        data["Morpheme"][morph_id]
                    )
                    )
                if ":" not in unit_value:
                    print("%s in %s contains no defined function!" % (unit_value, row["ID"]))
                function = unit_value.split(":")[1]
                morph_function_id = "%s:%s" % (morph_id, function)
                if morph_function_id not in data["MorphemeFunction"].keys():
                    print("Warning: Example %s tries to illustrate inexistent morpheme function %s!" % (row["ID"], unit_value.replace(".","-")))
                    continue
                data.add(models.UnitValueSentence,
                unitvaluesentence_key,
                sentence=data["Sentence"][row["ID"]],
                unitvalue=data["MorphemeFunction"][morph_function_id],
                )


    # see how many morpheme functions are illustrated with example sentences
    good_ill = [key for key, value in is_illustrated.items() if value]
    not_ill = [key for key, value in is_illustrated.items() if not value]
    not_ill.sort()
    cov = len(good_ill)/len(is_illustrated)*100
    print("Morpheme exemplification coverage is at %s%%. List of unillustrated morphemes saved to unillustrated_morphemes.txt" % str(round(cov, 2)))
    f = open("../unillustrated_morphemes.txt", "w")
    for morph in not_ill:
        f.write(morph+"\n")
    f.close()

    print("Adding cognate sets…")
    for cogset in args.cldf["CognatesetTable"]:
        new_cset = data.add(models.Cognateset,
            cogset["ID"],
            id=cogset["ID"],
            name=cogset["Name"],
            description=cogset["Function"],
            cogset_type="grammatical"
        )
        if cogset["Source"]:
            for source in cogset["Source"]:
                bib_key, pages = get_key_and_page(source)
                if bib_key in data["Source"]:
                    source = data["Source"][bib_key]
                    DBSession.add(models.CognatesetReference(
                        cognateset=new_cset,
                        source=source,
                        key=source.id,
                        description=pages)
                        )

    print("Adding cognates…")
    for morph in args.cldf["FormTable"]:
        for cognate_ID in morph["Cognateset_ID"]:
            DBSession.add(models.Cognate(
                    cognateset=data["Cognateset"][cognate_ID],
                    counterpart=data["Morpheme"][morph["ID"]]
                    )
            )

    print("Adding morpheme comments…")
    for row in args.cldf["FormTable"]:
        data["Morpheme"][row["ID"]].markup_description=util.generate_markup(row["Comment"])

    print("Adding construction descriptions…")
    for cons in args.cldf["ParameterTable"]:
        if cons["Comment"] is None:
            description = ""
        else:
            description = util.generate_markup(cons["Comment"])
        description += "\n" + util.generate_markup(util.transitive_construction_paradigm(cons["ID"]))
        description += util.generate_markup(util.intransitive_construction_paradigm(cons["ID"]))
        data["Construction"][cons["ID"]].markup_description = description


    print("Adding cognate set descriptions…")
    for cogset in args.cldf["CognatesetTable"]:
        data["Cognateset"][cogset["ID"]].markup_description = util.generate_markup(cogset["Description"])
        # if cogset["ID"] == "13pro":
        #     data["Cognateset"][cogset["ID"]].markup_description += util.generate_markup(
        #         util.comparative_function_paradigm(
        #             ["apa_main", "tri_main", "way_main", "mak_main", "kar_main", "hix_main", "wai_main", "ara_main", "ikp_main", "wmr_main", "pan_old", "kax_main"],
        #             "1+3 scenarios",
        #             ["1+3S", "1+3>3", "3>1+3", "2>1+3", "1+3>2"]))

    
    def add_tree_labels(phylo):
        uncertain_nodes = []
        for node in phylo.find_clades():
            if node.name == None or not node.is_terminal():
                continue
            plain_name = node.name.replace("?","")
            if "?" in node.name: uncertain_nodes.append(plain_name)
            if plain_name in lang_ids:
                node.name = lang_ids[plain_name]["Name"].replace("'", "’")
            if plain_name in uncertain_nodes: node.name += "?"
        return phylo, uncertain_nodes
        
    print("Adding trees…")
    own_trees = ["matter"]
    tree_path = str(args.cldf.tablegroup._fname.parent / '..' / 'raw')
    newick_files = {}
    for tree in args.cldf["cariban_trees.csv"]:
        if tree["ID"] in own_trees: continue
        newick_files[tree["ID"]] = {
            "orig": tree["ID"]+"_orig.newick",
            "norm": tree["ID"]+"_norm.newick",
            "source": tree["Source"],
            "comment": tree["Comment"],
            "o_comment": tree["Orig_Comment"]
        }
    #adding my own trees separately.
    for my_tree_count, tree_id in enumerate(own_trees):
        my_tree = Phylo.read(tree_path+"/"+"%s.newick" % tree_id, "newick")
        my_tree, uncertain_nodes = add_tree_labels(my_tree)
        
        edited_tree = io.StringIO()
        Phylo.write(my_tree, edited_tree, "newick")
        tree = edited_tree.getvalue().replace(":0.00000","")
        
        my_phylo = Phylogeny(
                tree_id,
                id=tree_id,
                name="Matter (2020)",# % str(my_tree_count+1),
                newick=tree,
                markup_description="My own, conservative, classification."
        )
        
        for l in DBSession.query(common.Language):
            lname = l.name.replace("'", "’")
            if l.id in uncertain_nodes: lname += "?"
            new_label = LanguageTreeLabel(
                language=l,
                treelabel=TreeLabel(
                    id="%s_%s" % (tree_id, l.id),
                    name=lname,
                    phylogeny=my_phylo
                )
            )
              
        DBSession.add(my_phylo)
        
    #adding the other trees
    for tree_id, values in newick_files.items():
        norm_biotree = Phylo.read(tree_path+"/"+values["norm"], "newick")
        orig_biotree = Phylo.read(tree_path+"/"+values["orig"], "newick")
        
        norm_biotree, uncertain_nodes = add_tree_labels(norm_biotree)
            
        edited_tree = io.StringIO()
        Phylo.write(norm_biotree, edited_tree, "newick")
        norm_tree = edited_tree.getvalue().replace(":0.00000","")
        
        edited_tree = io.StringIO()
        Phylo.write(orig_biotree, edited_tree, "newick")
        orig_tree = edited_tree.getvalue().replace(":0.00000","")
        
        norm_phylo = Phylogeny(
                id=tree_id+"_norm",
                name=str(data["Source"][values["source"]]) + " (Normalized)",
                markup_description=util.generate_markup("Source: src:"+values["source"])+
                "<br>This is a normalized version of <a href='/phylogeny/%s_orig'>this original tree</a>." % tree_id +
                util.generate_markup(
                    "<br>Comments: %s" % values["comment"]
                ),
                newick=norm_tree
        )
        
        if values["o_comment"] == None:
            o_comment = ""
        else:
            o_comment = values["o_comment"]
        orig_phylo = Phylogeny(
                id=tree_id+"_orig",
                name=str(data["Source"][values["source"]]) + " (Original)",
                markup_description=util.generate_markup("Source: src:"+values["source"])+
                    "<br>This is a representation of the original classification. A normalized version can be found <a href='/phylogeny/%s_norm'>here</a>." % tree_id +
                    util.generate_markup(
                    "<br>Comments: %s" % values["comment"] +
                    " " + o_comment
                    ),
                newick=orig_tree
        )
        for l in DBSession.query(common.Language):
            lname = l.name.replace("'", "’")
            if l.id in uncertain_nodes: lname += "?"
            new_label = LanguageTreeLabel(
                language=l,
                treelabel=TreeLabel(
                    id="%s_%s" % (tree_id, l.id),
                    name=lname,
                    phylogeny=norm_phylo
                )
            )
        DBSession.add(norm_phylo)
        DBSession.add(orig_phylo)

    print("Adding t-adding verb cognate sets…")
    for t_verb_set in args.cldf["cariban_t_cognates.csv"]:
        cognate_ID = "t"+t_verb_set["ID"]
        rec_t_form = "*[%s]%s" % (t_prefix_form(t_verb_set["Form"]), t_verb_set["Form"])
        t_cogset = data.add(models.Cognateset,
            cognate_ID,
            id=cognate_ID,
            name=rec_t_form,
            description="‘%s’ (*t-adding verb)" % t_verb_set["Parameter_ID"],
            cogset_type="t_adding"
        )
        if t_verb_set["Source"]:
            bib_key = t_verb_set["Source"].split("[")[0]
            if len(t_verb_set["Source"].split("[")) > 1:
                pages = t_verb_set["Source"].split("[")[1].split("]")[0]
            else:
                pages = " "
            if bib_key in data["Source"]:
                source = data["Source"][bib_key]
                DBSession.add(models.CognatesetReference(
                    cognateset=t_cogset,
                    source=source,
                    key=source.id,
                    description=pages)
                    )
    
    print("Adding t-adding verbs…")
    t_langs = {}
    t_verbs = {}
    non_t_adding_lgs = ["ing","mac","kar","wmr","pan"]
    data.add(models.Meaning,
        "t_verb",
        id="t-verb",
        name="t-adding verb",
    )
    for t_verb_entry in args.cldf["cariban_t_verbs.csv"]:
        if t_verb_entry["Language_ID"] == "cari1283": continue
        cognate_ID = "t"+t_verb_entry["Cognateset_ID"]
        lang_id = get_lang_id(t_verb_entry["Language_ID"])
        morph_id = lang_id+"_"+cognate_ID
        if morph_id in data["Morpheme"].keys():
            if morph_id + "_2" in data["Morpheme"].keys():
                morph_id += "_3"
            else:
                morph_id += "_2"
        t_verb = data.add(models.Morpheme,
            morph_id,
            id=morph_id,
            morpheme_type="t_adding",
            name=t_verb_entry["Form"],
            language=data["Language"][lang_id],
        )
        DBSession.add(models.Cognate(
                cognateset=data["Cognateset"][cognate_ID],
                counterpart=t_verb
            )
        )
        if t_verb_entry["t"] == "y":
            t_verb.name = "[%s]%s" % (t_prefix_form(t_verb.name), t_verb.name)
            t_verb.markup_description = util.generate_markup("Shows cogset:t")
        if t_verb_entry["t"] == "?" and lang_id not in non_t_adding_lgs:
            t_verb.name = "[t-?]"+t_verb.name
            t_verb.markup_description = util.generate_markup("It is not known if this verb shows cogset:t")
        if t_verb_entry["t"] == "n":
            t_verb.markup_description = util.generate_markup("Does not show cogset:t")
        if lang_id not in t_langs.keys():
            t_langs[lang_id] = {"y": 0, "n": 0, "?": 0}
        if cognate_ID not in t_verbs.keys():
            t_verbs[cognate_ID] = {"y": 0, "n": 0, "?": 0}
        t_langs[lang_id][t_verb_entry["t"]] += 1
        if lang_id not in non_t_adding_lgs:
            t_verbs[cognate_ID][t_verb_entry["t"]] += 1
        if t_verb_entry["Source"]:
            add_morpheme_reference(t_verb, t_verb_entry["Source"])

        data.add(models.MorphemeFunction,
            "t_"+t_verb_entry["ID"],
            id="t_"+t_verb_entry["ID"],
            name="t-Verb %s" % t_verb_entry["Parameter_ID"],
            unit=t_verb,
            unitparameter=data["Meaning"]["t_verb"],
            construction=None
        )
    for lang, values in t_langs.items():
        data["Language"][lang].update_jsondata(t_values=values)
    for verb, values in t_verbs.items():
        # data["Cognateset"][verb].description += " (%s/%s)" % (str(values["y"]), str(values["n"]+values["y"]+values["?"]))
        data["Cognateset"][verb].markup_description = util.generate_markup("This verb occurs with obj:t- in %s of %s languages which show reflexes of cogset:t." % (str(values["y"]), str(values["n"]+values["y"]+values["?"])))

    print("Adding reconstructed lexemes…")
    proto_forms = {}
    for cogset in args.cldf["cariban_lexical_reconstructions.csv"]:
        proto_forms[cogset["ID"]] = cogset["Form"]

    first_found = []
    for entry in args.cldf["cariban_swadesh_list.csv"]:
        cognateset_ID = entry["Parameter_ID"].replace("/","_")+"-"+entry["Cognateset_ID"]
        if cognateset_ID not in data["Cognateset"]:
            if cognateset_ID in proto_forms:
                form = "*" + proto_forms[cognateset_ID].replace("; ", " / ")
            # else:
            #     form = ""
                data.add(models.Cognateset,
                    cognateset_ID,
                    id=cognateset_ID,
                    name=form,
                    description=cognateset_ID,
                    cogset_type="lexical"
                )
        lang_id = get_lang_id(entry["Language_ID"])
        if lang_id not in data["Language"]: continue
        function = entry["Parameter_ID"].replace(".","_")
        morph_id = entry["Language_ID"] + "_" + function
        if morph_id in first_found: continue
        first_found.append(morph_id)
        if function not in data["Meaning"].keys():
            data.add(models.Meaning,
                function,
                id=function,
                name=function,
                meaning_type="lexical"
            )
        morpheme = data.add(models.Morpheme,
                    morph_id,
                    id=morph_id,
                    morpheme_type="lexical",
                    name=entry["Value"][0],
                    language=data["Language"][lang_id],
                )
        data.add(models.MorphemeFunction,
            "%s:%s" % (morph_id, function),
            id="%s:%s" % (morph_id, function),
            name="MorphemeFunction %s:%s"% (morph_id, function),
            unit=data["Morpheme"][morph_id],
            unitparameter=data["Meaning"][function],
            construction=None
        )
        if entry["Source"]:
            add_morpheme_reference(morpheme, entry["Source"])
        
        if cognateset_ID in proto_forms:
            DBSession.add(models.Cognate(
                    cognateset=data["Cognateset"][cognateset_ID],
                    counterpart=morpheme
                )
            )
Пример #7
0
def main(args):

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    data.add(
        common.Dataset,
        polyglottaafricana.__name__,
        id=polyglottaafricana.__name__,
        domain='',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'glottocode',
                          'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in iteritems(args.cldf, 'ParameterTable', 'id',
                           'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id']),
        )
    for form in iteritems(args.cldf, 'FormTable', 'id', 'form',
                          'languageReference', 'parameterReference', 'source'):
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(
            common.Value,
            form['id'],
            id=form['id'],
            name=form['form'],
            valueset=vs,
        )

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )
Пример #8
0
def main(args):
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    ds = data.add(
        common.Dataset,
        papuanvoices.__name__,
        id=papuanvoices.__name__,
        domain='papuanvoices.clld.org',
        name="Papuan Voices",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    data.add(common.Contributor, 'gray', id='gray', name='Russell Gray')
    for i, ed in enumerate(['gray']):
        data.add(common.Editor,
                 ed,
                 dataset=ds,
                 contributor=data['Contributor'][ed],
                 ord=i)

    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            description=lang['LongName'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id']),
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
        )
    f2a = form2audio(args.cldf)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        data.add(
            Counterpart,
            form['id'],
            id=form['id'],
            name=form['form'],
            valueset=vs,
            audio=f2a.get(form['id']),
        )

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )
def add_sources(sources_file_path, session):
    bibtex_db = bibtex.Database.from_file(sources_file_path, encoding='utf-8')
    for record in bibtex_db:
        session.add(bibtex2source(record))
        yield record.id
    session.flush()
Пример #10
0
def main(args):
    data = Data()

    icons = cycle(ORDERED_ICONS)

    dataset = common.Dataset(
        id=gelato.__name__,
        name="GeLaTo",
        description="Genes and Languages together",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='gelato.clld.org',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })

    for i, (id_, name) in enumerate([('barbierichiara', 'Chiara Barbieri'),
                                     ('blasidamian', 'Damián Blasi'),
                                     ('forkelrobert', 'Robert Forkel')]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)

    families = {}

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    for r in args.cldf.iter_rows('ContributionTable', 'id', 'name',
                                 'description'):
        ds = data.add(models.Panel,
                      r['id'],
                      id=r['id'],
                      name=r['name'],
                      description=r['description'])
    for row in args.cldf.iter_rows('LanguageTable', 'id', 'name',
                                   'contributionReference'):
        icon = families.get(row['LanguageFamily_Glottocode'])
        if not icon:
            families[row['LanguageFamily_Glottocode']] = icon = next(icons)
        lang = data['Languoid'].get(row['Glottocode'])
        if not lang:
            lang = data.add(
                models.Languoid,
                row['Glottocode'],
                id=row['Glottocode'],
                name=row['Language_Name'],
                family_id=row['LanguageFamily_Glottocode'],
                family_name=row['LanguageFamily'],
                jsondata=dict(icon=icon.name),
            )
        s = data.add(
            models.Sample,
            row['id'],
            id=row['id'],
            name=row['Name'],
            panel=data['Panel'][row['contributionReference']],
            languoid=lang,
            latitude=row['Latitude'],
            longitude=row['Longitude'],
            samplesize=int(row['samplesize']),
            #source=row.get('dataSet.of.origin'),
            region=row['geographicRegion'],
            #location=row['location'],
            jsondata=dict(color=REGIONS[row['geographicRegion']]),
        )
        DBSession.flush()
        for bibkey in row['Source']:
            DBSession.add(
                common.LanguageSource(language_pk=s.pk,
                                      source_pk=data['Source'][bibkey].pk))

    types = {}
    for row in args.cldf.iter_rows('ParameterTable', 'id', 'name',
                                   'description', 'contributionReference'):
        types[row['id']] = Datatype.fromvalue(row['datatype'])
        data.add(models.Measure,
                 row['id'],
                 id=row['id'],
                 name=row['name'],
                 description=row['description'],
                 panel=data['Panel'][row['contributionReference']])

    for row in args.cldf.iter_rows('ValueTable', 'id', 'parameterReference',
                                   'languageReference'):
        v = types[row['parameterReference']].read(row['Value'])
        if isinstance(v, float):
            vs = data.add(
                common.ValueSet,
                row['id'],
                id=row['id'],
                language=data['Sample'][row['languageReference']],
                parameter=data['Measure'][row['parameterReference']],
                #contribution=ds,
                #jsondata=dict(color=REGIONS[sample.region]),
            )
            data.add(models.Measurement,
                     row['id'],
                     id=row['id'],
                     valueset=vs,
                     name=row['Value'],
                     value=v)
Пример #11
0
def main(args):  # pragma: no cover
    get_repos()
    api = Grambank(REPOS['Grambank'])
    cldf = args.cldf
    data = Data()
    dataset = models.Grambank(
        id=grambank.__name__,
        name="Grambank",
        description="Grambank",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='grambank.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    contributors = {}
    for i, contrib in enumerate(api.contributors):
        contrib = common.Contributor(
            contrib.id,
            id=contrib.id,
            name=contrib.name,
        )
        common.Editor(dataset=dataset, contributor=contrib, ord=i)
        DBSession.add(contrib)
        DBSession.flush()
        contributors[contrib.id] = contrib.pk
    contributions = {r['ID']: r for r in cldf['LanguageTable']}

    DBSession.add(dataset)

    for rec in tqdm(list(Database.from_file(cldf.bibpath, lowercase=True)),
                    desc='sources'):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    DBSession.flush()
    sources = {k: v.pk for k, v in data['Source'].items()}

    features, codes = import_features(cldf, contributors)
    transaction.commit()

    values_by_sheet = [(lid, list(v)) for lid, v in itertools.groupby(
        sorted(cldf['ValueTable'], key=lambda r: r['Language_ID']),
        lambda r: r['Language_ID'],
    )]
    for lid, values in tqdm(values_by_sheet, desc='loading values'):
        transaction.begin()
        import_values(values, contributions[lid], features, codes,
                      contributors, sources)
        transaction.commit()

    transaction.begin()

    glottolog = Glottolog(REPOS['glottolog'])
    languoids = {l.id: l for l in glottolog.languoids()}
    gblangs = DBSession.query(models.GrambankLanguage).all()
    load_families(data,
                  gblangs,
                  glottolog_repos=REPOS['glottolog'],
                  isolates_icon='dcccccc')

    # Add isolates
    for lg in gblangs:
        gl_language = languoids.get(lg.id)
        if not gl_language.family:
            family = data.add(
                Family,
                gl_language.id,
                id=gl_language.id,
                name=gl_language.name,
                description=common.Identifier(
                    name=gl_language.id,
                    type=common.IdentifierType.glottolog.value).url(),
                jsondata={"icon": 'tcccccc'})
            lg.family = family
    coverage.main(glottolog)
    return
Пример #12
0
def main(args):
    assert args.glottolog, 'The --glottolog option is required!'
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')

    data = Data()
    ds = data.add(
        common.Dataset,
        mixezoqueanvoices.__name__,
        id=mixezoqueanvoices.__name__,
        name="Mixe-Zoquean Voices",
        domain='mixezoqueanvoices.clld.org',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )
    data.add(common.Contributor, 'kondic', id='kondic', name='Ana Kondic')
    data.add(common.Contributor, 'gray', id='gray', name='Russell Gray')
    DBSession.add(
        common.ContributionContributor(
            contribution=contrib,
            contributor=data['Contributor']['kondic'],
        ))
    for i, ed in enumerate(['kondic', 'gray']):
        data.add(common.Editor,
                 ed,
                 dataset=ds,
                 contributor=data['Contributor'][ed],
                 ord=i)

    ancestors = collections.defaultdict(list)
    gl = Glottolog(args.glottolog)
    lnames = {}
    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        lnames[lang['id']] = lang['name']
        glang = None
        if lang['glottocode']:
            glang = gl.languoid(lang['glottocode'])
            lineage = [i[0] for i in glang.lineage]
            if 'Mixe-Zoque' in lineage:
                ancestors[lang['id']].append('Protomixezoque')
            if 'Mixe' in lineage:
                ancestors[lang['id']].append('Protomixe')
            if 'Oaxaca Mixe' in lineage:
                ancestors[lang['id']].append('Protooaxacamixe')
        if not glang:
            assert lang['name'] == 'Nizaviguiti'
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            description=lang['LongName'],
            subgroup=glang.lineage[1][0]
            if glang and len(glang.lineage) > 1 else None,
        )
    colors = dict(
        zip(
            set(l.subgroup for l in data['Variety'].values()),
            qualitative_colors(
                len(set(l.subgroup for l in data['Variety'].values())))))
    for l in data['Variety'].values():
        l.jsondata = dict(color=colors[l.subgroup].replace('#', ''))

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    # Store proto-forms for later lookup:
    proto_forms = collections.defaultdict(
        lambda: collections.defaultdict(list))
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference'):
        if form['languageReference'].startswith('Proto'):
            proto_forms[form['languageReference']][
                form['parameterReference']].append(form['form'])

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        proto = collections.OrderedDict()
        for lid, forms in proto_forms.items():
            f = forms.get(param['id'])
            if f:
                proto[lnames[lid]] = f
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id'].split('_')[0]),
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
            description=param['Spanish_Gloss'],
            jsondata=dict(reconstructions=proto),
        )

    f2a = form2audio(args.cldf)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        assert not (form['form'] == '►' and not f2a.get(form['id']))
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        proto = collections.OrderedDict()
        for lid in ancestors.get(form['languageReference'], []):
            f = proto_forms[lid].get(form['parameterReference'])
            if f:
                proto[lnames[lid]] = f
        data.add(
            Counterpart,
            form['id'],
            id=form['id'],
            name=form['form'],
            valueset=vs,
            audio=f2a.get(form['id']),
            jsondata=dict(reconstructions=proto),
        )

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
Пример #13
0
    def add_to_database(self, data, language_id_map, contrib):
        used_languages = {
            row['Language_ID']
            for row in chain(
                self.cldf.get('ValueTable') or (),
                self.cldf.get('ExampleTable') or (),
                self.cldf.get('constructions.csv') or ())
            if row.get('Language_ID')
        }

        local_lang_ids = set()
        for language_row in self.cldf['LanguageTable']:
            old_id = language_row.get('ID')
            if not old_id or old_id not in used_languages:
                continue

            # Apparently some datasets contain multiple languages sharing the
            # same Glottocode...  So try and use the name to distinguish them
            id_candidate = language_row.get('Glottocode') or old_id
            number = 1
            new_id = id_candidate
            lang = data['Variety'].get(new_id)
            while (lang and new_id in local_lang_ids
                   and slug(lang.name) != slug(language_row.get('Name'))):
                number += 1
                new_id = '{}-{}'.format(id_candidate, number)
                lang = data['Variety'].get(new_id)
            local_lang_ids.add(new_id)

            language_id_map[old_id] = new_id
            if not lang:
                lang = data.add(Variety,
                                new_id,
                                id=new_id,
                                **map_cols(LANG_MAP, language_row))

            DBSession.flush()
            # TODO add glottocode, iso code, and wals code if available

            DBSession.add(
                ContributionLanguage(language_pk=lang.pk,
                                     contribution_pk=contrib.pk))

        DBSession.flush()

        for i, spec in enumerate(self.authors):
            if not isinstance(spec, dict):
                spec = {'name': spec}
            name = spec.get('name', '')
            parsed_name = HumanName(name)
            author_id = slug('{}{}'.format(parsed_name.last,
                                           parsed_name.first))
            author = data['Contributor'].get(author_id)
            if not author:
                author = data.add(Contributor,
                                  author_id,
                                  id=author_id,
                                  name=parsed_name.full_name,
                                  address=spec.get('affiliation'),
                                  url=spec.get('url'),
                                  email=spec.get('email'))
                DBSession.flush()
            DBSession.add(
                ContributionContributor(ord=i + 1,
                                        primary=spec.get('primary', True),
                                        contribution=contrib,
                                        contributor=author))

        biblio_map = {}
        if self.sources:
            for bibrecord in self.sources.records:
                source = bibtex2source(bibrecord, CrossgramDataSource)
                old_id = bibrecord.id
                new_id = '{}-{}'.format(contrib.id, old_id)
                source.id = new_id
                source.contribution = contrib
                biblio_map[old_id] = source

        cparam_ids = {
            row['Parameter_ID']
            for row in self.cldf.get('cvalues.csv', ())
            if 'Parameter_ID' in row
        }

        if self.cldf.get('ParameterTable'):
            for param_row in self.cldf.get('ParameterTable', ()):
                old_id = param_row.get('ID')
                if not old_id:
                    continue
                new_id = '{}-{}'.format(contrib.id, old_id)
                data.add(CParameter if old_id in cparam_ids else LParameter,
                         old_id,
                         contribution=contrib,
                         id=new_id,
                         **map_cols(PARAM_MAP, param_row))
        else:
            # If there is no parameter table fall back to Parameter_ID's in the
            # value tables
            for lvalue_row in self.cldf.get('ValueTable', ()):
                old_id = lvalue_row.get('Parameter_ID')
                if not old_id or old_id in data['LParameter']:
                    continue
                new_id = '{}-{}'.format(contrib.id, old_id)
                data.add(LParameter,
                         old_id,
                         contribution=contrib,
                         id=new_id,
                         name=old_id)
            for cvalue_row in self.cldf.get('cvalues.csv', ()):
                old_id = lvalue_row.get('Parameter_ID')
                if not old_id or old_id in data['CParameter']:
                    continue
                new_id = '{}-{}'.format(contrib.id, old_id)
                data.add(LParameter,
                         old_id,
                         contribution=contrib,
                         id=new_id,
                         name=old_id)

        DBSession.flush()

        for code_row in self.cldf.get('CodeTable', ()):
            old_id = code_row.get('ID')
            param_id = code_row.get('Parameter_ID')
            if not old_id or not param_id:
                continue
            new_id = '{}-{}'.format(contrib.id, old_id)
            if param_id in cparam_ids:
                param = data['CParameter'].get(param_id)
                data.add(UnitDomainElement,
                         old_id,
                         parameter=param,
                         id=new_id,
                         **map_cols(CCODE_MAP, code_row))
            else:
                param = data['LParameter'].get(param_id)
                data.add(DomainElement,
                         old_id,
                         parameter=param,
                         id=new_id,
                         **map_cols(LCODE_MAP, code_row))

        for index, example_row in enumerate(self.cldf.get('ExampleTable', ())):
            old_id = example_row.get('ID')
            lang_new_id = language_id_map.get(example_row['Language_ID'])
            lang = data['Variety'].get(lang_new_id)
            if not old_id or not lang:
                continue
            new_id = '{}-{}'.format(contrib.number or contrib.id, index + 1)
            example_row = _merge_glosses(example_row)
            example = data.add(Example,
                               old_id,
                               language=lang,
                               contribution=contrib,
                               id=new_id,
                               **map_cols(EXAMPLE_MAP, example_row))

            DBSession.flush()
            st = parse_source(biblio_map, example_row.get('Source') or '')
            if st and st.source_pk is not None:
                DBSession.add(
                    SentenceReference(key=st.bibkey,
                                      description=st.pages,
                                      sentence_pk=example.pk,
                                      source_pk=st.source_pk))

        DBSession.flush()

        for constr_row in self.cldf.get('constructions.csv', ()):
            old_id = constr_row.get('ID')
            if not old_id:
                continue
            new_id = '{}-{}'.format(contrib.id, old_id)
            lang_new_id = language_id_map.get(constr_row['Language_ID'])
            lang = data['Variety'].get(lang_new_id)
            constr = data.add(Construction,
                              old_id,
                              language=lang,
                              contribution=contrib,
                              id=new_id,
                              **map_cols(CONSTR_MAP, constr_row))

            DBSession.flush()
            for source_string in sorted(set(constr_row.get('Source') or ())):
                st = parse_source(biblio_map, source_string)
                if st and st.source_pk is not None:
                    DBSession.add(
                        UnitReference(key=st.bibkey,
                                      description=st.pages,
                                      unit_pk=constr.pk,
                                      source_pk=st.source_pk))

            for ex_id in sorted(set(constr_row.get('Example_IDs', ()))):
                example = data['Example'].get(ex_id)
                if example:
                    DBSession.add(UnitSentence(unit=constr, sentence=example))

        DBSession.flush()

        valueset_refs = OrderedDict()
        for value_row in self.cldf.get('ValueTable', ()):
            old_id = value_row.get('ID')
            lang_new_id = language_id_map.get(value_row['Language_ID'])
            lang = data['Variety'].get(lang_new_id)
            param = data['LParameter'].get(value_row['Parameter_ID'])
            code = data['DomainElement'].get(value_row['Code_ID'])
            value_name = code.name if code and code.name else value_row['Value']
            if not old_id or not lang or not param or not value_name:
                continue
            new_id = '{}-{}'.format(contrib.id, old_id)

            valueset = data['ValueSet'].get((lang.pk, param.pk))
            if not valueset:
                valueset = data.add(ValueSet, (lang.pk, param.pk),
                                    id=new_id,
                                    language=lang,
                                    parameter=param,
                                    contribution=contrib)

            DBSession.flush()
            lvalue = data['Value'].get((valueset.pk, value_name))
            if not lvalue:
                lvalue = data.add(Value, (valueset.pk, value_name),
                                  id=new_id,
                                  name=value_name,
                                  valueset=valueset,
                                  domainelement=code)

            for source_string in sorted(set(value_row.get('Source') or ())):
                st = parse_source(biblio_map, source_string)
                if st and st.source_pk is not None:
                    # collect sources for all values in the same value set
                    if valueset.pk not in valueset_refs:
                        valueset_refs[valueset.pk] = list()
                    valueset_refs[valueset.pk].append(st)

            DBSession.flush()
            for ex_id in sorted(set(value_row.get('Example_IDs', ()))):
                example = data['Example'].get(ex_id)
                if example:
                    DBSession.add(ValueSentence(value=lvalue,
                                                sentence=example))

        # attach collected sources from values to the value set
        valuesets = DBSession.query(ValueSet)\
            .filter(ValueSet.contribution == contrib)
        for valueset in valuesets:
            source_tuples = sorted(set(valueset_refs.get(valueset.pk, ())))
            for st in source_tuples:
                DBSession.add(
                    ValueSetReference(key=st.bibkey,
                                      description=st.pages or None,
                                      valueset_pk=valueset.pk,
                                      source_pk=st.source_pk))
            valueset.source = ';'.join(st[2] for st in source_tuples)

        for cvalue_row in self.cldf.get('cvalues.csv', ()):
            old_id = cvalue_row.get('ID')
            constr = data['Construction'].get(cvalue_row['Construction_ID'])
            param = data['CParameter'].get(cvalue_row['Parameter_ID'])
            code = data['UnitDomainElement'].get(cvalue_row['Code_ID'])
            value_name = code.name if code else cvalue_row['Value']
            if not old_id or not constr or not param or not value_name:
                continue
            new_id = '{}-{}'.format(contrib.id, old_id)

            cvalue = data.add(UnitValue,
                              old_id,
                              id=new_id,
                              name=value_name,
                              contribution=contrib,
                              unit=constr,
                              unitparameter=param,
                              unitdomainelement=code)

            DBSession.flush()
            for ex_id in sorted(set(cvalue_row.get('Example_IDs') or ())):
                example = data['Example'].get(ex_id)
                if example:
                    DBSession.add(
                        UnitValueSentence(unitvalue=cvalue, sentence=example))

            for source_string in sorted(set(cvalue_row.get('Source') or ())):
                st = parse_source(biblio_map, source_string)
                if st and st.source_pk is not None:
                    DBSession.add(
                        UnitValueReference(key=st.bibkey,
                                           description=st.pages or None,
                                           unitvalue=cvalue,
                                           source_pk=st.source_pk))
Пример #14
0
def main(args):
    data = Data()
    doi = input('DOI of the released dataset: ')

    dataset = common.Dataset(
        id=ewave.__name__,
        name='eWAVE',
        description='The Electronic World Atlas of Varieties of English',
        domain='ewave-atlas.org',
        published=date.today(),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'doi': doi,
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'})
    DBSession.add(dataset)

    ed_pattern = re.compile('ed(?P<ord>[0-9]+)$')
    for c in args.cldf['contributors.csv']:
        contrib = data.add(
            models.WaveContributor,
            c['ID'],
            id=c['ID'],
            name=c['Name'],
            email=c['Email'],
            url=c['URL'],
            address=c['Address'],
            sortkey=HumanName(c['Name']).last,
        )
        m = ed_pattern.match(c['ID'])
        if m:
            common.Editor(dataset=dataset, contributor=contrib, ord=int(m.group('ord')))

    for fc in args.cldf['featurecategories.csv']:
        data.add(
            models.FeatureCategory, fc['ID'],
            id=fc['ID'], name=fc['Name'], description=fc['Description'])

    for vt in args.cldf['varietytypes.csv']:
        data.add(
            models.VarietyType, vt['ID'],
            id=vt['ID'],
            name=vt['Name'],
            description=vt['Description'],
            jsondata=VARIETY_TYPE_ICONS[vt['ID']],
        )

    for vt in args.cldf['regions.csv']:
        data.add(models.Region, vt['ID'], id=vt['ID'], name=vt['Name'])

    for lang in args.cldf['LanguageTable']:
        l = data.add(
            models.Variety, lang['ID'],
            id=lang['ID'],
            name=lang['Name'],
            latitude=lang['Latitude'],
            longitude=lang['Longitude'],
            abbr=lang['abbr'],
            region=data['Region'][lang['Region_ID']],
            type=data['VarietyType'][lang['Type_ID']],
        )
        if lang['Glottocode']:
            add_language_codes(data, l, None, glottocode=lang['Glottocode'])
        c = data.add(
            models.WaveContribution, lang['ID'],
            id=str(lang['ID']),
            name=lang['Name'],
            description=lang['Description'],
            variety=l)
        for i, cid in enumerate(lang['Contributor_ID']):
            DBSession.add(common.ContributionContributor(
                contribution=c,
                contributor=data['WaveContributor'][cid],
                ord=i+1,
            ))

    for param in args.cldf['ParameterTable']:
        data.add(
            models.Feature, param['ID'],
            id=param['ID'],
            category=data['FeatureCategory'][param['Category_ID']],
            name=param['Name'],
            description=param['Description'],
            jsondata={'example_source': param['Example_Source']})


    for de in args.cldf['CodeTable']:
        data.add(
            common.DomainElement, de['ID'],
            id=de['ID'],
            parameter=data['Feature'][de['Parameter_ID']],
            name=de['Name'],
            description=de['Description'],
            jsondata={'color': CODE_COLORS[de['Name']]},
            number=de['Number'])

    for rec in bibtex.Database.from_file(args.cldf.bibpath):
        data.add(common.Source, slug(rec.id), _obj=bibtex2source(rec))

    for example in args.cldf['ExampleTable']:
        s = data.add(
            common.Sentence, example['ID'],
            id=example['ID'],
            name=example['Primary_Text'],
            gloss='\t'.join(example['Gloss']) if example['Gloss'] else None,
            comment=example['Comment'] or None,
            description=example['Translated_Text'] or None,
            language=data['Variety'][example['Language_ID']])

        for ref in example['Source']:
            sid, pages = Sources.parse(ref)
            DBSession.add(common.SentenceReference(
                sentence=s, source=data['Source'][sid], description=pages, key=sid))

    for value in args.cldf['ValueTable']:
        de = data['DomainElement'][value['Code_ID']]
        vs = data.add(
            common.ValueSet, value['ID'],
            id=value['ID'],
            contribution=data['WaveContribution'][value['Language_ID']],
            parameter=data['Feature'][value['Parameter_ID']],
            jsondata=de.jsondata,
            language=data['Variety'][value['Language_ID']])
        v = data.add(
            common.Value, value['ID'],
            id=value['ID'],
            domainelement=de,
            valueset=vs)

        for eid in value['Example_ID']:
            DBSession.add(common.ValueSentence(sentence=data['Sentence'][eid], value=v))
Пример #15
0
def main(args):

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    ds = data.add(
        common.Dataset,
        jambu.__name__,
        id=jambu.__name__,
        name='Jambu',
        domain='jambu-clld.herokuapp.com',
        publisher_name="Georgetown University",
        publisher_place="Washington",
        publisher_url="http://gucl.georgetown.edu/",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )

    for i, name in enumerate(['Aryaman Arora']):
        common.Editor(dataset=ds,
                      ord=i,
                      contributor=common.Contributor(id=slug(
                          HumanName(name).last),
                                                     name=name))

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )

    print("Languages...")
    for lang in iteritems(args.cldf, 'LanguageTable', 'id', 'name',
                          'glottocode', 'longitude', 'latitude', 'Clade'):
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            family=lang['Clade'],
        )

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    print("Cognates...")
    for cognate in iteritems(args.cldf, 'CognateTable'):
        # print(cognate)
        data.add(models.Cognate_,
                 cognate['Cognateset_ID'],
                 name=cognate['Form'],
                 language=cognate['Language_ID'],
                 description=cognate['Description'])

    counts = collections.defaultdict(set)
    print("Forms...")
    for form in tqdm(
            iteritems(args.cldf, 'FormTable', 'id', 'form',
                      'languageReference', 'parameterReference', 'source')):
        counts[form['parameterReference']].add(form['languageReference'])

    print("Params...")
    for param in tqdm(
            iteritems(args.cldf, 'ParameterTable', 'ID', 'Name',
                      'Concepticon_ID', 'Description')):
        data.add(models.Concept,
                 param['ID'],
                 id=param['ID'],
                 name='{} [{}]'.format(param['Name'], param['ID']),
                 description=param['Description'],
                 count=len(counts[param['ID']]))

    print("Forms...")
    for form in tqdm(
            iteritems(args.cldf, 'FormTable', 'id', 'form',
                      'languageReference', 'parameterReference', 'source')):
        l = re.split(r";|\+", form['parameterReference'])
        for i, paramref in enumerate(l):
            if paramref == '?': continue
            vsid = (form['languageReference'], paramref)
            vs = data['ValueSet'].get(vsid)
            if not vs:
                vs = data.add(
                    common.ValueSet,
                    vsid,
                    id='-'.join(vsid),
                    language=data['Variety'][form['languageReference']],
                    parameter=data['Concept'][paramref],
                    contribution=contrib,
                )

            for ref in form.get('source', []):
                sid, pages = Sources.parse(ref)
                refs[(vsid, sid)].append(pages)

            data.add(
                models.Lexeme,
                form['id'] + '-' + str(i) if len(l) > 1 else form['id'],
                id=form['id'] + '-' + str(i) if len(l) > 1 else form['id'],
                name=form['form'],
                gloss=form['Gloss'],
                native=form['Native'],
                phonemic='/' + form['Phonemic'] +
                '/' if form['Phonemic'] else None,
                description=form['Description'],
                cognateset=form['Cognateset'],
                valueset=vs,
            )

    print("Refs...")
    for (vsid, sid), pages in tqdm(refs.items()):
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
Пример #16
0
def main(args):
    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    data.add(common.Contribution,
             'tsammalex',
             name="Tsammalex",
             id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec,
                 rec.id,
                 _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(data,
                           lang,
                           lang.id.split('-')[0],
                           None,
                           glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg',
                      source_url).replace('/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(url=image.source_url,
                            thumbnail=image_url(image.source_url, 'thumbnail'),
                            web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(object=data['Taxon'][image.taxa__id],
                                       id=image.id,
                                       name=image.tags,
                                       jsondata=jsondata,
                                       mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)
Пример #17
0
def main(args):
    data = Data()
    ds = Pofatu(
        pathlib.Path(pofatu.__file__).parent.parent.parent / 'pofatu-data')

    dataset = common.Dataset(
        id=pofatu.__name__,
        name="POFATU",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="https://creativecommons.org/licenses/by/4.0/",
        domain='pofatu.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })

    for i, (id_, name) in enumerate([
        ('hermannaymeric', 'Aymeric Hermann'),
        ('forkelrobert', 'Robert Forkel'),
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    for rec in ds.iterbib():
        rec.genre = bibtex.EntryType.from_string(
            ENTRY_TYPES.get(rec.genre, rec.genre))
        if 'date' in rec:
            rec['year'] = rec.pop('date')
        data.add(common.Source,
                 rec.id,
                 _obj=bibtex2source(rec, lowercase_id=False))

    analyses = list(ds.iterdata())

    def midpoint(coords):
        p = MultiPoint([(lat, lon + 360 if lon < 0 else lon)
                        for lat, lon in coords]).convex_hull
        #geojson = {
        #    'type': 'Feature',
        #    'properties': {},
        #    'geometry': mapping(p)}
        c = p.centroid
        return c.x, (c.y - 360) if c.y > 180 else c.y

    artefacts = collections.defaultdict(dict)
    midpoints = {}
    for a in analyses:
        l = a.sample.location
        lid = l.id
        if lid not in midpoints:
            midpoints[lid] = set()
        if l.latitude is not None and l.longitude is not None:
            midpoints[lid].add((l.latitude, l.longitude))
        art = a.sample.artefact
        for attr_ in ['name', 'category', 'collection_type']:
            if not artefacts[slug(art.id)].get(attr_):
                artefacts[slug(art.id)][attr_] = getattr(art, attr_)

    midpoints = {
        k: midpoint(v) if v else (None, None)
        for k, v in midpoints.items()
    }

    for analysis in analyses:
        loc = analysis.sample.location
        if loc.id not in data['Location']:
            data.add(
                models.Location,
                loc.id,
                id=valid_id(loc.id),
                name=loc.label,
                latitude=midpoints[loc.id][0],
                longitude=midpoints[loc.id][1],
                region=loc.region.replace('_', ' '),
                subregion=loc.subregion,
                location=loc.locality,
            )

    # Add contributions
    for contrib in ds.itercontributions():
        contribution = data.add(
            common.Contribution,
            contrib.id,
            id=valid_id(contrib.id),
            name=contrib.label,
            description=contrib.description,
        )
        DBSession.flush()
        for i, name in enumerate(contrib.contributors):
            cid = slug(name)
            co = data['Contributor'].get(cid)
            if not co:
                co = data.add(common.Contributor, cid, id=cid, name=name)
            common.ContributionContributor(ord=i,
                                           contribution=contribution,
                                           contributor=co)

        for ref in contrib.source_ids:
            DBSession.add(
                common.ContributionReference(
                    contribution=contribution,
                    source=data['Source'][ref],
                ))
            data['Contribution'][ref] = contribution

    methods = collections.defaultdict(list)
    for method in ds.itermethods():
        m = data.add(
            models.Method,
            method.id,
            id=valid_id(method.id),
            name=method.label,
            code=method.code,
            parameter=method.parameter.strip(),
            instrument=method.instrument,
            number_of_replicates=method.number_of_replicates,
            date=method.date,
            comment=method.comment,
            detection_limit=method.detection_limit,
            detection_limit_unit=method.detection_limit_unit,
            total_procedural_blank_value=method.total_procedural_blank_value,
            total_procedural_unit=method.total_procedural_unit,
        )
        methods[(m.code.lower(), m.parameter.lower())].append(m)
        for ref in method.references:
            DBSession.add(
                models.MethodReference(
                    method=m,
                    sample_name=ref.sample_name,
                    sample_measured_value=ref.sample_measured_value,
                    uncertainty=ref.uncertainty,
                    uncertainty_unit=ref.uncertainty_unit,
                    number_of_measurements=ref.number_of_measurements,
                ))
        for ref in method.normalizations:
            DBSession.add(
                models.Normalization(
                    method=m,
                    reference_sample_name=ref.reference_sample_name,
                    reference_sample_accepted_value=ref.
                    reference_sample_accepted_value,
                    citation=ref.citation,
                ))

    parameter = data.add(common.Parameter,
                         'c',
                         id='category',
                         name='Sample category')
    for i, opt in enumerate(attr.fields_dict(
            pypofatu.models.Sample)['sample_category'].validator.options,
                            start=1):
        data.add(common.DomainElement,
                 opt,
                 parameter=parameter,
                 id=str(i),
                 name=opt)

    DBSession.flush()
    assert parameter.pk

    # Add Samples and UnitParameters and Measurements
    for analysis in analyses:
        sample = analysis.sample
        vsid = '{0}-{1}'.format(sample.location.id,
                                data['Contribution'][sample.source_id].id)
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id=valid_id(vsid),
                language_pk=data['Location'][sample.location.id].pk,
                parameter_pk=parameter.pk,
                contribution_pk=data['Contribution'][sample.source_id].pk,
            )
        v = data['Sample'].get(sample.id)
        if not v:
            v = data.add(
                models.Sample,
                sample.id,
                id=valid_id(sample.id),
                name=sample.id,
                sample_name=sample.sample_name,
                sample_comment=sample.sample_comment,
                petrography=sample.petrography,
                latitude=sample.location.latitude,
                longitude=sample.location.longitude,
                elevation=sample.location.elevation,
                location_comment=sample.location.comment,
                site_name=sample.site.name,
                site_code=sample.site.code,
                site_context=sample.site.context,
                site_comment=sample.site.comment,
                site_stratigraphic_position=sample.site.stratigraphic_position,
                site_stratigraphy_comment=sample.site.stratigraphy_comment,
                domainelement=data['DomainElement'][sample.sample_category],
                valueset=vs,
                artefact_id=sample.artefact.id,
                artefact_name=sample.artefact.name,
                artefact_category=sample.artefact.category,
                artefact_comment=sample.artefact.comment,
                artefact_attributes=sample.artefact.attributes,
                artefact_collector=sample.artefact.collector,
                artefact_collection_type=sample.artefact.collection_type,
                artefact_collection_location=sample.artefact.
                collection_location,
                artefact_collection_comment=sample.artefact.collection_comment,
                artefact_fieldwork_date=sample.artefact.fieldwork_date,
            )
            DBSession.add(
                models.SampleReference(
                    description='sample',
                    sample=v,
                    source=data['Source'][sample.source_id]))
            for ref in sample.artefact.source_ids:
                DBSession.add(
                    models.SampleReference(description='artefact',
                                           sample=v,
                                           source=data['Source'][ref]))
            for ref in sample.site.source_ids:
                DBSession.add(
                    models.SampleReference(description='site',
                                           sample=v,
                                           source=data['Source'][ref]))

        a = data.add(
            models.Analysis,
            analysis.id,
            id=better_slug(analysis.id),
            name=analysis.id,
            sample=v,
        )

        for i, measurement in enumerate(analysis.measurements):
            if i == 0:
                method = measurement.method
                if method:
                    a.analyzed_material_1 = method.analyzed_material_1,
                    a.analyzed_material_2 = method.analyzed_material_2,
                    a.sample_preparation = method.sample_preparation,
                    a.chemical_treatment = method.chemical_treatment,
                    a.technique = method.technique,
                    a.laboratory = method.laboratory,
                    a.analyst = method.analyst,

            pid = slug(measurement.parameter, lowercase=False)
            p = data['Param'].get(pid)
            if not p:
                p = data.add(models.Param,
                             pid,
                             id=pid,
                             name=measurement.parameter)
            data.add(
                models.Measurement,
                None,
                id='{0}-{1}'.format(a.id, p.id),
                analysis=a,
                method=data['Method'].get(measurement.method.id)
                if measurement.method else None,
                value=measurement.value,
                less=measurement.less,
                precision=measurement.value_sd,
                sigma=measurement.sd_sigma,
                unitparameter=p,
            )
Пример #18
0
def main(args):  # pragma: no cover
    #
    # FIXME: more generic:
    # - run iter_datasets(args.cldf) -> assuming args.cldf is a directory! -> must go in clld!
    # - Store datasets in defaultdict(list) keyed with module
    #
    datasets = {}
    for ds in iter_datasets(args.cldf.directory):
        datasets[ds.module] = ds

    assert args.glottolog, 'The --glottolog option is required!'

    data = Data()
    thedataset = data.add(
        common.Dataset,
        hindukush.__name__,
        id=hindukush.__name__,
        name='Hindu Kush Areal Typology',
        domain='hindukush.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        },
    )
    for i, name in enumerate(
        ['Henrik Liljegren', 'Robert Forkel', 'Nina Knobloch', 'Noa Lange']):
        common.Editor(dataset=thedataset,
                      ord=i,
                      contributor=common.Contributor(id=slug(
                          HumanName(name).last),
                                                     name=name))

    for rec in bibtex.Database.from_file(pathlib.Path(__file__).parent /
                                         'HK_website.bib',
                                         lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)
    for module, ds in sorted(datasets.items(), key=lambda i: i[0]):
        for lang in ds.iter_rows('LanguageTable', 'id', 'glottocode', 'name',
                                 'latitude', 'longitude'):
            if lang['id'] not in data['Variety']:
                data.add(
                    models.Variety,
                    lang['id'],
                    id=lang['id'],
                    name=lang['name'],
                    latitude=lang['latitude'],
                    longitude=lang['longitude'],
                    glottocode=lang['glottocode'],
                    subgroup=lang['SubGroup'],
                    location=lang['Location'],
                    elicitation=lang['Elicitation'],
                    jsondata=dict(shape=subgroup_shapes.get(lang['SubGroup'])),
                )

        contrib = data.add(
            models.CLDFDataset,
            module,
            id=module,
            name='{} [{}]'.format(ds.properties.get('dc:title'), module),
            description=ds.properties.get('dc:bibliographicCitation'),
            module=module,
        )

        if module == 'Wordlist':
            for param in ds.iter_rows('ParameterTable', 'id',
                                      'concepticonReference', 'name'):
                data.add(
                    models.Param,
                    param['id'],
                    id=param['id'],
                    name='{} [{}]'.format(param['name'], param['id']),
                    sortkey=param['id']
                    if not param['id'].startswith('Numerals') else
                    'Numerals-{0:04d}'.format(int(param['id'].split('-')[1])),
                    concepticon_id=param['concepticonReference'],
                    contribution=contrib,
                    category=param['domain'] or 'ASJPlist',
                )

            audio = {
                r['ID']: r
                for r in ds.iter_rows('media.csv')
                if r['mimetype'] == 'audio/mpeg'
            }
            for form in ds.iter_rows('FormTable', 'id', 'form',
                                     'languageReference', 'parameterReference',
                                     'source'):
                vsid = (form['languageReference'], form['parameterReference'])
                vs = data['ValueSet'].get(vsid)
                if not vs:
                    vs = data.add(
                        common.ValueSet,
                        vsid,
                        id='-'.join(vsid),
                        language=data['Variety'][form['languageReference']],
                        parameter=data['Param'][form['parameterReference']],
                        contribution=contrib,
                    )
                for ref in form.get('source', []):
                    sid, pages = Sources.parse(ref)
                    refs[(vsid, sid)].append(pages)
                mp3 = next(
                    iter([
                        audio[aid] for aid in form['Audio_Files']
                        if aid in audio
                    ]), None)
                data.add(
                    common.Value,
                    form['id'],
                    id=form['id'],
                    name=form['form'],
                    valueset=vs,
                    jsondata=dict(audio=ds.get_row_url('media.csv', mp3
                                                       ) if mp3 else None),
                )
        elif module == 'StructureDataset':
            for param in ds.iter_rows('ParameterTable', 'id', 'name',
                                      'description'):
                data.add(
                    models.Param,
                    param['id'],
                    id=param['id'],
                    name=param['name'],
                    description=html(param['description'])
                    if param['description'] else None,
                    category=param['Category'],
                    contribution=contrib,
                )
            for code in ds.iter_rows('CodeTable', 'id', 'name', 'description',
                                     'parameterReference'):
                data.add(common.DomainElement,
                         code['id'],
                         id=code['id'],
                         name=code['name'],
                         description=code['description'],
                         parameter=data['Param'][code['parameterReference']],
                         jsondata={
                             'color': {
                                 'absent': 'ff0000',
                                 'present': '0000ff',
                                 'indeterminate': 'cccccc',
                             }.get(code['description'])
                         })
            #
            # FIXME: read CodeTable!
            #
            for form in ds.iter_rows('ValueTable', 'id', 'value',
                                     'languageReference', 'parameterReference',
                                     'codeReference', 'source'):
                vsid = (form['languageReference'], form['parameterReference'])
                vs = data['ValueSet'].get(vsid)
                if not vs:
                    vs = data.add(
                        common.ValueSet,
                        vsid,
                        id='-'.join(vsid),
                        language=data['Variety'][form['languageReference']],
                        parameter=data['Param'][form['parameterReference']],
                        contribution=contrib,
                    )
                for ref in form.get('source', []):
                    sid, pages = Sources.parse(ref)
                    refs[(vsid, sid)].append(pages)
                data.add(
                    common.Value,
                    form['id'],
                    id=form['id'],
                    name=form['value'],
                    valueset=vs,
                    domainelement=data['DomainElement'][form['codeReference']])

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
    load_families(
        Data(),
        [(l.glottocode, l) for l in data['Variety'].values()],
        glottolog_repos=args.glottolog,
        isolates_icon='tcccccc',
        strict=False,
    )