Exemplo n.º 1
0
def add_doculects(lang_dataset, session, sources={}):
    """
    Creates and adds to the given SQLAlchemy session the Doculect instances
    harvested from the given LangDataset instance. Returns a dict of the added
    model instances with the respective ISO codes being the keys.
    The optional arg should contain common.Source instances with the keys being
    strings starting with the ISO code of the language that the source is for.
    Helper for the main function.
    """
    d = {}

    for lang in lang_dataset.gen_langs():

        if not (lang.name and lang.subfamily and lang.iso_code and
                lang.glotto_code and lang.longitude and lang.latitude and
                lang.id):
            print(f'SKIP: Missing data for {lang.name}.', file=sys.stderr)
            continue

        d[lang.name] = Doculect(id=lang.id, name=lang.name, subfamily=lang.subfamily,
                                iso_code=lang.iso_code, glotto_code=lang.glotto_code,
                                longitude=lang.longitude, latitude=lang.latitude)
        session.add(d[lang.name])

    session.flush()

    for key, source in sources.items():
        if key[:3] in d:
            session.add(common.LanguageSource(
                language_pk=d[key[:3]].pk,
                source_pk=source.pk))
    session.flush()

    return d
Exemplo n.º 2
0
def compute_language_sources(*references):
    """compute relations between languages and sources by going through the relevant
    models derived from the HasSource mixin.
    """
    old_sl = {}
    for pair in DBSession.query(common.LanguageSource):
        old_sl[(pair.source_pk, pair.language_pk)] = True

    references = list(references)
    references.extend([(common.ValueSetReference, 'valueset'),
                       (common.SentenceReference, 'sentence')])
    sl = {}
    for model, attr in references:
        for ref in DBSession.query(model):
            sl[(ref.source_pk, getattr(ref, attr).language_pk)] = True

    for s, l in sl:
        if (s, l) not in old_sl:
            DBSession.add(common.LanguageSource(language_pk=l, source_pk=s))
Exemplo n.º 3
0
def load_ref(data, entry, lgcodes, lgsources):
    kw = {'jsondata': {}, 'language_note': entry.fields.get('lgcode')}
    for col in common.Source.__table__.columns:
        if col.name in entry.fields:
            kw[col.name] = entry.fields.get(col.name)
    for col in models.Ref.__table__.columns:
        if col.name in entry.fields:
            kw[col.name] = entry.fields.get(col.name)
    for col in entry.fields:
        if col not in kw:
            kw['jsondata'][col] = entry.fields[col]
    try:
        btype = EntryType.from_string(entry.type.lower())
    except ValueError:
        btype = EntryType.misc

    kw.update(
        publisher=entry.publisher_and_address[0],
        address=entry.publisher_and_address[1],
        year_int=entry.year_int,
        pages_int=entry.pages_int,
        med_index=-entry.weight[0],
        med_pages=entry.weight[1],
        med_type=entry.med_type.id,
        id=entry.fields['glottolog_ref_id'],
        fts=fts.tsvector('\n'.join(v for k, v in entry.fields.items() if k != 'abstract')),
        name='{} {}'.format(entry.fields.get('author', 'na'), entry.fields.get('year', 'nd')),
        description=entry.fields.get('title') or entry.fields.get('booktitle'),
        bibtex_type=btype)
    ref = models.Ref(**kw)
    DBSession.add(ref)
    DBSession.flush()

    reflangs, trigger = [], None
    no_ca = [{'degruyter'}, {'benjamins'}]
    provs = set()
    for key in entry.fields['srctrickle'].split(','):
        key = key.strip()
        if key:
            reflangs.extend(lgsources.get(key, []))
            prov, key = key.split('#', 1)
            provs.add(prov)
            DBSession.add(models.Refprovider(
                provider_pk=data['Provider'][prov].pk,
                ref_pk=ref.pk,
                id='{0}:{1}'.format(prov, key)))

    if not reflangs:
        reflangs, trigger = entry.languoids(lgcodes)
        if trigger and ((provs in no_ca) or (reflangs)):
            # Discard computerized assigned languoids for bibs where this does not make sense,
            # or for bib entries that have been manually assigned in a Languoid's ini file.
            reflangs, trigger = [], None

    for lid in set(reflangs):
        DBSession.add(
            common.LanguageSource(
                language_pk=data['Languoid'][lid].pk, source_pk=ref.pk, active=not bool(trigger)))
    if trigger:
        ref.ca_language_trigger = trigger

    doctypes, trigger = entry.doctypes(data['Doctype'])
    if trigger is None or provs not in no_ca:
        for dt in set(doctypes):
            DBSession.add(models.Refdoctype(doctype_pk=dt.pk, ref_pk=ref.pk))
    if trigger:
        ref.ca_doctype_trigger = trigger

    return ref
Exemplo n.º 4
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    ordered_clade_colors = {k: v for k, v in DBSession.query(models.Clade.clade_name, models.Clade.color)
                            .filter(models.Clade.short_name != '')
                            .order_by(models.Clade.clade_level0).all()}
    for _, cc in groupby(
            DBSession.query(models.CognateClass, models.Variety.clade_name)
            .join(clld_cognacy_plugin.models.Cognate,
                  and_(models.CognateClass.pk == clld_cognacy_plugin.models.Cognate.cognateset_pk))
            .join(models.Value,
                  and_(clld_cognacy_plugin.models.Cognate.counterpart_pk == models.Value.pk))
            .join(common.ValueSet,
                  and_(models.Value.valueset_pk == common.ValueSet.pk))
            .join(models.Variety,
                  and_(common.ValueSet.language_pk == models.Variety.pk))
            .distinct().order_by(models.CognateClass.pk), lambda c: c[0].pk):
        cc = sorted(list(cc))
        cc[0][0].count_clades = len(cc)
        involved_clades = [c[1] for c in cc]
        r = []
        for cl, col in ordered_clade_colors.items():
            if cl in involved_clades:
                r.append(col)
            else:
                r.append('0')
        cc[0][0].involved_clade_colors = ' '.join(r)
        cc[0][0].clades = ', '.join(involved_clades)

    for c in DBSession.query(models.CognateClass, func.count(models.CognateClass.id)) \
            .join(clld_cognacy_plugin.models.Cognate) \
            .group_by(models.CognateClass.pk, models.Cognateset.pk, models.CognateClass.id):
        c[0].count_lexemes = c[1]

    for _, ccs in groupby(
        DBSession.query(models.CognateClass).order_by(models.CognateClass.meaning_pk),
        lambda c: c.meaning_pk
    ):
        ccs = list(ccs)
        colors = qualitative_colors(len(ccs))
        for i, cc in enumerate(ccs):
            cc.color = colors[i]

    for meaning in DBSession.query(models.Meaning).options(
        joinedload(models.Meaning.cognateclasses),
        joinedload(common.Parameter.valuesets, common.ValueSet.language)
    ):
        meaning.count_cognateclasses = len(meaning.cognateclasses)
        meaning.count_languages = len([vs.language for vs in meaning.valuesets])
        meaning.count_loan_cognateclasses = len([cc for cc in meaning.cognateclasses
                                                 if cc.is_loan])

    for meaning in DBSession.query(
        models.Meaning, func.count(common.Parameter.pk))\
            .join(common.Parameter).join(common.ValueSet).join(common.Value).group_by(
                models.Meaning.pk, common.Parameter.pk):
        meaning[0].count_lexemes = meaning[1]

    for language in DBSession.query(common.Language).options(
        joinedload(common.Language.valuesets, common.ValueSet.references)
    ):
        language.count_meanings = len(language.valuesets)
        language.count_lexemes = len(DBSession.query(common.Value.id)
                                     .filter(common.ValueSet.language_pk == language.pk)
                                     .join(common.ValueSet).all())
        spks = set()
        for vs in language.valuesets:
            for ref in vs.references:
                spks.add(ref.source_pk)
        for spk in spks:
            DBSession.add(common.LanguageSource(language_pk=language.pk, source_pk=spk))
Exemplo n.º 5
0
def load_ref(data, entry, lgcodes, lgsources):
    kw = {'jsondata': {}, 'language_note': entry.fields.get('lgcode')}
    for col in common.Source.__table__.columns:
        if col.name in entry.fields:
            kw[col.name] = entry.fields.get(col.name)
    for col in models.Ref.__table__.columns:
        if col.name in entry.fields:
            kw[col.name] = entry.fields.get(col.name)
    for col in entry.fields:
        if col not in kw:
            kw['jsondata'][col] = entry.fields[col]
    try:
        btype = EntryType.from_string(entry.type.lower())
    except ValueError:
        btype = EntryType.misc

    # try to extract numeric year, startpage, endpage, numberofpages, ...
    if kw.get('year'):
        # prefer years in brackets over the first 4-digit number.
        match = PREF_YEAR_PATTERN.search(kw.get('year'))
        if match:
            kw['year_int'] = int(match.group('year'))
        else:
            match = YEAR_PATTERN.search(kw.get('year'))
            if match:
                kw['year_int'] = int(match.group('year'))
    if kw.get('publisher'):
        p = kw.get('publisher')
        if ':' in p:
            address, publisher = [s.strip() for s in kw['publisher'].split(':', 1)]
            if 'address' not in kw or kw['address'] == address:
                kw['address'], kw['publisher'] = address, publisher

    if kw.get('numberofpages'):
        try:
            kw['pages_int'] = int(kw.get('numberofpages').strip())
        except ValueError:
            pass

    if kw.get('pages'):
        start, end, number = compute_pages(kw['pages'])
        if start is not None:
            kw['startpage_int'] = start
        if end is not None:
            kw['endpage_int'] = end
        if number is not None and 'pages_int' not in kw:
            kw['pages_int'] = number

    kw.update(
        id=entry.fields['glottolog_ref_id'],
        fts=fts.tsvector(
            '\n'.join(v for k, v in entry.fields.items() if k != 'abstract')),
        name='%s %s' % (
            entry.fields.get('author', 'na'), entry.fields.get('year', 'nd')),
        description=entry.fields.get('title') or entry.fields.get('booktitle'),
        bibtex_type=btype)
    ref = models.Ref(**kw)
    DBSession.add(ref)
    DBSession.flush()

    reflangs = []
    no_ca = [{'degruyter'}, {'benjamins'}]
    provs = set()
    for key in entry.fields['srctrickle'].split(','):
        key = key.strip()
        if key:
            if key in lgsources:
                reflangs.extend(lgsources[key])
            prov, key = key.split('#', 1)
            provs.add(prov)
            DBSession.add(models.Refprovider(
                provider_pk=data['Provider'][prov].pk,
                ref_pk=ref.pk,
                id='{0}:{1}'.format(prov, key)))

    langs, trigger = entry.languoids(lgcodes)
    if trigger and ((provs in no_ca) or (reflangs)):
        # Discard computerized assigned languoids for bibs where this does not make sense,
        # or for bib entries that have been manually assigned in a Languoid's ini file.
        langs, trigger = [], None

    for lid in set(reflangs + langs):
        DBSession.add(
            common.LanguageSource(language_pk=data['Languoid'][lid].pk, source_pk=ref.pk))
    if trigger:
        ref.ca_language_trigger = trigger

    doctypes, trigger = entry.doctypes(data['Doctype'])
    if trigger is None or provs not in no_ca:
        for dt in set(doctypes):
            DBSession.add(models.Refdoctype(doctype_pk=dt.pk, ref_pk=ref.pk))
    if trigger:
        ref.ca_doctype_trigger = trigger

    return ref
Exemplo n.º 6
0
def main(args):
    meta = parse_meta(args)
    print(len(meta))
    print(sum(len(m.sources) for m in meta.values()))
    sources = {}
    for m in meta.values():
        for s in m.sources:
            sources[s] = None
    print(len(sources), 'distinct')
    for i, s in enumerate(sources):
        sources[s] = get_source(s, i + 1)

    glottocodes = glottocodes_by_isocode('postgresql://robert@/glottolog3')

    data = Data()

    wals = create_engine('postgresql://robert@/wals3')
    wals_families = {}
    for row in wals.execute('select name, id from family'):
        wals_families[row[0]] = row[1]
        wals_families[row[1]] = row[1]

    #for item in reader(args.data_file('WALSFamilyAbbreviations.tab'), namedtuples=True, encoding='latin1'):
    #    name = item.FAMILY
    #    if name not in wals_families:
    #        name = slug(name)
    #        if name not in wals_families:
    #            print('missing wals family:', item.FAMILY)
    #            name = None
    #    if name:
    #        wals_families[item.ABBREVIATION] = wals_families[name]

    wals_genera = {
        row[0]: row[0]
        for row in wals.execute('select id from genus')
    }

    with args.data_file('listss17.txt').open(encoding='latin1') as fp:
        wordlists = ['\n'.join(lines) for lines in parse(fp)]

    dataset = common.Dataset(
        id=asjp.__name__,
        name="The ASJP Database",
        contact="*****@*****.**",
        description="The Automated Similarity Judgment Program",
        domain='asjp.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        license='http://creativecommons.org/licenses/by/4.0/',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    DBSession.add(dataset)

    transcribers = get_transcriber_map(args)
    for i, spec in enumerate([
        ('SW', "Søren Wichmann"),
        ('AM', "André Müller"),
        ('AKW', "Annkathrin Wett"),
        ('VV', "Viveka Velupillai"),
        ('JB', "Julia Bischoffberger"),
        ('CB', "Cecil H. Brown"),
        ('EH', "Eric W. Holman"),
        ('SS', "Sebastian Sauppe"),
        ('ZM', "Zarina Molochieva"),
        ('PB', "Pamela Brown"),
        ('HH', "Harald Hammarström"),
        ('OB', "Oleg Belyaev"),
        ('JML', "Johann-Mattis List"),
        ('DBA', "Dik Bakker"),
        ('DE', "Dmitry Egorov"),
        ('MU', "Matthias Urban"),
        ('RM', "Robert Mailhammer"),
        ('AC', "Agustina Carrizo"),
        ('MSD', "Matthew S. Dryer"),
        ('EK', "Evgenia Korovina"),
        ('DB', "David Beck"),
        ('HG', "Helen Geyer"),
        ('PE', "Patience Epps"),
        ('AG', "Anthony Grant"),
        ('PS', "Paul Sidwell"),  # not in citation
        ('KTR', "K. Taraka Rama"),  # not in citation
        ('PV', "Pilar Valenzuela"),
        ('MD', "Mark Donohue"),  # not in citation
    ]):
        id_, name = spec
        if id_ in transcribers:
            assert name == transcribers.pop(id_)
        contributor = data.add(common.Contributor, id_, id=id_, name=name)
        if id_ in ['SW', 'CB', 'EH']:
            DBSession.add(
                common.Editor(dataset=dataset,
                              ord=i + 1,
                              contributor=contributor))
    for id_, name in transcribers.items():
        data.add(common.Contributor, id_, id=id_, name=name)

    for id_ in sorted(models.MEANINGS_ALL.keys()):
        data.add(models.Meaning,
                 id_,
                 id=str(id_),
                 name=models.MEANINGS_ALL[id_],
                 core=id_ in models.MEANINGS)

    for n, l in enumerate(wordlists):
        #if n > 100:
        #    break
        lang = models.Doculect.from_txt(l)
        if lang.classification_wals:
            family, genus = lang.classification_wals.split('.')
            lang.wals_family = wals_families.get(family)
            lang.wals_genus = wals_genera.get(slug(genus))
        lang.code_glottolog = glottocodes.get(lang.code_iso)
        add_codes(lang)
        data.add(models.Doculect, lang.id, _obj=lang)
        DBSession.flush()
        md = meta.pop(lang.id, None)
        assert md
        # associate transcribers and sources
        for i, transcriber in enumerate(md.transcribers):
            common.ContributionContributor(
                contribution=lang.wordlist,
                contributor=data['Contributor'][transcriber],
                ord=i + 1)
        for source in md.sources:
            DBSession.add(
                common.LanguageSource(language_pk=lang.pk,
                                      source_pk=sources[source].pk))

    assert not list(meta.keys())
Exemplo n.º 7
0
def _main(data, glottolog):
    languoids = list(glottolog.languoids())
    lbyi = {l.iso: l for l in languoids if l.iso}

    dataset = common.Dataset(
        id='ldh',
        name='Language Description Heritage',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        license="https://creativecommons.org/licenses/by/4.0/",
        domain='ldh.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    DBSession.add(dataset)

    DBSession.add(
        common.Editor(dataset=dataset,
                      contributor=common.Contributor(id='forkel',
                                                     name='Robert Forkel')))

    ls = set()
    for post in iter_posts():
        if post.pure_item_id:
            item = pure.Item.from_json(post.pure_item_id)
            src = data['Description'].get(item.id)
            if not src:
                src = data.add(
                    models.Description,
                    item.id,
                    id=item.id,
                    description=item.title,
                    name=item.name,
                    bibtex_type=EntryType.get(item.bibtex_type),
                    year=item.year,
                    title=item.title,
                    address=item.publisher.get('place')
                    if item.publisher else None,
                    publisher=item.publisher.get('publisher')
                    if item.publisher else None,
                    author=' and '.join(item.authors),
                    editor=' and '.join(item.editors),
                    pid=item.doi or item.pid,
                    pid_type='doi' if item.doi else 'hdl',
                )
                DBSession.flush()
                for file in item.files:
                    if file.visibility == 'PUBLIC' \
                            and file.metadata["contentCategory"] == "any-fulltext"\
                            and file.storage == 'INTERNAL_MANAGED':
                        assert file.mimeType == 'application/pdf'
                        DBSession.add(
                            common.Source_files(
                                id=file.pid.replace('/', '__'),
                                name=file.name,
                                object_pk=src.pk,
                                mime_type=file.mimeType,
                                jsondata=dict(size=file.size,
                                              license=attr.asdict(file.license)
                                              if file.license else None),
                            ))
            for iso in item.isocodes:
                if iso in lbyi:
                    gl = lbyi[iso]
                    l = data['LDHLanguage'].get(iso)
                    if not l:
                        l = data.add(models.LDHLanguage,
                                     iso,
                                     id=iso,
                                     name=gl.name)
                    DBSession.flush()
                    if (item.id, iso) not in ls:
                        DBSession.add(
                            common.LanguageSource(language_pk=l.pk,
                                                  source_pk=src.pk))
                        ls.add((item.id, iso))

    for item in zenodo.iter_items():
        src = data.add(
            models.Description,
            item.id,
            id=item.id,
            description=item['metadata']['title'],
            name=item.name,
            bibtex_type=EntryType.get(item.bibtex_type),
            year=item.year,
            title=item['metadata']['title'],
            publisher='Zenodo',
            author=' and '.join(a['name']
                                for a in item['metadata']['creators']),
            pid=item['metadata']['doi'],
            pid_type='doi',
        )
        DBSession.flush()
        for file in item['files']:
            license = licenses.find(item['metadata']['license']['id'])
            DBSession.add(
                common.Source_files(
                    id=file['checksum'].replace('md5:', ''),
                    name=file['key'],
                    object_pk=src.pk,
                    mime_type='application/' + file['type'],
                    jsondata=dict(
                        size=file['size'],
                        url=file['links']['self'],
                        license=attr.asdict(license) if license else None),
                ))

        for kw in item['metadata']['keywords']:
            if not kw.startswith('iso:'):
                continue
            iso = kw.replace('iso:', '')
            if iso in lbyi:
                gl = lbyi[iso]
                l = data['LDHLanguage'].get(iso)
                if not l:
                    l = data.add(models.LDHLanguage, iso, id=iso, name=gl.name)
                DBSession.flush()
                if (item.id, iso) not in ls:
                    DBSession.add(
                        common.LanguageSource(language_pk=l.pk,
                                              source_pk=src.pk))
                    ls.add((item.id, iso))

    load_families(data,
                  data['LDHLanguage'].values(),
                  glottolog_repos=glottolog.repos,
                  isolates_icon='tcccccc')
Exemplo n.º 8
0
def main(args):
    data = Data()

    icons = cycle(ORDERED_ICONS)

    dataset = common.Dataset(
        id=gelato.__name__,
        name="GeLaTo",
        description="Genes and Languages together",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='gelato.clld.org',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })

    for i, (id_, name) in enumerate([('barbierichiara', 'Chiara Barbieri'),
                                     ('blasidamian', 'Damián Blasi'),
                                     ('forkelrobert', 'Robert Forkel')]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)

    families = {}

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    for r in args.cldf.iter_rows('ContributionTable', 'id', 'name',
                                 'description'):
        ds = data.add(models.Panel,
                      r['id'],
                      id=r['id'],
                      name=r['name'],
                      description=r['description'])
    for row in args.cldf.iter_rows('LanguageTable', 'id', 'name',
                                   'contributionReference'):
        icon = families.get(row['LanguageFamily_Glottocode'])
        if not icon:
            families[row['LanguageFamily_Glottocode']] = icon = next(icons)
        lang = data['Languoid'].get(row['Glottocode'])
        if not lang:
            lang = data.add(
                models.Languoid,
                row['Glottocode'],
                id=row['Glottocode'],
                name=row['Language_Name'],
                family_id=row['LanguageFamily_Glottocode'],
                family_name=row['LanguageFamily'],
                jsondata=dict(icon=icon.name),
            )
        s = data.add(
            models.Sample,
            row['id'],
            id=row['id'],
            name=row['Name'],
            panel=data['Panel'][row['contributionReference']],
            languoid=lang,
            latitude=row['Latitude'],
            longitude=row['Longitude'],
            samplesize=int(row['samplesize']),
            #source=row.get('dataSet.of.origin'),
            region=row['geographicRegion'],
            #location=row['location'],
            jsondata=dict(color=REGIONS[row['geographicRegion']]),
        )
        DBSession.flush()
        for bibkey in row['Source']:
            DBSession.add(
                common.LanguageSource(language_pk=s.pk,
                                      source_pk=data['Source'][bibkey].pk))

    types = {}
    for row in args.cldf.iter_rows('ParameterTable', 'id', 'name',
                                   'description', 'contributionReference'):
        types[row['id']] = Datatype.fromvalue(row['datatype'])
        data.add(models.Measure,
                 row['id'],
                 id=row['id'],
                 name=row['name'],
                 description=row['description'],
                 panel=data['Panel'][row['contributionReference']])

    for row in args.cldf.iter_rows('ValueTable', 'id', 'parameterReference',
                                   'languageReference'):
        v = types[row['parameterReference']].read(row['Value'])
        if isinstance(v, float):
            vs = data.add(
                common.ValueSet,
                row['id'],
                id=row['id'],
                language=data['Sample'][row['languageReference']],
                parameter=data['Measure'][row['parameterReference']],
                #contribution=ds,
                #jsondata=dict(color=REGIONS[sample.region]),
            )
            data.add(models.Measurement,
                     row['id'],
                     id=row['id'],
                     valueset=vs,
                     name=row['Value'],
                     value=v)