Exemplo n.º 1
0
def update(repos, gl_repos, year, title):
    societies_by_glottocode = {
        gc: list(socs)
        for gc, socs in itertools.groupby(
            sorted(repos.societies.values(), key=lambda s: s.glottocode),
            lambda s: s.glottocode)
    }
    api = Glottolog(gl_repos)
    langs = list(api.languoids())
    languoids(api, langs, repos.repos)
    trees(societies_by_glottocode, langs, repos.repos, year, title)
Exemplo n.º 2
0
def load(args):
    """
    clics load /path/to/concepticon-data /path/to/glottolog
    """
    if len(args.args) != 2:
        raise ParserError(
            'concepticon and glottolog repos locations must be specified!')
    concepticon = Path(args.args[0])
    if not concepticon.exists():
        raise ParserError('concepticon repository does not exist')
    glottolog = Path(args.args[1])
    if not glottolog.exists():
        raise ParserError('glottolog repository does not exist')

    args.api.db.create(exists_ok=True)
    args.log.info('loading datasets into {0}'.format(args.api.db.fname))
    in_db = args.api.db.datasets
    for ds in iter_datasets():
        if args.unloaded and ds.id in in_db:
            args.log.info('skipping {0} - already loaded'.format(ds.id))
            continue
        args.log.info('loading {0}'.format(ds.id))
        args.api.db.load(ds)
    args.log.info('loading Concepticon data')
    args.api.db.load_concepticon_data(Concepticon(str(concepticon)))
    args.log.info('loading Glottolog data')
    args.api.db.load_glottolog_data(Glottolog(str(glottolog)))
    return
Exemplo n.º 3
0
def main():  # pragma: no cover
    parser = ArgumentParserWithLogging('pyglottolog')
    parser.add_argument('--repos',
                        help="path to glottolog data repository",
                        type=Glottolog,
                        default=Glottolog())
    sys.exit(parser.main())
Exemplo n.º 4
0
def word_length(args):
    from pyconcepticon.api import Concepticon

    c = Concepticon(args.concepticon_repos)
    res = defaultdict(lambda: defaultdict(list))

    def _word_length(ds, **kw):
        ds.word_length(res)

    with_dataset(args, _word_length)
    concepts = c.conceptsets
    languoids = {l.id: l for l in Glottolog(args.glottolog_repos).languoids()}

    with UnicodeWriter('wordlength.csv') as writer:
        writer.writerow([
            'Concepticon_ID', 'Gloss', 'Semanticfield', 'Category',
            'Glottocode', 'Variety', 'Family', 'Form', 'Length'
        ])
        for pid, langs in res.items():
            if len(langs) >= 500:
                for (lang, variety), forms in langs.items():
                    if lang in languoids:
                        lengths = [len(f.split()) for f in forms]
                        lang = languoids[lang]
                        family = lang.lineage[0][0] if lang.lineage else ''
                        c = concepts[pid]
                        writer.writerow([
                            pid, c['GLOSS'], c['SEMANTICFIELD'],
                            c['ONTOLOGICAL_CATEGORY'], lang.id, variety,
                            family, forms[0],
                            sum(lengths) / len(lengths)
                        ])
Exemplo n.º 5
0
def iter_languages():
    ldstatus = load(
        GLOTTOLOG_VENV.joinpath('glottolog3/glottolog3/static/ldstatus.json'))
    for l in Glottolog(GLOTTOLOG_VENV.joinpath('glottolog')).languoids():
        if l.level == Level.language and not l.category.startswith('Pseudo'):
            yield Language(l, ((ldstatus.get(l.id) or [[0, None]])[0]
                               or [0, None])[1])
Exemplo n.º 6
0
def cldf(args):
    """
    Create CLDF datasets from the raw data for a dataset.

    lexibank --glottolog-repos PATH --concepticon-repos PATH cldf [DATASET_ID]
    """
    if not args.glottolog_repos or not Path(args.glottolog_repos).exists():
        raise ParserError('Invalid glottolog repository path given')

    if not args.concepticon_repos or not Path(args.concepticon_repos).exists():
        raise ParserError('Invalid concepticon repository path given')

    # FIXME: get dict of all glottolog langs right here, and attach to datasets!
    try:
        languoids = load('glottolog')
    except ValueError:
        languoids = {
            l.id: l
            for l in Glottolog(args.glottolog_repos).languoids()
        }
        dump(languoids, 'glottolog')

    def _cldf(ds, **kw):
        ds.glottolog_languoids = languoids
        ds.cldf(**kw)
        ds.write_cognates()

    with_dataset(args, _cldf)
Exemplo n.º 7
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    repos = Path(os.path.expanduser('~')).joinpath('venvs/lexibank/lexibank-data')

    with transaction.manager:
        dataset = common.Dataset(
            id=lexibank.__name__,
            name="lexibank",
            publisher_name="Max Planck Institute for the Science of Human History",
            publisher_place="Jena",
            publisher_url="http://shh.mpg.de",
            license="http://creativecommons.org/licenses/by/4.0/",
            domain='lexibank.clld.org',
            contact='*****@*****.**',
            jsondata={
                'license_icon': 'cc-by.png',
                'license_name': 'Creative Commons Attribution 4.0 International License'})
        DBSession.add(dataset)

    glottolog = Glottolog(
        Path(lexibank.__file__).parent.parent.parent.parent.joinpath('glottolog3', 'glottolog'))
    languoids = {l.id: l for l in glottolog.languoids()}
    concepticon = Concepticon(
        Path(lexibank.__file__).parent.parent.parent.parent.joinpath('concepticon', 'concepticon-data'))
    conceptsets = {c['ID']: c for c in concepticon.conceptsets()}

    for dname in repos.joinpath('datasets').iterdir():
        #if dname.name not in ['acbd']:
        #    continue
        if dname.is_dir() and dname.name != '_template':
            #if dname.name != 'zenodo34092':
            #    continue
            mdpath = dname.joinpath('metadata.json')
            if mdpath.exists():
                print(dname.name)
                import_cldf(dname, load(mdpath), languoids, conceptsets)

    with transaction.manager:
        load_families(
            Data(),
            DBSession.query(LexibankLanguage),
            glottolog=languoids,
            isolates_icon='tcccccc')
Exemplo n.º 8
0
def update(repos, gl_repos, year, title):
    societies_by_glottocode = {
        gc: list(socs)
        for gc, socs in groupby(
            sorted(repos.societies.values(), key=lambda s: s.glottocode),
            lambda s: s.glottocode)
    }
    langs = list(Glottolog(gl_repos).languoids())
    languoids(langs, repos.dir)
    trees(societies_by_glottocode, langs, repos.dir, year, title)
Exemplo n.º 9
0
def main(args):
    #TODO explain etc diachronic_strength
    #sigtests of dependencies
    #isogloss-maps
    data = Data()
    dataset = common.Dataset(
        id=grambank.__name__,
        name="Grambank",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='grambank.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    glottolog = Glottolog(GLOTTOLOG_REPOS)
    languoids = {l.id: l for l in glottolog.languoids()}

    import_gb20_features(GRAMBANK_REPOS, data)
    import_cldf(os.path.join(GRAMBANK_REPOS, 'datasets'), data, languoids)
    load_families(
        data,
        data['GrambankLanguage'].values(),
        glottolog=languoids,
        isolates_icon='tcccccc')

    # Add isolates
    for lg in data['GrambankLanguage'].values():
        gl_language = languoids.get(lg.id)
        if not gl_language.family:
            family = data.add(
                Family, gl_language.id,
                id=gl_language.id,
                name=gl_language.name,
                description=common.Identifier(
                    name=gl_language.id,
                    type=common.IdentifierType.glottolog.value).url(),
                jsondata={"icon": 'tcccccc'})
            lg.family = family
    return 
Exemplo n.º 10
0
def main():  # pragma: no cover
    pkg_dir = Path(glottolog3.__file__).parent
    parser = ArgumentParserWithLogging('glottolog3')
    parser.add_argument(
        '--repos',
        help="path to glottolog data repository",
        type=Glottolog,
        default=Glottolog(
            Path(glottolog3.__file__).parent.parent.parent.joinpath(
                'glottolog')))
    parser.add_argument('--pkg-dir', help=argparse.SUPPRESS, default=pkg_dir)
    sys.exit(parser.main())
Exemplo n.º 11
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    repos = Path(
        os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data')

    with transaction.manager:
        dataset = common.Dataset(
            id=lexirumah.__name__,
            name="lexirumah",
            publisher_name=
            "Max Planck Institute for the Science of Human History",
            publisher_place="Jena",
            publisher_url="http://shh.mpg.de",
            license="http://creativecommons.org/licenses/by/4.0/",
            domain='lexirumah.model-ling.eu',
            contact='*****@*****.**',
            jsondata={
                'license_icon':
                'cc-by.png',
                'license_name':
                'Creative Commons Attribution 4.0 International License'
            })
        DBSession.add(dataset)

    glottolog_repos = Path(
        lexirumah.__file__).parent.parent.parent.parent.joinpath(
            'glottolog3', 'glottolog')
    languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()}
    concepticon = Concepticon(
        Path(lexirumah.__file__).parent.parent.parent.parent.joinpath(
            'concepticon', 'concepticon-data'))
    conceptsets = {c.id: c for c in concepticon.conceptsets.values()}

    skip = True
    for dname in sorted(repos.joinpath('datasets').iterdir(),
                        key=lambda p: p.name):
        #if dname.name == 'benuecongo':
        #    skip = False
        #if skip:
        #    continue
        if dname.is_dir() and dname.name != '_template':
            mdpath = dname.joinpath('cldf', 'metadata.json')
            if mdpath.exists():
                print(dname.name)
                import_cldf(dname, load(mdpath), languoids, conceptsets)

    with transaction.manager:
        load_families(Data(),
                      DBSession.query(LexiRumahLanguage),
                      glottolog_repos=glottolog_repos,
                      isolates_icon='tcccccc')
Exemplo n.º 12
0
def tree(glottocodes, gl_repos):
    label_pattern = re.compile("'[^\[]+\[([a-z0-9]{4}[0-9]{4})[^']*'")

    def rename(n):
        n.name = label_pattern.match(n.name).groups()[0]
        n.length = 1

    glottocodes = set(glottocodes)
    glottocodes_in_global_tree = set()
    languoids = {}
    families = []

    for lang in Glottolog(gl_repos).languoids():
        if not lang.lineage:  # a top-level node
            if not lang.category.startswith('Pseudo '):
                families.append(lang)
        languoids[lang.id] = lang

    glob = Tree()
    glob.name = 'glottolog_global'

    for family in families:
        node = family.newick_node(nodes=languoids)
        node.visit(rename)
        langs_in_tree = set(n.name for n in node.walk())
        langs_selected = glottocodes.intersection(langs_in_tree)

        if not langs_selected:
            continue

        tree = Tree("({0});".format(node.newick), format=3)
        tree.name = 'glottolog_{0}'.format(family.id)

        if family.level.name == 'family':
            tree.prune([n for n in langs_selected])
            glottocodes_in_global_tree = glottocodes_in_global_tree.union(
                set(n.name for n in tree.traverse()))
        else:
            glottocodes_in_global_tree = glottocodes_in_global_tree.union(
                langs_in_tree)

        glob.add_child(tree)

    # global
    nodes = glottocodes_in_global_tree.intersection(glottocodes)
    glob.prune([n for n in nodes])

    return glob.write(format=9), nodes
Exemplo n.º 13
0
def make_tree(*taxa):
    # We create a dict to lookup Glottolog languoids by name, ISO- or Glottocode.
    langs = {}
    for lang in Glottolog().languoids():
        if lang.iso:
            langs[lang.iso] = lang
        langs[lang.name] = lang
        langs[lang.id] = lang

    t = TreeMaker()
    for taxon in taxa:
        if taxon not in langs:
            print('unknown taxon: {0}'.format(taxon))
            continue
        t.add(taxon, ', '.join(l[1] for l in langs[taxon].lineage))
    return t
Exemplo n.º 14
0
def prime_cache(args):  # pragma: no cover
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    if 1:
        langs = {l.pk: l for l in DBSession.query(models.GrambankLanguage)}
        features = {f.pk: f for f in DBSession.query(models.Feature)}

        for lpk, nf in DBSession.query(common.ValueSet.language_pk, func.count(common.ValueSet.pk)) \
                .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk) \
                .group_by(common.ValueSet.language_pk):
            langs[lpk].representation = nf

        for fpk, nl in DBSession.query(common.ValueSet.parameter_pk, func.count(common.ValueSet.pk))\
                .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk)\
                .group_by(common.ValueSet.parameter_pk):
            features[fpk].representation = nl

        compute_language_sources()

    get_repos()

    for obj in DBSession.query(LanguageTreeLabel).all():
        DBSession.delete(obj)
    for obj in DBSession.query(TreeLabel).all():
        DBSession.delete(obj)
    for obj in DBSession.query(Phylogeny).all():
        DBSession.delete(obj)
    DBSession.flush()

    for tree in tqdm(
            iter_trees([l.id for l in DBSession.query(common.Language)],
                       Glottolog(REPOS['glottolog']))):
        nodes = set(n.name for n in tree.traverse())
        phylo = Phylogeny(id=tree.name.split('_')[1],
                          name=tree.name,
                          newick=tree.write(format=9))
        for l in DBSession.query(common.Language).filter(
                common.Language.id.in_(nodes)):
            LanguageTreeLabel(language=l,
                              treelabel=TreeLabel(id=l.id,
                                                  name=l.id,
                                                  phylogeny=phylo))
        DBSession.add(phylo)
Exemplo n.º 15
0
def main(args=sys.argv):
    """The main CLI"""
    # Parse options
    parser = argparse.ArgumentParser(description=__doc__.split("\n")[0])
    parser.add_argument(
        'dataset', type=Path,
        help="Path to the CLDF dataset's JSON description")
    parser.add_argument(
        "output",
        help="File name to write output to")
    parser.add_argument(
        "--glottolog-repos", default=None,
        help="Path to local clone or export of clld/glottolog")
    parser.add_argument(
        "--cmap", type=plt.get_cmap, default=plt.get_cmap("magma_r"),
        help="Colormap to be used for the parameter counts")
    options = parser.parse_args()

    dataset = pycldf.Dataset.from_metadata(options.dataset)

    # Try to load language locations from the dataset
    locations = {}
    try:
        idcol = dataset["LanguageTable", "id"].name
        latcol = dataset["LanguageTable", "latitude"].name
        loncol = dataset["LanguageTable", "longitude"].name
        for row in dataset["LanguageTable"]:
            if row[latcol] is not None:
                locations[row[idcol]] = row[latcol], row[loncol]
    except ValueError:
        # No language table
        pass

    for lang in Glottolog(options.glottolog_repos).languoids():
        if lang.latitude is not None:
            if lang.id not in locations:
                locations[lang.id] = (lang.latitude, lang.longitude)
            if lang.iso and lang.iso not in locations:
                locations[lang.iso] = (lang.latitude, lang.longitude)

    # Aggregate the data
    lats, lons, sizes = [], [], []

    for language, sample_size in parameters_sampled(dataset).items():
        if language in locations:
            lat, lon = locations[language]
            lats.append(float(lat))
            lons.append(float(lon))
            sizes.append(sample_size)

    assert len(sizes) == len(lats) == len(lons)

    # Calculate coordinate boundaries
    min_lat, max_lat = min(lats), max(lats)
    d_lat = max_lat - min_lat
    min_lat = max(-90, min_lat - 0.1 * d_lat)
    max_lat = min(90, max_lat + 0.1 * d_lat)

    min_lon, max_lon = min(lons), max(lons)
    d_lon = max_lon - min_lon
    min_lon = max(-180, min_lon - 0.1 * d_lon)
    max_lon = min(180, max_lon + 0.1 * d_lon)

    # Draw the base map
    # TODO: Get coordinates from commandline, fallback to bounding box of data
    # TODO: Give more control over map drawing to user (projection, level of
    # detail, drawing other patterns (countries, eg.) instead of just coast
    # lines, continent color) – What is a good way to do that?
    map = Basemap(llcrnrlat=min_lat, llcrnrlon=min_lon, urcrnrlat=max_lat, urcrnrlon=max_lon,
                  # projection='lcc',
                  resolution='h', area_thresh=10)
    map.drawcoastlines()
    map.fillcontinents(color='#fff7ee', zorder=0)

    # Plot the sample sizes
    map.scatter(lons, lats, c=sizes, cmap=options.cmap, latlon=True)

    # TODO: Improve shape of components: Colorbar is very huge, margins are quite large
    plt.colorbar()
    plt.gcf().set_size_inches(12, 9)

    plt.savefig(options.output)
    return 0
Exemplo n.º 16
0
def main(args):  # pragma: no cover
    get_repos()
    api = Grambank(REPOS['Grambank'])
    cldf = args.cldf
    data = Data()
    dataset = models.Grambank(
        id=grambank.__name__,
        name="Grambank",
        description="Grambank",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="https://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='grambank.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    contributors = {}
    for i, contrib in enumerate(api.contributors):
        contrib = common.Contributor(
            contrib.id,
            id=contrib.id,
            name=contrib.name,
        )
        common.Editor(dataset=dataset, contributor=contrib, ord=i)
        DBSession.add(contrib)
        DBSession.flush()
        contributors[contrib.id] = contrib.pk
    contributions = {r['ID']: r for r in cldf['LanguageTable']}

    DBSession.add(dataset)

    for rec in tqdm(list(Database.from_file(cldf.bibpath, lowercase=True)),
                    desc='sources'):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    DBSession.flush()
    sources = {k: v.pk for k, v in data['Source'].items()}

    features, codes = import_features(cldf, contributors)
    transaction.commit()

    values_by_sheet = [(lid, list(v)) for lid, v in itertools.groupby(
        sorted(cldf['ValueTable'], key=lambda r: r['Language_ID']),
        lambda r: r['Language_ID'],
    )]
    for lid, values in tqdm(values_by_sheet, desc='loading values'):
        transaction.begin()
        import_values(values, contributions[lid], features, codes,
                      contributors, sources)
        transaction.commit()

    transaction.begin()

    glottolog = Glottolog(REPOS['glottolog'])
    languoids = {l.id: l for l in glottolog.languoids()}
    gblangs = DBSession.query(models.GrambankLanguage).all()
    load_families(data,
                  gblangs,
                  glottolog_repos=REPOS['glottolog'],
                  isolates_icon='dcccccc')

    # Add isolates
    for lg in gblangs:
        gl_language = languoids.get(lg.id)
        if not gl_language.family:
            family = data.add(
                Family,
                gl_language.id,
                id=gl_language.id,
                name=gl_language.name,
                description=common.Identifier(
                    name=gl_language.id,
                    type=common.IdentifierType.glottolog.value).url(),
                jsondata={"icon": 'tcccccc'})
            lg.family = family
    coverage.main(glottolog)
    return
Exemplo n.º 17
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.repos = self.tmp_path('repos')
     copytree(Path(__file__).parent.joinpath('data'), self.repos)
     self.api = Glottolog(self.repos)
Exemplo n.º 18
0
""" Small script from xrotwang to get Glottolog code to ISO 639-3 code mappings """

import csv
from pyglottolog.api import Glottolog

api = Glottolog('/Users/stiv/Github/glottolog/')
gc2iso = {l.id: l.iso for l in api.languoids() if l.iso}

with open('gc2iso.csv', 'wb') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in gc2iso.items():
        writer.writerow([key, value])
Exemplo n.º 19
0
import pandas as pd
from fuzzywuzzy import fuzz
from pyglottolog.api import Glottolog

api = Glottolog("/Users/stiv/Github/glottolog")


def matching_dialect(glottocode, name):
    print(type(glottocode), glottocode, type(name), name)

    if glottocode == "NA":
        return glottocode
    lang = api.languoid(glottocode)

    if lang is None:
        # print("glottocode has been updated:", glottocode)
        return glottocode

    if fuzz.ratio(lang.name, name) < 95:
        ratios = []
        for dialect in lang.children:
            ratios.append((dialect.id, dialect.name, fuzz.ratio(dialect.name, name)))
        if ratios and max(r[2] for r in ratios) >= 95:
            return sorted(ratios, key=lambda r: r[2], reverse=True)[0]
        else:
            return glottocode
    else:
        return glottocode


def get_code(glottocode, name):
Exemplo n.º 20
0
def main(args):
    data = Data()
    glottocodes, bibtex_keys = {}, defaultdict(set)
    for d in reader(
            args.data_file('repos', 'mappings',
                           'InventoryID-ISO-gcode-Bibkey-Source.tsv')):
        glottocodes[d['InventoryID']] = d['Glottocode']
        bibtex_keys[d['InventoryID']].add(d['BibtexKey'])

    glottolog = Glottolog(
        Path(phoible.__file__).parent.parent.parent.parent.joinpath(
            'glottolog3', 'glottolog'))
    languoids = {l.id: l for l in glottolog.languoids()}

    phonemes = sorted(list(
        reader(args.data_file('repos', 'data', 'phoible-by-phoneme.tsv'))),
                      key=lambda r: (r['InventoryID'], r['GlyphID']))

    inventories = defaultdict(set)
    for p in phonemes:
        if p['InventoryID'] in glottocodes:
            inventories[(languoids[glottocodes[p['InventoryID']]].name,
                         p['SpecificDialect'], p['Source'].upper())].add(
                             (p['InventoryID'], p['LanguageName']))

    inventory_names = {}
    for (glname, dname, source), invids in inventories.items():
        if len(invids) == 1:
            invid, lname = invids.pop()
            inventory_names[invid] = name_in_source(glname,
                                                    dname) + ' [%s]' % source
        else:
            use_lname = len(set(r[1] for r in invids)) == len(invids)
            for i, (invid,
                    lname) in enumerate(sorted(invids,
                                               key=lambda j: int(j[0]))):
                disambiguation = ' %s' % (i + 1, )
                if use_lname:
                    disambiguation = ' (%s)' % lname
                inventory_names[invid] = name_in_source(
                    glname, dname) + '%s [%s]' % (disambiguation, source)

    for (invid, lname, dname, source), ps in groupby(
            phonemes, lambda p: (p['InventoryID'], p['LanguageName'], p[
                'SpecificDialect'], p['Source'])):
        if invid not in glottocodes:
            continue
        ps = list(ps)
        gc = glottocodes[invid]
        lang = data['Variety'].get(gc)
        if not lang:
            languoid = languoids[gc]
            lang = data.add(
                models.Variety,
                gc,
                id=gc,
                language_code=ps[0]['LanguageCode'],
                name=languoid.name,
                level=text_type(languoid.level.name),
                latitude=languoid.latitude,
                longitude=languoid.longitude,
            )
            if lang.latitude is None and languoid.level == Level.dialect:
                ll = get_language(languoid)
                lang.latitude = ll.latitude
                lang.longitude = ll.longitude

        contrib = data.add(
            models.Inventory,
            invid,
            id=invid,
            #language=lang,
            source=source,
            #source_url=source_urls.get(row.InventoryID),
            #internetarchive_url=ia_urls.get(row.InventoryID),
            name=inventory_names[invid],
            description=name_in_source(lname, dname))

    return

    # FIXME: read from mappings file!
    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(
            common.Editor(dataset=dataset,
                          ord=i + 1,
                          contributor=common.Contributor(id=spec[0],
                                                         name=spec[1])))

    #squibs = defaultdict(list)
    #for row in get_rows(args, 'Squib'):
    #    squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    # FIXME: group phoible-by-phoneme by LanguageCode, Source (make sure this is unique!)
    aggregated = list(
        reader(args.data_file('phoible-aggregated.tsv'),
               delimiter='\t',
               namedtuples=True))
    inventory_names = {}
    for key, items in groupby(sorted(aggregated,
                                     key=lambda t: (t.LanguageCode, t.Source)),
                              key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i +
                                                                    1, key[1])

    # pull in Glottolog families instead? or in addition?

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus,
                            genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(models.Variety,
                            row.LanguageCode,
                            id=row.LanguageCode,
                            name=lnames[row.LanguageCode],
                            genus=genus,
                            country=strip_quotes(row.Country),
                            area=strip_quotes(row.Area),
                            latitude=coords[0],
                            longitude=coords[1],
                            jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data,
                               lang,
                               row.LanguageCode,
                               glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(common.Contributor,
                                   row.Source,
                                   id=row.Source,
                                   name=SOURCES[row.Source][0],
                                   description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(
                    models.ContributorReference(source=data['Source'][ref],
                                                contributor=contributor))

        contrib = data.add(models.Inventory,
                           row.InventoryID,
                           id=row.InventoryID,
                           language=lang,
                           source=row.Source,
                           source_url=source_urls.get(row.InventoryID),
                           internetarchive_url=ia_urls.get(row.InventoryID),
                           name=inventory_names[row.InventoryID],
                           description=row.LanguageName)

        DBSession.add(
            common.ContributionContributor(contribution=contrib,
                                           contributor=contributor))

        #for j, squib in enumerate(squibs.get(row.InventoryID, [])):
        #    f = common.Contribution_files(
        #        object=contrib,
        #        id='squib-%s-%s.pdf' % (contrib.id, j + 1),
        #        name='Phonological squib',
        #        description=squib,
        #        mime_type='application/pdf')
        #    assert f
        #    # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'),
                      namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment,
                row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join([
                    t[0] for t in unicode_desc
                    if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
                ]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(id=row.PhonemeID,
                             contribution=inventory,
                             language=inventory.language,
                             parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            common.Value(
                id=row.PhonemeID,
                name='%s %s' %
                (row.Phoneme, data['Inventory'][row.InventoryID].name),
                valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(common.ContributionReference,
                     '%s-%s' % (inventory_id, ref),
                     source=data['Source'][ref],
                     contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(
            reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(
                    common.Parameter_data(
                        key=features[j],
                        value=value,
                        ord=j,
                        object_pk=data['Segment'][row[0]].pk))

    # FIXME: add allophones!

    DBSession.flush()
Exemplo n.º 21
0
def get_names():
    return {l.id: l.name for l in Glottolog(GLOTTOLOG_REPOS).languoids()}
Exemplo n.º 22
0
def get_clf_paths(lgs):
    glottolog = Glottolog(GLOTTOLOG_REPOS)
    return [
        tuple([ll.id for ll in l.ancestors] + [l.id])
        for l in glottolog.languoids(lgs)
    ]
Exemplo n.º 23
0
        `pip install --upgrade --force-reinstall pyglottolog`
        `pip install fuzzywuzzy`

    Usage:
        $ python match.py nyun1247 Bibbulman
        (u'bibb1234', u'Bibbulman', 100)
        $ python match.py nyun1247 Balardung
        None
"""

from fuzzywuzzy import fuzz


def matching_dialect(glottolog, glottocode, name):
    lang = glottolog.languoid(glottocode)
    if fuzz.ratio(lang.name, name) < 95:
        ratios = []
        for dialect in lang.children:
            ratios.append(
                (dialect.id, dialect.name, fuzz.ratio(dialect.name, name)))
        if ratios and max(r[2] for r in ratios) >= 95:
            return sorted(ratios, key=lambda r: r[2], reverse=True)[0]


if __name__ == "__main__":
    import sys
    from pyglottolog.api import Glottolog
    print(
        matching_dialect(Glottolog("/Users/stiv/Github/glottolog"),
                         *sys.argv[1:]))
Exemplo n.º 24
0
def get_clf_paths(lgs):
    glottolog = Glottolog()
    return [
        tuple([ll.id for ll in l.ancestors] + [l.id]) for l in glottolog.languoids(lgs)]
Exemplo n.º 25
0
from clldutils.dsv import UnicodeWriter
from pyglottolog.api import Glottolog
from pyglottolog.objects import Level


def locations(glottolog, fid, outpath):
    with UnicodeWriter(outpath) as writer:
        writer.writerow(['name', 'glottocode', 'latitude', 'longitude'])
        for lang in glottolog.languoids():
            if lang.level == Level.language and lang.latitude is not None:
                if fid in [l[1] for l in lang.lineage]:
                    writer.writerow(
                        [lang.name, lang.id, lang.latitude, lang.longitude])


if __name__ == '__main__':
    import sys

    locations(Glottolog(sys.argv[1]), sys.argv[2], sys.argv[3])
Exemplo n.º 26
0
'''Imports Glottolog data needed for Pshrimp to a Postgres database.'''

from db_postgres import init_db
from pyglottolog.api import Glottolog
from import_postgres import insert, get_id
from collections import OrderedDict
import csv

from os.path import expanduser
GLOTTOLOG_LOCATION = expanduser('~/Documents/glottolog-3.4')

api = Glottolog(GLOTTOLOG_LOCATION)


def language(glottocode):
    '''Dialects don't have most information defined, so go upstairs to a language.'''

    # Ideally there would be error handling here in case there's a family.
    # In practice, it just crashed and I edited the csv file.
    languoid = api.languoid(glottocode)

    if languoid.level.name == 'dialect':
        while languoid.level.name == 'dialect':
            languoid = languoid.parent

    return languoid


def data(glottocode):
    languoid = language(glottocode)
    print(languoid)
Exemplo n.º 27
0
def main(args):  # pragma: no cover
    ds = StructureDataset.from_metadata(DS)
    data = Data()
    for source in ds.sources:
        data.add(common.Source, source.id, _obj=bibtex2source(source))

    ext = [
        Record.from_string('@' + s, lowercase=True)
        for s in nfilter(BIB.split('@'))
    ]
    for rec in ext:
        if rec.id not in data['Source']:
            data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    for contrib in ds['contributors.csv']:
        o = data.add(
            common.Contributor,
            contrib['ID'],
            id=contrib['ID'].upper(),
            name=contrib['Name'],
            description=contrib['Description'],
            url=contrib['URL'],
            jsondata={
                'readme': contrib['Readme'],
                'contents': contrib['Contents']
            },
        )
        for src in contrib['Source']:
            DBSession.add(
                models.ContributorReference(source=data['Source'][src],
                                            contributor=o))

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE 2.0',
        description='PHOIBLE 2.0',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        domain='phoible.org',
        license='https://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'https://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, (cid, name) in enumerate([
        ('UZ', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
    ],
                                    start=1):
        contrib = data['Contributor'].get(cid)
        if not contrib:
            contrib = common.Contributor(id=cid, name=name)
        DBSession.add(
            common.Editor(dataset=dataset, ord=i, contributor=contrib))

    glottolog = Glottolog(
        Path(phoible.__file__).parent.parent.parent.parent.joinpath(
            'glottolog', 'glottolog'))

    for lang in ds['LanguageTable']:
        l = data.add(
            models.Variety,
            lang['ID'],
            id=lang['ID'],
            name=lang['Name'],
        )

    load_families(data, [(l.id, l)
                         for l in data['Variety'].values() if len(l.id) == 8],
                  glottolog.repos)
    DBSession.flush()

    # assign color codes:
    families = defaultdict(list)
    for l in data['Variety'].values():
        families[l.family_pk].append(l)

    colors = color.qualitative_colors(len(families))
    for i, langs in enumerate(sorted(families.values(),
                                     key=lambda v: -len(v))):
        for l in langs:
            l.jsondata = {'color': colors[i]}

    for segment in ds['ParameterTable']:
        equivalence_class = ''.join([
            t[0] for t in [(c, unicodedata.name(c)) for c in segment['Name']]
            if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
        ]),
        data.add(models.Segment,
                 segment['ID'],
                 id=segment['ID'],
                 name=segment['Name'],
                 description=segment['Description'],
                 segment_class=segment['SegmentClass'],
                 equivalence_class=equivalence_class)
    DBSession.flush()

    # Add redirects for old language pages! get relevant ISO codes and map to Glottocode!
    for model, repls in load(
            Path(phoible.__file__).parent.parent /
            'replacements.json').items():
        if model == 'Language':
            languoids = {l.id: l for l in glottolog.languoids()}
            iso_languoids = {l.iso: l for l in languoids.values() if l.iso}
            gl_in_phoible = set(data['Variety'].keys())
            for oid, nid in repls.items():
                gls = descendants_from_nodemap(
                    iso_languoids.get(oid),
                    languoids).intersection(gl_in_phoible)
                if gls:
                    nid = gls.pop()
                    if len(gls) > 1:
                        print('+++', oid, gls)
                else:
                    print('---', oid)
                common.Config.add_replacement(oid, nid, common.Language)
        elif model == 'Parameter':
            segments_in_phoible = set(data['Segment'].keys())
            for oid, nid in repls.items():
                id_ = nid if nid in segments_in_phoible else None
                common.Config.add_replacement(oid, id_, common.Parameter)

    for segment in ds['ParameterTable']:
        for i, (k, v) in enumerate(sorted(segment.items())):
            if k not in ['ID', 'Name', 'Description', 'SegmentClass']:
                DBSession.add(
                    common.Parameter_data(
                        key=feature_name(k),
                        value=v,
                        ord=i,
                        object_pk=data['Segment'][segment['ID']].pk))

    for inventory in ds['contributions.csv']:
        inv = data.add(
            models.Inventory,
            inventory['ID'],
            id=inventory['ID'],
            name='{0} ({1} {2})'.format(
                inventory['Name'],
                inventory['Contributor_ID'].upper(),
                inventory['ID'],
            ),
            source_url=inventory['URL'],
            count_tone=inventory['count_tones'],
            count_vowel=inventory['count_vowels'],
            count_consonant=inventory['count_consonants'],
        )
        DBSession.add(
            common.ContributionContributor(
                contribution=inv,
                contributor=data['Contributor'][
                    inventory['Contributor_ID'].upper()]))
        for src in inventory['Source']:
            DBSession.add(
                common.ContributionReference(contribution=inv,
                                             source=data['Source'][src]))

    for phoneme in ds['ValueTable']:
        lang = data['Variety'][phoneme['Language_ID']]
        inv = data['Inventory'][phoneme['Contribution_ID']]
        if not inv.language:
            inv.language = lang
        vs = common.ValueSet(
            id=phoneme['ID'],
            contribution=inv,
            language=lang,
            parameter=data['Segment'][phoneme['Parameter_ID']])

        for ref in phoneme['Source']:
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            models.Phoneme(
                id=phoneme['ID'],
                name='%s %s' %
                (phoneme['Value'],
                 data['Inventory'][phoneme['Contribution_ID']].name),
                allophones=' '.join(phoneme['Allophones']),
                marginal=phoneme['Marginal'],
                valueset=vs))

    return
Exemplo n.º 28
0
def main(args):
    #
    # order of init:
    # - villages
    # - files
    # - movies
    #
    videos = defaultdict(list)
    for f in util.iter_files(args):
        obj = models.File(**attr.asdict(f))
        if obj.mime_type.startswith('video'):
            videos[slug(obj.name.split('.')[0])].append(obj)
        DBSession.add(obj)

    lexicon = list(util.iter_lexicon(args))
    villages = util.get_villages(args)
    ff_images = list(util.ff_images(args))
    bib = list(util.get_bib(args))
    data = Data()

    dataset = common.Dataset(
        id=dogonlanguages.__name__,
        name="Dogon and Bangime Linguistics",
        contact="*****@*****.**",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='dogonlanguages.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'}
    )
    DBSession.add(dataset)

    if Glottolog:
        if socket.gethostname() == 'dlt5502178l':
            glottolog = Glottolog(
                Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath(
                    'glottolog3', 'glottolog'))
        else:
            glottolog = Glottolog(
                Path(dogonlanguages.__file__).parent.parent.parent.parent.joinpath(
                    'glottolog'))
        languoids = {l.id: l for l in glottolog.languoids()}
    else:
        languoids = {}
    print('got glottolog')

    for c in util.CONTRIBUTORS:
        id_ = slug(c.name.split()[-1])
        data.add(models.Member, id_, id=id_, **attr.asdict(c))
    data.add(
        models.Member, 'forkel',
        id='forkel',
        name='Robert Forkel',
        email='*****@*****.**',
        in_project=False)

    for i, id_ in enumerate(['moran', 'forkel', 'heath']):
        DBSession.add(common.Editor(
            dataset=dataset, ord=i + 1, contributor=data['Member'][id_]))

    contrib = data.add(common.Contribution, 'd', id='d', name='Dogon Languages')
    for doc in bib:
        obj = data.add(
            models.Document,
            doc.rec.id,
            _obj=bibtex2source(doc.rec, cls=models.Document))
        keywords = nfilter([s.strip() for s in doc.rec.get('keywords', '').split(',')])
        for dt in 'grammar lexicon typology texts'.split():
            if dt in keywords:
                obj.doctype = dt
                break
        obj.project_doc = ('DLP' in keywords) or bool(doc.files)
        if obj.project_doc:
            for i, cid in enumerate(util.get_contributors(doc.rec, data)):
                models.DocumentContributor(
                    document=obj, contributor=data['Member'][cid], ord=i)
        for i, (path, cdstar) in enumerate(doc.files):
            common.Source_files(
                id='%s-%s' % (obj.id, i + 1),
                name=path,
                object=obj,
                mime_type=guess_type(path)[0],
                jsondata=cdstar,
            )

    print('got bib')

    for name, (gc, desc) in LANGUAGES.items():
        gl_lang = languoids[gc]
        lat, lon = gl_lang.latitude, gl_lang.longitude
        lang = data.add(
            models.Languoid, gc,
            id=gc,
            name=name,
            description=desc,
            latitude=lat,
            longitude=lon,
            family=gl_lang.family.name if gl_lang and gl_lang.family else name,
        )
        if name == 'Penange' and lang.longitude > 0:
            lang.longitude = -lang.longitude
        if name == 'Bankan Tey':
            lang.latitude, lang.longitude = 15.07, -2.91
        if name == 'Ben Tey':
            lang.latitude, lang.longitude = 14.85, -2.95
        if name == 'Togo Kan':
            lang.latitude, lang.longitude = 14.00, -3.25
        add_language_codes(data, lang, gl_lang.iso, glottocode=gc)

    villages_by_name = defaultdict(list)
    contrib_by_initial = {c.abbr: c for c in data['Member'].values()}
    for i, village in enumerate(villages):
        lang = None
        if village.glottocode:
            lang = data['Languoid'].get(village.glottocode)
            if not lang:
                gl_lang = languoids[village.glottocode]
                lang = data.add(
                    models.Languoid, gl_lang.id,
                    id=gl_lang.id,
                    name=gl_lang.name,
                    in_project=False,
                    family=gl_lang.family.name if gl_lang.family else gl_lang.name)
        v = data.add(
            models.Village, str(i + 1),
            id=str(i + 1),
            name=village.name,
            description=village.data.pop('social info'),
            surnames=village.data.pop('surnames'),
            major_city=village.data['MajorCity'] == 'Y',
            transcribed_name=village.data.pop('Transcribed Village Name'),
            source_of_coordinates=village.data.pop('sourceOfCoordinates'),
            latitude=village.lat,
            longitude=village.lon,
            languoid=lang,
            jsondata=village.data,
        )
        villages_by_name[village.name].append(v)
        for img in village.images:
            mimetype = guess_type(img.name)[0]
            if mimetype:
                f = models.Village_files(
                    id=img.id,
                    name=img.name,
                    description=img.description,
                    date_created=img.date,
                    latitude=img.coords[0] if img.coords else None,
                    longitude=-img.coords[1] if img.coords else None,
                    object=v,
                    mime_type=mimetype,
                    jsondata=img.cdstar,
                )
                for initial in img.creators:
                    if initial in contrib_by_initial:
                        models.Fotographer(
                            foto=f, contributor=contrib_by_initial[initial])

    for cat, desc, place, name in MOVIES:
        s = slug(name)
        m = models.Movie(
            id=s,
            name=desc,
            description=cat,
            place=place,
        )
        if place in villages_by_name and len(villages_by_name[place]) == 1:
            m.village = villages_by_name[place][0]
            #print('found village: %s' % name)
        for v in videos[s]:
            #print('found video: %s' % name)
            v.movie = m
            m.duration = v.duration

    names = defaultdict(int)
    for concept in lexicon:
        add(concept, data, names, contrib)

    count = set()
    for img in ff_images:
        if img.id in count:
            continue
        count.add(img.id)
        if img.ref:
            if img.ref in data['Concept']:
                concept = data['Concept'][img.ref]
                if img.tsammalex_taxon and not concept.tsammalex_taxon:
                    concept.tsammalex_taxon = img.tsammalex_taxon
                    #print(concept.tsammalex_taxon)
                common.Parameter_files(
                    object=concept,
                    id=img.id,
                    name=img.name.decode('utf8'),
                    mime_type=guess_type(img.name)[0],
                    jsondata=img.cdstar)
            else:
                print('missing ref: %s' % img.ref)