示例#1
0
def test_glottolog_invalid_repos(tmpdir):
    from pyglottolog import Glottolog
    with pytest.raises(ValueError, match=r'missing tree dir'):
        Glottolog(str(tmpdir))

    tmpdir.join('languoids').mkdir()
    tmpdir.join('languoids', 'tree').mkdir()

    with pytest.raises(ValueError, match=r'missing references subdir'):
        Glottolog(str(tmpdir))
示例#2
0
def from_bibkey(glottocode, bibkey):
	"""Look up source info from the Glottolog bibkey of the source.
	You'll probably want to change GLOTTOLOG_PATH here."""
	GLOTTOLOG_PATH = path.expanduser('~/Documents/glottolog-4.0')

	from pyglottolog import Glottolog
	g = Glottolog(GLOTTOLOG_PATH)

	languoid = g.languoid(glottocode)	
	ref = [x for x in languoid.sources if x.key == bibkey]
	if not ref:
		raise Exception('Reference not found')

	source = ref[0].get_source(g)
	return source.fields
示例#3
0
文件: initializedb.py 项目: clld/ldh
def main(args):
    gl_dir = PROJECT_DIR.parent / 'glottolog' / 'glottolog'
    gl_dir = pathlib.Path(
        input('Path to clone of glottolog/glottolog [{}]: '.format(gl_dir))
        or gl_dir)
    assert gl_dir.exists()
    with Catalog(gl_dir, tag=input('Glottolog version: ') or None) as cat:
        _main(Data(), Glottolog(gl_dir))
示例#4
0
def test_numeral_tables(tmprepo):
    glottolog = Glottolog(tmprepo['glottolog'])
    d = list(find_tables([tmprepo['raw'] / 'Abui.htm']))[0]
    assert len(d) == 7
    entry = NumeralsEntry(
        base_name=d[0],
        tables=d[1],
        file_name=d[2],
        title_name=d[3],
        codes=glottolog.languoids_by_code(),
        iso=glottolog.iso.languages,
        source=d[4],
        base=d[5],
        comment=d[6],
    )
    assert len(entry.tables) == 8
    assert entry.get_numeral_lexemes()[0][0][6][0] == 'tä.ˈlä.mä'
示例#5
0
def _main(commands,
          args=None,
          catch_all=False,
          parsed_args=None,
          log=None,
          test=False):
    try:
        repos = Config.from_file().get_clone('glottolog')
    except KeyError:  # pragma: no cover
        repos = pathlib.Path('.')
    parser, subparsers = get_parser_and_subparsers('glottolog')
    parser.add_argument('--repos',
                        help="clone of glottolog/glottolog",
                        default=repos,
                        type=pathlib.Path)
    parser.add_argument(
        '--repos-version',
        help="version of repository data. Requires a git clone!",
        default=None)
    parser.add_argument('--pkg-dir',
                        help=argparse.SUPPRESS,
                        default=pathlib.Path(__file__).parent)
    register_subcommands(subparsers, commands)

    args = parsed_args or parser.parse_args(args=args)
    args.test = test

    if not hasattr(args, "main"):
        parser.print_help()
        return 1

    with contextlib.ExitStack() as stack:
        if not log:  # pragma: no cover
            stack.enter_context(Logging(args.log, level=args.log_level))
        else:
            args.log = log
        if args.repos_version:  # pragma: no cover
            # If a specific version of the data is to be used, we make
            # use of a Catalog as context manager:
            stack.enter_context(Catalog(args.repos, tag=args.repos_version))
        try:
            args.repos = Glottolog(args.repos)
        except Exception as e:
            print(e)
            return _main(commands, args=[args._command, '-h'])
        args.log.info('glottolog/glottolog at {0}'.format(args.repos.repos))
        try:
            return args.main(args) or 0
        except KeyboardInterrupt:  # pragma: no cover
            return 0
        except ParserError as e:
            print(e)
            return _main(commands, args=[args._command, '-h'])
        except Exception as e:  # pragma: no cover
            if catch_all:
                print(e)
                return 1
            raise
示例#6
0
def test_num_entry(tmprepo, x, expected):
    raw_htmls = tmprepo['raw']
    glottolog = Glottolog(tmprepo['glottolog'])
    f = raw_htmls / x
    d = list(find_tables([f]))[0]
    entry = NumeralsEntry(
        base_name=d[0],
        tables=d[1],
        file_name=d[2],
        title_name=d[3],
        codes=glottolog.languoids_by_code(),
        iso=glottolog.iso.languages,
        source=d[4],
        base=d[5],
        comment=d[6],
    )
    assert entry.base_name == Path(f).stem
    assert entry.glottocodes[0] == expected
示例#7
0
def main():  # pragma: no cover
    pkg_dir = Path(glottolog3.__file__).parent
    parser = ArgumentParserWithLogging('glottolog3')
    parser.add_argument(
        '--repos',
        help="path to glottolog data repository",
        type=Glottolog,
        default=Glottolog(
            Path(glottolog3.__file__).parent.parent.parent.joinpath(
                'glottolog')))
    parser.add_argument('--pkg-dir', help=argparse.SUPPRESS, default=pkg_dir)
    sys.exit(parser.main())
示例#8
0
def est_dbinit(mocker, testdb, capsys):
    from glottolog3.scripts import check_db_consistency

    mocker.patch('glottolog3.initdb.assert_release', mocker.Mock(return_value='1.0'))
    args = mocker.Mock(
        repos=Glottolog(pathlib.Path(__file__).parent / 'repos'),
        pkg_dir=pathlib.Path(__file__).parent,
    )
    __main__.dbload(args)
    __main__.dbprime(args)
    assert testdb.query(models.Ref).one().name == 'Huang, Shuanfan 2013'
    check_db_consistency.main(args)
    out, _ = capsys.readouterr()
    assert out.count(': OK') == 15
示例#9
0
def test_fuzzy_number_matching(tmprepo):
    glottolog = Glottolog(tmprepo['glottolog'])
    d = list(find_tables([tmprepo['raw'] / 'Aari.htm']))[0]
    entry = NumeralsEntry(
        base_name=d[0],
        tables=d[1],
        file_name=d[2],
        title_name=d[3],
        codes=glottolog.languoids_by_code(),
        iso=glottolog.iso.languages,
        source=d[4],
        base=d[5],
        comment=d[6],
    )
    numeral_table = entry.tables[1]
    table_elements = numeral_table.find_all('tr')
    cell_content = []

    for row in table_elements:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        cell_content.append([ele for ele in cols if ele])

    # Table is roughly structured like this:
    # 1 | 21
    # 2 | 22
    # 3 | 23
    # ...
    # 10 | 30
    # ..
    # 20 | 2000

    assert parse_number(cell_content[0][0]) == 1
    assert parse_number(cell_content[0][1]) == 21
    assert parse_number(cell_content[9][0]) == 10
    assert parse_number(cell_content[19][1]) == 2000
示例#10
0
def make_tree(*taxa):
    # We create a dict to lookup Glottolog languoids by name, ISO- or Glottocode.
    langs = {}
    for lang in Glottolog().languoids():
        if lang.iso:
            langs[lang.iso] = lang
        langs[lang.name] = lang
        langs[lang.id] = lang

    t = TreeMaker()
    for taxon in taxa:
        if taxon not in langs:
            print('unknown taxon: {0}'.format(taxon))
            continue
        t.add(taxon, ', '.join(l[1] for l in langs[taxon].lineage))
    return t
示例#11
0
def test_glottolog_invalid_repos(tmpdir):
    from pyglottolog import Glottolog
    with pytest.raises(ValueError, match=r'missing tree dir'):
        Glottolog(str(tmpdir))
示例#12
0
def main(args):
    assert args.glottolog, 'The --glottolog option is required!'
    license = licenses.find(args.cldf.properties['dc:license'])
    assert license and license.id.startswith('CC-')

    data = Data()
    ds = data.add(
        common.Dataset,
        mixezoqueanvoices.__name__,
        id=mixezoqueanvoices.__name__,
        name="Mixe-Zoquean Voices",
        domain='mixezoqueanvoices.clld.org',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license=license.url,
        jsondata={
            'license_icon':
            '{}.png'.format('-'.join(
                [p.lower() for p in license.id.split('-')[:-1]])),
            'license_name':
            license.name
        },
    )

    contrib = data.add(
        common.Contribution,
        None,
        id='cldf',
        name=args.cldf.properties.get('dc:title'),
        description=args.cldf.properties.get('dc:bibliographicCitation'),
    )
    data.add(common.Contributor, 'kondic', id='kondic', name='Ana Kondic')
    data.add(common.Contributor, 'gray', id='gray', name='Russell Gray')
    DBSession.add(
        common.ContributionContributor(
            contribution=contrib,
            contributor=data['Contributor']['kondic'],
        ))
    for i, ed in enumerate(['kondic', 'gray']):
        data.add(common.Editor,
                 ed,
                 dataset=ds,
                 contributor=data['Contributor'][ed],
                 ord=i)

    ancestors = collections.defaultdict(list)
    gl = Glottolog(args.glottolog)
    lnames = {}
    for lang in args.cldf.iter_rows('LanguageTable', 'id', 'glottocode',
                                    'name', 'latitude', 'longitude'):
        lnames[lang['id']] = lang['name']
        glang = None
        if lang['glottocode']:
            glang = gl.languoid(lang['glottocode'])
            lineage = [i[0] for i in glang.lineage]
            if 'Mixe-Zoque' in lineage:
                ancestors[lang['id']].append('Protomixezoque')
            if 'Mixe' in lineage:
                ancestors[lang['id']].append('Protomixe')
            if 'Oaxaca Mixe' in lineage:
                ancestors[lang['id']].append('Protooaxacamixe')
        if not glang:
            assert lang['name'] == 'Nizaviguiti'
        data.add(
            models.Variety,
            lang['id'],
            id=lang['id'],
            name=lang['name'],
            latitude=lang['latitude'],
            longitude=lang['longitude'],
            glottocode=lang['glottocode'],
            description=lang['LongName'],
            subgroup=glang.lineage[1][0]
            if glang and len(glang.lineage) > 1 else None,
        )
    colors = dict(
        zip(
            set(l.subgroup for l in data['Variety'].values()),
            qualitative_colors(
                len(set(l.subgroup for l in data['Variety'].values())))))
    for l in data['Variety'].values():
        l.jsondata = dict(color=colors[l.subgroup].replace('#', ''))

    for rec in bibtex.Database.from_file(args.cldf.bibpath, lowercase=True):
        data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    refs = collections.defaultdict(list)

    # Store proto-forms for later lookup:
    proto_forms = collections.defaultdict(
        lambda: collections.defaultdict(list))
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference'):
        if form['languageReference'].startswith('Proto'):
            proto_forms[form['languageReference']][
                form['parameterReference']].append(form['form'])

    for param in args.cldf.iter_rows('ParameterTable', 'id',
                                     'concepticonReference', 'name'):
        proto = collections.OrderedDict()
        for lid, forms in proto_forms.items():
            f = forms.get(param['id'])
            if f:
                proto[lnames[lid]] = f
        data.add(
            models.Concept,
            param['id'],
            id=param['id'],
            name='{} [{}]'.format(param['name'], param['id'].split('_')[0]),
            concepticon_id=param['concepticonReference'],
            concepticon_gloss=param['Concepticon_Gloss'],
            description=param['Spanish_Gloss'],
            jsondata=dict(reconstructions=proto),
        )

    f2a = form2audio(args.cldf)
    for form in args.cldf.iter_rows('FormTable', 'id', 'form',
                                    'languageReference', 'parameterReference',
                                    'source'):
        assert not (form['form'] == '►' and not f2a.get(form['id']))
        vsid = (form['languageReference'], form['parameterReference'])
        vs = data['ValueSet'].get(vsid)
        if not vs:
            vs = data.add(
                common.ValueSet,
                vsid,
                id='-'.join(vsid),
                language=data['Variety'][form['languageReference']],
                parameter=data['Concept'][form['parameterReference']],
                contribution=contrib,
            )
        for ref in form.get('source', []):
            sid, pages = Sources.parse(ref)
            refs[(vsid, sid)].append(pages)
        proto = collections.OrderedDict()
        for lid in ancestors.get(form['languageReference'], []):
            f = proto_forms[lid].get(form['parameterReference'])
            if f:
                proto[lnames[lid]] = f
        data.add(
            Counterpart,
            form['id'],
            id=form['id'],
            name=form['form'],
            valueset=vs,
            audio=f2a.get(form['id']),
            jsondata=dict(reconstructions=proto),
        )

    for (vsid, sid), pages in refs.items():
        DBSession.add(
            common.ValueSetReference(valueset=data['ValueSet'][vsid],
                                     source=data['Source'][sid],
                                     description='; '.join(nfilter(pages))))
示例#13
0
def get_glottolog_api(repos):
    return Glottolog(repos or pathlib.Path(glottolog3.__file__).parent.parent.parent.joinpath('glottolog'))
示例#14
0
from clldutils.dsv import UnicodeWriter
from pyglottolog import Glottolog
from pyglottolog.languoids import Level


def locations(glottolog, fid, outpath):
    with UnicodeWriter(outpath) as writer:
        writer.writerow(['name', 'glottocode', 'latitude', 'longitude'])
        for lang in glottolog.languoids():
            if lang.level == Level.language and lang.latitude is not None:
                if fid in [l[1] for l in lang.lineage]:
                    writer.writerow(
                        [lang.name, lang.id, lang.latitude, lang.longitude])


if __name__ == '__main__':
    import sys

    locations(Glottolog(sys.argv[1]), sys.argv[2], sys.argv[3])
    def cmd_makecldf(self, args):

        glottolog = Glottolog(args.glottolog.dir)
        clts = CLTS(Config.from_file().get_clone('clts'))
        bipa = clts.bipa
        clts_eurasian = clts.transcriptiondata_dict['eurasian']

        args.writer.cldf.add_columns("ValueTable", {
            "name": "Marginal",
            "datatype": "boolean"
        }, {
            "name": "Value_in_Source",
            "datatype": "string"
        })

        args.writer.cldf.add_columns('ParameterTable', {
            'name': 'CLTS_BIPA',
            'datatype': 'string'
        }, {
            'name': 'CLTS_Name',
            'datatype': 'string'
        })
        args.writer.cldf.add_component("LanguageTable", "Family",
                                       "Glottolog_Name")

        # load language mapping and build inventory info
        languages = []
        lang_map = {}
        all_glottolog = {lng.id: lng for lng in glottolog.languoids()}
        unknowns = defaultdict(list)
        for row in progressbar(
                self.etc_dir.read_csv("languages.csv", dicts=True)):
            lang_map[row["name"]] = slug(row["name"])
            lang_dict = {"ID": slug(row["name"]), "Name": row["name"]}
            if row["glottocode"] in all_glottolog:
                lang = all_glottolog[row["glottocode"]]
                lang_dict.update({
                    "Family":
                    lang.family if lang.lineage else None,
                    "Glottocode":
                    lang.id,
                    "ISO639P3code":
                    lang.iso_code,
                    "Latitude":
                    lang.latitude,
                    "Longitude":
                    lang.longitude,
                    "Macroarea":
                    lang.macroareas[0].name if lang.macroareas else None,
                    "Glottolog_Name":
                    lang.name,
                })
            languages.append(lang_dict)

        # Read raw data
        with open(self.raw_dir.joinpath(
                'phono_dbase.json').as_posix()) as handler:
            raw_data = json.load(handler)

        # Iterate over raw data
        values = []
        parameters = []
        inventories = []
        counter = 1
        segment_set = set()
        with open(self.raw_dir.joinpath('sources.txt').as_posix()) as f:
            sources = [source.strip() for source in f.readlines()][1:]
        sources_ = Sources.from_file(self.raw_dir / "sources.bib")
        args.writer.cldf.add_sources(*sources_)
        for idx, (language, langdata) in enumerate(raw_data.items()):
            cons = langdata["cons"]
            vows = langdata["vows"]
            tones = [tone for tone in langdata["tones"] if tone]
            source = sources[idx]
            # Prepare language key
            lang_key = language.split("#")[0].replace(",", "")

            # Add consonants and vowels to values, also collecting parameters
            for segment in cons + vows:
                marginal = bool(segment[0] == "(")

                # Obtain the corresponding BIPA grapheme, is possible
                normalized = normalize_grapheme(segment)
                par_id = compute_id(normalized)
                if normalized in clts_eurasian.grapheme_map:
                    sound = bipa[clts_eurasian.grapheme_map[normalized]]
                else:
                    sound = bipa['<NA>']
                    unknowns[normalized] += [(segment, lang_key)]
                if sound.type == 'unknownsound':
                    bipa_grapheme = ''
                    desc = ''
                else:
                    bipa_grapheme = str(sound)
                    desc = sound.name
                parameters.append((par_id, normalized, bipa_grapheme, desc))

                values.append({
                    "ID": str(counter),
                    "Language_ID": lang_map[lang_key],
                    "Marginal": marginal,
                    "Parameter_ID": par_id,
                    "Value": normalized,
                    "Value_in_Source": segment,
                    "Source": [source],
                })
                counter += 1

        # Build segment data
        segments = [{
            "ID": id,
            "Name": normalized,
            "BIPA": bipa_grapheme,
            "Description": desc
        } for id, normalized, bipa_grapheme, desc in set(parameters)]

        # Write data and validate
        args.writer.write(
            **{
                "ValueTable": values,
                "LanguageTable": languages,
                "ParameterTable": segments,
            })
        for g, rest in unknowns.items():
            print('\t'.join([repr(g), str(len(rest)), g]))
示例#16
0
文件: conftest.py 项目: cldf/cldfviz
def glottolog(glottolog_dir):
    return Glottolog(glottolog_dir)
示例#17
0
    def cmd_makecldf(self, args):

        # Add sources
        sources = Sources.from_file(self.raw_dir / "sources.bib")
        args.writer.cldf.add_sources(*sources)

        glottolog = Glottolog(args.glottolog.dir)
        clts = CLTS(Config.from_file().get_clone('clts'))
        bipa = clts.bipa
        clts_saphon = clts.transcriptiondata_dict['saphon']

        # Add components
        args.writer.cldf.add_columns("ValueTable", {
            "name": "Value_in_Source",
            "datatype": "string"
        })

        cltstable = Terms()["cltsReference"].to_column().asdict()
        cltstable["datatype"]["format"] = "[a-z_-]+|NA"
        args.writer.cldf.add_columns('ParameterTable', cltstable, {
            'name': 'CLTS_BIPA',
            'datatype': 'string'
        }, {
            'name': 'CLTS_Name',
            'datatype': 'string'
        })
        args.writer.cldf.add_component("LanguageTable", "Family",
                                       "Glottolog_Name")

        languages = []
        #all_glottolog = {lng.id: lng for lng in glottolog.languoids()}
        #iso2glot = {lng.iso: lng.glottocode for lng in all_glottolog.values()}
        #args.log.info("loaded glottolog")
        for row in progressbar(
                self.etc_dir.read_csv("languages.csv", dicts=True)):
            #if row["SAPHON_Code"] in iso2glot:
            #    glottocode = iso2glot[row["SAPHON_Code"]]
            #elif row["SAPHON_Code"][:3] in iso2glot:
            #    glottocode = iso2glot[row["SAPHON_Code"][:3]]
            #else:
            #    glottocode = ""

            #if glottocode and glottocode in all_glottolog:
            #    lang = all_glottolog[glottocode]
            #    update = {
            #        "Family": lang.family.name if lang.family else '',
            #        "Glottocode": glottocode,
            #        "Latitude": lang.latitude,
            #        "Longitude": lang.longitude,
            #        "Macroarea": lang.macroareas[0].name if lang.macroareas else None,
            #        "Glottolog_Name": lang.name,
            #    }
            #    row.update(update)
            languages.append(row)

        # Build source map from language
        source_map = {
            k: v
            for k, v in self.raw_dir.read_csv("references.tsv", delimiter="\t")
        }

        # Parse sources
        segments = []
        values = []
        counter = 1
        unknowns = defaultdict(list)
        for lid, segment in self.raw_dir.read_csv('inventories.tsv',
                                                  delimiter="\t"):
            normalized = normalize_grapheme(segment)
            if normalized in clts_saphon.grapheme_map:
                sound = bipa[clts_saphon.grapheme_map[normalized]]
            else:
                sound = bipa['<NA>']
                unknowns[normalized] += [(lang_key, segment)]
            par_id = compute_id(normalized)
            if sound.type == 'unknownsound':
                bipa_grapheme = ''
                desc = ''
            else:
                bipa_grapheme = str(sound)
                desc = sound.name

            segments.append((par_id, normalized, bipa_grapheme, desc))

            values.append({
                "ID": str(counter),
                "Language_ID": lid,
                "Parameter_ID": par_id,
                "Value_in_Source": segment,
                "Value": normalized,
                "Source": [source_map[lid]]
            })
            counter += 1

        # Build segment data
        parameters = [{
            "ID": ID,
            "Name": normalized,
            "Description": '',
            "CLTS_ID": desc.replace(' ', '_') if desc.strip() else "NA",
            "CLTS_BIPA": bipa_grapheme,
            "CLTS_Name": desc
        } for ID, normalized, bipa_grapheme, desc in set(segments)]

        # Write data and validate
        args.writer.write(
            **{
                "ValueTable": values,
                "LanguageTable": languages,
                "ParameterTable": parameters,
            })
        for g, rest in unknowns.items():
            print('\t'.join([repr(g), str(len(rest)), g]))
示例#18
0
def load_datasets(dirname):
    datasets = defaultdict(list)
    for p in dirname.iterdir():
        if p.stem != '.DS_Store':
            datasets[p.stem].append(p)
    return datasets


if __name__ == '__main__':
    
    rawdata = {"varikin": VARIKIN, 'parabank': PARABANK}
    sources = {}
    clashes = []
    languages = {}
    concepts = Counter()
    glottolog = {l.id: l for l in Glottolog(gdir).languoids()}

    language_id = 1
    for dlabel, dpath in rawdata.items():
        for d in sorted(load_datasets(dpath).items()):
            try:
                ds = Dataset(*d)
            except:
                print("ERROR loading %s - %s." % d)
                raise

            if len(ds.data) == 0:
                print("EMPTY %s - ignoring." % ds)
                continue

            # check glottocode
示例#19
0
def main(scripts, dev, glr):
    cldf_dir = Path('cldf')
    bib = parse_string(read_text(cldf_dir / 'sources.bib'),
                       bib_format='bibtex')
    write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex'))

    glottolog = Glottolog(glr)
    ds = StructureDataset.in_dir(cldf_dir)
    ds.tablegroup.notes.append(
        OrderedDict([('dc:title', 'environment'),
                     ('properties',
                      OrderedDict([
                          ('glottolog_version', git_describe(glottolog.repos)),
                      ]))]))
    ds.add_columns('ValueTable', {
        'name': 'Marginal',
        'datatype': 'boolean'
    }, {
        'name': 'Allophones',
        'separator': ' '
    }, 'Contribution_ID')
    features = [
        "tone", "stress", "syllabic", "short", "long", "consonantal",
        "sonorant", "continuant", "delayedRelease", "approximant", "tap",
        "trill", "nasal", "lateral", "labial", "round", "labiodental",
        "coronal", "anterior", "distributed", "strident", "dorsal", "high",
        "low", "front", "back", "tense", "retractedTongueRoot",
        "advancedTongueRoot", "periodicGlottalSource", "epilaryngealSource",
        "spreadGlottis", "constrictedGlottis", "fortis",
        "raisedLarynxEjective", "loweredLarynxImplosive", "click"
    ]
    ds.add_component('ParameterTable', 'SegmentClass', *features)
    ds.add_component('LanguageTable')
    ds.add_table(
        'contributions.csv', 'ID', 'Name', 'Contributor_ID', {
            'name': 'Source',
            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
            'separator': ';'
        }, 'URL')
    ds.add_table(
        'contributors.csv',
        'ID',
        'Name',
        'Description',
        'Readme',
        'Contents',
        {
            'name': 'Source',
            'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source',
            'separator': ';'
        },
        'URL',
    )

    def read(what):
        return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True)

    languoids = {l.id: l for l in glottolog.languoids()}

    values, segments, languages, inventories, sources = [], [], {}, {}, []
    for contrib in read('contributors.csv'):
        sources.append(
            dict(
                ID=contrib.Name,
                Name=contrib.Contributor,
                Description=contrib.Description,
                Readme=desc(dev, contrib.Name),
                Contents=contrib.Contents,
                Source=[
                    c.strip().lower() for c in contrib.Citation.split(';')
                ],
                URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '',
            ))

    pid_map = {}
    for row in read('parameters.csv'):
        pid = md5(row.Description.encode('utf8')).hexdigest().upper()
        pid_map[row.ID] = pid
        segments.append(
            dict(ID=pid,
                 Name=row.Name,
                 Description=row.Description,
                 SegmentClass=row.SegmentClass,
                 **{f: getattr(row, f)
                    for f in features}))

    src = {}
    for row in read('contributions.csv'):
        src[row.ID] = row.References.split(
            ';') if row.References != 'no source given' else []
        src[row.ID] = [sid.lower() for sid in src[row.ID]]
        inventories[row.ID] = dict(ID=row.ID,
                                   Name=row.Name,
                                   Contributor_ID=row.Contributor_ID,
                                   URL=row.URI if row.URI != 'NA' else '',
                                   Source=src[row.ID])

    uniq = set()
    for row in read('values.csv'):
        pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID)
        if pk in uniq:
            print('skipping duplicate phoneme {0}'.format(pk))
            continue
        uniq.add(pk)
        lid = row.Language_ID if row.Language_ID in languoids else slug(
            inventories[row.Contribution_ID]['Name'])
        if lid not in languages:
            #
            # FIXME: Language_ID == 'NA' for three inventories! This must be mapped!
            #
            lang = languoids.get(lid)
            languages[lid] = dict(
                ID=lid,
                Name=lang.name if lang else None,
                Glottocode=lang.id if lang else None,
                ISO639P3code=row.ISO639P3code
                if row.ISO639P3code != 'NA' else None,
            )
        values.append(
            dict(
                ID=row.ID,
                Language_ID=lid,
                Parameter_ID=pid_map[row.Parameter_ID],
                Contribution_ID=row.Contribution_ID,
                Value=row.Name,
                Marginal=None if row.Marginal == 'NA' else eval(
                    row.Marginal.lower().capitalize()),  # FALSE|TRUE|NA
                Allophones=row.Allophones.split()
                if row.Allophones != 'NA' else [],
                Source=src[row.Contribution_ID],
            ))

    ds.write(
        **{
            'ValueTable': values,
            'LanguageTable': languages.values(),
            'ParameterTable': segments,
            'contributions.csv': inventories.values(),
            'contributors.csv': sources
        })
    ds.validate(logging.getLogger(__name__))
示例#20
0
def main(scripts, dev, glr):
    cldf_dir = Path('cldf')
    bib = parse_string(read_text(cldf_dir / 'sources.bib'), bib_format='bibtex')
    for _, e in bib.entries.items():
        for field in e.fields:
            e.fields[field] = e.fields[field].replace('\\', '')
    write_text(cldf_dir / 'sources.bib', bib.lower().to_string('bibtex'))

    glottolog = Glottolog(glr)

    ds = StructureDataset.in_dir(cldf_dir)

    def describe_repos(r, org, name=None):
        return OrderedDict([
            ('dc:title', '{0}/{1}'.format(org, name or r.name)),
            ('dc:description', git_describe(r))])

    ds.tablegroup.common_props['prov:wasDerivedFrom'] = [
        describe_repos(dev, 'phoible'),
        describe_repos(scripts, 'bambooforest'),
        describe_repos(glottolog.repos, 'clld'),
    ]
    ds.tablegroup.common_props['prov:wasGeneratedBy'] = describe_repos(
        Path(__file__).parent, 'cldf-datasets', name='phoible')

    ds.add_columns(
        'ValueTable',
        {'name': 'Marginal', 'datatype': 'boolean'},
        {'name': 'Allophones', 'separator': ' '},
        'Contribution_ID')
    features = ["tone","stress","syllabic","short","long","consonantal","sonorant","continuant","delayedRelease","approximant","tap","trill","nasal","lateral","labial","round","labiodental","coronal","anterior","distributed","strident","dorsal","high","low","front","back","tense","retractedTongueRoot","advancedTongueRoot","periodicGlottalSource","epilaryngealSource","spreadGlottis","constrictedGlottis","fortis","raisedLarynxEjective","loweredLarynxImplosive","click"]
    ds.add_component('ParameterTable', 'SegmentClass', *features)
    ds.add_component('LanguageTable', 'Family_Glottocode', 'Family_Name')
    table = ds.add_table(
        'contributions.csv', 
        'ID', 
        'Name', 
        'Contributor_ID', 
        {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'},
        'URL',
        {'name': 'count_phonemes', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_consonants', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_vowels', 'required': True, 'datatype': {'base': 'integer', 'minimum': 0}},
        {'name': 'count_tones', 'datatype': {'base': 'integer', 'minimum': 0}, 'null': 'NA'},
    )
    table.tableSchema.primaryKey = ['ID']
    table.tableSchema.foreignKeys.append(ForeignKey.fromdict(dict(
        columnReference='Contributor_ID',
        reference=dict(resource='contributors.csv', columnReference='ID'))))
    table.common_props['dc:conformsTo'] = None
    table = ds.add_table(
        'contributors.csv',
        'ID',
        'Name',
        'Description',
        'Readme',
        'Contents',
        {'name': 'Source', 'propertyUrl': 'http://cldf.clld.org/v1.0/terms.rdf#source', 'separator': ';'},
        'URL',
        {'name': 'with_tones', 'datatype': {'base': 'boolean', 'format': '1|0'}},
    )
    table.tableSchema.primaryKey = ['ID']
    table.common_props['dc:conformsTo'] = None

    def read(what):
        return reader(scripts / 'to_cldf' / 'cldf' / what, namedtuples=True)

    languoids = {l.id: l for l in glottolog.languoids()}

    values, segments, languages, inventories, sources = [], [], OrderedDict(), OrderedDict(), []
    with_tones = {}
    for contrib in read('contributors.csv'):
        sources.append(dict(
            ID=contrib.Name,
            Name=contrib.Contributor,
            Description=contrib.Description,
            Readme=desc(dev, contrib.Name),
            Contents=contrib.Contents,
            Source=[c.strip().lower() for c in contrib.Citation.split(';')],
            URL=contrib.SourceURL if contrib.SourceURL != 'NA' else '',
            with_tones=contrib.with_tones == '1',
        ))
        with_tones[contrib.Name] = contrib.with_tones == '1'

    pid_map = {}
    for row in read('parameters.csv'):
        pid = md5(row.Description.encode('utf8')).hexdigest().upper()
        pid_map[row.ID] = (pid, row.SegmentClass)
        segments.append(dict(
            ID=pid,
            Name=row.Name,
            Description=row.Description,
            SegmentClass=row.SegmentClass,
            **{f: getattr(row, f) for f in features}
        ))

    src = {}
    for row in read('contributions.csv'):
        src[row.ID] = row.References.split(';') if row.References != 'no source given' else []
        src[row.ID] = [sid.lower() for sid in src[row.ID]]
        inventories[row.ID] = dict(
            ID=row.ID, 
            Name=row.Name, 
            Contributor_ID=row.Contributor_ID.upper(), 
            URL=row.URI if row.URI != 'NA' else '',
            Source=src[row.ID],
            count_phonemes=0,
            count_consonants=0,
            count_vowels=0,
            count_tones=0,
        )

    uniq, counts = set(), Counter()
    for row in read('values.csv'):
        pk = (row.Language_ID, row.Parameter_ID, row.Contribution_ID)
        if pk in uniq:
            print('skipping duplicate phoneme {0}'.format(pk))
            continue
        uniq.add(pk)
        lid = row.Language_ID if row.Language_ID in languoids else slug(inventories[row.Contribution_ID]['Name'])
        if lid not in languages:
            #
            # FIXME: Language_ID == 'NA' for three inventories! This must be mapped!
            #
            lang = languoids.get(lid)
            fam = lang.lineage[0] if lang and lang.lineage else None
            languages[lid] = dict(
                ID=lid,
                Name=lang.name if lang else None,
                Glottocode=lang.id if lang else None,
                ISO639P3code=row.ISO639P3code if row.ISO639P3code != 'NA' else None,
                Macroarea=lang.macroareas[0].value if lang and lang.macroareas else None,
                Latitude=lang.latitude if lang else None,
                Longitude=lang.longitude if lang else None,
                Family_Glottocode=fam[1] if fam else None,
                Family_Name=fam[0] if fam else None,
            )
        pid, sc = pid_map[row.Parameter_ID]
        counts.update([(row.Contribution_ID, sc)])
        values.append(dict(
            ID=row.ID,
            Language_ID=lid,
            Parameter_ID=pid,
            Contribution_ID=row.Contribution_ID,
            Value=row.Name,
            Marginal=None if row.Marginal == 'NA' else eval(row.Marginal.lower().capitalize()),  # FALSE|TRUE|NA
            Allophones=row.Allophones.split() if row.Allophones != 'NA' else [],
            Source=src[row.Contribution_ID],
        ))
    for key, count in counts.items():
        inventories[key[0]]['count_{0}s'.format(key[1])] = count
        inventories[key[0]]['count_phonemes'] += count

    for inv in inventories.values():
        if not with_tones[inv['Contributor_ID']]:
            assert inv['count_tones'] == 0
            inv['count_tones'] = 'NA'

    ds.write(**{
        'ValueTable': values,
        'LanguageTable': languages.values(),
        'ParameterTable': segments,
        'contributions.csv': inventories.values(),
        'contributors.csv': sources
    })
    ds.validate(logging.getLogger(__name__))
示例#21
0
import sys

try:
    import zenodo
except ImportError:
    from langsci import zenodo

glottolog = False
from pyglottolog import Glottolog
try:
    glottolog = Glottolog('.')
except ValueError:
    print("Glottolog tree directory not found. Glottocodes will not work. Please symlink the directories glottolog/languoids and glottolog/references")

"""
usage: > python3 zenodo.py 7
The script looks for all include'd files from the folder chapters/ in main.tex
It will ignore the first n files, where n is the argument of the script
If no argument is given, processing will start with the first file. 
For each file, the script will extract metadata from the file itself 
and from the file collection_tmp.bib generated by biber. 
The metadata is collected and a corresponding entry is created on Zenodo. 
The DOI assigned by Zenodo is collected and inserted into the file. 
"""

offset = 0
try:
    offset = int(sys.argv[1])
except IndexError:
    pass
extracommunities = []
示例#22
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """

    print('Parsing markdown intros...')
    for contrib in DBSession.query(models.Contribution):
        if contrib.description:
            contrib.markup_description = markdown(contrib.description)
        else:
            contrib.markup_description = None
    print('... done')

    print('Retrieving language data from glottolog...')

    catconf = cldfcatalog.Config.from_file()
    glottolog_path = catconf.get_clone('glottolog')
    glottolog = Glottolog(glottolog_path)

    lang_ids = [lang.id for lang in DBSession.query(common.Language)]
    languoids = {l.id: l for l in glottolog.languoids(lang_ids)}

    glottocodes = [(l.id,
                    common.Identifier(id=l.id, name=l.id, type='glottolog'))
                   for l in languoids.values()]
    glottocodes = OrderedDict(sorted(glottocodes, key=lambda t: t[0]))

    isocodes = [(l.iso, common.Identifier(id=l.iso,
                                          name=l.iso,
                                          type='iso639-3'))
                for l in languoids.values() if l.iso]
    isocodes = OrderedDict(sorted(isocodes, key=lambda t: t[0]))

    DBSession.add_all(glottocodes.values())
    DBSession.add_all(isocodes.values())
    DBSession.flush()

    for lang in DBSession.query(common.Language):
        if lang.id not in languoids:
            continue
        languoid = languoids[lang.id]
        lang.name = languoid.name
        lang.latitude = languoid.latitude
        lang.longitude = languoid.longitude
        lang.macroarea = languoid.macroareas[
            0].name if languoid.macroareas else ''

        DBSession.add(
            common.LanguageIdentifier(
                language=lang, identifier_pk=glottocodes[languoid.id].pk))

        if languoid.iso in isocodes:
            DBSession.add(
                common.LanguageIdentifier(
                    language=lang, identifier_pk=isocodes[languoid.iso].pk))

    DBSession.flush()
    print('... done')

    print('Making pretty colourful dots for parameter values...')
    all_icons = [icon.name for icon in ORDERED_ICONS]

    code_query = DBSession.query(common.DomainElement)\
        .order_by(common.DomainElement.parameter_pk, common.DomainElement.id)
    for _, param_codes in groupby(code_query, lambda c: c.parameter_pk):
        icons = cycle(all_icons)
        for code in param_codes:
            code.update_jsondata(icon=next(icons))

    DBSession.flush()
    print('... done')