Пример #1
0
def test_glottocodes_by_isocode(mocker, env):
    from clld.scripts.util import glottocodes_by_isocode

    ce = mocker.Mock(return_value=mocker.Mock(
        execute=lambda *args: [('iso', 'abcd1234')]))

    mocker.patch('clld.scripts.util.create_engine', ce)
    assert glottocodes_by_isocode('dburi')['iso'] == 'abcd1234'

    json = """{
        "properties": {
            "dataset": "glottolog",
            "uri_template": "http://glottolog.org/resource/languoid/id/{id}"
        },
        "resources": [
            {
                "id": "aant1238",
                "identifiers": [
                    {
                        "identifier": "tbg-aan",
                        "type": "multitree"
                    }
                ],
                "latitude": null,
                "longitude": null,
                "name": "Aantantara"
            },
            {
                "id": "aari1239",
                "identifiers": [
                    {
                        "identifier": "aiw",
                        "type": "iso639-3"
                    },
                    {
                        "identifier": "aiw",
                        "type": "multitree"
                    }
                ],
                "latitude": 5.95034,
                "longitude": 36.5721,
                "name": "Aari"
            }]}"""

    class Req(mocker.Mock):
        def get(self, *args):
            return mocker.Mock(json=mocker.Mock(return_value=loads(json)))

    mocker.patch('clld.scripts.util.requests', Req())
    assert glottocodes_by_isocode(None,
                                  cols=['id',
                                        'latitude'])['aiw'][0] == 'aari1239'
Пример #2
0
    def test_glottocodes_by_isocode(self):
        from clld.scripts.util import glottocodes_by_isocode

        ce = Mock(return_value=Mock(execute=lambda *args: [('iso', 'abcd1234')]))

        with patch('clld.scripts.util.create_engine', ce):
            assert glottocodes_by_isocode('dburi')['iso'] == 'abcd1234'

        json = """{
            "properties": {
                "dataset": "glottolog",
                "uri_template": "http://glottolog.org/resource/languoid/id/{id}"
            },
            "resources": [
                {
                    "id": "aant1238",
                    "identifiers": [
                        {
                            "identifier": "tbg-aan",
                            "type": "multitree"
                        }
                    ],
                    "latitude": null,
                    "longitude": null,
                    "name": "Aantantara"
                },
                {
                    "id": "aari1239",
                    "identifiers": [
                        {
                            "identifier": "aiw",
                            "type": "iso639-3"
                        },
                        {
                            "identifier": "aiw",
                            "type": "multitree"
                        }
                    ],
                    "latitude": 5.95034,
                    "longitude": 36.5721,
                    "name": "Aari"
                }]}"""

        class Req(Mock):
            def get(self, *args):
                return Mock(json=Mock(return_value=loads(json)))

        with patch('clld.scripts.util.requests', Req()):
            assert glottocodes_by_isocode(
                None, cols=['id', 'latitude'])['aiw'][0] == 'aari1239'
Пример #3
0
def main(args):
    glottocodes = {}
    if getuser() == "robert":
        glottocodes = glottocodes_by_isocode("postgresql://robert@/glottolog3")

    data = Data()
    dataset = common.Dataset(id=autotyp.__name__, name="AUTOTYP", description="AUTOTYP", domain="autotyp.clld.org")
    DBSession.add(dataset)

    bib = Database.from_file(args.data_file("LenaBib.bib"), lowercase=True)

    for i, spec in enumerate(
        [
            ("bickel", "Balthasar Bickel", "University of Zurich"),
            ("nichols", "Johanna Nichols", "University of California, Berkeley"),
        ]
    ):
        contributor = data.add(common.Contributor, spec[0], id=spec[0], name=spec[1])
        DBSession.add(common.Editor(dataset=dataset, ord=i + 1, contributor=contributor))

    for l in rows(
        args.data_file("backbone_09Jan2014_directexport.tab"), newline="\r", encoding="macroman", namedtuples=True
    ):
        # LID	language	ISO639.3.2013	stock	continent	area	latitude	longitude
        if l.stock not in data["Stock"]:
            stock = data.add(models.Stock, l.stock, id=slug(l.stock), name=l.stock)
        else:
            stock = data["Stock"][l.stock]

        if l.continent not in data["Continent"]:
            continent = data.add(models.Continent, l.continent, id=slug(l.continent), name=l.continent)
        else:
            continent = data["Continent"][l.continent]

        if l.area not in data["Area"]:
            area = data.add(models.Area, l.area, id=slug(l.area), name=l.area, continent=continent)
        else:
            area = data["Area"][l.area]

        lang = data.add(
            models.Languoid,
            l.LID,
            id=l.LID,
            name=l.language,
            latitude=coord(l.latitude),
            longitude=coord(l.longitude),
            stock=stock,
            area=area,
        )
        add_language_codes(data, lang, l.ISO639_3_2013, glottocodes=glottocodes)

    loader.case_alignment(args, data, bib)
    loader.inclusive_excusive(args, data, bib)
Пример #4
0
def main(args):
    data = Data()

    # fetch language data from glottolog:
    glottolog = glottocodes_by_isocode(
        'postgresql://robert@/glottolog3', ['id', 'name', 'latitude', 'longitude'])

    dataset = common.Dataset(
        id=jcld.__name__,
        name="Journal of Cross-Linguistic Databases",
        domain='jcld.clld.org')
    DBSession.add(dataset)

    contribution = data.add(common.Contribution, '1', id='1', name='fb')

    for i, row in enumerate(reader(file(args.data_file('fb_jcld.tab')), namedtuples=True, encoding='latin1')):
        if row.Feature not in data['Parameter']:
            parameter = data.add(common.Parameter, row.Feature, id='1', name=row.Feature)
        else:
            parameter = data['Parameter'][row.Feature]

        if row.Value not in data['DomainElement']:
            de = data.add(
                common.DomainElement, row.Value,
                id='%s-%s' % (parameter.id, slug(row.Value)), parameter=parameter, name=row.Value)
        else:
            de = data['DomainElement'][row.Value]

        if row.Language not in data['Language']:
            if row.Language not in glottolog:
                print '--->', row.Language
                continue
            glottocode, name, lat, lon = glottolog[row.Language]
            language = data.add(
                common.Language, row.Language,
                id=slug(row.Language), name=name, latitude=lat, longitude=lon)
        else:
            language = data['Language'][row.Language]

        id_ = str(i + 1)  #'%s-%s' % (parameter.id, language.id)
        vs = common.ValueSet(
            id=id_,
            parameter=parameter,
            language=language,
            contribution=contribution,
            description=row.Comment,
            source=row.Source)
        common.Value(valueset=vs, name=row.Value, domainelement=de)
Пример #5
0
def main(args):
    # determine if we run on a machine where other databases are available for lookup
    # locally:
    data = Data()
    genera = get_genera(data) if astroman else {}
    glottocodes, lnames, geocoords = {}, {}, {}
    if astroman:
        for k, v in glottocodes_by_isocode(
                'postgresql://robert@/glottolog3',
                cols=['id', 'name', 'latitude', 'longitude']).items():
            glottocodes[k] = v[0]
            lnames[k] = v[1]
            geocoords[k] = (v[2], v[3])

    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset, 'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
                'Creative Commons Attribution-ShareAlike 3.0 Unported License'})

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(common.Editor(
            dataset=dataset,
            ord=i + 1,
            contributor=common.Contributor(id=spec[0], name=spec[1])))

    squibs = defaultdict(list)
    for row in get_rows(args, 'Squib'):
        squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    aggregated = list(reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True))
    inventory_names = {}
    for key, items in groupby(
            sorted(aggregated, key=lambda t: (t.LanguageCode, t.Source)),
            key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i + 1, key[1])

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus, genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(
                models.Variety, row.LanguageCode,
                id=row.LanguageCode,
                name=lnames[row.LanguageCode],
                genus=genus,
                country=strip_quotes(row.Country),
                area=strip_quotes(row.Area),
                latitude=coords[0],
                longitude=coords[1],
                jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data, lang, row.LanguageCode, glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(
                common.Contributor, row.Source,
                id=row.Source,
                name=SOURCES[row.Source][0],
                description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(models.ContributorReference(
                    source=data['Source'][ref], contributor=contributor))

        contrib = data.add(
            models.Inventory, row.InventoryID,
            id=row.InventoryID,
            language=lang,
            source=row.Source,
            source_url=source_urls.get(row.InventoryID),
            internetarchive_url=ia_urls.get(row.InventoryID),
            name=inventory_names[row.InventoryID],
            description=row.LanguageName)

        DBSession.add(common.ContributionContributor(
            contribution=contrib, contributor=contributor))

        for j, squib in enumerate(squibs.get(row.InventoryID, [])):
            f = common.Contribution_files(
                object=contrib,
                id='squib-%s-%s.pdf' % (contrib.id, j + 1),
                name='Phonological squib',
                description=squib,
                mime_type='application/pdf')
            assert f
            # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'), namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment, row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join(
                    [t[0] for t in unicode_desc
                     if t[1].split()[0] not in ['COMBINING', 'MODIFIER']]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(
            id=row.PhonemeID,
            contribution=inventory,
            language=inventory.language,
            parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(common.ValueSetReference(
                source=data['Source'][ref],
                valueset=vs))

        DBSession.add(common.Value(
            id=row.PhonemeID,
            name='%s %s' % (row.Phoneme, data['Inventory'][row.InventoryID].name),
            valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(
                common.ContributionReference, '%s-%s' % (inventory_id, ref),
                source=data['Source'][ref],
                contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(common.Parameter_data(
                    key=features[j],
                    value=value,
                    ord=j,
                    object_pk=data['Segment'][row[0]].pk))
    DBSession.flush()
Пример #6
0
def main(args):
    # determine if we run on a machine where other databases are available for lookup
    # locally:
    data = Data()
    genera = get_genera(data) if astroman else {}
    glottocodes, lnames, geocoords = {}, {}, {}
    if astroman:
        for k, v in glottocodes_by_isocode(
                'postgresql://robert@/glottolog3',
                cols=['id', 'name', 'latitude', 'longitude']).items():
            glottocodes[k] = v[0]
            lnames[k] = v[1]
            geocoords[k] = (v[2], v[3])

    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(
            common.Editor(dataset=dataset,
                          ord=i + 1,
                          contributor=common.Contributor(id=spec[0],
                                                         name=spec[1])))

    squibs = defaultdict(list)
    for row in get_rows(args, 'Squib'):
        squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    aggregated = list(
        reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True))
    inventory_names = {}
    for key, items in groupby(sorted(aggregated,
                                     key=lambda t: (t.LanguageCode, t.Source)),
                              key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i +
                                                                    1, key[1])

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus,
                            genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(models.Variety,
                            row.LanguageCode,
                            id=row.LanguageCode,
                            name=lnames[row.LanguageCode],
                            genus=genus,
                            country=strip_quotes(row.Country),
                            area=strip_quotes(row.Area),
                            latitude=coords[0],
                            longitude=coords[1],
                            jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data,
                               lang,
                               row.LanguageCode,
                               glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(common.Contributor,
                                   row.Source,
                                   id=row.Source,
                                   name=SOURCES[row.Source][0],
                                   description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(
                    models.ContributorReference(source=data['Source'][ref],
                                                contributor=contributor))

        contrib = data.add(models.Inventory,
                           row.InventoryID,
                           id=row.InventoryID,
                           language=lang,
                           source=row.Source,
                           source_url=source_urls.get(row.InventoryID),
                           internetarchive_url=ia_urls.get(row.InventoryID),
                           name=inventory_names[row.InventoryID],
                           description=row.LanguageName)

        DBSession.add(
            common.ContributionContributor(contribution=contrib,
                                           contributor=contributor))

        for j, squib in enumerate(squibs.get(row.InventoryID, [])):
            f = common.Contribution_files(object=contrib,
                                          id='squib-%s-%s.pdf' %
                                          (contrib.id, j + 1),
                                          name='Phonological squib',
                                          description=squib,
                                          mime_type='application/pdf')
            assert f
            # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'),
                      namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment,
                row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join([
                    t[0] for t in unicode_desc
                    if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
                ]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(id=row.PhonemeID,
                             contribution=inventory,
                             language=inventory.language,
                             parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            common.Value(
                id=row.PhonemeID,
                name='%s %s' %
                (row.Phoneme, data['Inventory'][row.InventoryID].name),
                valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(common.ContributionReference,
                     '%s-%s' % (inventory_id, ref),
                     source=data['Source'][ref],
                     contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(
            reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(
                    common.Parameter_data(
                        key=features[j],
                        value=value,
                        ord=j,
                        object_pk=data['Segment'][row[0]].pk))
    DBSession.flush()
Пример #7
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)

    def data_file(*comps):
        return path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(common.Dataset, 'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex")
    glottolog = glottocodes_by_isocode('postgresql://robert@/glottolog3')

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        try:
            add_language_codes(
                data, lang, lang.id.split('-')[0], glottolog, glottocode=row[2] or None)
        except:
            print(row)
            raise
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace(
            '/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):
        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(
                url=image.source_url,
                thumbnail=image_url(image.source_url, 'thumbnail'),
                web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(
                object=data['Taxon'][image.taxa__id],
                id=image.id,
                name=image.tags,
                jsondata=jsondata,
                mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)
Пример #8
0
def main(args):
    meta = parse_meta(args)
    print(len(meta))
    print(sum(len(m.sources) for m in meta.values()))
    sources = {}
    for m in meta.values():
        for s in m.sources:
            sources[s] = None
    print(len(sources), 'distinct')
    for i, s in enumerate(sources):
        sources[s] = get_source(s, i + 1)

    glottocodes = glottocodes_by_isocode('postgresql://robert@/glottolog3')

    data = Data()

    wals = create_engine('postgresql://robert@/wals3')
    wals_families = {}
    for row in wals.execute('select name, id from family'):
        wals_families[row[0]] = row[1]
        wals_families[row[1]] = row[1]

    #for item in reader(args.data_file('WALSFamilyAbbreviations.tab'), namedtuples=True, encoding='latin1'):
    #    name = item.FAMILY
    #    if name not in wals_families:
    #        name = slug(name)
    #        if name not in wals_families:
    #            print('missing wals family:', item.FAMILY)
    #            name = None
    #    if name:
    #        wals_families[item.ABBREVIATION] = wals_families[name]

    wals_genera = {
        row[0]: row[0]
        for row in wals.execute('select id from genus')
    }

    with args.data_file('listss17.txt').open(encoding='latin1') as fp:
        wordlists = ['\n'.join(lines) for lines in parse(fp)]

    dataset = common.Dataset(
        id=asjp.__name__,
        name="The ASJP Database",
        contact="*****@*****.**",
        description="The Automated Similarity Judgment Program",
        domain='asjp.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        license='http://creativecommons.org/licenses/by/4.0/',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    DBSession.add(dataset)

    transcribers = get_transcriber_map(args)
    for i, spec in enumerate([
        ('SW', "Søren Wichmann"),
        ('AM', "André Müller"),
        ('AKW', "Annkathrin Wett"),
        ('VV', "Viveka Velupillai"),
        ('JB', "Julia Bischoffberger"),
        ('CB', "Cecil H. Brown"),
        ('EH', "Eric W. Holman"),
        ('SS', "Sebastian Sauppe"),
        ('ZM', "Zarina Molochieva"),
        ('PB', "Pamela Brown"),
        ('HH', "Harald Hammarström"),
        ('OB', "Oleg Belyaev"),
        ('JML', "Johann-Mattis List"),
        ('DBA', "Dik Bakker"),
        ('DE', "Dmitry Egorov"),
        ('MU', "Matthias Urban"),
        ('RM', "Robert Mailhammer"),
        ('AC', "Agustina Carrizo"),
        ('MSD', "Matthew S. Dryer"),
        ('EK', "Evgenia Korovina"),
        ('DB', "David Beck"),
        ('HG', "Helen Geyer"),
        ('PE', "Patience Epps"),
        ('AG', "Anthony Grant"),
        ('PS', "Paul Sidwell"),  # not in citation
        ('KTR', "K. Taraka Rama"),  # not in citation
        ('PV', "Pilar Valenzuela"),
        ('MD', "Mark Donohue"),  # not in citation
    ]):
        id_, name = spec
        if id_ in transcribers:
            assert name == transcribers.pop(id_)
        contributor = data.add(common.Contributor, id_, id=id_, name=name)
        if id_ in ['SW', 'CB', 'EH']:
            DBSession.add(
                common.Editor(dataset=dataset,
                              ord=i + 1,
                              contributor=contributor))
    for id_, name in transcribers.items():
        data.add(common.Contributor, id_, id=id_, name=name)

    for id_ in sorted(models.MEANINGS_ALL.keys()):
        data.add(models.Meaning,
                 id_,
                 id=str(id_),
                 name=models.MEANINGS_ALL[id_],
                 core=id_ in models.MEANINGS)

    for n, l in enumerate(wordlists):
        #if n > 100:
        #    break
        lang = models.Doculect.from_txt(l)
        if lang.classification_wals:
            family, genus = lang.classification_wals.split('.')
            lang.wals_family = wals_families.get(family)
            lang.wals_genus = wals_genera.get(slug(genus))
        lang.code_glottolog = glottocodes.get(lang.code_iso)
        add_codes(lang)
        data.add(models.Doculect, lang.id, _obj=lang)
        DBSession.flush()
        md = meta.pop(lang.id, None)
        assert md
        # associate transcribers and sources
        for i, transcriber in enumerate(md.transcribers):
            common.ContributionContributor(
                contribution=lang.wordlist,
                contributor=data['Contributor'][transcriber],
                ord=i + 1)
        for source in md.sources:
            DBSession.add(
                common.LanguageSource(language_pk=lang.pk,
                                      source_pk=sources[source].pk))

    assert not list(meta.keys())
Пример #9
0
def main(args):
    meta = parse_meta(args)
    sources = {}
    for m in meta.values():
        for s in m.sources:
            sources[s] = None
    for i, s in enumerate(sources):
        sources[s] = get_source(s, i + 1)

    glottocodes = glottocodes_by_isocode('postgresql://robert@/glottolog3')

    data = Data()

    wals = create_engine('postgresql://robert@/wals3')
    wals_families = {}
    for row in wals.execute('select name, id from family'):
        wals_families[row[0]] = row[1]
        wals_families[row[1]] = row[1]

    #for item in reader(args.data_file('WALSFamilyAbbreviations.tab'), namedtuples=True, encoding='latin1'):
    #    name = item.FAMILY
    #    if name not in wals_families:
    #        name = slug(name)
    #        if name not in wals_families:
    #            print('missing wals family:', item.FAMILY)
    #            name = None
    #    if name:
    #        wals_families[item.ABBREVIATION] = wals_families[name]

    wals_genera = {row[0]: row[0] for row in wals.execute('select id from genus')}

    with args.data_file('listss18.txt').open(encoding='latin1') as fp:
        wordlists = ['\n'.join(lines) for lines in parse(fp)]

    dataset = common.Dataset(
        id=asjp.__name__,
        name="The ASJP Database",
        contact="*****@*****.**",
        description="The Automated Similarity Judgment Program",
        domain='asjp.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        license='http://creativecommons.org/licenses/by/4.0/',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)

    transcribers = get_transcriber_map(args)
    for i, spec in enumerate([
        ('SW', "Søren Wichmann"),
        ('AM', "André Müller"),
        ('AKW', "Annkathrin Wett"),
        ('VV', "Viveka Velupillai"),
        ('JB', "Julia Bischoffberger"),
        ('CB', "Cecil H. Brown"),
        ('EH', "Eric W. Holman"),
        ('SS', "Sebastian Sauppe"),
        ('ZM', "Zarina Molochieva"),
        ('PB', "Pamela Brown"),
        ('HH', "Harald Hammarström"),
        ('OB', "Oleg Belyaev"),
        ('JML', "Johann-Mattis List"),
        ('DBA', "Dik Bakker"),
        ('DE', "Dmitry Egorov"),
        ('MU', "Matthias Urban"),
        ('RM', "Robert Mailhammer"),
        ('AC', "Agustina Carrizo"),
        ('MSD', "Matthew S. Dryer"),
        ('EK', "Evgenia Korovina"),
        ('DB', "David Beck"),
        ('HG', "Helen Geyer"),
        ('PE', "Patience Epps"),
        ('AG', "Anthony Grant"),
        ('PS', "Paul Sidwell"),  # not in citation
        ('KTR', "K. Taraka Rama"),  # not in citation
        ('PV', "Pilar Valenzuela"),
        ('MD', "Mark Donohue"),  # not in citation
    ]):
        id_, name = spec
        if id_ in transcribers:
            assert name == transcribers.pop(id_)
        contributor = data.add(common.Contributor, id_, id=id_, name=name)
        if id_ in ['SW', 'EH', 'CB']:
            DBSession.add(common.Editor(
                dataset=dataset,
                ord=i + 1,
                contributor=contributor))
    for id_, name in transcribers.items():
        data.add(common.Contributor, id_, id=id_, name=name)

    for id_ in sorted(models.MEANINGS_ALL.keys()):
        data.add(
            models.Meaning, id_,
            id=str(id_), name=models.MEANINGS_ALL[id_], core=id_ in models.MEANINGS)

    for n, l in enumerate(wordlists):
        #if n > 100:
        #    break
        lang = models.Doculect.from_txt(l)
        if lang.classification_wals:
            family, genus = lang.classification_wals.split('.')
            lang.wals_family = wals_families.get(family)
            lang.wals_genus = wals_genera.get(slug(genus))
        lang.code_glottolog = glottocodes.get(lang.code_iso)
        add_codes(lang)
        data.add(models.Doculect, lang.id, _obj=lang)
        DBSession.flush()
        md = meta.pop(lang.id, None)
        assert md
        # associate transcribers and sources
        for i, transcriber in enumerate(md.transcribers):
            common.ContributionContributor(
                contribution=lang.wordlist,
                contributor=data['Contributor'][transcriber],
                ord=i + 1)
        for source in md.sources:
            DBSession.add(
                common.LanguageSource(language_pk=lang.pk, source_pk=sources[source].pk))

    print(list(meta.keys()))
Пример #10
0
def main(args):
    sources = get_sources(args)
    Index('ducet1', collkey(common.Value.name)).create(DBSession.bind)
    Index('ducet2', collkey(models.Counterpart.phonetic)).create(DBSession.bind)
    data = Data()
    glottocodes, geocoords = {}, defaultdict(lambda: (None, None))
    for k, v in glottocodes_by_isocode(
            'postgresql://robert@/glottolog3',
            cols=['id', 'latitude', 'longitude']).items():
        glottocodes[k] = v[0]
        geocoords[k] = (v[1], v[2])
    geocoords['win'] = (43.50, -88.50)

    dataset = common.Dataset(
        id=csd.__name__,
        name="Comparative Siouan Dictionary",
        description="Comparative Siouan Dictionary",
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='csd.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    contrib = common.Contribution(id='csd', name=dataset.name)
    for i, spec in enumerate([
        ('Robert L. Rankin', True),
        ('Richard T. Carter', True),
        ('A. Wesley Jones', True),
        ('John E. Koontz', True),
        ('David S. Rood', True),
        ('Iren Hartmann', True),
    ]):
        name, primary = spec
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i, primary=primary))

    d = Dictionary(
        args.data_file(TXT),
        entry_impl=CsdEntry,
        entry_sep='\\lx ')
    d.entries = list(filter(lambda r: r.get('lx'), d.entries))[1:]
    print(len(d.entries))

    for i, v in enumerate(_LANGUAGES):
        l = data.add(
            models.Languoid, v[0],
            id=v[0],
            name=v[1],
            ord=i,
            color=v[4].lower(),
            proto=v[0].startswith('p') and len(v[0]) == 3,
            latitude=geocoords[v[2]][0],
            longitude=geocoords[v[2]][1],
            parent=data['Languoid'].get(v[5]))
        if v[2]:
            add_language_codes(data, l, v[2], glottocodes=glottocodes)
        if l.id == 'pn':
            l.latitude, l.longitude = (42.75, -98.03)
        if l.id == 'op':
            l.latitude, l.longitude = (43.5, -96.6)
        if l.id == 'mo':
            l.latitude, l.longitude = (40.05, -95.52)

    pnames = set()

    def _get(d, marker):
        _l = set(nfilter(d.get(marker, [])))
        if _l:
            _l = list(_l)
            if marker not in ['oo', 'or']:
                assert len(_l) == 1
                _l = _l[0]
            return _l

    def add_counterpart(d, vs, id,
                        phonetic,  # forms
                        cognate,  # oo
                        me, cm, so, org):
        assert phonetic or cognate

        if not cognate:
            if vs.language.proto:
                cognate = phonetic
                phonetic = None
            else:
                cognate = '[%s]' % phonetic
        m = models.Counterpart(
            id=id,
            name=cognate,
            phonetic=phonetic,
            description=me or '[%s]' % vs.parameter.name,
            comment=cm,
            original_entry=org,
            other_reconstructions='; '.join(_get(d, 'or') or []) if vs.language.id == 'psi' else None,
            valueset=vs)
        if so:
            for sid in nfilter([s.strip() for s in SEP_PATTERN.split(so or '')]):
                match = SID_PATTERN.match(sid)
                if not match:
                    continue

                name = sid
                sid = normalize_sid(match.group('key'))
                source = data['Source'].get(sid)
                if not source:
                    if sid in sources:
                        s = sources[sid]
                        source = data.add(
                            common.Source, sid,
                            id=sid,
                            name=s['Name'].upper() if len(s['Name']) <= 3 else s['Name'],
                            description=s.get('Title', s['citation']),
                            author=s.get('Author'),
                            title=s.get('Title'),
                            year=s.get('Year'),
                        )
                    else:
                        source = data.add(
                            common.Source, sid,
                            id=sid,
                            name=name.upper() if len(name) <= 3 else name)
                m.references.append(models.ValueReference(
                    source=source, description=match.group('pages')))

    for i, entry in enumerate(sorted(d.entries, key=lambda d: d.get('lx'), reverse=True)):
        lemma = entry.get('lx')
        if not lemma or not lemma.strip():
            continue
        pname = lemma
        j = 1
        while pname in pnames:
            pname = '%s (%s)' % (lemma, j)
            j += 1
        pnames.add(pname)
        contrib = data.add(
            common.Contribution, pname, id=str(i + 1), name='Entry "%s"' % pname)
        meaning = data.add(
            models.Entry, pname,
            id=str(i + 1),
            name=pname,
            contribution=contrib,
            description=entry.get('com'),
            psi_reconstruction_with_root_extension_code=entry.get('lxcm'),
            sd=normalize_comma_separated(entry.get('sd'), SD, lower=True),
            ps=normalize_comma_separated(entry.get('ps'), PS),
            othlgs='\n---\n'.join(entry.getall('othlgs')))
        if meaning.description:
            meaning.description = meaning.description.replace('.\n', '.\n\n')

        for lid, words in entry.get_words().items():
            vsid = '%s-%s' % (lid, meaning.id)
            vs = data.add(
                common.ValueSet, vsid,
                id=vsid,
                parameter=meaning,
                contribution=contrib,
                language=data['Languoid'][lid])

            for j, d in enumerate(words):
                looped = False

                for k, (oo, me, so, cm, org) in enumerate(izip_longest(
                        *[d.get(_m, []) for _m in 'oo me so cm _org'.split()])):
                    if not oo:
                        continue
                    looped = True
                    add_counterpart(d,
                        vs,
                        '%s-%s-%s' % (vsid, j + 1, k + 1),
                        d['forms'],
                        oo,
                        me,
                        cm,
                        so,
                        org)

                if not looped:  # not oo
                    if not d['forms']:
                        print '--->', d
                        continue
                    add_counterpart(d,
                        vs,
                        '%s-%s-%s' % (vsid, j + 1, 1),
                        d['forms'],
                        '; '.join(_get(d, 'oo') or []),
                        _get(d, 'me'),
                        _get(d, 'cm'),
                        _get(d, 'so'),
                        _get(d, '_org'))