示例#1
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=u'An Crúbadán',
        name=u'An Crúbadán',
        publisher_name="Saint Louis University",
        publisher_place="Saint Louis, USA",
        publisher_url="http://www.slu.edu/",
        description=
        "Linguistic datasets for over 2000 languages created from web-crawled text corpora",
        contact="*****@*****.**",
        license='http://creativecommons.org/licenses/by/4.0/',
        jsondata={
            'license_icon':
            'https://licensebuttons.net/l/by/4.0/88x31.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License',
        },
        domain='crubadan.org',
    )

    DBSession.add(dataset)
    DBSession.flush()

    editor = data.add(common.Contributor,
                      "Kevin Scannell",
                      id="Kevin Scannell",
                      name="Kevin Scannell",
                      email="*****@*****.**")
    common.Editor(dataset=dataset, contributor=editor, ord=0)
    DBSession.flush()

    fillTable(DBSession)
示例#2
0
def test_add_language_codes(env):
    from clld.db.models.common import Language
    from clld.scripts.util import Data, add_language_codes

    add_language_codes(Data(),
                       Language(),
                       'iso',
                       glottocodes=dict(iso='glot1234'))
示例#3
0
    def test_Data(self):
        from clld.db.models.common import Language
        from clld.scripts.util import Data

        session = set()
        with patch('clld.scripts.util.DBSession', session):
            d = Data(jsondata={})
            d.add(Language, 'l', id='l', name='l')
            assert session
            d.add(Language, 'l2', _obj=5)
            self.assertRaises(ValueError, d.add, Language, 'l3', id='l.3')
示例#4
0
def test_Data(mocker):
    from clld.db.models.common import Language
    from clld.scripts.util import Data

    session = set()
    mocker.patch('clld.scripts.util.DBSession', session)
    d = Data(jsondata={})
    d.add(Language, 'l', id='l', name='l')
    assert session
    d.add(Language, 'l2', _obj=5)
    with pytest.raises(ValueError):
        d.add(Language, 'l3', id='l.3')
示例#5
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    repos = Path(
        os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data')

    with transaction.manager:
        dataset = common.Dataset(
            id=lexirumah.__name__,
            name="lexirumah",
            publisher_name=
            "Max Planck Institute for the Science of Human History",
            publisher_place="Jena",
            publisher_url="http://shh.mpg.de",
            license="http://creativecommons.org/licenses/by/4.0/",
            domain='lexirumah.model-ling.eu',
            contact='*****@*****.**',
            jsondata={
                'license_icon':
                'cc-by.png',
                'license_name':
                'Creative Commons Attribution 4.0 International License'
            })
        DBSession.add(dataset)

    glottolog_repos = Path(
        lexirumah.__file__).parent.parent.parent.parent.joinpath(
            'glottolog3', 'glottolog')
    languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()}
    concepticon = Concepticon(
        Path(lexirumah.__file__).parent.parent.parent.parent.joinpath(
            'concepticon', 'concepticon-data'))
    conceptsets = {c.id: c for c in concepticon.conceptsets.values()}

    skip = True
    for dname in sorted(repos.joinpath('datasets').iterdir(),
                        key=lambda p: p.name):
        #if dname.name == 'benuecongo':
        #    skip = False
        #if skip:
        #    continue
        if dname.is_dir() and dname.name != '_template':
            mdpath = dname.joinpath('cldf', 'metadata.json')
            if mdpath.exists():
                print(dname.name)
                import_cldf(dname, load(mdpath), languoids, conceptsets)

    with transaction.manager:
        load_families(Data(),
                      DBSession.query(LexiRumahLanguage),
                      glottolog_repos=glottolog_repos,
                      isolates_icon='tcccccc')
示例#6
0
def main(args):
    data = Data()
    data.add(
        common.Dataset,
        "starling",
        id="starling",
        name="Starling",
        domain="starling.rinet.ru",
        description=" The Global Lexicostatistical Database",
        publisher_name="Russian State University for the Humanities, Moscow")
    data.add(common.Contribution, "starling", name="Starling", id="starling")

    def row_to_dict(row_entry):
        swadesh_id_idx, swadesh_word_idx, form_idx, cognation_idx, notes_idx = range(
            0, 5)
        return {
            "swadesh_id": row_entry[swadesh_id_idx].value,
            "swadesh_word": row_entry[swadesh_word_idx].value,
            "form": row_entry[form_idx].value,
            "cognation_index": row_entry[cognation_idx].value,
            "notes": row_entry[notes_idx].value,
        }

    data_dir = "./gld/scripts/data/"
    for path in os.listdir(data_dir):
        data_file_path = os.path.join(data_dir, path)
        book = load_workbook(data_file_path)
        sheet = book.active
        lang_name = sheet["C1"].value
        data.add(common.Language,
                 lang_name,
                 id=lang_name,
                 name=lang_name,
                 latitude=52.0,
                 longitude=0.0)

        fields = [
            "swadesh_id", "swadesh_word", "form", "cognation_index", "notes"
        ]
        for row in sheet.iter_rows(min_row=2, min_col=1):
            row_data = row_to_dict(row)
            w = data.add(Word,
                         "%s_%s" % (row_data["swadesh_id"], row_data["form"]),
                         name=row_data["form"],
                         description="Description",
                         jsondata={k: row_data[k]
                                   for k in fields})
            w.language = data["Language"][lang_name]

        DBSession.flush()
示例#7
0
def import_cldf(srcdir, md, languoids, conceptsets):
    with transaction.manager:
        contrib = Provider(
            id=srcdir.name,
            name=md['dc:title'],
            description=md.get('dc:bibliographicCitation'),
            url=md.get('dc:identifier'),
            license=md.get('dc:license'),
            aboutUrl=md.get('aboutUrl'),
        )
        DBSession.add(contrib)
        sources = {}
        cldfdir = srcdir.joinpath('cldf')
        values = Data()
        for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False):
            ds = Dataset.from_metadata(fname)
            for src in ds.sources.items():
                if src.id not in sources:
                    sources[src.id] = cldf2clld(src, contrib, len(sources) + 1)
            import_dataset(ds, contrib, languoids, conceptsets, sources,
                           values)
            DBSession.flush()
        # import cognates:
        if cldfdir.joinpath('cognates.csv').exists():
            for csid, cognates in groupby(
                    reader(cldfdir.joinpath('cognates.csv'), dicts=True),
                    lambda i: i['Cognate_set_ID']):
                cs = Cognateset(id=unique_id(contrib, csid),
                                contribution=contrib)
                for cognate in cognates:
                    cp = values['Counterpart'].get(cognate['Word_ID'])
                    if cp:
                        DBSession.add(
                            CognatesetCounterpart(
                                cognateset=cs,
                                counterpart=cp,
                                cognate_detection_method=cognate[
                                    'Cognate_detection_method'],
                                alignment=cognate['Alignment'],
                                alignment_method=cognate['Alignment_method'],
                                doubt=cognate['Doubt'] == 'True'))
示例#8
0
    def test_load_families(self):
        from clld_glottologfamily_plugin.util import load_families

        class Languoid(object):
            id = 'abcd1234'
            iso_code = 'abc'
            name = 'language'
            latitude = 1.0
            longitude = 1.0
            macroareas = ['Area']

            @property
            def family(self):
                return self

        class Glottolog(object):
            def languoid(self, code):
                return Languoid()

        load_families(Data(),
                      DBSession.query(LanguageWithFamily),
                      glottolog=Glottolog())
示例#9
0
def main(args):
    data = Data()

    dataset = common.Dataset(id=moslex.__name__,
                             domain='moslex.clld.org',
                             name='MosLex',
                             license='https://creativecommons.org'
                             '/licenses/by-nc/4.0/',
                             jsondata={
                                 'license_icon':
                                 'cc-by-nc.png',
                                 'license_name':
                                 'Creative Commons '
                                 'Attribution-NC 4.0 '
                                 'International License'
                             })

    editor = data.add(common.Contributor,
                      'editor',
                      id='editor',
                      name='Alexei Kassian')
    common.Editor(dataset=dataset, contributor=editor)

    with open('languoids.json') as file:
        languoids = json.load(file)

    with open('concepts.json') as file:
        concepts = json.load(file)

    with open('forms.json') as file:
        forms = json.load(file)

    add_languoids(data, languoids)
    add_concepts(data, concepts)
    add_forms(data, forms)

    DBSession.add(dataset)
示例#10
0
def main(args):
    data = Data()

    dataset = common.Dataset(id=moslex.__name__,
                             domain='moslex.clld.org',
                             name='MosLex',
                             license='https://creativecommons.org'
                             '/licenses/by-nc/4.0/',
                             jsondata={
                                 'license_icon':
                                 'cc-by-nc.png',
                                 'license_name':
                                 'Creative Commons '
                                 'Attribution-NC 4.0 '
                                 'International License'
                             })

    editor = data.add(common.Contributor,
                      'editor',
                      id='editor',
                      name='Alexei Kassian',
                      email='*****@*****.**')
    common.Editor(dataset=dataset, contributor=editor)

    with open(os.environ['MOSLEX_CONCEPTS']) as file:
        concepts = json.load(file)
    add_concepts(data, concepts)

    data_folders = [
        path
        for path in glob.glob(os.path.join(os.environ['MOSLEX_DATA'], '*'))
        if os.path.isdir(path)
    ]
    for folder in data_folders:
        add_data_folder(folder, data)

    DBSession.add(dataset)
示例#11
0
def main(args):
    fts.index('fts_index', Word.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")

    data = Data()

    dataset = common.Dataset(
        id=dictionaria.__name__,
        name="Dictionaria",
        description="The Dictionary Journal",
        published=date(2017, 3, 30),
        contact='*****@*****.**',
        domain='dictionaria.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    for i, (id_, name) in enumerate([
        ('haspelmathmartin', 'Martin Haspelmath'),
        ('moselulrike', 'Ulrike Mosel'),
        ('stiebelsbarbara', 'Barbara Stiebels')
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))

    comparison_meanings = {}

    print('loading concepts ...')

    glosses = set()
    concepticon = Concepticon(
        REPOS.joinpath('..', '..', 'concepticon', 'concepticon-data'))
    if not args.no_concepts:
        for conceptset in concepticon.conceptsets.values():
            if conceptset.gloss in glosses:
                continue
            glosses.add(conceptset.gloss)
            cm = data.add(
                ComparisonMeaning,
                conceptset.id,
                id=conceptset.id,
                name=conceptset.gloss.lower(),
                description=conceptset.definition,
                concepticon_url='http://concepticon.clld.org/parameters/%s' % conceptset.id)
            comparison_meanings[cm.id] = cm

    DBSession.flush()

    print('... done')

    comparison_meanings = {k: v.pk for k, v in comparison_meanings.items()}
    submissions = []

    for submission in REPOS.joinpath(
            'submissions-internal' if args.internal else 'submissions').glob('*'):
        if not submission.is_dir():
            continue

        try:
            submission = Submission(submission)
        except ValueError:
            continue

        md = submission.md
        if md is None:
            continue

        if not md['date_published']:
            continue

        id_ = submission.id
        if args.dict and args.dict != id_ and args.dict != 'all':
            continue
        lmd = md['language']
        props = md.get('properties', {})
        props.setdefault('custom_fields', [])
        props['metalanguage_styles'] = {}
        for v, s in zip(props.get('metalanguages', {}).values(),
                        ['success', 'info', 'warning', 'important']):
            props['metalanguage_styles'][v] = s
        props['custom_fields'] = ['lang-' + f if f in props['metalanguage_styles'] else f
                                  for f in props['custom_fields']]

        language = data['Variety'].get(lmd['glottocode'])
        if not language:
            language = data.add(
                Variety, lmd['glottocode'], id=lmd['glottocode'], name=lmd['name'])

        md['date_published'] = md['date_published'] or date.today().isoformat()
        if '-' not in md['date_published']:
            md['date_published'] = md['date_published'] + '-01-01'
        dictionary = data.add(
            Dictionary,
            id_,
            id=id_,
            number=md.get('number'),
            name=props.get('title', lmd['name'] + ' dictionary'),
            description=submission.description,
            language=language,
            published=date(*map(int, md['date_published'].split('-'))),
            jsondata=props)

        for i, spec in enumerate(md['authors']):
            if not isinstance(spec, dict):
                cname, address = spec, None
                spec = {}
            else:
                cname, address = spec['name'], spec.get('affiliation')
            name = HumanName(cname)
            cid = slug('%s%s' % (name.last, name.first))
            contrib = data['Contributor'].get(cid)
            if not contrib:
                contrib = data.add(
                    common.Contributor,
                    cid,
                    id=cid,
                    name=cname,
                    address=address,
                    url=spec.get('url'),
                    email=spec.get('email'))
            DBSession.add(common.ContributionContributor(
                ord=i + 1,
                primary=True,
                contributor=contrib,
                contribution=dictionary))

        submissions.append((dictionary.id, language.id, submission))
    transaction.commit()

    for did, lid, submission in submissions:
        #if submission.id != 'sidaama':
        #    continue
        transaction.begin()
        print('loading %s ...' % submission.id)
        dictdata = Data()
        lang = Variety.get(lid)
        submission.load_examples(Dictionary.get(did), dictdata, lang)
        submission.dictionary.load(
            submission,
            dictdata,
            Dictionary.get(did),
            lang,
            comparison_meanings,
            OrderedDict(submission.md.get('properties', {}).get('labels', [])))
        transaction.commit()
        print('... done')

    transaction.begin()
    load_families(
        Data(),
        [v for v in DBSession.query(Variety) if re.match('[a-z]{4}[0-9]{4}', v.id)],
        glottolog_repos='../../glottolog3/glottolog')
示例#12
0
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=plld_app.__name__,
        name=plld_app.__name__,
        domain='plld.clld.org',
        description="Database of Papuan Language and Culture",
    )
    DBSession.add(dataset)

    # Load the list of languages
    languages = pandas.ExcelFile(
        os.path.join(DBPATH, "Languages and Coordinates.xlsx")).parse(0)

    parameters = {}
    for i, language in languages.iterrows():
        # Generate the database object for each language
        print("\nCreating language", language['Language name (-dialect)'])
        lang = models.Lect(
            id=((language['Language name (-dialect)'].lower()[:4] + "x" +
                 newid()) if pandas.isnull(language['Glottolog']) else
                language['Glottolog'].strip()),
            region=language['Region'],
            family=language['Family'],
            name=language['Language name (-dialect)'].strip(),
            latitude=language['Lat'],
            longitude=language['Lon'])

        # Check what data files we have that say they are about that language.
        if pandas.isnull(language['ISO_code']):
            # The convention for file-names of varieties without iso
            # code is ad-hoc, skip those until we have established a
            # good convention.
            files_concerning = [
                file for file in os.listdir(DBPATH)
                if file.lower().startswith(language['Internal'].lower() + '_')
            ]
        else:
            # Otherwise, the convention is that languages are
            # described by files starting with their iso code and an
            # underscore.
            files_concerning = [
                file for file in os.listdir(DBPATH)
                if file.lower().startswith(language['ISO_code'].lower() + '_')
            ]

        # For each such language, we might have typological, sociolinguistic and lexical (small vocabulary or big vocabulary or kinship terms) data. Deal with them in order.

        # Try to load the corresponding typology questionnaire
        typology_files = [
            file for file in files_concerning if 'typolog' in file.lower()
        ]
        if len(typology_files) == 1:
            try:
                add_typological_data(typology_files[0], parameters, lang)
                print("Typological data read.")
            except UnexpectedTypologyFormatError:
                print(
                    "File", typology_files[0],
                    "had an unexpected format for a typology questionnaire!")
        else:
            print("There were not one, but", len(typology_files),
                  "possible questionnaires.")

        # Try to load the corresponding cultural features questionnaire
        culture_files = [
            file for file in files_concerning if 'cult' in file.lower()
        ]
        if len(culture_files) == 1:
            try:
                add_cultural_data(culture_files[0], parameters, lang)
                print("Cultural data read.")
            except UnexpectedCultureFormatError:
                print("File", culture_files[0],
                      "had an unexpected format for a culture questionnaire!")
        else:
            print("There were not one, but", len(culture_files),
                  "possible questionnaires.")
示例#13
0
def main(args):
    old_db = create_engine(DB)
    data = Data()

    #
    # migrate contributor table: complete
    #
    for row in old_db.execute("select * from contributor"):
        data.add(common.Contributor,
                 row['id'],
                 id=row['id'],
                 name='%(firstname)s %(lastname)s' % row,
                 url=row['homepage'],
                 description=row['note'],
                 email=row['email'],
                 address=row['address'])
    data.add(common.Contributor,
             'haspelmathmartin',
             id='haspelmathmartin',
             name="Martin Haspelmath",
             url="http://email.eva.mpg.de/~haspelmt/")
    DBSession.flush()

    dataset = common.Dataset(
        id='wold',
        name='WOLD',
        description='World Loanword Database',
        domain='wold.clld.org',
        published=date(2009, 8, 15),
        license='http://creativecommons.org/licenses/by/3.0/de/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'http://i.creativecommons.org/l/by/3.0/de/88x31.png',
            'license_name': 'Creative Commons Attribution 3.0 Germany License'
        })
    DBSession.add(dataset)

    for i, editor in enumerate(['haspelmathmartin', 'tadmoruri']):
        common.Editor(dataset=dataset,
                      contributor=data['Contributor'][editor],
                      ord=i + 1)

    #
    # migrate semantic_field table: complete
    #
    for row in old_db.execute("select * from semantic_field"):
        if row['id'] != 25:
            kw = dict((key, row[key]) for key in ['id', 'name', 'description'])
            data.add(models.SemanticField, row['id'], **kw)

    #
    # migrate language table: complete
    # recipient flag is replaced by vocabulary_pk!
    #
    for row in old_db.execute("select * from language order by id"):
        kw = dict((key, row[key]) for key in [
            'fm_dl_id', 'name', 'latitude', 'longitude', 'wals_equivalent',
            'affiliation', 'family', 'genus', 'countries'
        ])
        data.add(models.WoldLanguage, row['id'], id=str(row['id']), **kw)

    #
    # migrate language_code table: complete
    #
    for row in old_db.execute("select * from language_code"):
        _id = '%(type)s-%(code)s' % row
        data.add(common.Identifier,
                 _id,
                 id=_id,
                 type=row['type'],
                 name=row['code'])
        if row['type'] == 'iso639-3' and row['code'] in glottocodes:
            gc = glottocodes[row['code']]
            data.add(common.Identifier,
                     gc,
                     id=gc,
                     type=common.IdentifierType.glottolog.value,
                     name=gc)
    DBSession.flush()

    #
    # migrate language_code_language table: complete
    #
    for row in old_db.execute("select * from language_code_language"):
        _id = '%(type)s-%(code)s' % row
        data.add(common.LanguageIdentifier,
                 '%s-%s' % (_id, row['language_id']),
                 identifier_pk=data['Identifier'][_id].pk,
                 language_pk=data['WoldLanguage'][row['language_id']].pk)
        if row['type'] == 'iso639-3' and row['code'] in glottocodes:
            gc = glottocodes[row['code']]
            data.add(common.LanguageIdentifier,
                     '%s-%s' % (gc, row['language_id']),
                     identifier_pk=data['Identifier'][gc].pk,
                     language_pk=data['WoldLanguage'][row['language_id']].pk)
    DBSession.flush()

    #
    # migrate vocabulary table: complete
    #
    for row in old_db.execute("select * from vocabulary order by id"):
        jsondata = {}
        for key in row.keys():
            if key.startswith('fd_') or key in [
                    'other_information', 'abbreviations'
            ]:
                jsondata[key] = row[key]
        vocab = data.add(models.Vocabulary,
                         row['id'],
                         id=str(row['id']),
                         name=row['name'],
                         color=row['color'],
                         jsondata=jsondata)
        DBSession.flush()
        data['WoldLanguage'][row['language_id']].vocabulary_pk = vocab.pk
    DBSession.flush()

    #
    # migrate contact_situation and age tables: complete
    # contact situations and ages are unitdomainelements!
    #
    contact_situation = common.UnitParameter(id='cs', name='Contact Situation')
    age = common.UnitParameter(id='a', name='Age')

    DBSession.add(contact_situation)
    DBSession.add(age)
    DBSession.flush()

    for row in old_db.execute("select * from contact_situation"):
        if row['vocabulary_id'] is None:
            continue
        kw = dict((key, row[key]) for key in ['description', 'id', 'name'])
        kw['id'] = 'cs-%s' % kw['id']
        p = data.add(models.WoldUnitDomainElement, row['id'], **kw)
        p.vocabulary = data['Vocabulary'][row['vocabulary_id']]
        p.unitparameter_pk = contact_situation.pk

    for row in old_db.execute("select * from age"):
        id_ = '%(vocabulary_id)s-%(label)s' % row
        kw = dict((key, row[key]) for key in ['start_year', 'end_year'])
        p = data.add(models.WoldUnitDomainElement,
                     id_,
                     id='a-%s' % id_,
                     name=row['label'],
                     description=row['description'],
                     jsondata=kw)
        p.vocabulary = data['Vocabulary'][row['vocabulary_id']]
        p.unitparameter_pk = age.pk

    #
    # migrate meaning table: complete
    #
    for row in old_db.execute("select * from meaning"):
        kw = dict((key, row[key]) for key in [
            'description', 'core_list', 'ids_code', 'typical_context',
            'semantic_category'
        ])
        p = data.add(
            models.Meaning,
            row['id'],
            id=row['id'].replace('.', '-'),
            name=row['label'],
            sub_code=row['id'].split('.')[1] if '.' in row['id'] else '',
            semantic_field=data['SemanticField'][row['semantic_field_id']],
            **kw)
        DBSession.flush()

        for field in ['french', 'spanish', 'german', 'russian']:
            DBSession.add(
                models.Translation(name=row[field], lang=field, meaning=p))

        for key in data['WoldLanguage']:
            lang = data['WoldLanguage'][key]
            data.add(common.ValueSet,
                     '%s-%s' % (key, row['id']),
                     id='%s-%s' % (key, row['id'].replace('.', '-')),
                     language=lang,
                     contribution=lang.vocabulary,
                     parameter=p)

    DBSession.flush()

    #
    # migrate word table:
    # TODO: all the other word properties!!
    #
    fields = [
        'age_label',
        'original_script',
        'grammatical_info',
        'comment_on_word_form',
        'gloss',
        "comment_on_borrowed",
        "calqued",
        "borrowed_base",
        "numeric_frequency",
        "relative_frequency",
        "effect",
        "integration",
        "salience",
        "reference",
        "other_comments",
        "register",
        "loan_history",
        'colonial_word',
        'paraphrase_in_dutch',
        'word_source',
        'paraphrase_in_german',
        'lexical_stratum',
        'comparison_with_mandarin',
        'year',
        'comparison_with_korean',
        'czech_translation',
        'hungarian_translation',
        'early_romani_reconstruction',
        'etymological_note',
        'boretzky_and_igla_etymology',
        'manuss_et_al_etymology',
        'vekerdi_etymology',
        'turner_etymology',
        'other_etymologies',
        'mayrhofer_etymology',
    ]
    word_to_vocab = {}
    for row in old_db.execute("select * from word"):
        word_to_vocab[row['id']] = row['vocabulary_id']
        kw = dict((key, row[key]) for key in [
            'id', 'age_score', 'borrowed', 'borrowed_score', 'analyzability',
            'simplicity_score'
        ])
        w = data.add(models.Word,
                     row['id'],
                     name=row['form'],
                     description=row['free_meaning'],
                     jsondata={k: row[k]
                               for k in fields},
                     **kw)
        w.language = data['Vocabulary'][row['vocabulary_id']].language

        if row['age_label']:
            DBSession.add(
                common.UnitValue(
                    id='%(id)s-a' % row,
                    unit=w,
                    unitparameter=age,
                    unitdomainelement=data['WoldUnitDomainElement'][
                        '%(vocabulary_id)s-%(age_label)s' % row],
                    contribution=data['Vocabulary'][row['vocabulary_id']]))

        if row['contact_situation_id'] and row[
                'contact_situation_id'] != '9129144185487768':
            DBSession.add(
                common.UnitValue(
                    id='%(id)s-cs' % row,
                    unit=w,
                    unitparameter=contact_situation,
                    unitdomainelement=data['WoldUnitDomainElement'][
                        row['contact_situation_id']],
                    contribution=data['Vocabulary'][row['vocabulary_id']]))

    DBSession.flush()

    #
    # migrate word_meaning table: complete
    #
    for i, row in enumerate(old_db.execute("select * from word_meaning")):
        data.add(
            models.Counterpart,
            i,
            id=i,
            description='%(relationship)s (%(comment_on_relationship)s)' % row,
            name=data['Word'][row['word_id']].name,
            valueset=data['ValueSet']['%s-%s' % (word_to_vocab[row['word_id']],
                                                 row['meaning_id'])],
            word=data['Word'][row['word_id']])
    DBSession.flush()

    #
    # migrate vocabulary_contributor table: complete
    #
    for row in old_db.execute("select * from vocabulary_contributor"):
        DBSession.add(
            common.ContributionContributor(
                ord=row['ordinal'],
                primary=row['primary'],
                contributor_pk=data['Contributor'][row['contributor_id']].pk,
                contribution_pk=data['Vocabulary'][row['vocabulary_id']].pk))

    DBSession.flush()

    #
    # source words: we have to make sure a word does only belong to one language.
    # thus, we have to reassign identifier!
    #
    # loop over source_word, source_word_donor_language pairs keeping track of source_word ids:
    known_ids = {}

    for row in old_db.execute(
            "select sw.id, sw.meaning, sw.form, dl.language_id from source_word as sw, source_word_donor_language as dl where sw.id = dl.source_word_id"
    ):
        if row['id'] in known_ids:
            # source_word was already seen associated to a different donor language!
            assert row['language_id'] not in known_ids[row['id']]
            known_ids[row['id']].append(row['language_id'])
            id_ = '%s-%s' % (row['id'], len(known_ids[row['id']]))
        else:
            id_ = '%s-%s' % (row['id'], 1)
            known_ids[row['id']] = [row['language_id']]

        new = data.add(models.Word,
                       id_,
                       id=id_,
                       name=row['form'],
                       description=row['meaning'])
        new.language = data['WoldLanguage'][row['language_id']]

    # source words may end up as words without language!
    for row in old_db.execute(
            "select id, meaning, form from source_word where id not in (select source_word_id from source_word_donor_language)"
    ):
        id_ = '%s-%s' % (row['id'], 1)
        new = data.add(models.Word,
                       id_,
                       id=id_,
                       name=row['form'],
                       description=row['meaning'])

    DBSession.flush()

    #
    # migrate word_source_word relations
    # TODO: should be modelled as UnitParameter!
    #
    j = 0
    for row in old_db.execute("select * from word_source_word"):
        # there may be more than one word associated with a source_word_id (see above)
        source_words = []
        for i in range(4):  # but we guess no more than 4 :)
            id_ = '%s-%s' % (row['source_word_id'], i + 1)
            if id_ in data['Word']:
                source_words.append(data['Word'][id_])
        if not source_words:
            j += 1
            #print(row['source_word_id'])
            #raise ValueError(row['source_word_id'])

        for sw in source_words:
            DBSession.add(
                models.Loan(source_word=sw,
                            target_word=data['Word'][row['word_id']],
                            relation=row['relationship'],
                            certain=len(source_words) == 1))

    print('%s source words not migrated because they have no donor language!' %
          j)
示例#14
0
文件: initializedb.py 项目: clld/abvd
def main(args):
    data = Data()

    dataset = common.Dataset(
        id=abvd.__name__,
        name='ABVD',
        description='',
        domain='abvd.clld.org',
        published=date.today(),
        license='https://creativecommons.org/licenses/by/4.0/',
        contact='',
        jsondata={
            'doi': args.doi,
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)

    for name in ['Simon Greenhill', 'Robert Blust', 'Russell Gray']:
        common.Editor(contributor=contributor(data, name), dataset=dataset)

    cnames = Counter()
    families = Counter([l['Family'] for l in args.cldf['LanguageTable']])
    colors = dict(
        zip([i[0] for i in families.most_common()], color.qualitative_colors(len(families))))
    cid2l = {}
    for lang in args.cldf['LanguageTable']:
        lid = (lang['Name'], lang['Glottocode'])
        l = data['Language'].get(lid)
        if not l:
            l = data.add(
                common.Language, lid,
                id=lang['ID'],
                name=lang['Name'],
                latitude=lang['Latitude'],
                longitude=lang['Longitude'],
                jsondata=dict(
                    family=lang['Family'],
                    icon='{0}{1}'.format('c' if lang['Family'] else 't', colors[lang['Family']]),
                ),
            )
            if lang['Glottocode'] or lang['ISO639P3code']:
                add_language_codes(
                    data, l, isocode=lang['ISO639P3code'], glottocode=lang['Glottocode'])
        cid2l[lang['ID']] = l
        cname = '{0} ({1})'.format(lang['Name'], lang['author'])
        cnames.update([cname])
        if cnames[cname] > 1:
            cname += ' {0}'.format(cnames[cname])
        c = data.add(
            models.Wordlist, lang['ID'],
            id=lang['ID'],
            name=cname,
            description=lang['author'],
            language=l,
            notes=lang['notes'],
        )
        i = 0
        typers = (lang['typedby'] or '').split(' and ')
        checkers = (lang['checkedby'] or '').split(' and ')
        for name in typers:
            i += 1
            DBSession.add(common.ContributionContributor(
                contribution=c,
                contributor=contributor(data, name),
                ord=i,
                jsondata=dict(type='typedby and checkedby' if name in checkers else 'typedby'),
            ))
        for name in checkers:
            if name in typers:
                continue
            i += 1
            DBSession.add(common.ContributionContributor(
                contribution=c,
                contributor=contributor(data, name),
                ord=i,
                jsondata=dict(type='checkedby'),
            ))

    for param in args.cldf['ParameterTable']:
        data.add(
            common.Parameter, param['ID'],
            id=param['ID'],
            name=param['Name'],
        )

    #
    # FIXME: add sources!
    #
    vsrs = set()
    for row in args.cldf['FormTable']:
        vs = data['ValueSet'].get((row['Language_ID'], row['Parameter_ID']))
        if not vs:
            vs = data.add(
                common.ValueSet,
                (row['Language_ID'], row['Parameter_ID']),
                id='{0}-{1}'.format(row['Language_ID'], row['Parameter_ID']),
                language=cid2l[row['Language_ID']],
                parameter=data['Parameter'][row['Parameter_ID']],
                contribution=data['Wordlist'][row['Language_ID']],
            )
        v = data.add(
            common.Value,
            row['ID'],
            id=row['ID'],
            name=row['Form'],
            valueset=vs
        )

    for row in args.cldf['CognateTable']:
        cc = data['Cognateset'].get(row['Cognateset_ID'])
        if not cc:
            cc = data.add(Cognateset, row['Cognateset_ID'], id=row['Cognateset_ID'])
        data.add(
            Cognate,
            row['ID'],
            cognateset=cc,
            counterpart=data['Value'][row['Form_ID']],
            doubt=row['Doubt'],
        )
示例#15
0
def main(args):
    data = Data()
    lotw_conn = sqlite3.connect("lotw_base.sqlite")
    lotw_base = lotw_conn.cursor()
    contrib = common.Contribution(id="initial_contrib",
                                  name="Initial contribution")


    dataset = common.Dataset(id=lotw_dev.__name__,
                             domain='lotw_dev.clld.org',
                             name="Languages of the World",
                             publisher_name="IL RAS",
                             publisher_place="Moscow",
                             publisher_url="http://iling-ran.ru/main/",
                             jsondata={
                                 'license_name': 'Creative Commons Attribution 4.0 International License'}
                             )
    DBSession.add(dataset)
    feature_dict = {}
    unnamed_feature_count = 0
    features = lotw_base.execute("SELECT * FROM Feature").fetchall()
    names = [y[2] for y in features]
    feat_name_counts = {x[2]: [names.count(x[2]), 0] for x in features if names.count(x[2]) > 1}

    # features = [convert_feature(x) for x in features]

    for feature in features:
        name = feature[2]
        # if name == ".О":
        #     continue
        if name in feat_name_counts.keys():
            temp_name = name
            name += ("_" + str(feat_name_counts[name][1]))
            feat_name_counts[temp_name][1] += 1

        feature_dict[feature[0]] = TreeFeature(pk=feature[0],
                                               id=feature[0],
                                             name=name,
                                               father_pk=feature[5])
        print("Added feature %s" % feature[2])

    langs = lotw_base.execute("SELECT * FROM Language").fetchall()
    assert len(set([lang[0] for lang in langs])) == len([lang[0] for lang in langs])
    for language in langs:
        value_sets = []
        geodata = lotw_base.execute("SELECT * FROM Geographical_index WHERE Lang_id=?", (str(language[0]), )).fetchone()
        famdata = lotw_base.execute("SELECT * FROM Genealogical_index WHERE Lang_id=?", (str(language[0]), )).fetchone()
        famname = lotw_base.execute("SELECT * FROM Family where Id=?", (famdata[2], )).fetchone()[1]
        branchname =lotw_base.execute("SELECT * FROM Branch where Id=?", (famdata[3], )).fetchone()[1]
        if not geodata:
            geodata = [0.0 for x in range(7)]
        data.add(lotw_devLanguage, language[0],
                 id=str(language[0]),
                 iso=language[3],
                 family=famname,
                 branch=branchname,
                 name=language[1],
                 latitude=geodata[5],
                 longitude=geodata[6])

        print("Added language %s" % language[3])
        # Lang_id=language["Lang_id"], Order_of_addition=language["Order_of_addition"],
                # Sorting_number=language["Sorting_number"], Code_ISO_639_3=language["Code_ISO_639_3"]
        language_features = lotw_base.execute("SELECT * FROM Binary_data WHERE Lang_id=? AND Feature_value=1", (str(language[0]), ))
        for l_feat in language_features.fetchall():
            feat_id = l_feat[0]
            try:
                feat_name = feature_dict[l_feat[2]].name
            except KeyError:
                continue

            vs = common.ValueSet(id=feat_id,
                                 language=data["lotw_devLanguage"][language[0]],
                                 parameter=feature_dict[l_feat[2]],
                                 contribution=contrib)
            DBSession.add(common.Value(id=feat_id,
                                       name=feat_name,
                                       valueset=vs))
            print("Added value %s" % feat_id)





    lotw_conn.close()
示例#16
0
def main(args):  # pragma: no cover
    ds = StructureDataset.from_metadata(DS)
    data = Data()
    for source in ds.sources:
        data.add(common.Source, source.id, _obj=bibtex2source(source))

    ext = [
        Record.from_string('@' + s, lowercase=True)
        for s in nfilter(BIB.split('@'))
    ]
    for rec in ext:
        if rec.id not in data['Source']:
            data.add(common.Source, rec.id, _obj=bibtex2source(rec))

    for contrib in ds['contributors.csv']:
        o = data.add(
            common.Contributor,
            contrib['ID'],
            id=contrib['ID'].upper(),
            name=contrib['Name'],
            description=contrib['Description'],
            url=contrib['URL'],
            jsondata={
                'readme': contrib['Readme'],
                'contents': contrib['Contents']
            },
        )
        for src in contrib['Source']:
            DBSession.add(
                models.ContributorReference(source=data['Source'][src],
                                            contributor=o))

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE 2.0',
        description='PHOIBLE 2.0',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        domain='phoible.org',
        license='https://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'https://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, (cid, name) in enumerate([
        ('UZ', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
    ],
                                    start=1):
        contrib = data['Contributor'].get(cid)
        if not contrib:
            contrib = common.Contributor(id=cid, name=name)
        DBSession.add(
            common.Editor(dataset=dataset, ord=i, contributor=contrib))

    glottolog = Glottolog(
        Path(phoible.__file__).parent.parent.parent.parent.joinpath(
            'glottolog', 'glottolog'))

    for lang in ds['LanguageTable']:
        l = data.add(
            models.Variety,
            lang['ID'],
            id=lang['ID'],
            name=lang['Name'],
        )

    load_families(data, [(l.id, l)
                         for l in data['Variety'].values() if len(l.id) == 8],
                  glottolog.repos)
    DBSession.flush()

    # assign color codes:
    families = defaultdict(list)
    for l in data['Variety'].values():
        families[l.family_pk].append(l)

    colors = color.qualitative_colors(len(families))
    for i, langs in enumerate(sorted(families.values(),
                                     key=lambda v: -len(v))):
        for l in langs:
            l.jsondata = {'color': colors[i]}

    for segment in ds['ParameterTable']:
        equivalence_class = ''.join([
            t[0] for t in [(c, unicodedata.name(c)) for c in segment['Name']]
            if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
        ]),
        data.add(models.Segment,
                 segment['ID'],
                 id=segment['ID'],
                 name=segment['Name'],
                 description=segment['Description'],
                 segment_class=segment['SegmentClass'],
                 equivalence_class=equivalence_class)
    DBSession.flush()

    # Add redirects for old language pages! get relevant ISO codes and map to Glottocode!
    for model, repls in load(
            Path(phoible.__file__).parent.parent /
            'replacements.json').items():
        if model == 'Language':
            languoids = {l.id: l for l in glottolog.languoids()}
            iso_languoids = {l.iso: l for l in languoids.values() if l.iso}
            gl_in_phoible = set(data['Variety'].keys())
            for oid, nid in repls.items():
                gls = descendants_from_nodemap(
                    iso_languoids.get(oid),
                    languoids).intersection(gl_in_phoible)
                if gls:
                    nid = gls.pop()
                    if len(gls) > 1:
                        print('+++', oid, gls)
                else:
                    print('---', oid)
                common.Config.add_replacement(oid, nid, common.Language)
        elif model == 'Parameter':
            segments_in_phoible = set(data['Segment'].keys())
            for oid, nid in repls.items():
                id_ = nid if nid in segments_in_phoible else None
                common.Config.add_replacement(oid, id_, common.Parameter)

    for segment in ds['ParameterTable']:
        for i, (k, v) in enumerate(sorted(segment.items())):
            if k not in ['ID', 'Name', 'Description', 'SegmentClass']:
                DBSession.add(
                    common.Parameter_data(
                        key=feature_name(k),
                        value=v,
                        ord=i,
                        object_pk=data['Segment'][segment['ID']].pk))

    for inventory in ds['contributions.csv']:
        inv = data.add(
            models.Inventory,
            inventory['ID'],
            id=inventory['ID'],
            name='{0} ({1} {2})'.format(
                inventory['Name'],
                inventory['Contributor_ID'].upper(),
                inventory['ID'],
            ),
            source_url=inventory['URL'],
            count_tone=inventory['count_tones'],
            count_vowel=inventory['count_vowels'],
            count_consonant=inventory['count_consonants'],
        )
        DBSession.add(
            common.ContributionContributor(
                contribution=inv,
                contributor=data['Contributor'][
                    inventory['Contributor_ID'].upper()]))
        for src in inventory['Source']:
            DBSession.add(
                common.ContributionReference(contribution=inv,
                                             source=data['Source'][src]))

    for phoneme in ds['ValueTable']:
        lang = data['Variety'][phoneme['Language_ID']]
        inv = data['Inventory'][phoneme['Contribution_ID']]
        if not inv.language:
            inv.language = lang
        vs = common.ValueSet(
            id=phoneme['ID'],
            contribution=inv,
            language=lang,
            parameter=data['Segment'][phoneme['Parameter_ID']])

        for ref in phoneme['Source']:
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            models.Phoneme(
                id=phoneme['ID'],
                name='%s %s' %
                (phoneme['Value'],
                 data['Inventory'][phoneme['Contribution_ID']].name),
                allophones=' '.join(phoneme['Allophones']),
                marginal=phoneme['Marginal'],
                valueset=vs))

    return
示例#17
0
def load(args):
    fts.index('fts_index', models.Ref.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")

    dataset = common.Dataset(
        id='glottolog',
        name="Glottolog {0}".format(args.args[0]),
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='glottolog.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    data = Data()
    for i, (id_, name) in enumerate([
        ('hammarstroem', 'Harald Hammarström'),
        ('bank', 'Sebastian Bank'),
        ('forkel', 'Robert Forkel'),
        ('haspelmath', 'Martin Haspelmath'),
    ]):
        ed = data.add(common.Contributor, id_, id=id_, name=name)
        common.Editor(dataset=dataset, contributor=ed, ord=i + 1)
    DBSession.add(dataset)

    clf = data.add(common.Contribution, 'clf', id='clf', name='Classification')
    DBSession.add(common.ContributionContributor(
        contribution=clf, contributor=data['Contributor']['hammarstroem']))

    for pid, pname in [
        ('fc', 'Family classification'),
        ('sc', 'Subclassification'),
        ('vitality', 'Degree of endangerment'),
    ]:
        data.add(common.Parameter, pid, id=pid, name=pname)

    legacy = jsonlib.load(gc2version(args))
    for gc, version in legacy.items():
        data.add(models.LegacyCode, gc, id=gc, version=version)

    glottolog = args.repos
    for ma in Macroarea:
        data.add(
            models.Macroarea,
            ma.name,
            id=ma.name,
            name=ma.value,
            description=ma.description)

    for country in glottolog.countries:
        data.add(models.Country, country.id, id=country.id, name=country.name)

    lgcodes, mas, countries, lgsources = {}, {}, {}, defaultdict(list)
    languoids = list(glottolog.languoids())
    nodemap = {l.id: l for l in languoids}
    for lang in languoids:
        for ref in lang.sources:
            lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id)
        load_languoid(data, lang, nodemap)
        mas[lang.id] = [ma.name for ma in lang.macroareas]
        countries[lang.id] = [c.id for c in lang.countries]
        lgcodes[lang.id] = lang.id
        if lang.hid:
            lgcodes[lang.hid] = lang.id
        if lang.iso:
            lgcodes[lang.iso] = lang.id

    for gc in glottolog.glottocodes:
        if gc not in data['Languoid'] and gc not in legacy:
            common.Config.add_replacement(gc, None, model=common.Language)

    for obj in jsonlib.load(glottolog.references_path('replacements.json')):
        common.Config.add_replacement(
            '{0}'.format(obj['id']),
            '{0}'.format(obj['replacement']) if obj['replacement'] else None,
            model=common.Source)

    DBSession.flush()
    for lid, maids in mas.items():
        for ma in maids:
            DBSession.add(models.Languoidmacroarea(
                languoid_pk=data['Languoid'][lid].pk,
                macroarea_pk=data['Macroarea'][ma].pk))

    for lid, cids in countries.items():
        for cid in cids:
            DBSession.add(models.Languoidcountry(
                languoid_pk=data['Languoid'][lid].pk,
                country_pk=data['Country'][cid].pk))

    for doctype in glottolog.hhtypes:
        data.add(
            models.Doctype, doctype.id, id=doctype.id,
            name=doctype.name,
            description=doctype.description,
            abbr=doctype.abbv,
            ord=doctype.rank)

    for bib in glottolog.bibfiles:
        data.add(
            models.Provider,
            bib.id,
            id=bib.id,
            name=bib.title,
            description=bib.description,
            abbr=bib.abbr,
            url=bib.url)
    DBSession.flush()

    s = time()
    for i, entry in enumerate(
            BibFile(glottolog.build_path('monster-utf8.bib')).iterentries()):
        if i % 10000 == 0:
            args.log.info('{0}: {1:.3}'.format(i, time() - s))
            s = time()
        ref = load_ref(data, entry, lgcodes, lgsources)
        if 'macro_area' in entry.fields:
            for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True):
                ma = 'North America' if ma == 'Middle America' else ma
                ma = Macroarea.get('Papunesia' if ma == 'Papua' else ma)
                DBSession.add(models.Refmacroarea(
                    ref_pk=ref.pk, macroarea_pk=data['Macroarea'][ma.name].pk))
示例#18
0
def main(args):
    data = Data()
    data_path = lambda *cs: args.data_file('concepticon-data',
                                           'concepticondata', *cs)

    dataset = common.Dataset(
        id=concepticon.__name__,
        name="Concepticon 1.0",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='concepticon.clld.org',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    DBSession.add(dataset)
    for i, name in enumerate(
        ['Johann-Mattis List', 'Michael Cysouw', 'Robert Forkel']):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    english = data.add(common.Language, 'eng', id='eng', name='English')

    files = {}
    for fname in data_path('sources').iterdir():
        files[fname.stem] = \
            "https://github.com/clld/concepticon-data/blob/master/concepticondata/sources/%s" % fname.name

    for rec in Database.from_file(data_path('references', 'references.bib'),
                                  lowercase=True):
        source = data.add(common.Source, rec.id, _obj=bibtex2source(rec))
        if rec.id in files:
            DBSession.flush()
            DBSession.add(
                common.Source_files(mime_type='application/pdf',
                                    object_pk=source.pk,
                                    jsondata=dict(url=files[rec.id])))

    for concept in reader(data_path('concepticon.tsv'), namedtuples=True):
        data.add(models.ConceptSet,
                 concept.ID,
                 id=concept.ID,
                 name=concept.GLOSS,
                 description=concept.DEFINITION,
                 semanticfield=concept.SEMANTICFIELD,
                 ontological_category=concept.ONTOLOGICAL_CATEGORY)

    for rel in reader(data_path('conceptrelations.tsv'), namedtuples=True):
        DBSession.add(
            models.Relation(source=data['ConceptSet'][rel.SOURCE],
                            target=data['ConceptSet'][rel.TARGET],
                            description=rel.RELATION))

    unmapped = Counter()
    number_pattern = re.compile('(?P<number>[0-9]+)(?P<suffix>.*)')

    for cl in reader(data_path('conceptlists.tsv'), dicts=True):
        concepts = data_path('conceptlists', '%(ID)s.tsv' % cl)
        if not concepts.exists():
            continue
        langs = [l.lower() for l in split(cl['SOURCE_LANGUAGE'])]
        conceptlist = data.add(
            models.Conceptlist,
            cl['ID'],
            id=cl['ID'],
            name=' '.join(cl['ID'].split('-')),
            description=cl['NOTE'],
            target_languages=cl['TARGET_LANGUAGE'],
            source_languages=' '.join(langs),
            year=int(cl['YEAR']) if cl['YEAR'] else None,
        )
        for id_ in split(cl['REFS']):
            common.ContributionReference(source=data['Source'][id_],
                                         contribution=conceptlist)
        for i, name in enumerate(split(cl['AUTHOR'], sep=' and ')):
            name = strip_braces(name)
            contrib = data['Contributor'].get(name)
            if not contrib:
                contrib = data.add(common.Contributor,
                                   name,
                                   id=slug(name),
                                   name=name)
            DBSession.add(
                common.ContributionContributor(ord=i,
                                               contribution=conceptlist,
                                               contributor=contrib))
        for k in 'ID NOTE TARGET_LANGUAGE SOURCE_LANGUAGE YEAR REFS AUTHOR'.split(
        ):
            del cl[k]
        DBSession.flush()
        for k, v in cl.items():
            DBSession.add(
                common.Contribution_data(object_pk=conceptlist.pk,
                                         key=k,
                                         value=v))

        for concept in reader(concepts, namedtuples=True):
            if not concept.ID or not concept.CONCEPTICON_ID or concept.CONCEPTICON_ID == 'NAN':
                #print conceptlist.id, getattr(concept, 'ENGLISH', getattr(concept, 'GLOSS', None))
                unmapped.update([conceptlist.id])
                continue

            lgs = {}
            for lang in langs:
                v = getattr(concept, lang.upper())
                if v:
                    lgs[lang] = v

            match = number_pattern.match(concept.NUMBER)
            if not match:
                print(concept.ID)
                raise ValueError
            vs = common.ValueSet(
                id=concept.ID,
                description=getattr(concept, 'GLOSS',
                                    getattr(concept, 'ENGLISH', None)),
                language=english,
                contribution=conceptlist,
                parameter=data['ConceptSet'][concept.CONCEPTICON_ID])
            d = {}
            for key, value in concept.__dict__.items():
                if not key.startswith('CONCEPTICON_') and \
                        key not in ['NUMBER', 'ID', 'GLOSS'] + [l.upper() for l in langs]:
                    d[key.lower()] = value
            v = models.Concept(
                id=concept.ID,
                valueset=vs,
                description=getattr(concept, 'GLOSS',
                                    None),  # our own gloss, if available
                name='; '.join('%s [%s]' % (lgs[l], l)
                               for l in sorted(lgs.keys())),
                number=int(match.group('number')),
                number_suffix=match.group('suffix'),
                jsondata=d)
            DBSession.flush()
            for key, value in lgs.items():
                DBSession.add(
                    common.Value_data(key='lang_' + key,
                                      value=value,
                                      object_pk=v.pk))

    print('Unmapped concepts:')
    for clid, no in unmapped.most_common():
        print(clid, no)

    for fname in data_path('concept_set_meta').iterdir():
        if fname.suffix == '.tsv':
            md = load(fname.parent.joinpath(fname.name + '-metadata.json'))
            provider = models.MetaProvider(id=fname.stem,
                                           name=md['dc:title'],
                                           description=md['dc:description'],
                                           url=md['dc:source'],
                                           jsondata=md)
            for meta in reader(fname, dicts=True):
                try:
                    for k, v in meta.items():
                        if v and k != 'CONCEPTICON_ID':
                            models.ConceptSetMeta(metaprovider=provider,
                                                  conceptset=data['ConceptSet']
                                                  [meta['CONCEPTICON_ID']],
                                                  key=k,
                                                  value=v)
                except:
                    print(fname)
                    print(meta)
                    raise
示例#19
0
def main(args):
    data = Data(created=utc.localize(datetime(2013, 11, 15)),
                updated=utc.localize(datetime(2013, 12, 12)))
    icons = issues.Icons()
    #print icons

    DBSession.execute("delete from Language")
    DBSession.execute("delete from Unit")
    DBSession.execute("delete from featuredomain")
    DBSession.execute("delete from family")
    DBSession.execute("delete from source")
    DBSession.execute("delete from parameter")
    DBSession.execute("delete from feature")
    DBSession.execute("delete from domainelement")
    DBSession.execute("delete from valueset")
    DBSession.execute("delete from value")
    DBSession.execute("delete from lsivalue")
    DBSession.execute("delete from dataset")
    DBSession.execute("delete from contributor")
    DBSession.execute("delete from lsilanguage")
    DBSession.execute("delete from contribution")
    DBSession.execute("delete from designer")

    DBSession.flush()

    dtab = partial(_dtab, args.data_file())

    #Languages

    #print args.data_file()
    #tabfns = ['%s' % fn.basename() for fn in args.data_file().files('nts_*.tab')]
    #tabfns = ['nts_18.tab']
    ##tabfns = os.listdir('/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/data')[1:]
    tabfns = os.listdir(
        '/Users/virk/shafqat/postDoc-Swe/project/clld/clld/lsi/lsi/lsi_data'
    )[1:]
    #print tabfns
    args.log.info("Sheets found: %s" % tabfns)
    ldps = []
    lgs = {}
    nfeatures = Counter()
    nlgs = Counter()

    for fn in tabfns:
        for ld in dtab(fn):

            if ld['language_id'] == 'qgr' or ld['language_id'] == '---' or ld[
                    'language_id'] == '':  # to exclude languages which do not have an iso-code
                continue
            if "feature_alphanumid" not in ld:
                args.log.info("NO FEATUREID %s %s" % (len(ld), ld))
            if not ld["feature_alphanumid"].startswith("DRS") \
                    and ld["feature_alphanumid"].find(".") == -1:
                ldps.append(dp_dict(ld))
                ##print ld
                lgs[ld['language_id']] = unescape(ld['language_name'])
                if ld["value"] != "?":
                    nfeatures.update([ld['language_id']])
                    nlgs.update([ld['feature_alphanumid']])

    ldps = sorted(ldps, key=lambda d: d['feature_alphanumid'])

    #lgs["ygr"] = "Hua"

    for lgid, lgname in lgs.items():
        data.add(models.lsiLanguage,
                 lgid,
                 id=lgid,
                 name=lgname,
                 representation=nfeatures.get(lgid, 0))
    DBSession.flush()
    #print "I am here"
    #print data['ntsLanguage'].values()[1].id
    load_families(
        data,
        ##[(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l) for l in data['lsiLanguage'].values()],
        [(NOCODE_TO_GLOTTOCODE.get(l.id, l.id), l)
         for l in data['lsiLanguage'].values()
         if l.id != '---' and l.id != ''],
        isolates_icon='tcccccc')
    #print 'family'
    #print data['Family'].get('sino1245').jsondata
    #Domains
    for domain in set(ld['feature_domain'] for ld in ldps):
        data.add(models.FeatureDomain, domain, name=domain)
    DBSession.flush()

    #Designers
    #for i, info in enumerate(dtab("ntscontributions.tab") + dtab("ntscontacts.tab")):
    for i, info in enumerate([{
            'designer': 'shafqat',
            'domain': '',
            'pdflink': '',
            'citation': ''
    }, {
            'designer': '-',
            'domain': '',
            'pdflink': '',
            'citation': ''
    }]):
        designer_id = str(i + 1)
        data.add(models.Designer,
                 info['designer'],
                 id=designer_id,
                 name=designer_id,
                 domain=info["domain"],
                 contributor=info['designer'],
                 pdflink=info["pdflink"],
                 citation=info["citation"])
    DBSession.flush()

    #Sources
    '''for k, (typ, bibdata) in [
        ktfbib(bibsource) for ld in ldps
        if ld.get(u'bibsources') for bibsource in ld['bibsources'].split(",,,")
    ]:
        if k not in data["Source"]:
            data.add(common.Source, k, _obj=bibtex2source(Record(typ, k, **bibdata)))
    DBSession.flush()'''

    #Features
    fs = [(fid, mergeds(lds))
          for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])]

    fvdesc = [(fid, [
        (ld.get("feature_possible_values"), ld.get("fromfile")) for ld in lds
        if ld.get("feature_possible_values")
    ]) for fid, lds in groupby(ldps, key=lambda d: d['feature_alphanumid'])]
    fvdt = [(fid, grp2(vdescs)) for (fid, vdescs) in fvdesc]
    fvmis = [(fid, vdescs) for (fid, vdescs) in fvdt if len(vdescs) > 1]

    for _, dfsids in groupby(sorted(
        (f.get('feature_name', fid), fid) for fid, f in fs),
                             key=lambda t: t[0]):
        ##print [(k,v) for (k,v) in list(dfsids)],len(list(dfsids))
        assert len(list(dfsids)) == 1
    #print 'here is nlgs'

    for fid, f in fs:
        #print "lang name"
        #print ldps
        #print f.get('feature_possible_values', ""),
        if not fid.isdigit():
            args.log.info("NO INT FID %s" % f)
        feature = data.add(
            models.Feature,
            fid,
            id=fid,
            name=f.get('feature_name', f['feature_alphanumid']),
            doc=f.get('feature_information', ""),
            vdoc=f.get('feature_possible_values', ""),
            representation=nlgs.get(fid, 0),
            designer=data["Designer"][f['designer']],
            dependson=f.get("depends_on", ""),
            abbreviation=f.get("abbreviation", ""),
            featuredomain=data['FeatureDomain'][f["feature_domain"]],
            name_french=f.get('francais', ""),
            clarification=f.get(
                "draft of clarifying comments to outsiders (hedvig + dunn + harald + suzanne)",
                ""),
            alternative_id=f.get("old feature number", ""),
            jl_relevant_unit=f.get("relevant unit(s)", ""),
            jl_function=f.get("function", ""),
            jl_formal_means=f.get("formal means", ""),
            sortkey_str="",
            sortkey_int=int(fid))

        vdesclist = [veq.split("==") for veq in feature.vdoc.split("||")]
        vdesc = {v.replace(".", "-"): desc for [v, desc] in vdesclist}
        ##vdesc = {fmly+val2icon(v): desc for ((v,desc),fmly) in itertools.product([(vv,desc) for [vv, desc] in vdesclist],['c','d','f','t'])}

        vdesc.setdefault('?', 'Not known')
        if 'N/A' not in vdesc and feature.dependson:
            vdesc["N/A"] = "Not Applicable"
        vi = {v: i for (i, v) in enumerate(sorted(vdesc.keys()))}
        ##vicons = {f+val2icon(v):f+val2icon(v) for (v,f) in itertools.product(['0','1','2','3'],['c','d','f','t'])}
        ##vicons['?'] = 'c00ffff'
        ##vicons['N/A'] = 'c00ffff'
        ##vicons = icons.iconize(vi.keys())
        for (v, desc) in vdesc.items():
            #print v,vicons[v]
            data.add(common.DomainElement, (fid, v),
                     id='%s-%s' % (fid, v),
                     name=v,
                     description=desc,
                     jsondata={"icon": Colors[v]},
                     number=vi[v],
                     parameter=feature)
    DBSession.flush()

    for ((f, lg), ixs) in grp2([((ld['feature_alphanumid'], ld['language_id']),
                                 i) for i, ld in enumerate(ldps)]):
        ixvs = set([ldps[ix]['value'] for ix in ixs])
        if len(ixvs) == 1:
            continue
        args.log.warn("Dup value %s %s %s" %
                      (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'])
                               for ix in ixs]))
        ##print "Dup value %s %s %s" % (f, lg, [(ldps[ix]['value'], ldps[ix]['fromfile'], ldps[ix].get('provenance')) for ix in ixs])
    errors = {}
    done = set()
    glottolog = Glottolog()
    for ld in ldps:

        ############################### for printing different map markers for different familys for features:shafqat
        #print data['Family']

        language = data['lsiLanguage'][ld['language_id']]
        if isinstance(language, (tuple, list)) and len(language) == 2:
            code, language = language
        else:
            code = language.id
        if code != '-':
            gl_language = glottolog.languoid(code)
            if gl_language:
                gl_family = gl_language.family
                if gl_family:
                    family = data['Family'].get(gl_family.id)

        ##ld['value'] = ld['value']+'-'+str(family)
        ##ld['value'] = combineValueFamily(ld['value'],str(family))
        #print family
        #####################################
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['lsiLanguage'][ld['language_id']]

        id_ = '%s-%s' % (parameter.id, language.id)
        if id_ in done:
            continue

        if (ld['feature_alphanumid'],
                ld['value']) not in data['DomainElement']:
            if not ld["value"].strip():
                continue
            info = (ld['feature_alphanumid'],
                    ld.get('feature_name', "[Feature Name Lacking]"),
                    ld['language_id'], ld['value'], ld['fromfile'])
            msg = u"%s %s %s %s %s not in the set of legal values ({0})" % info
            args.log.error(
                msg.format(
                    sorted([
                        y for (x, y) in data['DomainElement'].keys()
                        if x == ld['feature_alphanumid']
                    ])))
            ##print msg.format(sorted(
            ##  [y for (x, y) in data['DomainElement'].keys()
            ## if x == ld['feature_alphanumid']]))
            errors[(ld['feature_alphanumid'], ld['language_id'])] = info
            continue

        vs = common.ValueSet(
            id=id_,
            language=language,
            parameter=parameter,
            source=ld["source"] or None,
            ##contribution=parameter.designer
        )
        #print
        #print "this one"
        #print ld['value'],family
        models.lsiValue(
            id=id_,
            domainelement=data['DomainElement'][(ld['feature_alphanumid'],
                                                 ld['value'])],
            jsondata={
                "icon":
                data['DomainElement'][(ld['feature_alphanumid'],
                                       ld['value'])].jsondata,
                "family":
                FamilyCodes[str(family)]
            },
            comment=ld["comment"],
            valueset=vs,
            contributed_datapoint=ld["contributor"])
        done.add(id_)
        '''if not ld.get('bibsources'):
            if 'bibsources' not in ld:
                args.log.warn("no bibsource %s" % ld)
            continue
        for k, _ in [ktfbib(bibsource) for bibsource in ld['bibsources'].split(",,,")]:
            common.ValueSetReference(valueset=vs, source=data['Source'][k])'''
    DBSession.flush()

    #To CLDF
    cldf = {}
    for ld in ldps:
        parameter = data['Feature'][ld['feature_alphanumid']]
        language = data['lsiLanguage'][ld['language_id']]
        id_ = '%s-%s' % (parameter.id, language.id)
        if not id_ in done:
            continue
        dt = (lgs[ld['language_id']], ld['language_id'],
              ld['feature_alphanumid'] + ". " + ld['feature_name'],
              ld["value"])
        cldf[dt] = None

    tab = lambda rows: u''.join([u'\t'.join(row) + u"\n" for row in rows])
    savu(tab([("Language", "iso-639-3", "Feature", "Value")] + cldf.keys()),
         "lsi.cldf")

    args.log.info('%s Errors' % len(errors))

    dataset = common.Dataset(
        id="LSI",
        name='Linguistic Survey of India',
        publisher_name="Sprakbanken",
        publisher_place="Gothenburg",
        publisher_url="to be given",
        description="this is to be followed",
        domain='http://lsi.clld.org',
        published=date(2016, 05, 16),
        contact='*****@*****.**',
        license='http://creativecommons.org/licenses/by-nc-nd/2.0/de/deed.en',
        jsondata={
            'license_icon':
            'http://wals.info/static/images/cc_by_nc_nd.png',
            'license_name':
            'Creative Commons Attribution-NonCommercial-NoDerivs 2.0 Germany'
        })

    # disabled for experimental purposes, names were appearing multiple times
    for i, contributor in enumerate([
            common.Contributor(id="Lars Borin",
                               name="Lars Borin",
                               email="*****@*****.**"),
            common.Contributor(id="Shafqat Mumtaz Virk",
                               name="Shafqat Mumtaz Virk",
                               email="*****@*****.**"),
            common.Contributor(id="Anju Saxena",
                               name="Anju Saxena",
                               email="*****@*****.**"),
            common.Contributor(id="Harald Hammarstrom",
                               name="Harald Hammarstrom",
                               email="*****@*****.**")
    ]):
        #print i
        common.Editor(dataset=dataset, contributor=contributor, ord=i)
    '''cont1 = common.Contributor(
            id="Harald Hammarstrom",
            name="Harald Hammarstrom",
            email="*****@*****.**")
    cont2= common.Contributor(
            id="Shafqat Mumtaz Virk",
            name="Shafqat Mumtaz Virk",
            email="*****@*****.**")
    cont3 = common.Contributor(
            id="Lars Borin",
            name="Lars Borin",
            email="*****@*****.**")
    for contributor in [cont1,cont2,cont3]:
        common.Editor(dataset=dataset, contributor=contributor,ord=1)'''

    DBSession.add(dataset)
    DBSession.flush()
示例#20
0
def main(args):
    meta = parse_meta(args)
    print(len(meta))
    print(sum(len(m.sources) for m in meta.values()))
    sources = {}
    for m in meta.values():
        for s in m.sources:
            sources[s] = None
    print(len(sources), 'distinct')
    for i, s in enumerate(sources):
        sources[s] = get_source(s, i + 1)

    glottocodes = glottocodes_by_isocode('postgresql://robert@/glottolog3')

    data = Data()

    wals = create_engine('postgresql://robert@/wals3')
    wals_families = {}
    for row in wals.execute('select name, id from family'):
        wals_families[row[0]] = row[1]
        wals_families[row[1]] = row[1]

    #for item in reader(args.data_file('WALSFamilyAbbreviations.tab'), namedtuples=True, encoding='latin1'):
    #    name = item.FAMILY
    #    if name not in wals_families:
    #        name = slug(name)
    #        if name not in wals_families:
    #            print('missing wals family:', item.FAMILY)
    #            name = None
    #    if name:
    #        wals_families[item.ABBREVIATION] = wals_families[name]

    wals_genera = {
        row[0]: row[0]
        for row in wals.execute('select id from genus')
    }

    with args.data_file('listss17.txt').open(encoding='latin1') as fp:
        wordlists = ['\n'.join(lines) for lines in parse(fp)]

    dataset = common.Dataset(
        id=asjp.__name__,
        name="The ASJP Database",
        contact="*****@*****.**",
        description="The Automated Similarity Judgment Program",
        domain='asjp.clld.org',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        license='http://creativecommons.org/licenses/by/4.0/',
        jsondata={
            'license_icon':
            'cc-by.png',
            'license_name':
            'Creative Commons Attribution 4.0 International License'
        })
    DBSession.add(dataset)

    transcribers = get_transcriber_map(args)
    for i, spec in enumerate([
        ('SW', "Søren Wichmann"),
        ('AM', "André Müller"),
        ('AKW', "Annkathrin Wett"),
        ('VV', "Viveka Velupillai"),
        ('JB', "Julia Bischoffberger"),
        ('CB', "Cecil H. Brown"),
        ('EH', "Eric W. Holman"),
        ('SS', "Sebastian Sauppe"),
        ('ZM', "Zarina Molochieva"),
        ('PB', "Pamela Brown"),
        ('HH', "Harald Hammarström"),
        ('OB', "Oleg Belyaev"),
        ('JML', "Johann-Mattis List"),
        ('DBA', "Dik Bakker"),
        ('DE', "Dmitry Egorov"),
        ('MU', "Matthias Urban"),
        ('RM', "Robert Mailhammer"),
        ('AC', "Agustina Carrizo"),
        ('MSD', "Matthew S. Dryer"),
        ('EK', "Evgenia Korovina"),
        ('DB', "David Beck"),
        ('HG', "Helen Geyer"),
        ('PE', "Patience Epps"),
        ('AG', "Anthony Grant"),
        ('PS', "Paul Sidwell"),  # not in citation
        ('KTR', "K. Taraka Rama"),  # not in citation
        ('PV', "Pilar Valenzuela"),
        ('MD', "Mark Donohue"),  # not in citation
    ]):
        id_, name = spec
        if id_ in transcribers:
            assert name == transcribers.pop(id_)
        contributor = data.add(common.Contributor, id_, id=id_, name=name)
        if id_ in ['SW', 'CB', 'EH']:
            DBSession.add(
                common.Editor(dataset=dataset,
                              ord=i + 1,
                              contributor=contributor))
    for id_, name in transcribers.items():
        data.add(common.Contributor, id_, id=id_, name=name)

    for id_ in sorted(models.MEANINGS_ALL.keys()):
        data.add(models.Meaning,
                 id_,
                 id=str(id_),
                 name=models.MEANINGS_ALL[id_],
                 core=id_ in models.MEANINGS)

    for n, l in enumerate(wordlists):
        #if n > 100:
        #    break
        lang = models.Doculect.from_txt(l)
        if lang.classification_wals:
            family, genus = lang.classification_wals.split('.')
            lang.wals_family = wals_families.get(family)
            lang.wals_genus = wals_genera.get(slug(genus))
        lang.code_glottolog = glottocodes.get(lang.code_iso)
        add_codes(lang)
        data.add(models.Doculect, lang.id, _obj=lang)
        DBSession.flush()
        md = meta.pop(lang.id, None)
        assert md
        # associate transcribers and sources
        for i, transcriber in enumerate(md.transcribers):
            common.ContributionContributor(
                contribution=lang.wordlist,
                contributor=data['Contributor'][transcriber],
                ord=i + 1)
        for source in md.sources:
            DBSession.add(
                common.LanguageSource(language_pk=lang.pk,
                                      source_pk=sources[source].pk))

    assert not list(meta.keys())
示例#21
0
def main(args):
    data = Data()
    glottocodes, bibtex_keys = {}, defaultdict(set)
    for d in reader(
            args.data_file('repos', 'mappings',
                           'InventoryID-ISO-gcode-Bibkey-Source.tsv')):
        glottocodes[d['InventoryID']] = d['Glottocode']
        bibtex_keys[d['InventoryID']].add(d['BibtexKey'])

    glottolog = Glottolog(
        Path(phoible.__file__).parent.parent.parent.parent.joinpath(
            'glottolog3', 'glottolog'))
    languoids = {l.id: l for l in glottolog.languoids()}

    phonemes = sorted(list(
        reader(args.data_file('repos', 'data', 'phoible-by-phoneme.tsv'))),
                      key=lambda r: (r['InventoryID'], r['GlyphID']))

    inventories = defaultdict(set)
    for p in phonemes:
        if p['InventoryID'] in glottocodes:
            inventories[(languoids[glottocodes[p['InventoryID']]].name,
                         p['SpecificDialect'], p['Source'].upper())].add(
                             (p['InventoryID'], p['LanguageName']))

    inventory_names = {}
    for (glname, dname, source), invids in inventories.items():
        if len(invids) == 1:
            invid, lname = invids.pop()
            inventory_names[invid] = name_in_source(glname,
                                                    dname) + ' [%s]' % source
        else:
            use_lname = len(set(r[1] for r in invids)) == len(invids)
            for i, (invid,
                    lname) in enumerate(sorted(invids,
                                               key=lambda j: int(j[0]))):
                disambiguation = ' %s' % (i + 1, )
                if use_lname:
                    disambiguation = ' (%s)' % lname
                inventory_names[invid] = name_in_source(
                    glname, dname) + '%s [%s]' % (disambiguation, source)

    for (invid, lname, dname, source), ps in groupby(
            phonemes, lambda p: (p['InventoryID'], p['LanguageName'], p[
                'SpecificDialect'], p['Source'])):
        if invid not in glottocodes:
            continue
        ps = list(ps)
        gc = glottocodes[invid]
        lang = data['Variety'].get(gc)
        if not lang:
            languoid = languoids[gc]
            lang = data.add(
                models.Variety,
                gc,
                id=gc,
                language_code=ps[0]['LanguageCode'],
                name=languoid.name,
                level=text_type(languoid.level.name),
                latitude=languoid.latitude,
                longitude=languoid.longitude,
            )
            if lang.latitude is None and languoid.level == Level.dialect:
                ll = get_language(languoid)
                lang.latitude = ll.latitude
                lang.longitude = ll.longitude

        contrib = data.add(
            models.Inventory,
            invid,
            id=invid,
            #language=lang,
            source=source,
            #source_url=source_urls.get(row.InventoryID),
            #internetarchive_url=ia_urls.get(row.InventoryID),
            name=inventory_names[invid],
            description=name_in_source(lname, dname))

    return

    # FIXME: read from mappings file!
    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="https://www.shh.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(
            common.Editor(dataset=dataset,
                          ord=i + 1,
                          contributor=common.Contributor(id=spec[0],
                                                         name=spec[1])))

    #squibs = defaultdict(list)
    #for row in get_rows(args, 'Squib'):
    #    squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    # FIXME: group phoible-by-phoneme by LanguageCode, Source (make sure this is unique!)
    aggregated = list(
        reader(args.data_file('phoible-aggregated.tsv'),
               delimiter='\t',
               namedtuples=True))
    inventory_names = {}
    for key, items in groupby(sorted(aggregated,
                                     key=lambda t: (t.LanguageCode, t.Source)),
                              key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i +
                                                                    1, key[1])

    # pull in Glottolog families instead? or in addition?

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus,
                            genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(models.Variety,
                            row.LanguageCode,
                            id=row.LanguageCode,
                            name=lnames[row.LanguageCode],
                            genus=genus,
                            country=strip_quotes(row.Country),
                            area=strip_quotes(row.Area),
                            latitude=coords[0],
                            longitude=coords[1],
                            jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data,
                               lang,
                               row.LanguageCode,
                               glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(common.Contributor,
                                   row.Source,
                                   id=row.Source,
                                   name=SOURCES[row.Source][0],
                                   description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(
                    models.ContributorReference(source=data['Source'][ref],
                                                contributor=contributor))

        contrib = data.add(models.Inventory,
                           row.InventoryID,
                           id=row.InventoryID,
                           language=lang,
                           source=row.Source,
                           source_url=source_urls.get(row.InventoryID),
                           internetarchive_url=ia_urls.get(row.InventoryID),
                           name=inventory_names[row.InventoryID],
                           description=row.LanguageName)

        DBSession.add(
            common.ContributionContributor(contribution=contrib,
                                           contributor=contributor))

        #for j, squib in enumerate(squibs.get(row.InventoryID, [])):
        #    f = common.Contribution_files(
        #        object=contrib,
        #        id='squib-%s-%s.pdf' % (contrib.id, j + 1),
        #        name='Phonological squib',
        #        description=squib,
        #        mime_type='application/pdf')
        #    assert f
        #    # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'),
                      namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment,
                row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join([
                    t[0] for t in unicode_desc
                    if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
                ]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(id=row.PhonemeID,
                             contribution=inventory,
                             language=inventory.language,
                             parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            common.Value(
                id=row.PhonemeID,
                name='%s %s' %
                (row.Phoneme, data['Inventory'][row.InventoryID].name),
                valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(common.ContributionReference,
                     '%s-%s' % (inventory_id, ref),
                     source=data['Source'][ref],
                     contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(
            reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(
                    common.Parameter_data(
                        key=features[j],
                        value=value,
                        ord=j,
                        object_pk=data['Segment'][row[0]].pk))

    # FIXME: add allophones!

    DBSession.flush()
示例#22
0
        if len(res) > 100:
            break
    return res.strip()


def get_vs2008(args):  # pragma: no cover
    vs2008 = {}
    for row in reader(args.data_file('datapoints_2008.csv'), delimiter=','):
        vs2008[(row[0], '%sA' % row[1])] = int(row[2])
    return vs2008


E2008 = utc.localize(datetime(2008, 4, 21))
E2011 = utc.localize(datetime(2011, 4, 28))
E2013 = utc.localize(datetime(2013, 11, 15))
data = Data(created=E2008, updated=E2008)


def migrate(from_, to_, converter):  # pragma: no cover
    for row in DB.execute("select * from %s" % from_):
        res = converter(row)
        if not res:
            continue
        if isinstance(res, dict):
            DBSession.add(to_(**res))
        else:
            data.add(to_, res[0], **res[1])
    DBSession.flush()


def main(args):  # pragma: no cover
示例#23
0
def main(args):
    data = Data()

    editors = OrderedDict()
    editors['Susanne Maria Michaelis'] = None
    editors['Philippe Maurer'] = None
    editors['Martin Haspelmath'] = None
    editors['Magnus Huber'] = None

    for row in read(args, 'People'):
        name = row['First name'] + ' ' if row['First name'] else ''
        name += row['Last name']
        kw = dict(
            name=name,
            id=slug('%(Last name)s%(First name)s' % row),
            url=row['Contact Website'].split()[0]
            if row['Contact Website'] else None,
            address=row['Comments on database'],
        )
        contrib = data.add(common.Contributor, row['Author ID'], **kw)
        if kw['name'] in editors:
            editors[kw['name']] = contrib

    DBSession.flush()

    dataset = common.Dataset(
        id='apics',
        name='APiCS Online',
        description='Atlas of Pidgin and Creole Language Structures Online',
        domain='apics-online.info',
        published=date(2013, 11, 4),
        license='http://creativecommons.org/licenses/by/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 3.0 Unported License'
        })
    DBSession.add(dataset)
    for i, editor in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=editor, ord=i + 1)

    colors = dict(
        (row['ID'], row['RGB_code']) for row in read(args, 'Colours'))

    abbrs = {}
    for id_, name in LGR_ABBRS.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for id_, name in {
            'C**T': 'clitic',
            'IMPF': 'imperfect',
            'INTERM': 'intermediate',
            'NCOMPL': 'noncompletive',
            'NONFUT': 'nonfuture',
            'NPROX': 'nonproximal',
            'NSG': 'nonsingular',
            'PP': 'past participle',
            'PROP': 'proprietive',
            'TMA': 'tense-mood-aspect',
    }.items():
        DBSession.add(common.GlossAbbreviation(id=id_, name=name))
        abbrs[id_] = 1

    for row in reader(args.data_file('non-lgr-gloss-abbrs.csv'),
                      delimiter=',',
                      namedtuples=True):
        for match in GLOSS_ABBR_PATTERN.finditer(row.standard):
            if match.group('abbr') not in abbrs:
                abbrs[match.group('abbr')] = 1
                DBSession.add(
                    common.GlossAbbreviation(id=match.group('abbr'),
                                             name=row.meaning))

    non_bibs = {}
    for row in read(args, 'References', 'Reference_ID'):
        if row['Reference_type'] == 'Non-bib':
            non_bibs[row['Reference_ID']] = row['Reference_name']
            continue

        if isinstance(row['Year'], int):
            year_int = row['Year']
            year = str(row['Year'])
        elif row['Year']:
            year_int = None
            for m in re.finditer('(?P<year>(1|2)[0-9]{3})', row['Year']):
                year_int = int(m.group('year'))
                break
            year = row['Year']
        else:
            year, year_int = None, None

        title = row['Article_title'] or row['Book_title']
        attrs = {}
        jsondata = {}
        for attr, field in {
                'Additional_information': 'note',
                'Article_title': 'title',
                'Book_title': 'booktitle',
                'City': 'address',
                'Editors': 'editor',
                'Full_reference': None,
                'Issue': None,
                'Journal': 'journal',
                'Language_codes': None,
                'LaTeX_cite_key': None,
                'Pages': 'pages',
                'Publisher': 'publisher',
                'Reference_type': 'type',
                'School': 'school',
                'Series_title': 'series',
                'URL': 'url',
                'Volume': 'volume',
        }.items():
            value = row.get(attr)
            if not isinstance(value, int):
                value = (value or '').strip()
            if attr == 'Issue' and value:
                try:
                    value = str(int(value))
                except ValueError:
                    pass
            if value:
                if field:
                    attrs[field] = value
                else:
                    jsondata[attr] = value
        p = data.add(common.Source,
                     row['Reference_ID'],
                     id=str(row['Reference_ID']),
                     name=row['Reference_name'],
                     description=title,
                     author=row['Authors'],
                     year=year,
                     year_int=year_int,
                     bibtex_type=getattr(EntryType, row['BibTeX_type']
                                         or 'misc'),
                     jsondata=jsondata,
                     **attrs)
        if p.bibtex_type.value == 'misc' and not p.description:
            p.description = p.note
        DBSession.flush()

    DBSession.flush()

    infobox = jsonload(args.data_file('infobox.json'))
    glottocodes = jsonload(args.data_file('glottocodes.json'))
    for row in read(args, 'Languages', 'Order_number'):
        lon, lat = [
            float(c.strip()) for c in row['map_coordinates'].split(',')
        ]
        kw = dict(
            name=row['Language_name'],
            id=str(row['Order_number']),
            latitude=lat,
            longitude=lon,
            region=row['Category_region'],
        )
        lect = data.add(models.Lect, row['Language_ID'], **kw)
        DBSession.flush()

        for i, item in enumerate(infobox[lect.id]):
            DBSession.add(
                common.Language_data(object_pk=lect.pk,
                                     ord=i,
                                     key=item[0],
                                     value=item[1]))

        if row["Languages_contribution_documentation::Lect_description_checked_status"] \
                != "Checked":
            print 'unchecked! ---', row['Language_name']

        desc = row.get(
            'Languages_contribution_documentation::Lect description', '')
        markup_desc = normalize_markup(row[
            'Languages_contribution_documentation::z_calc_GetAsCSS_Lect_description']
                                       )

        c = data.add(
            models.ApicsContribution,
            row['Language_ID'],
            id=str(row['Order_number']),
            name=row['Language_name'],
            description=desc,
            markup_description=markup_desc,
            survey_reference=data['Source'][row['Survey_reference_ID']],
            language=lect)

        for ext, label, mtype in [
            ('pdf', 'Glossed text', 'application/pdf'),
            ('mp3', 'Glossed text audio', 'audio/mpeg'),
        ]:
            fid = '%s-gt.%s' % (c.id, ext)
            if args.data_file('files', 'contribution', c.id, fid).exists():
                common.Contribution_files(object=c,
                                          id=fid,
                                          name=label,
                                          mime_type=mtype)
            else:
                print label, 'missing for:', row['Language_name']

        #
        # TODO: for michif, 75, add link http://www.youtube.com/watch?v=f0C4cODsSyE
        #

        iso = None
        if row['ISO_code'] and len(row['ISO_code']) == 3:
            iso = row['ISO_code'].lower()
            if 'iso:%s' % row['ISO_code'] not in data['Identifier']:
                data.add(common.Identifier,
                         'iso:%s' % row['ISO_code'],
                         id=row['ISO_code'].lower(),
                         name=row['ISO_code'].lower(),
                         type=common.IdentifierType.iso.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier']['iso:%s' % row['ISO_code']]))

        if lect.id in glottocodes:
            identifier = data.add(common.Identifier,
                                  'gc:%s' % glottocodes[lect.id],
                                  id=glottocodes[lect.id],
                                  name=glottocodes[lect.id],
                                  type=common.IdentifierType.glottolog.value)

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=identifier))

        if row['Language_name_ethnologue']:
            if row['Language_name_ethnologue'] not in data['Identifier']:
                data.add(common.Identifier,
                         row['Language_name_ethnologue'],
                         id=iso
                         or 'ethnologue:%s' % row['Language_name_ethnologue'],
                         name=row['Language_name_ethnologue'],
                         type='ethnologue')

            DBSession.add(
                common.LanguageIdentifier(
                    language=data['Lect'][row['Language_ID']],
                    identifier=data['Identifier'][
                        row['Language_name_ethnologue']]))

    example_count = {}
    for row in read(args, 'Examples', 'Order_number'):
        assert row['Language_ID']
        lang = data['Lect'][row['Language_ID']]
        id_ = '%(Language_ID)s-%(Example_number)s' % row
        atext, gloss = igt(row)
        example_count[row['Language_ID']] = max(
            [example_count.get(row['Language_ID'], 1), row['Example_number']])
        p = add_sentence(
            args,
            data,
            id_,
            id='%s-%s' % (lang.id, row['Example_number']),
            name=row['Text'] or row['Analyzed_text'],
            description=row['Translation'],
            type=row['Type'].strip().lower() if row['Type'] else None,
            comment=row['Comments'],
            gloss=gloss,
            analyzed=atext,
            markup_text=normalize_markup(row['z_calc_Text_CSS']),
            markup_gloss=normalize_markup(row['z_calc_Gloss_CSS']),
            markup_comment=normalize_markup(row['z_calc_Comments_CSS']),
            markup_analyzed=normalize_markup(row['z_calc_Analyzed_text_CSS']),
            original_script=row['Original_script'],
            jsondata={
                'sort': row['Order_number'],
                'alt_translation': (row['Translation_other'] or '').strip()
                or None
            },
            language=lang)

        if row['Reference_ID']:
            if row['Reference_ID'] in data['Source']:
                source = data['Source'][row['Reference_ID']]
                DBSession.add(
                    common.SentenceReference(
                        sentence=p,
                        source=source,
                        key=source.id,
                        description=row['Reference_pages']))
            else:
                p.source = non_bibs[row['Reference_ID']]

    DBSession.flush()

    for row in read(args, 'Language_references'):
        if row['Reference_ID'] not in data['Source']:
            assert row['Reference_ID'] in non_bibs
            continue
        assert row['Language_ID'] in data['ApicsContribution']
        source = data['Source'][row['Reference_ID']]
        DBSession.add(
            common.ContributionReference(
                contribution=data['ApicsContribution'][row['Language_ID']],
                source=source,
                description=row['Pages'],
                key=source.id))

    #
    # global counter for features - across feature types
    #
    feature_count = 0
    for row in read(args, 'Features', 'Feature_number'):
        id_ = str(row['Feature_number'])
        if int(id_) > feature_count:
            feature_count = int(id_)
        wals_id = None
        desc = row['Feature_annotation_publication']
        if row['WALS_match'] == 'Total':
            if isinstance(row['WALS_No.'], int):
                wals_id = row['WALS_No.']
            else:
                wals_id = int(row['WALS_No.'].split('.')[0].strip())

        p = data.add(models.Feature,
                     row['Feature_code'],
                     name=row['Feature_name'],
                     id=id_,
                     description=desc,
                     markup_description=normalize_markup(
                         row['z_calc_Feature_annotation_publication_CSS']),
                     feature_type='primary',
                     multivalued=row['Value_relation_type'] != 'Single',
                     area=row['Feature_area'],
                     wals_id=wals_id)

        names = {}
        for i in range(1, 10):
            if not row['Value%s_publication' % i] \
                    or not row['Value%s_publication' % i].strip():
                continue
            name = row['Value%s_publication' % i].strip()
            if name in names:
                name += ' (%s)' % i
            names[name] = 1
            de = data.add(
                common.DomainElement,
                '%s-%s' % (row['Feature_code'], i),
                id='%s-%s' % (id_, i),
                name=name,
                parameter=p,
                abbr=row['Value%s_for_book_maps' % i] if p.id != '0' else name,
                number=int(row['Value%s_value_number_for_publication' % i]),
                jsondata={'color': colors[row['Value_%s_colour_ID' % i]]},
            )
            assert de

        if row['Authors_FeatureArticles']:
            authors, _ = row['Authors_FeatureArticles'].split('and the APiCS')
            authors = authors.strip()
            if authors.endswith(','):
                authors = authors[:-1].strip()
            for i, name in enumerate(authors.split(',')):
                assert name.strip() in editors
                p._authors.append(
                    models.FeatureAuthor(ord=i + 1,
                                         contributor=editors[name.strip()]))

        DBSession.flush()

    primary_to_segment = {123: 63, 126: 35, 128: 45, 130: 41}
    segment_to_primary = dict(
        zip(primary_to_segment.values(), primary_to_segment.keys()))
    number_map = {}
    names = {}
    for row in read(args, 'Segment_features', 'Order_number'):
        symbol = row['Segment_symbol']
        if row['Segment_name'] == 'voiceless dental/alveolar sibilant affricate':
            symbol = 't\u0361s'
        truth = lambda s: s and s.strip().lower() == 'yes'
        name = '%s - %s' % (symbol, row['Segment_name'])

        if name in names:
            number_map[row['Segment_feature_number']] = names[name]
            continue

        number_map[
            row['Segment_feature_number']] = row['Segment_feature_number']
        names[name] = row['Segment_feature_number']
        feature_count += 1
        if row['Segment_feature_number'] in segment_to_primary:
            primary_to_segment[segment_to_primary[row['Segment_feature_number']]]\
                = str(feature_count)
        p = data.add(models.Feature,
                     row['Segment_feature_number'],
                     name=name,
                     id=str(feature_count),
                     feature_type='segment',
                     area='Vowels' if truth(row['Vowel']) else
                     ('Obstruent consonants'
                      if truth(row['Obstruent']) else 'Sonorant consonants'),
                     jsondata=dict(
                         number=int(row['Segment_feature_number']),
                         vowel=truth(row['Vowel']),
                         consonant=truth(row['Consonant']),
                         obstruent=truth(row['Obstruent']),
                         core_list=truth(row['Core_list_segment']),
                         symbol=symbol,
                     ))

        for i, spec in SEGMENT_VALUES.items():
            data.add(common.DomainElement,
                     '%s-%s' % (row['Segment_feature_number'], spec[0]),
                     id='%s-%s' % (p.id, i),
                     name=spec[0],
                     parameter=p,
                     jsondata={'color': spec[1]},
                     number=i)

    print '--> remapped:', primary_to_segment
    DBSession.flush()

    for row in read(args, 'Sociolinguistic_features',
                    'Sociolinguistic_feature_number'):
        feature_count += 1
        p = data.add(models.Feature,
                     row['Sociolinguistic_feature_code'],
                     name=row['Sociolinguistic_feature_name'],
                     id='%s' % feature_count,
                     description=row['Sociolinguistic_feature_annotation'],
                     area='Sociolinguistic',
                     feature_type='sociolinguistic')

        names = {}

        for i in range(1, 10):
            id_ = '%s-%s' % (row['Sociolinguistic_feature_code'], i)
            if row.get('Value%s' % i) and row['Value%s' % i].strip():
                name = row['Value%s' % i].strip()
                if name in names:
                    name += ' (%s)' % i
                names[name] = 1
            else:
                continue
            kw = dict(id='%s-%s' % (p.id, i), name=name, parameter=p, number=i)
            data.add(common.DomainElement,
                     id_,
                     id='%s-%s' % (p.id, i),
                     name=name,
                     parameter=p,
                     number=i,
                     jsondata={
                         'color':
                         colors.get(row['Value%s_colour_ID' % i],
                                    colors.values()[i])
                     })

    sd = {}
    for row in read(args, 'Segment_data'):
        if row['Segment_feature_number'] not in number_map:
            continue
        number = number_map[row['Segment_feature_number']]

        if not row['Presence_in_the_language']:
            continue

        lang = data['Lect'][row['Language_ID']]
        param = data['Feature'][number]
        id_ = '%s-%s' % (lang.id, param.id)
        if id_ in sd:
            assert row['c_Record_is_a_duplicate'] == 'Yes'
            continue
        sd[id_] = 1
        valueset = data.add(
            common.ValueSet,
            id_,
            id=id_,
            parameter=param,
            language=lang,
            contribution=data['ApicsContribution'][row['Language_ID']],
            description=row['Comments'],
            markup_description=normalize_markup(row['z_calc_Comments_CSS']),
        )
        v = data.add(
            common.Value,
            id_,
            id=id_,
            frequency=float(100),
            valueset=valueset,
            domainelement=data['DomainElement'][
                '%s-%s' % (number, row['Presence_in_the_language'])],
        )
        if row['Example_word'] and row['Example_word_gloss']:
            example_count[row['Language_ID']] += 1
            p = add_sentence(args,
                             data,
                             '%s-p%s' % (lang.id, data['Feature'][number].id),
                             id='%s-%s' %
                             (lang.id, example_count[row['Language_ID']]),
                             name=row['Example_word'],
                             description=row['Example_word_gloss'],
                             language=lang)
            DBSession.add(common.ValueSentence(value=v, sentence=p))

        source = data['Source'].get(row['Refers_to_references_Reference_ID'])
        if source:
            DBSession.add(
                common.ValueSetReference(valueset=valueset,
                                         source=source,
                                         key=source.id))
        elif row['Refers_to_references_Reference_ID'] in non_bibs:
            valueset.source = non_bibs[
                row['Refers_to_references_Reference_ID']]

    lects = defaultdict(lambda: 1)
    lect_map = {}
    records = {}
    false_values = {}
    no_values = {}
    wals_value_number = {}
    for row in read(args, 'wals'):
        if row['z_calc_WALS_value_number']:
            wals_value_number[
                row['Data_record_id']] = row['z_calc_WALS_value_number']

    def prefix(attr, _prefix):
        if _prefix:
            return '%s_%s' % (_prefix, attr)
        return attr.capitalize()

    for _prefix, abbr in [('', ''), ('Sociolinguistic', 'sl')]:
        num_values = 10
        for row in read(args, prefix('data', _prefix)):
            if not row[prefix('feature_code', _prefix)]:
                print('no associated feature for', prefix('data', _prefix),
                      row[prefix('data_record_id', _prefix)])
                continue

            lid = row['Language_ID']
            lect_attr = row.get('Lect_attribute', 'my default lect').lower()
            if lect_attr != 'my default lect':
                if (row['Language_ID'], row['Lect_attribute']) in lect_map:
                    lid = lect_map[(row['Language_ID'], row['Lect_attribute'])]
                else:
                    lang = data['Lect'][row['Language_ID']]
                    c = lects[row['Language_ID']]
                    lid = '%s-%s' % (row['Language_ID'], c)
                    kw = dict(
                        name='%s (%s)' % (lang.name, row['Lect_attribute']),
                        id='%s' % (1000 + 10 * int(lang.id) + c),
                        latitude=lang.latitude,
                        longitude=lang.longitude,
                        description=row['Lect_attribute'],
                        language=lang,
                    )
                    data.add(models.Lect, lid, **kw)
                    lects[row['Language_ID']] += 1
                    lect_map[(row['Language_ID'], row['Lect_attribute'])] = lid

            id_ = abbr + str(row[prefix('data_record_id', _prefix)])
            assert id_ not in records
            records[id_] = 1

            assert row[prefix('feature_code', _prefix)] in data['Feature']
            language = data['Lect'][lid]
            parameter = data['Feature'][row[prefix('feature_code', _prefix)]]
            valueset = common.ValueSet(
                id='%s-%s' % (language.id, parameter.id),
                description=row['Comments_on_value_assignment'],
                markup_description=normalize_markup(
                    row.get('z_calc_Comments_on_value_assignment_CSS')),
            )

            values_found = {}
            for i in range(1, num_values):
                if not row['Value%s_true_false' % i]:
                    continue

                if row['Value%s_true_false' % i].strip().lower() != 'true':
                    assert row['Value%s_true_false' %
                               i].strip().lower() == 'false'
                    false_values[row[prefix('data_record_id', _prefix)]] = 1
                    continue

                iid = '%s-%s' % (row[prefix('feature_code', _prefix)], i)
                if iid not in data['DomainElement']:
                    print(iid, row[prefix('data_record_id',
                                          _prefix)], '--> no domainelement!')
                    continue
                values_found['%s-%s' % (id_, i)] = dict(
                    id='%s-%s' % (valueset.id, i),
                    domainelement=data['DomainElement']['%s-%s' % (row[prefix(
                        'feature_code', _prefix)], i)],
                    confidence=row['Value%s_confidence' % i],
                    frequency=float(row['c_V%s_frequency_normalised' %
                                        i]) if _prefix == '' else 100)

            if values_found:
                if row[prefix('data_record_id', _prefix)] in wals_value_number:
                    valueset.jsondata = {
                        'wals_value_number':
                        wals_value_number.pop(row[prefix(
                            'data_record_id', _prefix)])
                    }
                valueset.parameter = parameter
                valueset.language = language
                valueset.contribution = data['ApicsContribution'][
                    row['Language_ID']]
                valueset = data.add(common.ValueSet, id_, _obj=valueset)
                for i, item in enumerate(values_found.items()):
                    if i > 0 and not parameter.multivalued:
                        print 'multiple values for single-valued parameter: %s' % id_
                        break
                    id_, kw = item
                    kw['valueset'] = valueset
                    value = data.add(common.Value, id_, **kw)

                #
                # store references to additional data for segments which should be reused
                # for corresponding primary features!
                #
                if int(parameter.id) in primary_to_segment:
                    assert len(values_found) == 1
                    seg_id = '%s-%s' % (language.id, primary_to_segment[int(
                        parameter.id)])
                    seg_valueset = data['ValueSet'][seg_id]
                    seg_value = data['Value'][seg_id]
                    if not valueset.description and seg_valueset.description:
                        valueset.description = seg_valueset.description

                    for s in seg_value.sentence_assocs:
                        DBSession.add(
                            common.ValueSentence(value=value,
                                                 sentence=s.sentence))

                    for r in seg_valueset.references:
                        DBSession.add(
                            common.ValueSetReference(valueset=valueset,
                                                     source=r.source,
                                                     key=r.key))

                    if not valueset.source and seg_valueset.source:
                        valueset.source = seg_valueset.source

                DBSession.flush()
            else:
                no_values[id_] = 1

    DBSession.flush()

    for prefix, abbr, num_values in [
        ('D', '', 10),
        ('Sociolinguistic_d', 'sl', 7),
    ]:
        for row in read(args, prefix + 'ata_references'):
            assert row['Reference_ID'] in data['Source'] \
                or row['Reference_ID'] in non_bibs
            try:
                vs = data['ValueSet'][abbr +
                                      str(row[prefix + 'ata_record_id'])]
                if row['Reference_ID'] in data['Source']:
                    source = data['Source'][row['Reference_ID']]
                    DBSession.add(
                        common.ValueSetReference(
                            valueset=vs,
                            source=source,
                            key=source.id,
                            description=row['Pages'],
                        ))
                else:
                    if vs.source:
                        vs.source += '; ' + non_bibs[row['Reference_ID']]
                    else:
                        vs.source = non_bibs[row['Reference_ID']]
            except KeyError:
                continue

    DBSession.flush()

    missing = 0
    for row in read(args, 'Value_examples'):
        try:
            DBSession.add(
                common.ValueSentence(
                    value=data['Value']['%(Data_record_id)s-%(Value_number)s' %
                                        row],
                    sentence=data['Sentence'][
                        '%(Language_ID)s-%(Example_number)s' % row],
                    description=row['Notes'],
                ))
        except KeyError:
            missing += 1
    print('%s Value_examples are missing data' % missing)

    print('%s data sets with false values' % len(false_values))
    print('%s data sets without values' % len(no_values))

    for k, v in wals_value_number.items():
        print 'unclaimed wals value number:', k, v

    for i, row in enumerate(read(args, 'Contributors')):
        kw = dict(contribution=data['ApicsContribution'][row['Language ID']],
                  contributor=data['Contributor'][row['Author ID']])
        if row['Order_of_appearance']:
            kw['ord'] = int(float(row['Order_of_appearance']))
        data.add(common.ContributionContributor, i, **kw)

    DBSession.flush()
示例#24
0
def main(args):
    """Fills the database with data retrieved from tabular files.
    'filltables()' iterates over each row of each table.
    'typ' is the name of the table, 'name' a key column and 'tupl'
    the other columns as a dict {column_name,cell_value}."""

    data = Data()
    count = 0
    # dataset
    dataset = _addDataset(data)
    # load languages
    for typ, name, tupl in filltables():
        if not name or name == "na":
            continue
        #TODO we exclude non core language
        if typ == "languages" and tupl.get('Id').startswith('L_'):
            #print(name, tupl)
            lang = _addLang([
                name,
                tupl.get('Language', "na"),
                tupl.get('Family', "na"),
                tupl.get('fam_glottocode', ""),
                tupl.get('Area', "na"),
                tupl.get('Creator', "na"),
                tupl.get('Date', "na"),
                tupl.get('Archive', "na"),
                tupl.get('Archive_link', "na"),
                tupl.get('Translation', "na"),
                tupl.get('License', "na"),
                tupl.get('Audio license', "na"),
                tupl.get('NAKALA', "na"),
                tupl.get('Gloss', "na"),
                tupl.get('Words', 0),
                tupl.get('Speakers', 0),
                tupl.get('Texts', 0),
                tupl.get('Core words', 0),
                tupl.get('Core speakers', 0),
                tupl.get('Core texts', 0),
                tupl.get('Latitude', 0.0),
                tupl.get('Longitude', 0.0),
                tupl.get('Extended', "no")
            ])
            add_language_codes(data,
                               lang,
                               tupl.get('iso-639-3'),
                               glottocode=name)
        elif typ == "editors":
            dataset, count = _addEditor(dataset, count, [
                name,
                tupl.get('url', "na"),
                tupl.get('email', "na"),
                tupl.get('address', "na"),
                tupl.get('team', "na"),
                tupl.get('function', "na")
            ])
        elif typ == "sources":
            _addSource([
                name,
                tupl.get('bibtex_type', "na"),
                tupl.get('author', "na"),
                tupl.get('year', "na"),
                tupl.get('title', "na"),
                tupl.get('url', "na"),
                tupl.get('note', "na")
            ])
        else:
            #TODO for texts, we exclude delete and so on in column extended
            if tupl.get('extended') in ['no', 'yes']:
                #if typ=='dolg1241' : print(tupl):''
                _addText([
                    typ, name,
                    tupl.get('name', "na"),
                    tupl.get('spk_code', "na"),
                    tupl.get('spk_age', '0'),
                    tupl.get('spk_age_c', "na"),
                    tupl.get('spk_sex', "na"),
                    tupl.get('rec_date', "na"),
                    tupl.get('rec_date_c', "na"),
                    tupl.get('genre', "na"),
                    tupl.get('genre_stim', "na"),
                    tupl.get('gloss', "na"),
                    tupl.get('transl', "na"),
                    tupl.get('sound', "na"),
                    tupl.get('overlap', "na"),
                    tupl.get('processed', "na"),
                    tupl.get('nakala', "na"),
                    tupl.get('words', 0),
                    tupl.get('extended', "no")
                ])
        # dataset
        # Note: needs to run after loading (for editors)

    DBSession.add(dataset)
    DBSession.flush()
示例#25
0
def create(args):
    args.log.info('starting migration ...')
    data = Data()
    db = create_engine('postgresql://robert@/glottolog2')

    with transaction.manager:
        sn = data.add(common.Contributor,
                      'sn',
                      id='sn',
                      name='Sebastian Nordhoff')
        hh = data.add(common.Contributor,
                      'hh',
                      id='hh',
                      name='Harald Hammarström')
        rf = data.add(common.Contributor,
                      'rf',
                      id='rf',
                      name='Robert Forkel',
                      url="https://github.com/xrotwang")
        mh = data.add(common.Contributor,
                      'mh',
                      id='mh',
                      name='Martin Haspelmath')
        contrib = data.add(common.Contribution,
                           'c',
                           id='classification',
                           name='Classification')
        data.add(common.ContributionContributor,
                 'hh',
                 contribution=contrib,
                 contributor=hh)
        params = dict(
            fc=data.add(common.Parameter,
                        'fc',
                        id='fc',
                        name='Family classification'),
            sc=data.add(common.Parameter,
                        'sc',
                        id='sc',
                        name='Subclassification'),
        )

        dataset = data.add(
            common.Dataset,
            'd',
            id='glottolog',
            name='Glottolog 2.0',
            description='',
            published=datetime.date(2013, 8, 15),
            domain='glottolog.org',
            contact='*****@*****.**',
            jsondata={
                'license_icon':
                'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
                'license_name':
                'Creative Commons Attribution-ShareAlike 3.0 Unported License'
            })
        for i, ed in enumerate([sn, hh, rf, mh]):
            DBSession.add(
                common.Editor(dataset=dataset, contributor=ed, ord=i + 1))

        valuesets = {}

        def create_languoid(row, father_pk=None):
            glottocode = {
                'akun1242': 'akun1241'
            }.get(row['alnumcode'], row['alnumcode'])
            attrs = dict(
                pk=row['id'],
                id=glottocode,
                name=row['primaryname'],
                description=row['globalclassificationcomment'],
                level=getattr(models2.LanguoidLevel, row['level']),
                status=getattr(models2.LanguoidStatus,
                               (row['status'] or '').replace(' ', '_'), None),
                father_pk=father_pk,
                created=row['updated'],
                jsondata={} if not row['hname'] else {'hname': row['hname']},
            )
            for attr in [
                    'active',
                    'updated',
                    'hid',
                    'latitude',
                    'longitude',
            ]:
                attrs[attr] = row[attr]
            l = data.add(models2.Languoid, row['id'], **attrs)
            for type_ in params:
                id_ = '%s%s' % (type_, row['id'])
                vs = data.add(common.ValueSet,
                              id_,
                              id=id_,
                              description=row['classificationcomment'] if type_
                              == 'fc' else row['subclassificationcomment'],
                              language=l,
                              parameter=params[type_],
                              contribution=contrib)
                data.add(common.Value,
                         id_,
                         id=id_,
                         name='%s - %s' % (row['level'], row['status']),
                         valueset=vs)
                DBSession.flush()
                valuesets[id_] = vs.pk
            return str(row['id'])

        level = 0
        parents = [
            create_languoid(row) for row in db.execute(
                'select * from languoidbase where father_id is null')
        ]
        while parents:
            args.log.info('level: %s' % level)
            level += 1
            parents = [
                create_languoid(
                    row, father_pk=data['Languoid'][row['father_id']].pk)
                for row in db.execute(
                    'select * from languoidbase where father_id in (%s)' %
                    ','.join(parents))
            ]

    def handler(offset, batch):
        svalues = []
        rvalues = []
        for row in batch:
            jsondata = json.loads(row['jsondata'] or "{}")
            jsondata['bibtexkey'] = row['bibtexkey']
            dicts = {
                's':
                dict(pk=row['id'],
                     polymorphic_type='base',
                     id=str(row['id']),
                     name='%(author)s %(year)s' % row,
                     description=row['title'],
                     bibtex_type=getattr(EntryType, row['type']),
                     jsondata=jsondata),
                'r':
                dict(pk=row['id']),
            }
            for model, map_ in {
                    's': {
                        'author': None,
                        'yearstring': 'year',
                        'year': 'year_int',
                        'startpage': 'startpage_int',
                        'numberofpages': 'pages_int',
                        'pages': None,
                        'edition': None,
                        'school': None,
                        'address': None,
                        'url': None,
                        'note': None,
                        'number': None,
                        'series': None,
                        'editor': None,
                        'booktitle': None,
                        'journal': None,
                        'volume': None,
                        'publisher': None,
                    },
                    'r': {
                        'endpage': 'endpage_int',
                        'inlg': None,
                        'inlg_code': None,
                        'subject': None,
                        'subject_headings': None,
                        'keywords': None,
                        'normalizedauthorstring': None,
                        'normalizededitorstring': None,
                        'ozbib_id': None,
                    }
            }.items():
                for okey, nkey in map_.items():
                    dicts[model][nkey or okey] = row[okey]
            svalues.append(dicts['s'])
            rvalues.append(dicts['r'])
        DBSession.execute(common.Source.__table__.insert(), svalues)
        DBSession.execute(models2.Ref.__table__.insert(), rvalues)

    select(db, 'select * from refbase order by id', handler)
    DBSession.execute('COMMIT')

    for table, model, value, order in [
        ('macroarea', models2.Macroarea,
         lambda i, row: dict(pk=row['id'],
                             id=slug(row['name']),
                             name=row['name'],
                             description=row['description']), None),
        ('country', models2.Country,
         lambda i, row: dict(pk=row['id'], id=row['alpha2'], name=row['name']),
         None),
        ('provider', models2.Provider,
         lambda i, row: dict(pk=row['id'],
                             id=slug(row['name']),
                             name=row['description'],
                             description=row['comment'],
                             abbr=row['abbr'],
                             url=row['url'],
                             refurl=row['refurl'],
                             bibfield=row['bibfield']), None),
        ('doctype', models2.Doctype,
         lambda i, row: dict(pk=row['id'],
                             id=slug(row['name']),
                             abbr=row['abbr'],
                             name=row['name'],
                             description=row['description']), None),
        ('refprovider', models2.Refprovider, lambda i, row: dict(
            pk=i, provider_pk=row['provider_id'], ref_pk=row['refbase_id']),
         ('provider_id', 'refbase_id')),
        ('refdoctype', models2.Refdoctype, lambda i, row: dict(
            pk=i, doctype_pk=row['doctype_id'], ref_pk=row['refbase_id']),
         ('doctype_id', 'refbase_id')),
    ]:
        insert(db, table, model, value, order=order)

    names = dict(
        (int(d['id']), d['pk'])
        for d in insert(db,
                        'namebase',
                        common.Identifier,
                        lambda i, row: dict(pk=i,
                                            id=str(row['id']),
                                            name=row['namestring'],
                                            type='name',
                                            description=row['nameprovider'],
                                            lang=row['inlg'] if row['inlg'] and
                                            len(row['inlg']) <= 3 else 'en'),
                        order='id'))

    codes = dict(
        (int(d['id']), d['pk'])
        for d in insert(db,
                        'codebase',
                        common.Identifier,
                        lambda i, row: dict(pk=i,
                                            id=str(row['id']),
                                            name=row['codestring'],
                                            type=common.IdentifierType.iso.
                                            value if row['codeprovider'] ==
                                            'ISO' else row['codeprovider']),
                        start=len(names),
                        order='id'))

    res = insert(
        db, 'nodecodes', common.LanguageIdentifier,
        lambda i, row: dict(pk=i,
                            language_pk=row['languoidbase_id'],
                            identifier_pk=codes[row['codebase_id']]))

    insert(db,
           'nodenames',
           common.LanguageIdentifier,
           lambda i, row: dict(pk=i,
                               language_pk=row['languoidbase_id'],
                               identifier_pk=names[row['namebase_id']]),
           start=len(res))

    for table, model, value in [
        ('languoidmacroarea', models2.Languoidmacroarea,
         lambda i, row: dict(pk=i,
                             languoid_pk=row['languoidbase_id'],
                             macroarea_pk=row['macroarea_id'])),
        ('languoidcountry', models2.Languoidcountry,
         lambda i, row: dict(pk=i,
                             languoid_pk=row['languoidbase_id'],
                             country_pk=row['country_id'])),
        ('noderefs', common.LanguageSource,
         lambda i, row: dict(pk=i,
                             language_pk=row['languoidbase_id'],
                             source_pk=row['refbase_id'])),
        ('refmacroarea', models2.Refmacroarea, lambda i, row: dict(
            pk=i, macroarea_pk=row['macroarea_id'], ref_pk=row['refbase_id'])),
        ('refcountry', models2.Refcountry, lambda i, row: dict(
            pk=i, country_pk=row['country_id'], ref_pk=row['refbase_id'])),
        ('spuriousreplacements', models2.Superseded,
         lambda i, row: dict(pk=i,
                             languoid_pk=row['languoidbase_id'],
                             replacement_pk=row['replacement_id'],
                             description=row['relation'])),
        ('justification', common.ValueSetReference, lambda i, row: dict(
            pk=i,
            valueset_pk=valuesets['%s%s' % ('fc' if row[
                'type'] == 'family' else 'sc', row['languoidbase_id'])],
            source_pk=row['refbase_id'],
            description=row['pages'])),
    ]:
        insert(db, table, model, value)
示例#26
0
def import_dataset(ds, contrib, languoids, conceptsets, sources, values):
    data = Data()
    concepts = {p.id: p for p in DBSession.query(Concept)}
    langs = {l.id: l for l in DBSession.query(LexiRumahLanguage)}

    for i, row in enumerate(ds.rows):
        if not row['Value'] or not row['Parameter_ID'] or not row[
                'Language_ID']:
            continue

        lid = row['Language_ID'].lower()
        if lid == 'none':
            continue

        if not row['Parameter_ID'].strip():
            continue

        language = langs.get(lid)
        if language is None:
            languoid = languoids.get(lid)
            if not languoid:
                continue
            langs[lid] = language = LexiRumahLanguage(
                id=lid,
                name=languoid.name,
                level=text_type(languoid.level.name),
                latitude=languoid.latitude
                if languoid.id != 'plau1238' else -10,
                longitude=languoid.longitude)

        concept = concepts.get(row['Parameter_ID'])
        if concept is None:
            cs = conceptsets[row['Parameter_ID']]
            concepts[row['Parameter_ID']] = concept = Concept(
                id=row['Parameter_ID'],
                name=cs.gloss,
                description=cs.definition,
                semanticfield=cs.semanticfield)

        vsid = unique_id(contrib,
                         '%s-%s-%s' % (ds.name, language.id, concept.id))
        vid = unique_id(contrib, row['ID'])

        vs = data['ValueSet'].get(vsid)
        if vs is None:
            vs = data.add(ValueSet,
                          vsid,
                          id=vsid,
                          parameter=concept,
                          language=language,
                          contribution=contrib,
                          source=None)  # FIXME: add sources!

        counterpart = values.add(
            Counterpart,
            row['ID'],
            id=vid,
            valueset=vs,
            name=row['Form'],
            description=row.get('Comment'),
            context=row['Value'],
            variety_name=row.get('Language_name'),
            loan=row.get('Loan', False),
        )

        for ref in row.refs:
            CounterpartReference(counterpart=counterpart,
                                 source=sources[ref.source.id],
                                 description=ref.description)
示例#27
0
def main(args):

    data = Data()

    dataset = common.Dataset(
        id=amsd.__name__,
        name="AMSD",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        domain='amsd.clld.org',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})

    DBSession.add(dataset)

    editors = OrderedDict([('Piers Kelly', None)])

    # data_entry => Contributor
    for row in sorted(dicts('data_entry'), key=lambda x: [
                x['name'].lower()] ):
        if row['name'] in editors:
            editors[row['name']] = row['pk']
        data.add(
            common.Contributor,
            row['pk'],
            id=row['pk'],
            name=row['name']
        )

    for i, cid in enumerate(editors.values()):
        common.Editor(dataset=dataset, contributor=data['Contributor'][cid], ord=i + 1)

    for row in dicts('source_citation'):
        data.add(
            common.Source,
            row['pk'],
            id=row['pk'],
            note=row['name'],
            name=row['name'],
        )

    for row in dicts('ling_area'):
        data.add(
            models.ling_area,
            row['pk'],
            chirila_name = row['chirila_name'],
            austlang_code = row['austlang_code'],
            austlang_name = row['austlang_name'],
            glottolog_code = row['glottolog_code'],
        )
    fd = {}
    for row in dicts('linked_filenames'):
        if row['name'] not in ['00-Text_reference.png', '00-No_image_available.png']:
            fd[row['pk']] = dict(
                name = row['name'],
                oid = row['oid'],
                path = row['path'],
                mimetype = mimetypes.guess_type(row['path'])[0],
            )

    for m in 'item_type technique keywords material source_type sem_domain holder_file'.split():
        for row in dicts(m):
            data.add(
                getattr(models, m),
                row['pk'],
                name = row['name'],
            )

    DBSession.flush()

    # sticks => MessageStick
    no_fts_cols = ['pk', 'latitude', 'longitude', 'item_type',
        'irn', 'data_entry', 'dim_1', 'dim_2', 'dim_3', 'data_entry',
        'ling_area_1', 'ling_area_2', 'ling_area_3', 'holder_file']
    x_cols = ['sem_domain', 'material', 'source_type', 'technique', 'keywords', 'holder_file', 'item_type']
    for i, row in enumerate(dicts('sticks')):

        fts_items = []
        for col in row.keys():
            if col:
                if col == 'amsd_id':
                    fts_items.append(row['amsd_id'].replace('.', '_') or "amsd_{:05d}".format(i),)
                elif col not in no_fts_cols and not col.endswith('_pk'):
                    fts_items.append(row[col])

        for t in x_cols:
            if row[t]:
                for _, k in enumerate(row[t].split(';')):
                    fts_items.append(str(data[t][k]))
                    fts_items.extend(str(data[t][k]).split('_'))

        for t in ['ling_area_1', 'ling_area_2', 'ling_area_3']:
            if row[t]:
                for _, k in enumerate(row[t].split(';')):
                    fts_items.append(data['ling_area'][k].chirila_name)
                    fts_items.append(data['ling_area'][k].austlang_code)
                    fts_items.append(data['ling_area'][k].austlang_name)
                    fts_items.append(data['ling_area'][k].glottolog_code)

        if row['source_citation']:
            for k in row['source_citation'].split(';'):
                data.add(
                    common.ContributionReference,
                    k,
                    contribution_pk = int(row['pk']),
                    source_pk = int(k),
                )
                fts_items.append(str(data['Source'][k]))

        if row['linked_filenames']:
            for j, k in enumerate(row['linked_filenames'].split(';')):
                if k in fd:
                    oid = fd[k].get('oid')
                    mt = fd[k].get('mimetype')
                    refobjid = ''
                    if mt == 'application/pdf':
                        refobjid = oid
                        # use for web, thumbnail a place holder image
                        oid = 'EAEA0-52CC-0295-6B71-0'
                    n = fd[k].get('name')
                    data.add(
                        common.Contribution_files,
                        k,
                        id='%s-%s-%i' % (k, row['pk'], j),
                        object_pk = int(row['pk']),
                        name = n,
                        jsondata = dict(
                                original = fd[k].get('path'),
                                objid = oid,
                                refobjid = refobjid,
                                web = 'web.jpg',
                                thumbnail = 'thumbnail.jpg',
                            ),
                        ord=j,
                        mime_type = mt,
                    )
                    fts_items.append(n)
                    fts_items.extend(nfilter(re.split('[_\-\.]', n)))

        data.add(
            models.MessageStick,
            row['pk'],
            id = row['amsd_id'].replace('.', '_') or "amsd_{:05d}".format(i),
            title = row['title'],
            description = row['description'],
            obj_creator = row['obj_creator'],
            date_created = row['date_created'],
            note_place_created = row['note_place_created'],
            place_created = row['place_created'],
            item_type_pk = row['item_type'] or None,
            ling_area_1_pk = row['ling_area_1'] or None,
            ling_area_2_pk = row['ling_area_2'] or None,
            ling_area_3_pk = row['ling_area_3'] or None,
            notes_ling_area = row['notes_ling_area'],
            stick_term = row['stick_term'],
            message = row['message'],
            motifs = row['motifs'],
            motif_transcription = row['motif_transcription'],
            dim_1 = row['dim_1'],
            dim_2 = row['dim_2'],
            dim_3 = row['dim_3'],
            date_collected = row['date_collected'],
            holder_file_pk = row['holder_file'] or None,
            holder_obj_id = row['holder_obj_id'],
            collector = row['collector'],
            place_collected = row['place_collected'],
            creator_copyright = row['creator_copyright'],
            file_copyright = row['file_copyright'],
            latitude = row['lat'] or None,
            longitude = row['long'] or None,
            notes_coords = row['notes_coords'],
            url_institution = row['url_institution'],
            url_source_1 = row['url_source_1'],
            url_source_2 = row['url_source_2'],
            irn = row['irn'],
            notes = row['notes'],
            data_entry = row['data_entry'],
            fts = fts.tsvector('\n'.join(re.sub('[_\-]','.',v) for v in fts_items)),
        )

    DBSession.flush()
    for row in dicts('sticks'):
        for t in ['sem_domain', 'material', 'source_type', 'technique', 'keywords']:
            if row[t]:
                for _, k in enumerate(row[t].split(';')):
                    data.add(
                        getattr(models, 'x_%s' % (t)),
                        k,
                        object_pk = int(row['pk']),
                        item_pk = int(k),
                    )
示例#28
0
def load(args):
    glottolog = args.repos
    fts.index('fts_index', models.Ref.fts, DBSession.bind)
    DBSession.execute("CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;")
    version = assert_release(glottolog.repos)
    dataset = common.Dataset(
        id='glottolog',
        name="{0} {1}".format(glottolog.publication.web.name, version),
        publisher_name=glottolog.publication.publisher.name,
        publisher_place=glottolog.publication.publisher.place,
        publisher_url=glottolog.publication.publisher.url,
        license=glottolog.publication.license.url,
        domain=purl.URL(glottolog.publication.web.url).domain(),
        contact=glottolog.publication.web.contact,
        jsondata={'license_icon': 'cc-by.png', 'license_name': glottolog.publication.license.name},
    )
    data = Data()

    for e in glottolog.editors.values():
        if e.current:
            ed = data.add(common.Contributor, e.id, id=e.id, name=e.name)
            common.Editor(dataset=dataset, contributor=ed, ord=int(e.ord))
    DBSession.add(dataset)

    contrib = data.add(common.Contribution, 'glottolog', id='glottolog', name='Glottolog')
    DBSession.add(common.ContributionContributor(
        contribution=contrib, contributor=data['Contributor']['hammarstroem']))

    #
    # Add Parameters:
    #
    add = functools.partial(add_parameter, data)
    add('fc', name='Family classification')
    add('sc', name='Subclassification')
    add('aes',
        args.repos.aes_status.values(),
        name=args.repos.aes_status.__defaults__['name'],
        pkw=dict(
            jsondata=dict(
                reference_id=args.repos.aes_status.__defaults__['reference_id'],
                sources=[attr.asdict(v) for v in args.repos.aes_sources.values()],
                scale=[attr.asdict(v) for v in args.repos.aes_status.values()])),
        dekw=lambda de: dict(name=de.name, number=de.ordinal, jsondata=dict(icon=de.icon)),
    )
    add('med',
        args.repos.med_types.values(),
        name='Most Extensive Description',
        dekw=lambda de: dict(
            name=de.name, description=de.description, number=de.rank, jsondata=dict(icon=de.icon)),
    )
    add('macroarea',
        args.repos.macroareas.values(),
        pkw=dict(
            description=args.repos.macroareas.__defaults__['description'],
            jsondata=dict(reference_id=args.repos.macroareas.__defaults__['reference_id'])),
        dekw=lambda de: dict(
            name=de.name,
            description=de.description,
            jsondata=dict(geojson=read_macroarea_geojson(args.repos, de.name, de.description)),
        ),
    )
    add('ltype',
        args.repos.language_types.values(),
        name='Language Type',
        dekw=lambda de: dict(name=de.category, description=de.description),
        delookup='category',
    )
    add('country',
        args.repos.countries,
        dekw=lambda de: dict(name=de.id, description=de.name),
    )

    legacy = jsonlib.load(gc2version(args))
    for gc, version in legacy.items():
        data.add(models.LegacyCode, gc, id=gc, version=version)

    #
    # Now load languoid data, keeping track of relations that can only be inserted later.
    #
    lgsources = defaultdict(list)
    # Note: We rely on languoids() yielding languoids in the "right" order, i.e. such that top-level
    # nodes will precede nested nodes. This order must be preserved using an `OrderedDict`:
    nodemap = OrderedDict([(l.id, l) for l in glottolog.languoids()])
    lgcodes = {k: v.id for k, v in args.repos.languoids_by_code(nodemap).items()}
    for lang in nodemap.values():
        for ref in lang.sources:
            lgsources['{0.provider}#{0.bibkey}'.format(ref)].append(lang.id)
        load_languoid(glottolog, data, lang, nodemap)

    for gc in glottolog.glottocodes:
        if gc not in data['Languoid'] and gc not in legacy:
            common.Config.add_replacement(gc, None, model=common.Language)

    for obj in jsonlib.load(glottolog.references_path('replacements.json')):
        common.Config.add_replacement(
            '{0}'.format(obj['id']),
            '{0}'.format(obj['replacement']) if obj['replacement'] else None,
            model=common.Source)

    DBSession.flush()

    for doctype in glottolog.hhtypes:
        data.add(
            models.Doctype, doctype.id, id=doctype.id,
            name=doctype.name,
            description=doctype.description,
            abbr=doctype.abbv,
            ord=doctype.rank)

    for bib in glottolog.bibfiles:
        data.add(
            models.Provider,
            bib.id,
            id=bib.id,
            name=bib.title,
            description=bib.description,
            abbr=bib.abbr,
            url=bib.url)
    DBSession.flush()

    s = time()
    for i, entry in enumerate(
            BibFile(glottolog.build_path('monster-utf8.bib'), api=glottolog).iterentries()):
        if i % 10000 == 0:
            args.log.info('{0}: {1:.3}'.format(i, time() - s))
            s = time()
        ref = load_ref(data, entry, lgcodes, lgsources)
        if 'macro_area' in entry.fields:
            mas = []
            for ma in split_text(entry.fields['macro_area'], separators=',;', strip=True):
                ma = 'North America' if ma == 'Middle America' else ma
                ma = glottolog.macroareas.get('Papunesia' if ma == 'Papua' else ma)
                mas.append(ma.name)
            ref.macroareas = ', '.join(mas)
示例#29
0
def main(args):
    # determine if we run on a machine where other databases are available for lookup
    # locally:
    data = Data()
    genera = get_genera(data) if astroman else {}
    glottocodes, lnames, geocoords = {}, {}, {}
    if astroman:
        for k, v in glottocodes_by_isocode(
                'postgresql://robert@/glottolog3',
                cols=['id', 'name', 'latitude', 'longitude']).items():
            glottocodes[k] = v[0]
            lnames[k] = v[1]
            geocoords[k] = (v[2], v[3])

    refs = defaultdict(list)
    for row in get_rows(args, 'BibtexKey'):
        if row[1] == 'NO SOURCE GIVEN':
            refs[row[0]] = []
        else:
            refs[row[0]].append(row[1])
    add_sources(args, data)

    dataset = data.add(
        common.Dataset,
        'phoible',
        id='phoible',
        name='PHOIBLE Online',
        description='PHOIBLE Online',
        publisher_name="Max Planck Institute for Evolutionary Anthropology",
        publisher_place="Leipzig",
        publisher_url="http://www.eva.mpg.de",
        domain='phoible.org',
        license='http://creativecommons.org/licenses/by-sa/3.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon':
            'http://i.creativecommons.org/l/by-sa/3.0/88x31.png',
            'license_name':
            'Creative Commons Attribution-ShareAlike 3.0 Unported License'
        })

    for i, spec in enumerate([
        ('moran', "Steven Moran"),
        ('mccloy', "Daniel McCloy"),
        ('wright', "Richard Wright"),
    ]):
        DBSession.add(
            common.Editor(dataset=dataset,
                          ord=i + 1,
                          contributor=common.Contributor(id=spec[0],
                                                         name=spec[1])))

    squibs = defaultdict(list)
    for row in get_rows(args, 'Squib'):
        squibs[row[0]].append(row[1])

    source_urls = dict(get_rows(args, 'URL'))
    ia_urls = dict(get_rows(args, 'InternetArchive'))

    aggregated = list(
        reader(args.data_file('phoible-aggregated.tsv'), namedtuples=True))
    inventory_names = {}
    for key, items in groupby(sorted(aggregated,
                                     key=lambda t: (t.LanguageCode, t.Source)),
                              key=lambda t: (t.LanguageCode, t.Source)):
        items = list(items)

        lname = lnames.get(key[0])
        if not lname:
            lname = items[0].LanguageName
            lnames[key[0]] = lname

        if len(items) == 1:
            inventory_names[items[0].InventoryID] = '%s (%s)' % (lname, key[1])
        else:
            for i, item in enumerate(items):
                inventory_names[item.InventoryID] = '%s %s (%s)' % (lname, i +
                                                                    1, key[1])

    family_map = {
        ("Arawakan", "arwk"): "Arawakan",
        ("Trans-New Guinea", "trng"): "Trans-New Guinea",
        ("Moklen", "anes"): "Austronesian",
        ("Oko", "ncon"): "Niger-Congo",
        ("Muniche", "saso"): "Muniche",
        ("Tinigua", "saso"): "Tinigua",
        ("Vilela", "luvi"): "Vilela",
        ("Ofayé", "macg"): "Kamakanan",
        ("Purian", "macg"): "PurianPrint",
        ("Mixed language", "saml"): "Mixed language",
        ("Tupian", "tupi"): "Tupian",
        ("Yuwana", "saun"): "YuwanaPrint",
    }
    family_code_map = {k[1]: v for k, v in family_map.items()}

    for row in aggregated:
        lang = data['Variety'].get(row.LanguageCode)
        if not lang:
            if row.LanguageFamilyGenus == 'UNCLASSIFIED':
                genus = None
            else:
                genus_id = slug(strip_quotes(row.LanguageFamilyGenus))
                genus = genera.get(genus_id)
                if not genus:
                    genus = genera.get(row.LanguageCode)
                    if not genus:
                        #print(row.LanguageFamilyGenus, row.LanguageFamilyRoot)
                        family = family_map.get(
                            (row.LanguageFamilyGenus, row.LanguageFamilyRoot))
                        genus = genera[genus_id] = data.add(
                            models.Genus,
                            genus_id,
                            id=genus_id,
                            name=row.LanguageFamilyGenus,
                            description=family or row.LanguageFamilyRoot,
                            active=False,
                            root=row.LanguageFamilyRoot)

                if not genus.root:
                    genus.root = row.LanguageFamilyRoot

                if genus.description in family_code_map:
                    genus.description = family_code_map[genus.description]

            if row.LanguageCode in geocoords:
                coords = geocoords[row.LanguageCode]
            elif row.Latitude != 'NULL' and row.Longitude != 'NULL':
                coords = (float(row.Latitude), float(row.Longitude))
            lang = data.add(models.Variety,
                            row.LanguageCode,
                            id=row.LanguageCode,
                            name=lnames[row.LanguageCode],
                            genus=genus,
                            country=strip_quotes(row.Country),
                            area=strip_quotes(row.Area),
                            latitude=coords[0],
                            longitude=coords[1],
                            jsondata=dict(inventory_id=row.InventoryID))
            add_language_codes(data,
                               lang,
                               row.LanguageCode,
                               glottocodes=glottocodes)

        contributor = data['Contributor'].get(row.Source)
        if not contributor:
            contributor = data.add(common.Contributor,
                                   row.Source,
                                   id=row.Source,
                                   name=SOURCES[row.Source][0],
                                   description=SOURCES[row.Source][2])
            for ref in SOURCES[row.Source][1]:
                DBSession.add(
                    models.ContributorReference(source=data['Source'][ref],
                                                contributor=contributor))

        contrib = data.add(models.Inventory,
                           row.InventoryID,
                           id=row.InventoryID,
                           language=lang,
                           source=row.Source,
                           source_url=source_urls.get(row.InventoryID),
                           internetarchive_url=ia_urls.get(row.InventoryID),
                           name=inventory_names[row.InventoryID],
                           description=row.LanguageName)

        DBSession.add(
            common.ContributionContributor(contribution=contrib,
                                           contributor=contributor))

        for j, squib in enumerate(squibs.get(row.InventoryID, [])):
            f = common.Contribution_files(object=contrib,
                                          id='squib-%s-%s.pdf' %
                                          (contrib.id, j + 1),
                                          name='Phonological squib',
                                          description=squib,
                                          mime_type='application/pdf')
            assert f
            # f.create(files_dir, file(args.data_file('phonological_squibs', src)).read())

    DBSession.flush()
    unknown_refs = {}

    for row in reader(args.data_file('phoible-phonemes.tsv'),
                      namedtuples=True):
        inventory = data['Inventory'][row.InventoryID]
        segment = data['Segment'].get(row.Phoneme)
        if not segment:
            unicode_desc = [(c, unicodedata.name(c)) for c in row.Phoneme]
            description = ' - '.join([t[1] for t in unicode_desc])
            segment = data.add(
                models.Segment,
                row.Phoneme,
                id=b16encode(md5(description).digest()),
                name=row.Phoneme,
                description=description,
                equivalence_class=''.join([
                    t[0] for t in unicode_desc
                    if t[1].split()[0] not in ['COMBINING', 'MODIFIER']
                ]),
                segment_class=row.Class,
                combined_class=row.CombinedClass)
            DBSession.flush()

        vs = common.ValueSet(id=row.PhonemeID,
                             contribution=inventory,
                             language=inventory.language,
                             parameter=segment)

        for ref in refs.get(row.InventoryID, []):
            if ref not in data['Source']:
                if ref not in unknown_refs:
                    print('-------', ref)
                unknown_refs[ref] = 1
                continue
            DBSession.add(
                common.ValueSetReference(source=data['Source'][ref],
                                         valueset=vs))

        DBSession.add(
            common.Value(
                id=row.PhonemeID,
                name='%s %s' %
                (row.Phoneme, data['Inventory'][row.InventoryID].name),
                valueset=vs))
        DBSession.flush()

    for inventory_id in refs:
        for ref in refs[inventory_id]:
            if ref not in data['Source']:
                continue
            data.add(common.ContributionReference,
                     '%s-%s' % (inventory_id, ref),
                     source=data['Source'][ref],
                     contribution=data['Inventory'][inventory_id])

    for i, row in enumerate(
            reader(args.data_file('phoible-segments-features.tsv'))):
        if i == 0:
            features = list(map(feature_name, row))
            continue

        if row[0] not in data['Segment']:
            # print('skipping feature vector:', row)
            continue
        for j, value in enumerate(row):
            if j and value != '0':
                DBSession.add(
                    common.Parameter_data(
                        key=features[j],
                        value=value,
                        ord=j,
                        object_pk=data['Segment'][row[0]].pk))
    DBSession.flush()
示例#30
0
def main(args):
    data = Data()

    for rec in Database.from_file(
            data_path('references.bib'), lowercase=False):
        source = data.add(common.Source, rec.id, _obj=bibtex2source(rec))
    
    dataset = common.Dataset(
        id=clts.__name__,
        name="CLTS",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        license="http://creativecommons.org/licenses/by/4.0/",
        contact='*****@*****.**',
        domain='clts.clld.org',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    DBSession.add(dataset)
    for i, name in enumerate(['Johann-Mattis List', 'Cormac Anderson', 'Tiago Tresoldi', 
        'Thiago Chacon', 'Robert Forkel']):
        c = common.Contributor(id=slug(name), name=name)
        dataset.editors.append(common.Editor(contributor=c, ord=i))

    for i, line in enumerate(reader(data_path('sounds.tsv'), delimiter='\t',
            namedtuples=True)):
        if not i % 100:
            print('-', end="")
        key = line.NAME.replace(' ', '_')
        data.add(
                models.SoundSegment,
                key,
                id=key,
                name = line.NAME,
                grapheme=line.GRAPHEME,
                aliases=line.ALIASES,
                representation=len(line.REFLEXES.split(',')),
                reflexes = line.REFLEXES,
                generated = True if line.GENERATED else False,
                unicode = line.UNICODE,
                )
    print('')
    english = data.add(
        common.Language, 'eng',
        id='eng',
        name='English')



    contributions = {}
    for line in reader(data_path('datasets.tsv'),
            delimiter='\t', namedtuples=True):
        contributions[line.NAME] = data.add(
                models.CLTSDataSet,
                line.NAME,
                id=line.NAME,
                name=line.NAME,
                description=line.DESCRIPTION,
                datatype=line.TYPE
                )
        for id_ in line.REFS.split(', '):
            common.ContributionReference(
                    source=data['Source'][id_],
                    contribution=contributions[line.NAME])
    
        
    visited = set()
    for i, line in enumerate(reader(data_path('graphemes.tsv'), delimiter="\t",
            namedtuples=True)):
        if not i % 100: print('-', end='')
        key = line.DATASET + ':' + line.NAME+':'+line.GRAPHEME
        if key not in visited:
            sound_id = line.NAME.replace(' ', '_')
            vs = common.ValueSet(
                    id=key,
                    description=line.NAME,
                    language=english,
                    contribution=contributions[line.DATASET],
                    parameter=data['SoundSegment'][sound_id]
                    )
            data.add(
                    models.Grapheme,
                    key,
                    id=key,
                    grapheme=line.GRAPHEME,
                    bipa_grapheme=line.BIPA,
                    name=line.NAME,
                    dataset=line.DATASET,
                    datatype=line.DATATYPE,
                    frequency=line.FREQUENCY or 0,
                    image=line.IMAGE,
                    url=line.URL,
                    valueset=vs
                    )
            visited.add(key)
    print('-')