Пример #1
0
    def test_add_delete_rows(self):
        from pytsammalex.util import add_rows, filter_rows

        csv_path = self.tmp_path('test.csv')
        add_rows(csv_path, ['a', 'b'], [1, 2], [3, 4])
        self.assertEqual(len(list(reader(csv_path, dicts=True))), 2)
        filter_rows(csv_path, lambda item: item['a'] == '1')
        self.assertEqual(len(list(reader(csv_path, dicts=True))), 1)
        add_rows(csv_path, [1, 2], [3, 4])
        self.assertEqual(len(list(reader(csv_path, dicts=True))), 3)
Пример #2
0
def metadata(write_stats=True):
    """Writes statistics on metadata to readme."""
    txt = '# Basic Statistics on Metadata\n\n'
    cnc = list(reader(data_path('concepticon.tsv'), namedtuples=True, delimiter="\t"))
    for i,cl in enumerate(PKG_PATH.joinpath('concept_set_meta').glob('*.tsv')):
        data = list(reader(cl, namedtuples=True, delimiter="\t"))
        txt += '* {0} covers {1} concept sets ({2:.2f} %)\n'.format(cl.name[:-4], len(data), len(data) / len(cnc))
    if write_stats:
        with PKG_PATH.joinpath('concept_set_meta', 'README.md').open('w', encoding='utf8') as fp:
            fp.write(txt)
Пример #3
0
def test_add_delete_rows(tmpdir):
    csv_path = Path(tmpdir.join('test.csv'))
    add_rows(csv_path, ['a', 'b'], [1, 2], [3, 4])
    assert (len(list(reader(csv_path, dicts=True))) == 2)

    filter_rows(csv_path, lambda item: item['a'] == '1')
    assert (len(list(reader(csv_path, dicts=True))) == 1)

    add_rows(csv_path, [1, 2], [3, 4])
    assert (len(list(reader(csv_path, dicts=True))) == 3)
Пример #4
0
    def test_rewrite(self):
        from clldutils.dsv import reader, rewrite

        tmp = self.tmp_path('test')
        shutil.copy(FIXTURES.joinpath('tsv.txt').as_posix(), tmp.as_posix())
        rewrite(tmp.as_posix(), lambda i, row: [len(row)], delimiter='\t')
        self.assertEquals(list(reader(tmp))[0], ['2'])

        shutil.copy(FIXTURES.joinpath('csv.txt').as_posix(), tmp.as_posix())
        rewrite(tmp, lambda i, row: row)
        self.assertEquals(list(reader(tmp)), list(reader(FIXTURES.joinpath('csv.txt'))))
Пример #5
0
 def __init__(self, dir_):
     self.dir = dir_
     self.datasets = [
         Dataset(base_dir=self.dir.joinpath('datasets'), **r)
         for r in reader(self.dir.joinpath('datasets', 'index.csv'),
                         dicts=True)
     ]
     self.phylogenies = [
         Phylogeny(base_dir=self.dir.joinpath('phylogenies'), **r)
         for r in reader(self.dir.joinpath('phylogenies', 'index.csv'),
                         dicts=True)
     ]
Пример #6
0
    def test_add_delete_rows(self):
        from clldutils.dsv import add_rows, filter_rows_as_dict, reader

        csv_path = self.tmp_path('test.csv')
        add_rows(csv_path, ['a', 'b'], [1, 2], [3, 4])
        self.assertEqual(len(list(reader(csv_path, dicts=True))), 2)
        filter_rows_as_dict(csv_path, lambda item: item['a'] == '1')
        self.assertEqual(len(list(reader(csv_path, dicts=True))), 1)
        add_rows(csv_path, [2, 2], [2, 4])
        self.assertEqual(len(list(reader(csv_path, dicts=True))), 3)
        res = filter_rows_as_dict(csv_path, lambda item: item['a'] == '1')
        self.assertEqual(res, 2)
Пример #7
0
def ff_images(args):
    tsammalex = {
        i.id: i.taxa__id for i in
        reader(args.data_file('repos', 'tsammalex_images.csv'), namedtuples=True)}

    ref_pattern = re.compile('(?P<ref>[0-9]{5})')
    uploaded = load(args.data_file('repos', 'cdstar.json'))
    files = load(args.data_file('repos', 'Heath_flora_fauna_images.json'))
    files.update(load(args.data_file('repos', 'ffmissing.json')))
    path_to_md5 = {}
    for md5, paths in files.items():
        for path in paths:
            path_to_md5[Path(path.encode('utf8')).stem] = md5
    missed, found, uploaded_ = 0, 0, 0
    for i, img in enumerate(reader(args.data_file('repos', 'dogon_flora-fauna.csv'), delimiter=',', namedtuples=True)):
        stem = Path(img.filenames.encode('utf8')).stem
        assert stem in path_to_md5
        found += 1
        if path_to_md5[stem] in uploaded:
            m = ref_pattern.search(stem)
            uploaded_ += 1
            yield FFImage(
                path_to_md5[stem],
                Path(files[path_to_md5[stem]][0].encode('utf8')).name,
                None,
                m.group('ref') if m else None,
                None,
                [],
                uploaded[path_to_md5[stem]],
                tsammalex.get(path_to_md5[stem]))

    videos = load(args.data_file('repos', 'videos_from_website.json'))
    videos.update(load(args.data_file('repos', 'videos.json')))

    for md5, paths in videos.items():
        if md5 in uploaded:
            path = Path(paths[0].encode('utf8'))
            m = ref_pattern.search(path.stem)
            uploaded_ += 1
            yield FFImage(
                md5,
                path.name,
                None,
                m.group('ref') if m else None,
                None,
                [],
                uploaded[md5],
                tsammalex.get(md5))
        else:
            missed += 1

    print('ff_images', missed, uploaded_)
Пример #8
0
 def __init__(self, dir_):
     self.dir = Path(dir_)
     self.datasets = [
         Dataset(base_dir=self.dir.joinpath('datasets'), **r) for r in
         reader(self.dir.joinpath('datasets', 'index.csv'), dicts=True)]
     self.phylogenies = [
         Phylogeny(base_dir=self.dir.joinpath('phylogenies'), **r) for r in
         reader(self.dir.joinpath('phylogenies', 'index.csv'), dicts=True)]
     self.societies = {
         s.id: s for s in chain.from_iterable(d.societies for d in self.datasets)
     }
     self.variables = {
         v.id: v for v in chain.from_iterable(d.societies for d in self.datasets)
     }
     self.sources = BibFile(self.dir.joinpath('datasets', 'sources.bib'))
Пример #9
0
def metadata(write_stats=True):
    """Writes statistics on metadata to readme."""
    txt = '# Basic Statistics on Metadata\n\n'
    cnc = list(
        reader(data_path('concepticon.tsv'), namedtuples=True, delimiter="\t"))
    for i, cl in enumerate(
            PKG_PATH.joinpath('concept_set_meta').glob('*.tsv')):
        data = list(reader(cl, namedtuples=True, delimiter="\t"))
        txt += '* {0} covers {1} concept sets ({2:.2f} %)\n'.format(
            cl.name[:-4], len(data),
            len(data) / len(cnc))
    if write_stats:
        with PKG_PATH.joinpath('concept_set_meta',
                               'README.md').open('w', encoding='utf8') as fp:
            fp.write(txt)
Пример #10
0
def fill(dataset, data, socids):
    lines_old = set(open(data, encoding="utf8").readlines())
    res = defaultdict(list)
    for item in reader(data, dicts=True):
        res[(item["Dataset"], item["VarID"], item["soc_id"])].append(item)
        keys = list(item.keys())

    print(dataset, len(socids), "societies")

    for var_id, socs in groupby(sorted(res.keys(), key=lambda t: t[1]), key=lambda t: t[1]):
        for ds, soc_id in socids.difference(set((s[0], s[2]) for s in socs)):
            rec = OrderedDict()
            for key in keys:
                rec[key] = ""
            rec.update(soc_id=soc_id, Dataset=ds, Code="NA", VarID=var_id)
            res[(ds, var_id, soc_id)].append(rec)
        assert sum(len(v) for k, v in res.items() if k[1] == var_id) >= len(socids)

    with UnicodeWriter(data) as fp:
        fp.writerow(keys)
        for key in sorted(res.keys()):
            fp.writerows(row.values() for row in res[key])

    # fix line endings:
    with open(data, encoding="utf8") as fp:
        c = fp.read()

    with open(data, "w", encoding="utf8") as fp:
        fp.write(c.replace("\r\n", "\n"))

    lines_new = set(open(data, encoding="utf8").readlines())
    assert lines_old.issubset(lines_new)
    print(len(lines_new.difference(lines_old)), "NA values added")
Пример #11
0
def read_scorer(path):
    """
    Read a scoring function in a file into a ScoreDict object.

    Parameters
    ----------
    path : Path
        The path to the input file that shall be read as a scoring dictionary.
        The matrix format is a simple csv-file in which the scoring matrix is
        displayed, with negative values indicating high differences between
        sound segments (or sound classes) and positive values indicating high
        similarity. The matrix should be symmetric, columns should be separated
        by tabstops, and the first column should provide the alphabet for which
        the scoring function is defined.

    Returns
    -------
    scoredict : ~lingpy.algorithm.ScoreDict
        A ScoreDict instance which can be directly passed to LingPy's alignment
        functions.
    """
    chars, matrix = [], []
    for row in reader(path, delimiter='\t'):
        if row:
            chars.append(row[0])
            matrix.append(map(float, row[1:]))
    return ScoreDict(chars, matrix)
Пример #12
0
def stats():
    lines = [
        '## Concept Lists',
        '',
        ' name | mapped | mergers ',
        ' ---- | ------ | ------- ',
    ]
    
    for cl in sorted(
            PKG_PATH.joinpath('conceptlists').glob('*.tsv'), key=lambda _cl: _cl.name):
        concepts = list(reader(cl, namedtuples=True, delimiter='\t'))
        mapped = len([c for c in concepts if c.CONCEPTICON_ID])
        mapped_ratio = int((mapped / len(concepts)) * 100)
        concepticon_ids = Counter(
            [c.CONCEPTICON_ID for c in concepts if c.CONCEPTICON_ID])
        mergers = len([k for k, v in concepticon_ids.items() if v > 1])

        line = [
            '[%s](%s) ' % (cl.stem, cl.name),
            badge('mapped', '%s%%' % mapped_ratio, Colors.red if mapped_ratio < 99 else Colors.brightgreen),
            badge('mergers', '%s' % mergers, Colors.red if mergers else Colors.brightgreen),
        ]

        lines.append(' | '.join(line))

    with PKG_PATH.joinpath('conceptlists', 'README.md').open('w', encoding='utf8') as fp:
        fp.write('\n'.join(lines))
Пример #13
0
def import_gb20_features(datadir, data):
    for feature in reader(os.path.join(datadir, 'gb20features.tsv'),
                          delimiter='\t',
                          dicts=True):
        feature = FeatureSpec(feature)
        f = data.add(Feature,
                     feature.id,
                     id=feature.id,
                     name=feature.name,
                     doc=feature.doc,
                     patron=feature.patron,
                     std_comments=feature.std_comments,
                     name_french=feature.name_french,
                     jl_relevant_unit=feature.jl_relevant_unit,
                     jl_function=feature.jl_function,
                     jl_form=feature.jl_form,
                     hard_to_deny=feature.hard_to_deny,
                     prone_misunderstanding=feature.prone_misunderstanding,
                     requires_extensive_data=feature.requires_extensive_data,
                     last_edited=feature.last_edited,
                     other_survey=feature.other_survey)
        for i, (deid, desc) in enumerate(feature.domain.items()):
            DomainElement(id='%s-%s' % (f.id, deid),
                          parameter=f,
                          abbr=deid,
                          name='%s - %s' % (deid, desc),
                          number=int(deid) if deid != '?' else 999,
                          description=desc,
                          jsondata=dict(icon=ORDERED_ICONS[i].name))
Пример #14
0
    def _from(cls, data, container=None, skip_on_error=False):
        container = container or data.parent
        dataset = cls(data.stem)
        dataset.metadata.read(Dataset.filename(data, 'metadata'), container)
        dataset._table = dataset.metadata.get_table()
        dataset.sources.read(Dataset.filename(data, 'sources'), container)
        delimiter = ','
        if dataset.table:
            delimiter = dataset.table.dialect.delimiter
        if data.suffix in TAB_SUFFIXES:
            delimiter = '\t'

        if isinstance(container, Archive):
            rows = container.read_text(data.name).split('\n')
        else:
            rows = data

        for i, row in enumerate(reader(rows, delimiter=delimiter)):
            if i == 0:
                dataset.fields = tuple(row)
            else:
                try:
                    dataset.add_row(row)
                except ValueError as e:
                    if skip_on_error:
                        log.warn('skipping row in line %s: %s' % (i + 1, e))
                    else:
                        raise e
        dataset.table.dialect.delimiter = delimiter
        dataset.table.url = data.name
        return dataset
Пример #15
0
def _dtab(dir_, fn):
    lpd = []
    for d in reader(dir_.joinpath(fn), dicts=True, delimiter='\t', quoting=csv.QUOTE_NONE):
        lpd.append({
            k.replace('\ufeff', ''): (v or '').strip()
            for k, v in d.items() + [("fromfile", fn)]})
    return lpd
Пример #16
0
def load(table, csv, engine):
    schema = jsonlib.load(
        csv.parent.joinpath(csv.stem + '.' + CsvmJsonAdapter.extension))
    converter = get_converter(schema['tableSchema'], table)
    engine.execute(table.insert(),
                   [converted(d, converter) for d in reader(csv, dicts=True)])
    return schema.get("dc:identifier")
Пример #17
0
def stats():
    lines = [
        '## Concept Lists',
        '',
        ' name | mapped | mergers ',
        ' ---- | ------ | ------- ',
    ]

    for cl in sorted(PKG_PATH.joinpath('conceptlists').glob('*.tsv'),
                     key=lambda _cl: _cl.name):
        concepts = list(reader(cl, namedtuples=True, delimiter='\t'))
        mapped = len([c for c in concepts if c.CONCEPTICON_ID])
        mapped_ratio = int((mapped / len(concepts)) * 100)
        concepticon_ids = Counter(
            [c.CONCEPTICON_ID for c in concepts if c.CONCEPTICON_ID])
        mergers = len([k for k, v in concepticon_ids.items() if v > 1])

        line = [
            '[%s](%s) ' % (cl.stem, cl.name),
            badge('mapped', '%s%%' % mapped_ratio,
                  Colors.red if mapped_ratio < 99 else Colors.brightgreen),
            badge('mergers', '%s' % mergers,
                  Colors.red if mergers else Colors.brightgreen),
        ]

        lines.append(' | '.join(line))

    with PKG_PATH.joinpath('conceptlists',
                           'README.md').open('w', encoding='utf8') as fp:
        fp.write('\n'.join(lines))
Пример #18
0
def upgrade():
    conn = Connection(op.get_bind())
    example_map = {}

    sid = 204
    for example in jsonload(data_file('lingala_examples.json')):
        sid += 1
        kw = {
            'id': '60-%s' % sid,
            'language_pk': conn.pk(Language, '60'),
            'name': example['Text'],
            'description': example['Translation'],
            'gloss': '\t'.join(example['Gloss'].split()),
            'analyzed': '\t'.join(example['Text'].split()),
            'type': example['Type'].strip().lower(),
            'jsondata': {'sort': int(example['Order_number']), 'alt_translation': None}
        }
        example_map[example['Example_number']] = conn.insert(Sentence, **kw)

    for ve in jsonload(data_file('lingala_value_examples.json')):
        vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number'])
        vpk = conn.pk(Value, vspk, attr='valueset_pk')
        conn.insert(
            ValueSentence, value_pk=vpk, sentence_pk=example_map[ve['Example_number']])

    for i, comment in enumerate(reader(data_file('lingala_valueset_comments.tab'), delimiter='\t', dicts=True)):
        vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number'])
        comment['Comments_on_value_assignment'] = comment['Comments_on_value_assignment'].replace('\x0b', '\n')
        conn.update(
            ValueSet,
            {
                'description': comment['Comments_on_value_assignment'],
                'markup_description': None,
            },
            pk=vspk)
Пример #19
0
def iter_lexicon(args):
    for fname in ['dogon_lexicon', 'flora_Dogon_Unicode', 'fauna_Dogon_Unicode']:
        for concept in reader(args.data_file('repos', fname + '.csv'), dicts=True):
            entry = Entry(**{v: concept.get(k) for k, v in FIELD_MAP.items()})
            for name, gc in LEX_LANGS[fname].items():
                entry.forms[gc] = concept[name].strip()
            if entry.ref and entry.ref != 'zzz':
                yield entry
Пример #20
0
 def read(self, path, sep="\t", comment="#"):
     with Path(path).open(encoding='utf-8') as handle:
         lines = [
             unicodedata.normalize('NFC', hline)
             for hline in handle.readlines()
             if hline and not hline.startswith(comment)
         ]
     self.extend(list(reader(lines, dicts=True, delimiter=sep)))
Пример #21
0
 def _iter_etc(self, what):
     delimiter = '\t'
     path = self.dir / 'etc' / (what + '.tsv')
     if not path.exists():
         delimiter = ','
         path = path.parent / (what + '.csv')
     return reader(path, dicts=True,
                   delimiter=delimiter) if path.exists() else []
Пример #22
0
 def __init__(self, name_and_date, fp):
     parts = name_and_date.split('_')
     self.date = date(int(parts[-1][:4]), int(parts[-1][4:6]), int(parts[-1][6:8]))
     name = '_'.join(parts[:-1])
     if name.startswith('_') or name.startswith('-'):
         name = name[1:]
     if not name:
         name = 'Codes'
     self.name = name
     list.__init__(self, reader(fp.splitlines(), dicts=True, delimiter='\t'))
Пример #23
0
def read_win1252(fname, ignore_dataset=False):
    with open(fname, 'rb') as fp:
        c = fp.read()

    with open(fname, 'wb') as fp:
        fp.write(c.replace(b'\x9d', b''))

    for r in reader(fname, dicts=True, encoding='cp1252'):
        if ignore_dataset or (r.get('dataset') == 'SCCS') or (
                r.get('Dataset') == 'SCCS') or (r.get('Datset') == 'SCCS'):
            yield r
Пример #24
0
 def from_file(cls, fname):
     """
     Orthography profiles must be
     - tab-separated CSV files
     - encoded in UTF-8
     - with a header containing a column "Grapheme"
     """
     return cls(*list(
         reader(readlines(fname, normalize='NFD'),
                dicts=True,
                delimiter='\t',
                quotechar=None)))
Пример #25
0
Файл: util.py Проект: clld/csd
def get_sources(args):
    res = {}

    for d in reader(args.data_file('sources_CSD.csv'), delimiter=',', dicts=True):
        res[normalize_sid(d['Abbreviation'])] = d

    for sid in list(SOURCES.keys()):
        _sid = normalize_sid(sid)
        if _sid not in res:
            print('missing sid: %s' % sid)
            res[_sid] = dict(citation=SOURCES[sid], Name=sid, title=SOURCES[sid])

    return res
 def cmd_install(self, **kw):
     with self.cldf as ds:
         ds.add_concepts(id_factory=lambda d: d.number.replace('.', '-'))
         lmap = ds.add_languages()
         for p in self.raw.glob('*.csv'):
             lid = p.stem.split('-')[1]
             if lid in lmap:
                 for item in reader(p, dicts=True):
                     if item['Phonetic']:
                         ds.add_lexemes(
                             Language_ID=lid,
                             Parameter_ID=item['BNC ID'].replace('.', '-'),
                             Value=unicodedata.normalize('NFC', item['Phonetic']))
Пример #27
0
    def __init__(self, path):
        """
        A dataset is initialzed passing its directory path.
        """
        path = Path(path)
        self.id = path.name
        self.log = logging.getLogger(pylexibank.__name__)
        self.dir = path
        self.raw = self.dir.joinpath('raw', 'data')
        if not self.raw.exists():
            self.raw.mkdir()
        self.cldf_dir = self.dir.joinpath('cldf')
        if not self.cldf_dir.exists():
            self.cldf_dir.mkdir()
        self.commands = import_module(self.dir)
        self.md = jsonlib.load(self.dir.joinpath('metadata.json'))
        self.languages = []
        lpath = self.dir.joinpath('languages.csv')
        if lpath.exists():
            for item in reader(lpath, dicts=True):
                if item['GLOTTOCODE'] and not GC_PATTERN.match(
                        item['GLOTTOCODE']):
                    raise ValueError("Wrong glottocode for item {0}".format(
                        item['GLOTTOCODE']))
                self.languages.append(item)
        self.conceptlist = None
        url = self.md.get('dc:conformsTo')
        if url and url.startswith(
                'http://concepticon.clld.org/contributions/'):
            self.conceptlist = url.split('/')[-1]
        self.concepts = []
        cpath = self.dir.joinpath('concepts.csv')
        if cpath.exists():
            self.concepts = list(reader(cpath, dicts=True))
        self.cognates = Cognates()

        # the following attributes are only set when a dataset's cldf method is run:
        self.glottolog_languoids = {}
        self.glottolog_version, self.concepticon_version = None, None
Пример #28
0
def pytest_generate_tests(metafunc):
    if 'test_sounds' == metafunc.function.__name__:
        fixturenames = None
        tests = []
        for i, test in enumerate(
                reader(Path(__file__).parent / 'data' / 'test_data.tsv',
                       delimiter='\t',
                       dicts=True)):
            if i == 0:
                fixturenames = list(test.keys())
                fixturenames.pop(fixturenames.index('bipa'))
            del test['bipa']
            if None in test:
                del test[None]
            if len(fixturenames) != len(test.keys()):
                raise ValueError(set(test.keys()) - set(fixturenames))
            tests.append(test)

        attrs = [
            'nfd-normalized', 'clts-normalized', 'aliased', 'generated',
            'stressed'
        ]
        tests = sorted(tests, key=lambda t: tuple([t[a] for a in attrs]))
        batches = []
        for _, ts in groupby(tests, lambda t: tuple([t[a] for a in attrs])):
            for test in ts:
                batches.append(tuple(test.values()))
                break

        metafunc.parametrize(
            ','.join(n.replace('-', '_') for n in fixturenames), batches)
    elif 'test_clicks' == metafunc.function.__name__:
        tests = []
        for test in reader(Path(__file__).parent / 'data' / 'clicks.tsv',
                           delimiter='\t',
                           dicts=True):
            tests.append((test['GRAPHEME'], test['MANNER']))
        metafunc.parametrize('grapheme,gtype', tests)
Пример #29
0
def get_wordlist(path,
                 delimiter=",",
                 quotechar='"',
                 normalization_form="NFC",
                 **keywords):
    """
    Load a wordlist from a normal CSV file.

    Parameters
    ----------
    path : str
        The path to your CSV file.
    delimiter : str
        The delimiter in the CSV file.
    quotechar : str
        The quote character in your data.
    row : str (default = "concept")
        A string indicating the name of the row that shall be taken as the
        basis for the tabular representation of the word list.
    col : str (default = "doculect")
        A string indicating the name of the column that shall be taken as the
        basis for the tabular representation of the word list.
    conf : string (default='')
        A string defining the path to the configuration file.
    
    Notes
    -----
    This function returns a :py:class:`~lingpy.basic.wordlist.Wordlist` object.
    In contrast to the normal way to load a wordlist from a tab-separated file,
    however, this allows to directly load a wordlist from any "normal"
    csv-file, with your own specified delimiters and quote characters. If the
    first cell in the first row of your CSV file is not named "ID", the integer
    identifiers, which are required by LingPy will be automatically created.

    """
    kw = dict(conf="", col="doculect", row="concept")
    kw.update(keywords)
    data = list(dsv.reader(path, delimiter=delimiter, quotechar=quotechar))
    header = [h.lower() for h in data[0]]
    data = data[1:]
    D = {}
    if header[0] == 'ID':
        D[0] = header[1:]
        for row in data:
            D[row[0]] = [normalize(normalization_form, n) for n in row[1:]]
    else:
        D[0] = header
        for idx, row in enumerate(data):
            D[idx + 1] = row
    return Wordlist(D, row=kw['row'].lower(), col=kw['col'].lower())
Пример #30
0
def read_csv(fname, data):
    concepts = None

    for i, row in enumerate(reader(fname)):
        if i == 0:
            concepts = {j: c for j, c in enumerate(row[1:])}
        else:
            for j, c in enumerate(row[1:]):
                if j % 2 == 0:  # even number
                    loan, form = get_loan_and_form(c)
                else:
                    if form.strip():
                        data[row[0]][concepts[j]] = (form, loan, c)
    return data
Пример #31
0
    def __init__(self, clid):
        self.clid = clid
        self.concepts = {
            'CONCEPTICON_ID': {},  # maps ID to GLOSS
            'CONCEPTICON_GLOSS': {},  # maps GLOSS to ID
        }
        for cs in reader(data_path('concepticon.tsv'), dicts=True, delimiter='\t'):
            self.concepts['CONCEPTICON_ID'][cs['ID']] = cs['GLOSS']
            self.concepts['CONCEPTICON_GLOSS'][cs['GLOSS']] = cs['ID']

        self._cid_index = None
        self._cgloss_index = None
        self._link_col = (None, None)
        self._number_index = None
Пример #32
0
def stats(args):
    sounds = {}
    for row in reader(args.repos.data_path('sounds.tsv'),
                      delimiter='\t',
                      dicts=True):
        sounds[row['NAME']] = row
    graphs = {}
    for row in reader(args.repos.data_path('graphemes.tsv'),
                      delimiter='\t',
                      dicts=True):
        graphs['{GRAPHEME}-{NAME}-{DATASET}'.format(**row)] = row

    graphdict = defaultdict(list)
    for id_, row in graphs.items():
        graphdict[row['GRAPHEME']] += [row['DATASET']]

    text = [['DATA', 'STATS', 'PERC']]
    text.append([
        'Unique graphemes',
        len(set(row['GRAPHEME'] for row in graphs.values())), ''
    ])
    text.append(['different sounds', len(sounds), ''])
    text.append([
        'singletons',
        len([g for g in graphdict if len(set(graphdict[g])) == 1]), ''
    ])
    text.append([
        'multiples',
        len([g for g in graphdict if len(set(graphdict[g])) > 1]), ''
    ])
    total = len(sounds)
    for type_, count in Counter([s['TYPE']
                                 for s in sounds.values()]).most_common():
        text.append([type_ + 's', count, '{0:.2f}'.format(count / total)])

    print(tabulate.tabulate(text, headers='firstrow'))
Пример #33
0
    def __init__(self, clid):
        self.clid = clid
        self.concepts = {
            'CONCEPTICON_ID': {},  # maps ID to GLOSS
            'CONCEPTICON_GLOSS': {},  # maps GLOSS to ID
        }
        for cs in reader(data_path('concepticon.tsv'),
                         dicts=True,
                         delimiter='\t'):
            self.concepts['CONCEPTICON_ID'][cs['ID']] = cs['GLOSS']
            self.concepts['CONCEPTICON_GLOSS'][cs['GLOSS']] = cs['ID']

        self._cid_index = None
        self._cgloss_index = None
        self._link_col = (None, None)
        self._number_index = None
Пример #34
0
def list_attributes(write_stats=True):
    """Calculate the addditional attributes in the lists."""
    D = {}
    for i,cl in enumerate(PKG_PATH.joinpath('conceptlists').glob('*.tsv')):
        header = list(reader(cl, delimiter="\t"))[0]
        header = [h for h in header if h not in ['ID', 'CONCEPTICON_ID', 
            'CONCEPTICON_GLOSS', 'ENGLISH', 'GLOSS', 'NUMBER']]
        for h in header:
            try:
                D[h] += [cl.name]
            except KeyError:
                D[h] = [cl.name]
    txt = '# Common Additional Columns of Concept Lists\n'
    for k,v in sorted(D.items(), key=lambda x: len(x[1]), reverse=True):
        txt += '* {2} occurences: {0}, {1}\n'.format(k, ', '.join(v), len(v))
    print(txt)
Пример #35
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    concepticon = {
        c.GLOSS: c.CONCEPTICON_ID for c in
        reader(args.data_file('repos', 'conceptlist.tsv'), delimiter='\t', namedtuples=True)
        if c.CONCEPTICON_ID}
    sdata = jsonlib.load(args.data_file('repos', 'classification.json'))
    for concept in DBSession.query(models.Concept).options(joinedload(common.Parameter._files)):
        for t_ in ['image', 'video']:
            setattr(concept, 'count_{0}s'.format(t_), len(getattr(concept, t_ + 's')))
        if concept.jsondata['ref'] in sdata:
            util.update_species_data(concept, sdata[concept.jsondata['ref']])
        if concept.name in concepticon:
            concept.concepticon_id = int(concepticon[concept.name])
Пример #36
0
def get_wordlist(path, delimiter=",", quotechar='"', normalization_form="NFC", **keywords):
    """
    Load a wordlist from a normal CSV file.

    Parameters
    ----------
    path : str
        The path to your CSV file.
    delimiter : str
        The delimiter in the CSV file.
    quotechar : str
        The quote character in your data.
    row : str (default = "concept")
        A string indicating the name of the row that shall be taken as the
        basis for the tabular representation of the word list.
    col : str (default = "doculect")
        A string indicating the name of the column that shall be taken as the
        basis for the tabular representation of the word list.
    conf : string (default='')
        A string defining the path to the configuration file.
    
    Notes
    -----
    This function returns a :py:class:`~lingpy.basic.wordlist.Wordlist` object.
    In contrast to the normal way to load a wordlist from a tab-separated file,
    however, this allows to directly load a wordlist from any "normal"
    csv-file, with your own specified delimiters and quote characters. If the
    first cell in the first row of your CSV file is not named "ID", the integer
    identifiers, which are required by LingPy will be automatically created.

    """
    kw = dict(conf="", col="doculect", row="concept")
    kw.update(keywords)
    data = list(dsv.reader(path, delimiter=delimiter, quotechar=quotechar))
    header = [h.lower() for h in data[0]]
    data = data[1:]
    D = {}
    if header[0] == 'ID':
        D[0] = header[1:]
        for row in data:
            D[row[0]] = [normalize(normalization_form, n) for n in row[1:]]
    else:
        D[0] = header
        for idx, row in enumerate(data):
            D[idx + 1] = row
    return Wordlist(D, row=kw['row'].lower(), col=kw['col'].lower())
Пример #37
0
def from_csv(data_file, model, data, name=None, visitor=None, filter_=None):
    if filter_ is None:
        filter_ = lambda r: True
    kw = {'delimiter': ',', 'lineterminator': str('\r\n'), 'quotechar': '"'}
    for fname in data_files(data_file, (name or model.__csv_name__) + '.csv'):
        for row in list(reader(fname, **kw))[1:]:
            if row and filter_(row):
                try:
                    obj = model.from_csv(row, data)
                except (KeyError, IndexError):
                    obj = None
                    print(fname)
                    print(row)
                    raise
                if obj:
                    obj = data.add(model, row[0], _obj=obj)
                    if visitor:
                        visitor(obj, row, data)
Пример #38
0
def from_csv(data_file, model, data, name=None, visitor=None, filter_=None):
    if filter_ is None:
        filter_ = lambda r: True
    kw = {'delimiter': ',', 'lineterminator': str('\r\n'), 'quotechar': '"'}
    for fname in data_files(data_file, (name or model.__csv_name__) + '.csv'):
        for row in list(reader(fname, **kw))[1:]:
            if row and filter_(row):
                try:
                    obj = model.from_csv(row, data)
                except (KeyError, IndexError):
                    obj = None
                    print(fname)
                    print(row)
                    raise
                if obj:
                    obj = data.add(model, row[0], _obj=obj)
                    if visitor:
                        visitor(obj, row, data)
Пример #39
0
def viewCsvImport(request):
    '''
    report stores how the import went.
    Its structure must be iterable containing dicts
    with str data for keys 'heading', 'body'.
    '''
    report = []
    if request.method == 'POST' and 'CsvImportForm' in request.POST:
        importMethod = request.POST['tableType']
        fileDicts = list(
            dsv.reader(
                request.FILES['csvFile'].read().decode('utf8').splitlines(),
                dicts=True))
        handlerFunctions = {'ms*l': handleMeaningsLanguageImport}
        if importMethod in handlerFunctions:
            report = handlerFunctions[importMethod](fileDicts, request)

    return render_template(request, "admin/viewCsvImport.html",
                           {'report': report})
Пример #40
0
def list_attributes(write_stats=True):
    """Calculate the addditional attributes in the lists."""
    D = {}
    for i, cl in enumerate(PKG_PATH.joinpath('conceptlists').glob('*.tsv')):
        header = list(reader(cl, delimiter="\t"))[0]
        header = [
            h for h in header if h not in [
                'ID', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'ENGLISH',
                'GLOSS', 'NUMBER'
            ]
        ]
        for h in header:
            try:
                D[h] += [cl.name]
            except KeyError:
                D[h] = [cl.name]
    txt = '# Common Additional Columns of Concept Lists\n'
    for k, v in sorted(D.items(), key=lambda x: len(x[1]), reverse=True):
        txt += '* {2} occurences: {0}, {1}\n'.format(k, ', '.join(v), len(v))
    print(txt)
Пример #41
0
def upgrade():
    conn = Connection(op.get_bind())
    example_map = {}

    sid = 204
    for example in jsonload(data_file('lingala_examples.json')):
        sid += 1
        kw = {
            'id': '60-%s' % sid,
            'language_pk': conn.pk(Language, '60'),
            'name': example['Text'],
            'description': example['Translation'],
            'gloss': '\t'.join(example['Gloss'].split()),
            'analyzed': '\t'.join(example['Text'].split()),
            'type': example['Type'].strip().lower(),
            'jsondata': {
                'sort': int(example['Order_number']),
                'alt_translation': None
            }
        }
        example_map[example['Example_number']] = conn.insert(Sentence, **kw)

    for ve in jsonload(data_file('lingala_value_examples.json')):
        vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number'])
        vpk = conn.pk(Value, vspk, attr='valueset_pk')
        conn.insert(ValueSentence,
                    value_pk=vpk,
                    sentence_pk=example_map[ve['Example_number']])

    for i, comment in enumerate(
            reader(data_file('lingala_valueset_comments.tab'),
                   delimiter='\t',
                   dicts=True)):
        vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number'])
        comment['Comments_on_value_assignment'] = comment[
            'Comments_on_value_assignment'].replace('\x0b', '\n')
        conn.update(ValueSet, {
            'description': comment['Comments_on_value_assignment'],
            'markup_description': None,
        },
                    pk=vspk)
Пример #42
0
def import_cldf(srcdir, md, languoids, conceptsets):
    with transaction.manager:
        contrib = Provider(
            id=srcdir.name,
            name=md['dc:title'],
            description=md.get('dc:bibliographicCitation'),
            url=md.get('dc:identifier'),
            license=md.get('dc:license'),
            aboutUrl=md.get('aboutUrl'),
        )
        DBSession.add(contrib)
        sources = {}
        cldfdir = srcdir.joinpath('cldf')
        values = Data()
        for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False):
            ds = Dataset.from_metadata(fname)
            for src in ds.sources.items():
                if src.id not in sources:
                    sources[src.id] = cldf2clld(src, contrib, len(sources) + 1)
            import_dataset(ds, contrib, languoids, conceptsets, sources,
                           values)
            DBSession.flush()
        # import cognates:
        if cldfdir.joinpath('cognates.csv').exists():
            for csid, cognates in groupby(
                    reader(cldfdir.joinpath('cognates.csv'), dicts=True),
                    lambda i: i['Cognate_set_ID']):
                cs = Cognateset(id=unique_id(contrib, csid),
                                contribution=contrib)
                for cognate in cognates:
                    cp = values['Counterpart'].get(cognate['Word_ID'])
                    if cp:
                        DBSession.add(
                            CognatesetCounterpart(
                                cognateset=cs,
                                counterpart=cp,
                                cognate_detection_method=cognate[
                                    'Cognate_detection_method'],
                                alignment=cognate['Alignment'],
                                alignment_method=cognate['Alignment_method'],
                                doubt=cognate['Doubt'] == 'True'))
Пример #43
0
def import_cldf(srcdir, md, languoids, conceptsets):
    with transaction.manager:
        contrib = Provider(
            id=srcdir.name,
            name=md['dc:title'],
            description=md.get('dc:bibliographicCitation'),
            url=md.get('dc:identifier'),
            license=md.get('dc:license'),
            aboutUrl=md.get('aboutUrl'),
        )
        DBSession.add(contrib)
        sources = {}
        cldfdir = srcdir.joinpath('cldf')
        values = Data()
        for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False):
            ds = Dataset.from_metadata(fname)
            for src in ds.sources.items():
                if src.id not in sources:
                    sources[src.id] = cldf2clld(src, contrib, len(sources) + 1)
            import_dataset(ds, contrib, languoids, conceptsets, sources, values)
            DBSession.flush()
        # import cognates:
        if cldfdir.joinpath('cognates.csv').exists():
            for csid, cognates in groupby(
                    reader(cldfdir.joinpath('cognates.csv'), dicts=True),
                    lambda i: i['Cognate_set_ID']):
                cs = Cognateset(id=unique_id(contrib, csid), contribution=contrib)
                for cognate in cognates:
                    cp = values['Counterpart'].get(cognate['Word_ID'])
                    if cp:
                        DBSession.add(CognatesetCounterpart(
                            cognateset=cs,
                            counterpart=cp,
                            cognate_detection_method=cognate['Cognate_detection_method'],
                            alignment=cognate['Alignment'],
                            alignment_method=cognate['Alignment_method'],
                            doubt=cognate['Doubt'] == 'True'))
Пример #44
0
 def _read_tsv(self, path):
     return set(tuple(row[1:]) for row in reader(path, delimiter="\t"))
Пример #45
0
 def _read_tsv(self, path):
     return set(tuple(row[1:]) for row in reader(path, delimiter='\t'))
Пример #46
0
 def read(table):
     fname = args.data_file(table + '.all.csv')
     if not fname.exists():
         fname = args.data_file(table + '.csv')
     return list(dsv.reader(fname, namedtuples=True))
Пример #47
0
def get_tab(name):
    """Generator for entries in a tab file specified by name."""
    return dsv.reader(
        get(get_taburls()[name]).split('\n'), namedtuples=True, delimiter='\t')
Пример #48
0
def main():
    socs = read_win1252(
        'ALL_soc_ids_to_lang_wAltNames_sources_5Sept2017_win1252.csv')
    links = {
        r['soc_id']: r
        for r in read_win1252(
            'ALL_soc_links_to_other_databases_30Aug2017_win1252.csv')
    }
    locations = {
        'SCCS' + r['soc_id']: r
        for r in reader('../../legacy/LatLong_data.csv', dicts=True)
    }
    for row in reader(
            '../WNAI/DPLACE_RevisedLatLong_27April2017_inclWNAI_SCCS.csv',
            dicts=True):
        if row['Dataset'] == 'SCCS':
            locations[row['soc_id']]['Lat'] = row['soc.latitude']
            locations[row['soc_id']]['Long'] = row['soc.longitude']

    with UnicodeWriter('societies.csv') as w:
        w.writerow([f.name for f in attr.fields(Society)])
        for soc in socs:
            kw = {
                'id': soc['soc_id'],
                'glottocode': soc['glottolog_id'],
                'glottocode_comment': 'Lang_assignment_change_notes'
            }
            for col in [
                    'xd_id',
                    'pref_name_for_society',
                    'ORIG_name_and_ID_in_this_dataset',
                    'alt_names_by_society',
                    'main_focal_year',
            ]:
                kw[col] = soc[col]

            for col in ['Lat', 'Long', 'origLat', 'origLong', 'Comment']:
                kw[col] = locations[soc['soc_id']][col]

            kw['HRAF_name_ID'] = links[soc['soc_id']]['HRAF_name_ID']
            kw['HRAF_link'] = links[soc['soc_id']]['HRAF_link']
            w.writerow(attr.astuple(Society(**kw)))

    with UnicodeWriter('societies_mapping.csv') as w:
        w.writerow(['id', 'related'])
        for sid, l in links.items():
            rels = []
            for dsid, suffix in [
                ('EA', '1'),
                ('EA', '2'),
                ('Binford', '1'),
                ('Binford', '2'),
                ('Binford', '3'),
                ('SCCS', ''),
                ('WNAI', '1'),
                ('WNAI', '2'),
                ('WNAI', '3'),
                ('WNAI', '4'),
                ('WNAI', '5'),
            ]:
                if dsid == 'SCCS':
                    label = l['{0}_society_equivalent{1}'.format(dsid, suffix)]
                else:
                    label = l['{0}_label_society_equivalent{1}'.format(
                        dsid, suffix)]
                id = l['{0}_id_society_equivalent{1}'.format(dsid, suffix)]
                if label and id:
                    rels.append('{0}: {1} [{2}]'.format(dsid, label, id))
            w.writerow([sid, '; '.join(rels)])

    var_info = {
        r['source']: r['APA_reference']
        for r in read_win1252('SCCS_variable_sources_bibtex_to_APA.csv',
                              ignore_dataset=True)
    }

    with UnicodeWriter('variables.csv') as w:
        fm = OrderedDict([
            ('VarID', 'id'),
            ('Category', 'category'),
            ('VarTitle', 'title'),
            ('VarDefinition', 'definition'),
            ('VarType', 'type'),
            ('UserNotes', 'notes'),
            ('source', 'source'),
            ('VarTitleShort', 'changes'),
            ('Unit', 'units'),
        ])
        w.writerow(fm.values())
        for row in read_win1252(
                'SCCS_Full_VariableList_12Sept2017_win1252.csv'):
            row['VarID'] = 'SCCS' + row['VarID']
            row['VarType'] = row['VarType'].capitalize()
            if row['VarDefinition']:
                row['VarDefinition'] += '\n\n'
            row['VarDefinition'] += var_info.get(row['source'], row['source'])
            w.writerow([row[f] for f in fm.keys()])

    with UnicodeWriter('codes.csv') as w:
        fm = OrderedDict([
            ('VarID', 'var_id'),
            ('Code', 'code'),
            ('CodeDescription', 'description'),
            ('ShortName', 'name'),
        ])
        w.writerow(fm.values())
        for row in read_win1252(
                'SCCS_CodeDescriptions_12Sept2017_win1252.csv'):
            row['VarID'] = 'SCCS' + row['VarID']
            w.writerow([row[f] for f in fm.keys()])

    with UnicodeWriter('data.csv') as w:
        fm = OrderedDict([
            ('soc_id', 'soc_id'),
            ('SubCase', 'sub_case'),
            ('Year', 'year'),
            ('VarID', 'var_id'),
            ('Code', 'code'),
            ('EthnoReferences', 'references'),
            ('AdminComment', 'admin_comment'),
            ('UserComment', 'comment'),
            ('SourceCodedData', 'source_coded_data'),
        ])
        w.writerow(fm.values())
        for row in read_win1252(
                'Full_SCCS_data_12Sept2017_FINAL_329451rows_win1252.csv'):
            row['VarID'] = 'SCCS' + row['VarID']
            w.writerow([row[f] for f in fm.keys()])
Пример #49
0
    def load_glottolog_data(self):
        """
        Loads the Glottolog classification information from the appropriate
        newick file, parses it and stores the required datastructure in
        self.classification.
        """
        # Don't load if the analysis doesn't use it
        if not self.check_glottolog_required():
            return
        # Don't load if we already have - can this really happen?
        if self.glottolog_loaded:
            return
        self.glottolog_loaded = True

        label2name = {}
        glottocode2node = {}

        def parse_label(label):
            match = GLOTTOLOG_NODE_LABEL.match(label)
            label2name[label] = (match.group('name').strip().replace("\\'","'"), match.group('glottocode'))
            return (
                match.group('name').strip(),
                match.group('glottocode'),
                match.group('isocode'))

        def get_classification(node):
            res = []
            ancestor = node.ancestor
            while ancestor:
                res.append(label2name[ancestor.name])
                ancestor = ancestor.ancestor
            return list(reversed(res))

        # Walk the tree and build the classifications dictionary
        glottolog_trees = newick.read(get_glottolog_data('newick', self.glottolog_release))
        for tree in glottolog_trees:
            for node in tree.walk():
                name, glottocode, isocode = parse_label(node.name)
                classification = get_classification(node)
                self.classifications[glottocode] = classification
                if isocode:
                    self.classifications[isocode] = classification
                glottocode2node[glottocode] = node

        # Load geographic metadata
        for t in reader(
                get_glottolog_data('geo', self.glottolog_release), namedtuples=True):
            if t.macroarea:
                self.glotto_macroareas[t.glottocode] = t.macroarea
                for isocode in t.isocodes.split():
                    self.glotto_macroareas[isocode] = t.macroarea
            if self.location_data:
                continue # Use user-supplied data instead

            if t.latitude and t.longitude:
                latlon = (float(t.latitude), float(t.longitude))
                self.locations[t.glottocode] = latlon
                for isocode in t.isocodes.split():
                    self.locations[isocode] = latlon

        if self.location_data:
            return

        # Second pass of geographic data to handle dialects, which inherit
        # their parent language's location
        for t in reader(
                get_glottolog_data('geo', self.glottolog_release), namedtuples=True):
            if t.level == "dialect":
                failed = False
                node = glottocode2node[t.glottocode]
                ancestor = node.ancestor
                while label2name[ancestor.name][1] not in self.locations:
                    if not ancestor.ancestor:
                        # We've hit the root without finding an ancestral node
                        # with location data!
                        failed = True
                        break
                    else:
                        ancestor = ancestor.ancestor
                if failed:
                    continue
                latlon = self.locations[label2name[ancestor.name][1]]
                self.locations[t.glottocode] = latlon
                for isocode in t.isocodes.split():
                    self.locations[isocode] = latlon
Пример #50
0
def read_csv(path):
    return list(reader(path, dicts=True))
Пример #51
0
def get_rows(args, name):  # pragma: no cover
    for i, row in enumerate(
            reader(args.data_file('InventoryID-%s.csv' % name), delimiter='\t')):
        if i and row[1] != 'NA':
            yield row
Пример #52
0
def read_all(fname, **kw):
    kw.setdefault('delimiter', '\t')
    if not kw.get('dicts'):
        kw.setdefault('namedtuples', True)
    return list(dsv.reader(fname, **kw))
Пример #53
0
def tsv_items(path, ordered=False):
    return list(reader(path, delimiter='\t', dicts=True))
Пример #54
0
 def read_csv(self, fname, **kw):
     return list(reader(self.joinpath(fname), **kw))
Пример #55
0
    def test_reader(self):
        from clldutils.dsv import reader

        lines = ['first\tline', 'sücond\tläneß']
        encoded_lines = [l.encode('utf8') for l in lines]
        csv_lines = [l.replace('\t', ',') for l in lines]

        def check(r):
            res = list(r)
            assert len(res) == 2
            assert res[1][1] == 'läneß'

        check(reader(lines, delimiter='\t'))
        for lt in ['\n', '\r\n', '\r']:
            if PY3:  # pragma: no cover
                # Simulate file opened in text mode:
                fp = StringIO(lt.join(lines), newline='')
            else:
                # Simulate file opened in binary mode:
                fp = BytesIO(to_binary(lt).join(encoded_lines))
            check(reader(fp, delimiter='\t'))
        check(reader(FIXTURES.joinpath('csv.txt')))

        res = list(reader(FIXTURES.joinpath('tsv.txt'), namedtuples=True, delimiter='\t'))
        assert res[0].a_name == 'b'
        # Missing column values should be set to None:
        assert res[2].a_name is None

        r = list(reader(lines, dicts=True, delimiter='\t'))
        assert len(r) == 1 and r[0]['first'] == 'sücond'
        r = list(reader(lines, namedtuples=True, delimiter='\t'))
        assert len(r) == 1 and r[0].first == 'sücond'
        r = list(reader(csv_lines, namedtuples=True))
        assert len(r) == 1 and r[0].first == 'sücond'
        self.assertEqual(list(reader([], dicts=True, delimiter='\t')), [])
        self.assertEqual(
            list(reader([''], dicts=True, fieldnames=['a', 'b'], delimiter='\t')), [])
        self.assertEqual(list(reader(['a,b', ''], dicts=True, delimiter='\t')), [])

        r = reader(
            ['a,b', '1,2,3,4', '1'], dicts=True, restkey='x', restval='y', delimiter=',')
        self.assertEqual(list(r), [dict(a='1', b='2', x=['3', '4']), dict(a='1', b='y')])
Пример #56
0
def reflexes(write_stats=True, path='concepticondata'):
    """
    Returns a dictionary with concept set label as value and tuples of concept
    list identifier and concept label as values.
    """
    D, G = {}, {}
    cpl = 0
    cln = 0
    clb = set([])
    
    dpath = Path(path) if path else PKG_PATH
    
    for i, cl in enumerate(dpath.joinpath('conceptlists').glob('*.tsv')):
        concepts = list(reader(cl, namedtuples=True, delimiter="\t"))
        for j,concept in enumerate([c for c in concepts if c.CONCEPTICON_ID]):
            label = concept.GLOSS if hasattr(concept, 'GLOSS') else concept.ENGLISH
            name = cl.name
            try:
                D[concept.CONCEPTICON_GLOSS] += [(name, label)]
            except KeyError:
                D[concept.CONCEPTICON_GLOSS] = [(name, label)]
            try:
                G[label] += [(concept.CONCEPTICON_ID, concept.CONCEPTICON_GLOSS, name)]
            except KeyError:
                G[label] = [(concept.CONCEPTICON_ID, concept.CONCEPTICON_GLOSS, name)]
            clb.add(label)
            cpl += 1
        cln += 1
    # write basic statistics and most frequent glosses
    if write_stats:
        txt = """# Concepticon Statistics
* concept sets (used): {0}
* concept lists: {1}
* concept labels: {2}
* concept labels (unique): {3}
* Ø concepts per list: {4:.2f}
* Ø concepts per concept set: {5:.2f}
* Ø unique concept labels per concept set: {6:.2f}

"""
        txt = txt.format(
            len(D),
            cln,
            cpl,
            len(clb),
            cpl / cln,
            sum([len(v) for k,v in D.items()]) / len(D),
            sum([len(set([label for _,label in v])) for k,v in D.items()]) / len(D)
            )
        
        txt += '# Twenty Most Diverse Concept Sets\n\n'
        txt += '| No. | concept set | distinct labels | concept lists | examples |\n'
        txt += '| --- | --- | --- | --- | --- |\n'
        for i,(k,v) in enumerate(sorted(D.items(), key=lambda x: len(set([label for _,label in
            x[1]])), reverse=True)[:20]):
            txt += '| {0} | {1} | {2} | {3} | {4} |\n'.format(
                    i+1,
                    k,
                    len(set([label for _,label in v])),
                    len(set([clist for clist,_ in v])),
                    ', '.join(sorted(set(['«{0}»'.format(label.replace('*','`*`')) for _,label in
                        v])))
                    )

        txt += '# Twenty Most Frequent Concept Sets\n\n'
        txt += '| No. | concept set | distinct labels | concept lists | examples |\n'
        txt += '| --- | --- | --- | --- | --- |\n'
        for i,(k,v) in enumerate(sorted(D.items(), key=lambda x: len(set([clist for clist,_ in
            x[1]])), reverse=True)[:20]):
            txt += '| {0} | {1} | {2} | {3} | {4} |\n'.format(
                    i+1,
                    k,
                    len(set([label for _,label in v])),
                    len(set([clist for clist,_ in v])),
                    ', '.join(sorted(set(['«{0}»'.format(label.replace('*','`*`')) for _,label in
                        v])))
                    )

        with dpath.joinpath('README.md').open('w', encoding='utf8') as fp:
            fp.write(txt)

    return D, G
Пример #57
0
def gps(args):
    """
    Multilingual,
    lg family, -> Dogon, Atlantic, Mande, Berber, ...
    family code ,
    Language (group), -> Names of Dogon languages
    alternate lg (group), -> comma separated list of language names
    language code, -> ISO 639-3
    Language (based on native name),
    dialect code,
    ISO 3 Letter country code,
    OfficialVillageName, -> name!
    MajorCity,
    PopulationNumber,
    village (RB),
    village (DNAFLA),
    village (SIL),
    village (map), -> alternative name
    Transcribed Village Name, -> keep
    N Lat,
    W Lon,
    NFr,
    WFr,
    Nmn60,
    Wmn60,
    NMinFr,
    WMinFr,
    Ndg,
    Wdg,
    N Lat_2, -> 12 12.123
    W Lon_2, -> 12 12.123
    N SIL,
    W SIL,
    N Lat source,
    WLon source,
    N Lat map,
    WLon map,
    N Lat us,
    W Long us,
    sourceOfCoordinates, -> keep
    name of map, -> keep
    lg comment, -> keep
    industries, -> keep
    weekly market, -> keep
    surnames, -> keep
    social info, -> keep
    Image,
    ImageDescription,
    Audio,
    AudioTranscription,
    Video,
    VideoTranscription
    """
    full_name_map = {
        'Oualo (upper)': 'walo_upper',
        'Oualo (lower)': 'walo_lower',
        'Kenntaba-Leye': 'kentabaley',
        'Djimerou-Doungo': 'djimeroudungo',
        'Sassourou': 'sassouru',
        'Sege-Bougie': 'seguebougie',
        'Fiko': 'ficko',
        'Iribanga (Fulbe)': 'iribanga_fulbe',
        'Madina (near Banggel-Toupe)': 'madina_near_bangueltoupe)',
        'Dourou Tanga (1)': 'douroutanga_1',
        'Dourou Tanga (2)': 'douroutanga_2',
        'Dourou Tanga (3)': 'douroutanga_3',
        'Tena (Tere)': 'tena_aka_tere',
        'Anakaga (Amamounou)': 'anakaga_in_amamounou',
        'Dari (near Hombori)': 'dari_near_hombori',
        'Bamba Tene': 'bambatende',
        'Kenntaba-Do': 'kentabado',
        'Tialegel': 'tialeggel',
        'Bani-Banggou': 'banibangou',
        'Ourobangourdi': 'ourobaangourdi',
        'Ourodjougal': 'ourodiouggal',
        'Yadianga (Fulbe)': 'yadiangapoulogoro',
        'Gueourou (Fulbe)': 'gueouroupulogoro',
        'Tongoro-Legu': 'tongorolegou',
        'Koundougou-Mossi': 'koundougoumouniougoro',
        'Billanto-Bella': 'bella',
        'Dianggassagou (Diemessogou)': 'diangassagou_aka_diemessogou)',
    }
    name_map = {
        'kelmita': 'kelmitaa',
        'yrebann': 'yreban',
        'aouguine': 'aougine',
        'bendielysigen': 'bendielisigen',
        'bendielydana': 'bendielidana',
        'ourongeou': 'ourongueou',
        'oukoulourou': 'oukolourou',
        'bendielygirikombo': 'bendieligirikombo',
        'dianggassagou': 'diangassagou',
        'komokanina': 'komokaninaa',
        'dourouna': 'dourounaa',
        'idielina': 'idielinaa',
        'woltigueri': 'woltiguere',
        'irelikanaw': 'ireli_kanaw',
        'korimaounde': 'kori_maounde',
        'yandaguinedia': 'yandaginedia',
        'boudoufolii': 'boudoufoli_section1',
        'boudoufoliii': 'boudoufoli_section2',
    }

    def location(d):
        if d['OfficialVillageName'] == 'Balaguina (Balaguina-Baboye)':
            d['N Lat'] = d['N Lat'].replace(' 115.3', ' 15.3')
        if d['OfficialVillageName'] == 'Daidourou':
            return None, None
        #if d['W Lon us'] and d['N Lat us']:
        #    return parse_deg(d['N Lat us']), parse_deg(d['W Lon us'])
        lat, lon = parse_deg(d['N Lat']), parse_deg(d['W Lon'])
        if lon:
            lon = -lon
        if lon and lon < -10:
            lon += 10
        return lat, lon

    for d in reader(
            args.data_file('repos', 'GPS_Dogon.csv'), dicts=True):
        for k in d:
            d[k] = d[k].strip()
        if not d['OfficialVillageName']:
            continue
        normname = full_name_map.get(d['OfficialVillageName'].strip())
        if normname is None:
            normname = d['OfficialVillageName'].replace('-', '').replace(' ', '').replace('(', '_aka_').replace(')', '').split(',')[0].strip().lower()
            normname = name_map.get(normname, normname)
        v = Village(
            d['OfficialVillageName'],
            normname,
            GPS_LANGS.get(d['Language (group)']),
            data=d)
        v.lat, v.lon = location(d)
        yield v
Пример #58
0
 def read(self, path, sep="\t", comment="#"):
     with Path(path).open(encoding='utf-8') as handle:
         lines = [unicodedata.normalize('NFC', hline) for hline in handle.readlines()
                  if hline and not hline.startswith(comment)]
     self.extend(list(reader(lines, dicts=True, delimiter=sep)))
Пример #59
0
def main(args):
    if DBSession.bind.dialect.name == 'postgresql':
        Index('ducet', collkey(common.Value.name)).create(DBSession.bind)

    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(
            data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace(
            '/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(
                url=image.source_url,
                thumbnail=image_url(image.source_url, 'thumbnail'),
                web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(
                object=data['Taxon'][image.taxa__id],
                id=image.id,
                name=image.tags,
                jsondata=jsondata,
                mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)