Exemplo n.º 1
0
    def test_add_delete_rows(self):
        from pytsammalex.util import add_rows, filter_rows

        csv_path = self.tmp_path('test.csv')
        add_rows(csv_path, ['a', 'b'], [1, 2], [3, 4])
        self.assertEqual(len(list(reader(csv_path, dicts=True))), 2)
        filter_rows(csv_path, lambda item: item['a'] == '1')
        self.assertEqual(len(list(reader(csv_path, dicts=True))), 1)
        add_rows(csv_path, [1, 2], [3, 4])
        self.assertEqual(len(list(reader(csv_path, dicts=True))), 3)
Exemplo n.º 2
0
def metadata(write_stats=True):
    """Writes statistics on metadata to readme."""
    txt = '# Basic Statistics on Metadata\n\n'
    cnc = list(reader(data_path('concepticon.tsv'), namedtuples=True, delimiter="\t"))
    for i,cl in enumerate(PKG_PATH.joinpath('concept_set_meta').glob('*.tsv')):
        data = list(reader(cl, namedtuples=True, delimiter="\t"))
        txt += '* {0} covers {1} concept sets ({2:.2f} %)\n'.format(cl.name[:-4], len(data), len(data) / len(cnc))
    if write_stats:
        with PKG_PATH.joinpath('concept_set_meta', 'README.md').open('w', encoding='utf8') as fp:
            fp.write(txt)
Exemplo n.º 3
0
def test_add_delete_rows(tmpdir):
    csv_path = Path(tmpdir.join('test.csv'))
    add_rows(csv_path, ['a', 'b'], [1, 2], [3, 4])
    assert (len(list(reader(csv_path, dicts=True))) == 2)

    filter_rows(csv_path, lambda item: item['a'] == '1')
    assert (len(list(reader(csv_path, dicts=True))) == 1)

    add_rows(csv_path, [1, 2], [3, 4])
    assert (len(list(reader(csv_path, dicts=True))) == 3)
Exemplo n.º 4
0
    def test_rewrite(self):
        from clldutils.dsv import reader, rewrite

        tmp = self.tmp_path('test')
        shutil.copy(FIXTURES.joinpath('tsv.txt').as_posix(), tmp.as_posix())
        rewrite(tmp.as_posix(), lambda i, row: [len(row)], delimiter='\t')
        self.assertEquals(list(reader(tmp))[0], ['2'])

        shutil.copy(FIXTURES.joinpath('csv.txt').as_posix(), tmp.as_posix())
        rewrite(tmp, lambda i, row: row)
        self.assertEquals(list(reader(tmp)), list(reader(FIXTURES.joinpath('csv.txt'))))
Exemplo n.º 5
0
 def __init__(self, dir_):
     self.dir = dir_
     self.datasets = [
         Dataset(base_dir=self.dir.joinpath('datasets'), **r)
         for r in reader(self.dir.joinpath('datasets', 'index.csv'),
                         dicts=True)
     ]
     self.phylogenies = [
         Phylogeny(base_dir=self.dir.joinpath('phylogenies'), **r)
         for r in reader(self.dir.joinpath('phylogenies', 'index.csv'),
                         dicts=True)
     ]
Exemplo n.º 6
0
    def test_add_delete_rows(self):
        from clldutils.dsv import add_rows, filter_rows_as_dict, reader

        csv_path = self.tmp_path('test.csv')
        add_rows(csv_path, ['a', 'b'], [1, 2], [3, 4])
        self.assertEqual(len(list(reader(csv_path, dicts=True))), 2)
        filter_rows_as_dict(csv_path, lambda item: item['a'] == '1')
        self.assertEqual(len(list(reader(csv_path, dicts=True))), 1)
        add_rows(csv_path, [2, 2], [2, 4])
        self.assertEqual(len(list(reader(csv_path, dicts=True))), 3)
        res = filter_rows_as_dict(csv_path, lambda item: item['a'] == '1')
        self.assertEqual(res, 2)
Exemplo n.º 7
0
def ff_images(args):
    tsammalex = {
        i.id: i.taxa__id for i in
        reader(args.data_file('repos', 'tsammalex_images.csv'), namedtuples=True)}

    ref_pattern = re.compile('(?P<ref>[0-9]{5})')
    uploaded = load(args.data_file('repos', 'cdstar.json'))
    files = load(args.data_file('repos', 'Heath_flora_fauna_images.json'))
    files.update(load(args.data_file('repos', 'ffmissing.json')))
    path_to_md5 = {}
    for md5, paths in files.items():
        for path in paths:
            path_to_md5[Path(path.encode('utf8')).stem] = md5
    missed, found, uploaded_ = 0, 0, 0
    for i, img in enumerate(reader(args.data_file('repos', 'dogon_flora-fauna.csv'), delimiter=',', namedtuples=True)):
        stem = Path(img.filenames.encode('utf8')).stem
        assert stem in path_to_md5
        found += 1
        if path_to_md5[stem] in uploaded:
            m = ref_pattern.search(stem)
            uploaded_ += 1
            yield FFImage(
                path_to_md5[stem],
                Path(files[path_to_md5[stem]][0].encode('utf8')).name,
                None,
                m.group('ref') if m else None,
                None,
                [],
                uploaded[path_to_md5[stem]],
                tsammalex.get(path_to_md5[stem]))

    videos = load(args.data_file('repos', 'videos_from_website.json'))
    videos.update(load(args.data_file('repos', 'videos.json')))

    for md5, paths in videos.items():
        if md5 in uploaded:
            path = Path(paths[0].encode('utf8'))
            m = ref_pattern.search(path.stem)
            uploaded_ += 1
            yield FFImage(
                md5,
                path.name,
                None,
                m.group('ref') if m else None,
                None,
                [],
                uploaded[md5],
                tsammalex.get(md5))
        else:
            missed += 1

    print('ff_images', missed, uploaded_)
Exemplo n.º 8
0
 def __init__(self, dir_):
     self.dir = Path(dir_)
     self.datasets = [
         Dataset(base_dir=self.dir.joinpath('datasets'), **r) for r in
         reader(self.dir.joinpath('datasets', 'index.csv'), dicts=True)]
     self.phylogenies = [
         Phylogeny(base_dir=self.dir.joinpath('phylogenies'), **r) for r in
         reader(self.dir.joinpath('phylogenies', 'index.csv'), dicts=True)]
     self.societies = {
         s.id: s for s in chain.from_iterable(d.societies for d in self.datasets)
     }
     self.variables = {
         v.id: v for v in chain.from_iterable(d.societies for d in self.datasets)
     }
     self.sources = BibFile(self.dir.joinpath('datasets', 'sources.bib'))
Exemplo n.º 9
0
def metadata(write_stats=True):
    """Writes statistics on metadata to readme."""
    txt = '# Basic Statistics on Metadata\n\n'
    cnc = list(
        reader(data_path('concepticon.tsv'), namedtuples=True, delimiter="\t"))
    for i, cl in enumerate(
            PKG_PATH.joinpath('concept_set_meta').glob('*.tsv')):
        data = list(reader(cl, namedtuples=True, delimiter="\t"))
        txt += '* {0} covers {1} concept sets ({2:.2f} %)\n'.format(
            cl.name[:-4], len(data),
            len(data) / len(cnc))
    if write_stats:
        with PKG_PATH.joinpath('concept_set_meta',
                               'README.md').open('w', encoding='utf8') as fp:
            fp.write(txt)
Exemplo n.º 10
0
def fill(dataset, data, socids):
    lines_old = set(open(data, encoding="utf8").readlines())
    res = defaultdict(list)
    for item in reader(data, dicts=True):
        res[(item["Dataset"], item["VarID"], item["soc_id"])].append(item)
        keys = list(item.keys())

    print(dataset, len(socids), "societies")

    for var_id, socs in groupby(sorted(res.keys(), key=lambda t: t[1]), key=lambda t: t[1]):
        for ds, soc_id in socids.difference(set((s[0], s[2]) for s in socs)):
            rec = OrderedDict()
            for key in keys:
                rec[key] = ""
            rec.update(soc_id=soc_id, Dataset=ds, Code="NA", VarID=var_id)
            res[(ds, var_id, soc_id)].append(rec)
        assert sum(len(v) for k, v in res.items() if k[1] == var_id) >= len(socids)

    with UnicodeWriter(data) as fp:
        fp.writerow(keys)
        for key in sorted(res.keys()):
            fp.writerows(row.values() for row in res[key])

    # fix line endings:
    with open(data, encoding="utf8") as fp:
        c = fp.read()

    with open(data, "w", encoding="utf8") as fp:
        fp.write(c.replace("\r\n", "\n"))

    lines_new = set(open(data, encoding="utf8").readlines())
    assert lines_old.issubset(lines_new)
    print(len(lines_new.difference(lines_old)), "NA values added")
Exemplo n.º 11
0
def read_scorer(path):
    """
    Read a scoring function in a file into a ScoreDict object.

    Parameters
    ----------
    path : Path
        The path to the input file that shall be read as a scoring dictionary.
        The matrix format is a simple csv-file in which the scoring matrix is
        displayed, with negative values indicating high differences between
        sound segments (or sound classes) and positive values indicating high
        similarity. The matrix should be symmetric, columns should be separated
        by tabstops, and the first column should provide the alphabet for which
        the scoring function is defined.

    Returns
    -------
    scoredict : ~lingpy.algorithm.ScoreDict
        A ScoreDict instance which can be directly passed to LingPy's alignment
        functions.
    """
    chars, matrix = [], []
    for row in reader(path, delimiter='\t'):
        if row:
            chars.append(row[0])
            matrix.append(map(float, row[1:]))
    return ScoreDict(chars, matrix)
Exemplo n.º 12
0
def stats():
    lines = [
        '## Concept Lists',
        '',
        ' name | mapped | mergers ',
        ' ---- | ------ | ------- ',
    ]
    
    for cl in sorted(
            PKG_PATH.joinpath('conceptlists').glob('*.tsv'), key=lambda _cl: _cl.name):
        concepts = list(reader(cl, namedtuples=True, delimiter='\t'))
        mapped = len([c for c in concepts if c.CONCEPTICON_ID])
        mapped_ratio = int((mapped / len(concepts)) * 100)
        concepticon_ids = Counter(
            [c.CONCEPTICON_ID for c in concepts if c.CONCEPTICON_ID])
        mergers = len([k for k, v in concepticon_ids.items() if v > 1])

        line = [
            '[%s](%s) ' % (cl.stem, cl.name),
            badge('mapped', '%s%%' % mapped_ratio, Colors.red if mapped_ratio < 99 else Colors.brightgreen),
            badge('mergers', '%s' % mergers, Colors.red if mergers else Colors.brightgreen),
        ]

        lines.append(' | '.join(line))

    with PKG_PATH.joinpath('conceptlists', 'README.md').open('w', encoding='utf8') as fp:
        fp.write('\n'.join(lines))
Exemplo n.º 13
0
def import_gb20_features(datadir, data):
    for feature in reader(os.path.join(datadir, 'gb20features.tsv'),
                          delimiter='\t',
                          dicts=True):
        feature = FeatureSpec(feature)
        f = data.add(Feature,
                     feature.id,
                     id=feature.id,
                     name=feature.name,
                     doc=feature.doc,
                     patron=feature.patron,
                     std_comments=feature.std_comments,
                     name_french=feature.name_french,
                     jl_relevant_unit=feature.jl_relevant_unit,
                     jl_function=feature.jl_function,
                     jl_form=feature.jl_form,
                     hard_to_deny=feature.hard_to_deny,
                     prone_misunderstanding=feature.prone_misunderstanding,
                     requires_extensive_data=feature.requires_extensive_data,
                     last_edited=feature.last_edited,
                     other_survey=feature.other_survey)
        for i, (deid, desc) in enumerate(feature.domain.items()):
            DomainElement(id='%s-%s' % (f.id, deid),
                          parameter=f,
                          abbr=deid,
                          name='%s - %s' % (deid, desc),
                          number=int(deid) if deid != '?' else 999,
                          description=desc,
                          jsondata=dict(icon=ORDERED_ICONS[i].name))
Exemplo n.º 14
0
    def _from(cls, data, container=None, skip_on_error=False):
        container = container or data.parent
        dataset = cls(data.stem)
        dataset.metadata.read(Dataset.filename(data, 'metadata'), container)
        dataset._table = dataset.metadata.get_table()
        dataset.sources.read(Dataset.filename(data, 'sources'), container)
        delimiter = ','
        if dataset.table:
            delimiter = dataset.table.dialect.delimiter
        if data.suffix in TAB_SUFFIXES:
            delimiter = '\t'

        if isinstance(container, Archive):
            rows = container.read_text(data.name).split('\n')
        else:
            rows = data

        for i, row in enumerate(reader(rows, delimiter=delimiter)):
            if i == 0:
                dataset.fields = tuple(row)
            else:
                try:
                    dataset.add_row(row)
                except ValueError as e:
                    if skip_on_error:
                        log.warn('skipping row in line %s: %s' % (i + 1, e))
                    else:
                        raise e
        dataset.table.dialect.delimiter = delimiter
        dataset.table.url = data.name
        return dataset
Exemplo n.º 15
0
def _dtab(dir_, fn):
    lpd = []
    for d in reader(dir_.joinpath(fn), dicts=True, delimiter='\t', quoting=csv.QUOTE_NONE):
        lpd.append({
            k.replace('\ufeff', ''): (v or '').strip()
            for k, v in d.items() + [("fromfile", fn)]})
    return lpd
Exemplo n.º 16
0
def load(table, csv, engine):
    schema = jsonlib.load(
        csv.parent.joinpath(csv.stem + '.' + CsvmJsonAdapter.extension))
    converter = get_converter(schema['tableSchema'], table)
    engine.execute(table.insert(),
                   [converted(d, converter) for d in reader(csv, dicts=True)])
    return schema.get("dc:identifier")
Exemplo n.º 17
0
def stats():
    lines = [
        '## Concept Lists',
        '',
        ' name | mapped | mergers ',
        ' ---- | ------ | ------- ',
    ]

    for cl in sorted(PKG_PATH.joinpath('conceptlists').glob('*.tsv'),
                     key=lambda _cl: _cl.name):
        concepts = list(reader(cl, namedtuples=True, delimiter='\t'))
        mapped = len([c for c in concepts if c.CONCEPTICON_ID])
        mapped_ratio = int((mapped / len(concepts)) * 100)
        concepticon_ids = Counter(
            [c.CONCEPTICON_ID for c in concepts if c.CONCEPTICON_ID])
        mergers = len([k for k, v in concepticon_ids.items() if v > 1])

        line = [
            '[%s](%s) ' % (cl.stem, cl.name),
            badge('mapped', '%s%%' % mapped_ratio,
                  Colors.red if mapped_ratio < 99 else Colors.brightgreen),
            badge('mergers', '%s' % mergers,
                  Colors.red if mergers else Colors.brightgreen),
        ]

        lines.append(' | '.join(line))

    with PKG_PATH.joinpath('conceptlists',
                           'README.md').open('w', encoding='utf8') as fp:
        fp.write('\n'.join(lines))
Exemplo n.º 18
0
def upgrade():
    conn = Connection(op.get_bind())
    example_map = {}

    sid = 204
    for example in jsonload(data_file('lingala_examples.json')):
        sid += 1
        kw = {
            'id': '60-%s' % sid,
            'language_pk': conn.pk(Language, '60'),
            'name': example['Text'],
            'description': example['Translation'],
            'gloss': '\t'.join(example['Gloss'].split()),
            'analyzed': '\t'.join(example['Text'].split()),
            'type': example['Type'].strip().lower(),
            'jsondata': {'sort': int(example['Order_number']), 'alt_translation': None}
        }
        example_map[example['Example_number']] = conn.insert(Sentence, **kw)

    for ve in jsonload(data_file('lingala_value_examples.json')):
        vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number'])
        vpk = conn.pk(Value, vspk, attr='valueset_pk')
        conn.insert(
            ValueSentence, value_pk=vpk, sentence_pk=example_map[ve['Example_number']])

    for i, comment in enumerate(reader(data_file('lingala_valueset_comments.tab'), delimiter='\t', dicts=True)):
        vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number'])
        comment['Comments_on_value_assignment'] = comment['Comments_on_value_assignment'].replace('\x0b', '\n')
        conn.update(
            ValueSet,
            {
                'description': comment['Comments_on_value_assignment'],
                'markup_description': None,
            },
            pk=vspk)
Exemplo n.º 19
0
def iter_lexicon(args):
    for fname in ['dogon_lexicon', 'flora_Dogon_Unicode', 'fauna_Dogon_Unicode']:
        for concept in reader(args.data_file('repos', fname + '.csv'), dicts=True):
            entry = Entry(**{v: concept.get(k) for k, v in FIELD_MAP.items()})
            for name, gc in LEX_LANGS[fname].items():
                entry.forms[gc] = concept[name].strip()
            if entry.ref and entry.ref != 'zzz':
                yield entry
Exemplo n.º 20
0
 def read(self, path, sep="\t", comment="#"):
     with Path(path).open(encoding='utf-8') as handle:
         lines = [
             unicodedata.normalize('NFC', hline)
             for hline in handle.readlines()
             if hline and not hline.startswith(comment)
         ]
     self.extend(list(reader(lines, dicts=True, delimiter=sep)))
Exemplo n.º 21
0
 def _iter_etc(self, what):
     delimiter = '\t'
     path = self.dir / 'etc' / (what + '.tsv')
     if not path.exists():
         delimiter = ','
         path = path.parent / (what + '.csv')
     return reader(path, dicts=True,
                   delimiter=delimiter) if path.exists() else []
Exemplo n.º 22
0
 def __init__(self, name_and_date, fp):
     parts = name_and_date.split('_')
     self.date = date(int(parts[-1][:4]), int(parts[-1][4:6]), int(parts[-1][6:8]))
     name = '_'.join(parts[:-1])
     if name.startswith('_') or name.startswith('-'):
         name = name[1:]
     if not name:
         name = 'Codes'
     self.name = name
     list.__init__(self, reader(fp.splitlines(), dicts=True, delimiter='\t'))
Exemplo n.º 23
0
def read_win1252(fname, ignore_dataset=False):
    with open(fname, 'rb') as fp:
        c = fp.read()

    with open(fname, 'wb') as fp:
        fp.write(c.replace(b'\x9d', b''))

    for r in reader(fname, dicts=True, encoding='cp1252'):
        if ignore_dataset or (r.get('dataset') == 'SCCS') or (
                r.get('Dataset') == 'SCCS') or (r.get('Datset') == 'SCCS'):
            yield r
Exemplo n.º 24
0
 def from_file(cls, fname):
     """
     Orthography profiles must be
     - tab-separated CSV files
     - encoded in UTF-8
     - with a header containing a column "Grapheme"
     """
     return cls(*list(
         reader(readlines(fname, normalize='NFD'),
                dicts=True,
                delimiter='\t',
                quotechar=None)))
Exemplo n.º 25
0
Arquivo: util.py Projeto: clld/csd
def get_sources(args):
    res = {}

    for d in reader(args.data_file('sources_CSD.csv'), delimiter=',', dicts=True):
        res[normalize_sid(d['Abbreviation'])] = d

    for sid in list(SOURCES.keys()):
        _sid = normalize_sid(sid)
        if _sid not in res:
            print('missing sid: %s' % sid)
            res[_sid] = dict(citation=SOURCES[sid], Name=sid, title=SOURCES[sid])

    return res
 def cmd_install(self, **kw):
     with self.cldf as ds:
         ds.add_concepts(id_factory=lambda d: d.number.replace('.', '-'))
         lmap = ds.add_languages()
         for p in self.raw.glob('*.csv'):
             lid = p.stem.split('-')[1]
             if lid in lmap:
                 for item in reader(p, dicts=True):
                     if item['Phonetic']:
                         ds.add_lexemes(
                             Language_ID=lid,
                             Parameter_ID=item['BNC ID'].replace('.', '-'),
                             Value=unicodedata.normalize('NFC', item['Phonetic']))
Exemplo n.º 27
0
    def __init__(self, path):
        """
        A dataset is initialzed passing its directory path.
        """
        path = Path(path)
        self.id = path.name
        self.log = logging.getLogger(pylexibank.__name__)
        self.dir = path
        self.raw = self.dir.joinpath('raw', 'data')
        if not self.raw.exists():
            self.raw.mkdir()
        self.cldf_dir = self.dir.joinpath('cldf')
        if not self.cldf_dir.exists():
            self.cldf_dir.mkdir()
        self.commands = import_module(self.dir)
        self.md = jsonlib.load(self.dir.joinpath('metadata.json'))
        self.languages = []
        lpath = self.dir.joinpath('languages.csv')
        if lpath.exists():
            for item in reader(lpath, dicts=True):
                if item['GLOTTOCODE'] and not GC_PATTERN.match(
                        item['GLOTTOCODE']):
                    raise ValueError("Wrong glottocode for item {0}".format(
                        item['GLOTTOCODE']))
                self.languages.append(item)
        self.conceptlist = None
        url = self.md.get('dc:conformsTo')
        if url and url.startswith(
                'http://concepticon.clld.org/contributions/'):
            self.conceptlist = url.split('/')[-1]
        self.concepts = []
        cpath = self.dir.joinpath('concepts.csv')
        if cpath.exists():
            self.concepts = list(reader(cpath, dicts=True))
        self.cognates = Cognates()

        # the following attributes are only set when a dataset's cldf method is run:
        self.glottolog_languoids = {}
        self.glottolog_version, self.concepticon_version = None, None
Exemplo n.º 28
0
def pytest_generate_tests(metafunc):
    if 'test_sounds' == metafunc.function.__name__:
        fixturenames = None
        tests = []
        for i, test in enumerate(
                reader(Path(__file__).parent / 'data' / 'test_data.tsv',
                       delimiter='\t',
                       dicts=True)):
            if i == 0:
                fixturenames = list(test.keys())
                fixturenames.pop(fixturenames.index('bipa'))
            del test['bipa']
            if None in test:
                del test[None]
            if len(fixturenames) != len(test.keys()):
                raise ValueError(set(test.keys()) - set(fixturenames))
            tests.append(test)

        attrs = [
            'nfd-normalized', 'clts-normalized', 'aliased', 'generated',
            'stressed'
        ]
        tests = sorted(tests, key=lambda t: tuple([t[a] for a in attrs]))
        batches = []
        for _, ts in groupby(tests, lambda t: tuple([t[a] for a in attrs])):
            for test in ts:
                batches.append(tuple(test.values()))
                break

        metafunc.parametrize(
            ','.join(n.replace('-', '_') for n in fixturenames), batches)
    elif 'test_clicks' == metafunc.function.__name__:
        tests = []
        for test in reader(Path(__file__).parent / 'data' / 'clicks.tsv',
                           delimiter='\t',
                           dicts=True):
            tests.append((test['GRAPHEME'], test['MANNER']))
        metafunc.parametrize('grapheme,gtype', tests)
Exemplo n.º 29
0
def get_wordlist(path,
                 delimiter=",",
                 quotechar='"',
                 normalization_form="NFC",
                 **keywords):
    """
    Load a wordlist from a normal CSV file.

    Parameters
    ----------
    path : str
        The path to your CSV file.
    delimiter : str
        The delimiter in the CSV file.
    quotechar : str
        The quote character in your data.
    row : str (default = "concept")
        A string indicating the name of the row that shall be taken as the
        basis for the tabular representation of the word list.
    col : str (default = "doculect")
        A string indicating the name of the column that shall be taken as the
        basis for the tabular representation of the word list.
    conf : string (default='')
        A string defining the path to the configuration file.
    
    Notes
    -----
    This function returns a :py:class:`~lingpy.basic.wordlist.Wordlist` object.
    In contrast to the normal way to load a wordlist from a tab-separated file,
    however, this allows to directly load a wordlist from any "normal"
    csv-file, with your own specified delimiters and quote characters. If the
    first cell in the first row of your CSV file is not named "ID", the integer
    identifiers, which are required by LingPy will be automatically created.

    """
    kw = dict(conf="", col="doculect", row="concept")
    kw.update(keywords)
    data = list(dsv.reader(path, delimiter=delimiter, quotechar=quotechar))
    header = [h.lower() for h in data[0]]
    data = data[1:]
    D = {}
    if header[0] == 'ID':
        D[0] = header[1:]
        for row in data:
            D[row[0]] = [normalize(normalization_form, n) for n in row[1:]]
    else:
        D[0] = header
        for idx, row in enumerate(data):
            D[idx + 1] = row
    return Wordlist(D, row=kw['row'].lower(), col=kw['col'].lower())
Exemplo n.º 30
0
def read_csv(fname, data):
    concepts = None

    for i, row in enumerate(reader(fname)):
        if i == 0:
            concepts = {j: c for j, c in enumerate(row[1:])}
        else:
            for j, c in enumerate(row[1:]):
                if j % 2 == 0:  # even number
                    loan, form = get_loan_and_form(c)
                else:
                    if form.strip():
                        data[row[0]][concepts[j]] = (form, loan, c)
    return data
Exemplo n.º 31
0
    def __init__(self, clid):
        self.clid = clid
        self.concepts = {
            'CONCEPTICON_ID': {},  # maps ID to GLOSS
            'CONCEPTICON_GLOSS': {},  # maps GLOSS to ID
        }
        for cs in reader(data_path('concepticon.tsv'), dicts=True, delimiter='\t'):
            self.concepts['CONCEPTICON_ID'][cs['ID']] = cs['GLOSS']
            self.concepts['CONCEPTICON_GLOSS'][cs['GLOSS']] = cs['ID']

        self._cid_index = None
        self._cgloss_index = None
        self._link_col = (None, None)
        self._number_index = None
Exemplo n.º 32
0
def stats(args):
    sounds = {}
    for row in reader(args.repos.data_path('sounds.tsv'),
                      delimiter='\t',
                      dicts=True):
        sounds[row['NAME']] = row
    graphs = {}
    for row in reader(args.repos.data_path('graphemes.tsv'),
                      delimiter='\t',
                      dicts=True):
        graphs['{GRAPHEME}-{NAME}-{DATASET}'.format(**row)] = row

    graphdict = defaultdict(list)
    for id_, row in graphs.items():
        graphdict[row['GRAPHEME']] += [row['DATASET']]

    text = [['DATA', 'STATS', 'PERC']]
    text.append([
        'Unique graphemes',
        len(set(row['GRAPHEME'] for row in graphs.values())), ''
    ])
    text.append(['different sounds', len(sounds), ''])
    text.append([
        'singletons',
        len([g for g in graphdict if len(set(graphdict[g])) == 1]), ''
    ])
    text.append([
        'multiples',
        len([g for g in graphdict if len(set(graphdict[g])) > 1]), ''
    ])
    total = len(sounds)
    for type_, count in Counter([s['TYPE']
                                 for s in sounds.values()]).most_common():
        text.append([type_ + 's', count, '{0:.2f}'.format(count / total)])

    print(tabulate.tabulate(text, headers='firstrow'))
Exemplo n.º 33
0
    def __init__(self, clid):
        self.clid = clid
        self.concepts = {
            'CONCEPTICON_ID': {},  # maps ID to GLOSS
            'CONCEPTICON_GLOSS': {},  # maps GLOSS to ID
        }
        for cs in reader(data_path('concepticon.tsv'),
                         dicts=True,
                         delimiter='\t'):
            self.concepts['CONCEPTICON_ID'][cs['ID']] = cs['GLOSS']
            self.concepts['CONCEPTICON_GLOSS'][cs['GLOSS']] = cs['ID']

        self._cid_index = None
        self._cgloss_index = None
        self._link_col = (None, None)
        self._number_index = None
Exemplo n.º 34
0
def list_attributes(write_stats=True):
    """Calculate the addditional attributes in the lists."""
    D = {}
    for i,cl in enumerate(PKG_PATH.joinpath('conceptlists').glob('*.tsv')):
        header = list(reader(cl, delimiter="\t"))[0]
        header = [h for h in header if h not in ['ID', 'CONCEPTICON_ID', 
            'CONCEPTICON_GLOSS', 'ENGLISH', 'GLOSS', 'NUMBER']]
        for h in header:
            try:
                D[h] += [cl.name]
            except KeyError:
                D[h] = [cl.name]
    txt = '# Common Additional Columns of Concept Lists\n'
    for k,v in sorted(D.items(), key=lambda x: len(x[1]), reverse=True):
        txt += '* {2} occurences: {0}, {1}\n'.format(k, ', '.join(v), len(v))
    print(txt)
Exemplo n.º 35
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    concepticon = {
        c.GLOSS: c.CONCEPTICON_ID for c in
        reader(args.data_file('repos', 'conceptlist.tsv'), delimiter='\t', namedtuples=True)
        if c.CONCEPTICON_ID}
    sdata = jsonlib.load(args.data_file('repos', 'classification.json'))
    for concept in DBSession.query(models.Concept).options(joinedload(common.Parameter._files)):
        for t_ in ['image', 'video']:
            setattr(concept, 'count_{0}s'.format(t_), len(getattr(concept, t_ + 's')))
        if concept.jsondata['ref'] in sdata:
            util.update_species_data(concept, sdata[concept.jsondata['ref']])
        if concept.name in concepticon:
            concept.concepticon_id = int(concepticon[concept.name])
Exemplo n.º 36
0
def get_wordlist(path, delimiter=",", quotechar='"', normalization_form="NFC", **keywords):
    """
    Load a wordlist from a normal CSV file.

    Parameters
    ----------
    path : str
        The path to your CSV file.
    delimiter : str
        The delimiter in the CSV file.
    quotechar : str
        The quote character in your data.
    row : str (default = "concept")
        A string indicating the name of the row that shall be taken as the
        basis for the tabular representation of the word list.
    col : str (default = "doculect")
        A string indicating the name of the column that shall be taken as the
        basis for the tabular representation of the word list.
    conf : string (default='')
        A string defining the path to the configuration file.
    
    Notes
    -----
    This function returns a :py:class:`~lingpy.basic.wordlist.Wordlist` object.
    In contrast to the normal way to load a wordlist from a tab-separated file,
    however, this allows to directly load a wordlist from any "normal"
    csv-file, with your own specified delimiters and quote characters. If the
    first cell in the first row of your CSV file is not named "ID", the integer
    identifiers, which are required by LingPy will be automatically created.

    """
    kw = dict(conf="", col="doculect", row="concept")
    kw.update(keywords)
    data = list(dsv.reader(path, delimiter=delimiter, quotechar=quotechar))
    header = [h.lower() for h in data[0]]
    data = data[1:]
    D = {}
    if header[0] == 'ID':
        D[0] = header[1:]
        for row in data:
            D[row[0]] = [normalize(normalization_form, n) for n in row[1:]]
    else:
        D[0] = header
        for idx, row in enumerate(data):
            D[idx + 1] = row
    return Wordlist(D, row=kw['row'].lower(), col=kw['col'].lower())
Exemplo n.º 37
0
def from_csv(data_file, model, data, name=None, visitor=None, filter_=None):
    if filter_ is None:
        filter_ = lambda r: True
    kw = {'delimiter': ',', 'lineterminator': str('\r\n'), 'quotechar': '"'}
    for fname in data_files(data_file, (name or model.__csv_name__) + '.csv'):
        for row in list(reader(fname, **kw))[1:]:
            if row and filter_(row):
                try:
                    obj = model.from_csv(row, data)
                except (KeyError, IndexError):
                    obj = None
                    print(fname)
                    print(row)
                    raise
                if obj:
                    obj = data.add(model, row[0], _obj=obj)
                    if visitor:
                        visitor(obj, row, data)
Exemplo n.º 38
0
def from_csv(data_file, model, data, name=None, visitor=None, filter_=None):
    if filter_ is None:
        filter_ = lambda r: True
    kw = {'delimiter': ',', 'lineterminator': str('\r\n'), 'quotechar': '"'}
    for fname in data_files(data_file, (name or model.__csv_name__) + '.csv'):
        for row in list(reader(fname, **kw))[1:]:
            if row and filter_(row):
                try:
                    obj = model.from_csv(row, data)
                except (KeyError, IndexError):
                    obj = None
                    print(fname)
                    print(row)
                    raise
                if obj:
                    obj = data.add(model, row[0], _obj=obj)
                    if visitor:
                        visitor(obj, row, data)
Exemplo n.º 39
0
def viewCsvImport(request):
    '''
    report stores how the import went.
    Its structure must be iterable containing dicts
    with str data for keys 'heading', 'body'.
    '''
    report = []
    if request.method == 'POST' and 'CsvImportForm' in request.POST:
        importMethod = request.POST['tableType']
        fileDicts = list(
            dsv.reader(
                request.FILES['csvFile'].read().decode('utf8').splitlines(),
                dicts=True))
        handlerFunctions = {'ms*l': handleMeaningsLanguageImport}
        if importMethod in handlerFunctions:
            report = handlerFunctions[importMethod](fileDicts, request)

    return render_template(request, "admin/viewCsvImport.html",
                           {'report': report})
Exemplo n.º 40
0
def list_attributes(write_stats=True):
    """Calculate the addditional attributes in the lists."""
    D = {}
    for i, cl in enumerate(PKG_PATH.joinpath('conceptlists').glob('*.tsv')):
        header = list(reader(cl, delimiter="\t"))[0]
        header = [
            h for h in header if h not in [
                'ID', 'CONCEPTICON_ID', 'CONCEPTICON_GLOSS', 'ENGLISH',
                'GLOSS', 'NUMBER'
            ]
        ]
        for h in header:
            try:
                D[h] += [cl.name]
            except KeyError:
                D[h] = [cl.name]
    txt = '# Common Additional Columns of Concept Lists\n'
    for k, v in sorted(D.items(), key=lambda x: len(x[1]), reverse=True):
        txt += '* {2} occurences: {0}, {1}\n'.format(k, ', '.join(v), len(v))
    print(txt)
Exemplo n.º 41
0
def upgrade():
    conn = Connection(op.get_bind())
    example_map = {}

    sid = 204
    for example in jsonload(data_file('lingala_examples.json')):
        sid += 1
        kw = {
            'id': '60-%s' % sid,
            'language_pk': conn.pk(Language, '60'),
            'name': example['Text'],
            'description': example['Translation'],
            'gloss': '\t'.join(example['Gloss'].split()),
            'analyzed': '\t'.join(example['Text'].split()),
            'type': example['Type'].strip().lower(),
            'jsondata': {
                'sort': int(example['Order_number']),
                'alt_translation': None
            }
        }
        example_map[example['Example_number']] = conn.insert(Sentence, **kw)

    for ve in jsonload(data_file('lingala_value_examples.json')):
        vspk = conn.pk(ValueSet, '60-%s' % ve['Features::Feature_number'])
        vpk = conn.pk(Value, vspk, attr='valueset_pk')
        conn.insert(ValueSentence,
                    value_pk=vpk,
                    sentence_pk=example_map[ve['Example_number']])

    for i, comment in enumerate(
            reader(data_file('lingala_valueset_comments.tab'),
                   delimiter='\t',
                   dicts=True)):
        vspk = conn.pk(ValueSet, '60-%s' % comment['Features::Feature_number'])
        comment['Comments_on_value_assignment'] = comment[
            'Comments_on_value_assignment'].replace('\x0b', '\n')
        conn.update(ValueSet, {
            'description': comment['Comments_on_value_assignment'],
            'markup_description': None,
        },
                    pk=vspk)
Exemplo n.º 42
0
def import_cldf(srcdir, md, languoids, conceptsets):
    with transaction.manager:
        contrib = Provider(
            id=srcdir.name,
            name=md['dc:title'],
            description=md.get('dc:bibliographicCitation'),
            url=md.get('dc:identifier'),
            license=md.get('dc:license'),
            aboutUrl=md.get('aboutUrl'),
        )
        DBSession.add(contrib)
        sources = {}
        cldfdir = srcdir.joinpath('cldf')
        values = Data()
        for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False):
            ds = Dataset.from_metadata(fname)
            for src in ds.sources.items():
                if src.id not in sources:
                    sources[src.id] = cldf2clld(src, contrib, len(sources) + 1)
            import_dataset(ds, contrib, languoids, conceptsets, sources,
                           values)
            DBSession.flush()
        # import cognates:
        if cldfdir.joinpath('cognates.csv').exists():
            for csid, cognates in groupby(
                    reader(cldfdir.joinpath('cognates.csv'), dicts=True),
                    lambda i: i['Cognate_set_ID']):
                cs = Cognateset(id=unique_id(contrib, csid),
                                contribution=contrib)
                for cognate in cognates:
                    cp = values['Counterpart'].get(cognate['Word_ID'])
                    if cp:
                        DBSession.add(
                            CognatesetCounterpart(
                                cognateset=cs,
                                counterpart=cp,
                                cognate_detection_method=cognate[
                                    'Cognate_detection_method'],
                                alignment=cognate['Alignment'],
                                alignment_method=cognate['Alignment_method'],
                                doubt=cognate['Doubt'] == 'True'))
Exemplo n.º 43
0
def import_cldf(srcdir, md, languoids, conceptsets):
    with transaction.manager:
        contrib = Provider(
            id=srcdir.name,
            name=md['dc:title'],
            description=md.get('dc:bibliographicCitation'),
            url=md.get('dc:identifier'),
            license=md.get('dc:license'),
            aboutUrl=md.get('aboutUrl'),
        )
        DBSession.add(contrib)
        sources = {}
        cldfdir = srcdir.joinpath('cldf')
        values = Data()
        for fname in tqdm(list(cldfdir.glob('*' + MD_SUFFIX)), leave=False):
            ds = Dataset.from_metadata(fname)
            for src in ds.sources.items():
                if src.id not in sources:
                    sources[src.id] = cldf2clld(src, contrib, len(sources) + 1)
            import_dataset(ds, contrib, languoids, conceptsets, sources, values)
            DBSession.flush()
        # import cognates:
        if cldfdir.joinpath('cognates.csv').exists():
            for csid, cognates in groupby(
                    reader(cldfdir.joinpath('cognates.csv'), dicts=True),
                    lambda i: i['Cognate_set_ID']):
                cs = Cognateset(id=unique_id(contrib, csid), contribution=contrib)
                for cognate in cognates:
                    cp = values['Counterpart'].get(cognate['Word_ID'])
                    if cp:
                        DBSession.add(CognatesetCounterpart(
                            cognateset=cs,
                            counterpart=cp,
                            cognate_detection_method=cognate['Cognate_detection_method'],
                            alignment=cognate['Alignment'],
                            alignment_method=cognate['Alignment_method'],
                            doubt=cognate['Doubt'] == 'True'))
Exemplo n.º 44
0
 def _read_tsv(self, path):
     return set(tuple(row[1:]) for row in reader(path, delimiter="\t"))
Exemplo n.º 45
0
 def _read_tsv(self, path):
     return set(tuple(row[1:]) for row in reader(path, delimiter='\t'))
Exemplo n.º 46
0
 def read(table):
     fname = args.data_file(table + '.all.csv')
     if not fname.exists():
         fname = args.data_file(table + '.csv')
     return list(dsv.reader(fname, namedtuples=True))
Exemplo n.º 47
0
def get_tab(name):
    """Generator for entries in a tab file specified by name."""
    return dsv.reader(
        get(get_taburls()[name]).split('\n'), namedtuples=True, delimiter='\t')
Exemplo n.º 48
0
def main():
    socs = read_win1252(
        'ALL_soc_ids_to_lang_wAltNames_sources_5Sept2017_win1252.csv')
    links = {
        r['soc_id']: r
        for r in read_win1252(
            'ALL_soc_links_to_other_databases_30Aug2017_win1252.csv')
    }
    locations = {
        'SCCS' + r['soc_id']: r
        for r in reader('../../legacy/LatLong_data.csv', dicts=True)
    }
    for row in reader(
            '../WNAI/DPLACE_RevisedLatLong_27April2017_inclWNAI_SCCS.csv',
            dicts=True):
        if row['Dataset'] == 'SCCS':
            locations[row['soc_id']]['Lat'] = row['soc.latitude']
            locations[row['soc_id']]['Long'] = row['soc.longitude']

    with UnicodeWriter('societies.csv') as w:
        w.writerow([f.name for f in attr.fields(Society)])
        for soc in socs:
            kw = {
                'id': soc['soc_id'],
                'glottocode': soc['glottolog_id'],
                'glottocode_comment': 'Lang_assignment_change_notes'
            }
            for col in [
                    'xd_id',
                    'pref_name_for_society',
                    'ORIG_name_and_ID_in_this_dataset',
                    'alt_names_by_society',
                    'main_focal_year',
            ]:
                kw[col] = soc[col]

            for col in ['Lat', 'Long', 'origLat', 'origLong', 'Comment']:
                kw[col] = locations[soc['soc_id']][col]

            kw['HRAF_name_ID'] = links[soc['soc_id']]['HRAF_name_ID']
            kw['HRAF_link'] = links[soc['soc_id']]['HRAF_link']
            w.writerow(attr.astuple(Society(**kw)))

    with UnicodeWriter('societies_mapping.csv') as w:
        w.writerow(['id', 'related'])
        for sid, l in links.items():
            rels = []
            for dsid, suffix in [
                ('EA', '1'),
                ('EA', '2'),
                ('Binford', '1'),
                ('Binford', '2'),
                ('Binford', '3'),
                ('SCCS', ''),
                ('WNAI', '1'),
                ('WNAI', '2'),
                ('WNAI', '3'),
                ('WNAI', '4'),
                ('WNAI', '5'),
            ]:
                if dsid == 'SCCS':
                    label = l['{0}_society_equivalent{1}'.format(dsid, suffix)]
                else:
                    label = l['{0}_label_society_equivalent{1}'.format(
                        dsid, suffix)]
                id = l['{0}_id_society_equivalent{1}'.format(dsid, suffix)]
                if label and id:
                    rels.append('{0}: {1} [{2}]'.format(dsid, label, id))
            w.writerow([sid, '; '.join(rels)])

    var_info = {
        r['source']: r['APA_reference']
        for r in read_win1252('SCCS_variable_sources_bibtex_to_APA.csv',
                              ignore_dataset=True)
    }

    with UnicodeWriter('variables.csv') as w:
        fm = OrderedDict([
            ('VarID', 'id'),
            ('Category', 'category'),
            ('VarTitle', 'title'),
            ('VarDefinition', 'definition'),
            ('VarType', 'type'),
            ('UserNotes', 'notes'),
            ('source', 'source'),
            ('VarTitleShort', 'changes'),
            ('Unit', 'units'),
        ])
        w.writerow(fm.values())
        for row in read_win1252(
                'SCCS_Full_VariableList_12Sept2017_win1252.csv'):
            row['VarID'] = 'SCCS' + row['VarID']
            row['VarType'] = row['VarType'].capitalize()
            if row['VarDefinition']:
                row['VarDefinition'] += '\n\n'
            row['VarDefinition'] += var_info.get(row['source'], row['source'])
            w.writerow([row[f] for f in fm.keys()])

    with UnicodeWriter('codes.csv') as w:
        fm = OrderedDict([
            ('VarID', 'var_id'),
            ('Code', 'code'),
            ('CodeDescription', 'description'),
            ('ShortName', 'name'),
        ])
        w.writerow(fm.values())
        for row in read_win1252(
                'SCCS_CodeDescriptions_12Sept2017_win1252.csv'):
            row['VarID'] = 'SCCS' + row['VarID']
            w.writerow([row[f] for f in fm.keys()])

    with UnicodeWriter('data.csv') as w:
        fm = OrderedDict([
            ('soc_id', 'soc_id'),
            ('SubCase', 'sub_case'),
            ('Year', 'year'),
            ('VarID', 'var_id'),
            ('Code', 'code'),
            ('EthnoReferences', 'references'),
            ('AdminComment', 'admin_comment'),
            ('UserComment', 'comment'),
            ('SourceCodedData', 'source_coded_data'),
        ])
        w.writerow(fm.values())
        for row in read_win1252(
                'Full_SCCS_data_12Sept2017_FINAL_329451rows_win1252.csv'):
            row['VarID'] = 'SCCS' + row['VarID']
            w.writerow([row[f] for f in fm.keys()])
Exemplo n.º 49
0
    def load_glottolog_data(self):
        """
        Loads the Glottolog classification information from the appropriate
        newick file, parses it and stores the required datastructure in
        self.classification.
        """
        # Don't load if the analysis doesn't use it
        if not self.check_glottolog_required():
            return
        # Don't load if we already have - can this really happen?
        if self.glottolog_loaded:
            return
        self.glottolog_loaded = True

        label2name = {}
        glottocode2node = {}

        def parse_label(label):
            match = GLOTTOLOG_NODE_LABEL.match(label)
            label2name[label] = (match.group('name').strip().replace("\\'","'"), match.group('glottocode'))
            return (
                match.group('name').strip(),
                match.group('glottocode'),
                match.group('isocode'))

        def get_classification(node):
            res = []
            ancestor = node.ancestor
            while ancestor:
                res.append(label2name[ancestor.name])
                ancestor = ancestor.ancestor
            return list(reversed(res))

        # Walk the tree and build the classifications dictionary
        glottolog_trees = newick.read(get_glottolog_data('newick', self.glottolog_release))
        for tree in glottolog_trees:
            for node in tree.walk():
                name, glottocode, isocode = parse_label(node.name)
                classification = get_classification(node)
                self.classifications[glottocode] = classification
                if isocode:
                    self.classifications[isocode] = classification
                glottocode2node[glottocode] = node

        # Load geographic metadata
        for t in reader(
                get_glottolog_data('geo', self.glottolog_release), namedtuples=True):
            if t.macroarea:
                self.glotto_macroareas[t.glottocode] = t.macroarea
                for isocode in t.isocodes.split():
                    self.glotto_macroareas[isocode] = t.macroarea
            if self.location_data:
                continue # Use user-supplied data instead

            if t.latitude and t.longitude:
                latlon = (float(t.latitude), float(t.longitude))
                self.locations[t.glottocode] = latlon
                for isocode in t.isocodes.split():
                    self.locations[isocode] = latlon

        if self.location_data:
            return

        # Second pass of geographic data to handle dialects, which inherit
        # their parent language's location
        for t in reader(
                get_glottolog_data('geo', self.glottolog_release), namedtuples=True):
            if t.level == "dialect":
                failed = False
                node = glottocode2node[t.glottocode]
                ancestor = node.ancestor
                while label2name[ancestor.name][1] not in self.locations:
                    if not ancestor.ancestor:
                        # We've hit the root without finding an ancestral node
                        # with location data!
                        failed = True
                        break
                    else:
                        ancestor = ancestor.ancestor
                if failed:
                    continue
                latlon = self.locations[label2name[ancestor.name][1]]
                self.locations[t.glottocode] = latlon
                for isocode in t.isocodes.split():
                    self.locations[isocode] = latlon
Exemplo n.º 50
0
def read_csv(path):
    return list(reader(path, dicts=True))
Exemplo n.º 51
0
Arquivo: util.py Projeto: clld/phoible
def get_rows(args, name):  # pragma: no cover
    for i, row in enumerate(
            reader(args.data_file('InventoryID-%s.csv' % name), delimiter='\t')):
        if i and row[1] != 'NA':
            yield row
Exemplo n.º 52
0
def read_all(fname, **kw):
    kw.setdefault('delimiter', '\t')
    if not kw.get('dicts'):
        kw.setdefault('namedtuples', True)
    return list(dsv.reader(fname, **kw))
Exemplo n.º 53
0
def tsv_items(path, ordered=False):
    return list(reader(path, delimiter='\t', dicts=True))
Exemplo n.º 54
0
 def read_csv(self, fname, **kw):
     return list(reader(self.joinpath(fname), **kw))
Exemplo n.º 55
0
    def test_reader(self):
        from clldutils.dsv import reader

        lines = ['first\tline', 'sücond\tläneß']
        encoded_lines = [l.encode('utf8') for l in lines]
        csv_lines = [l.replace('\t', ',') for l in lines]

        def check(r):
            res = list(r)
            assert len(res) == 2
            assert res[1][1] == 'läneß'

        check(reader(lines, delimiter='\t'))
        for lt in ['\n', '\r\n', '\r']:
            if PY3:  # pragma: no cover
                # Simulate file opened in text mode:
                fp = StringIO(lt.join(lines), newline='')
            else:
                # Simulate file opened in binary mode:
                fp = BytesIO(to_binary(lt).join(encoded_lines))
            check(reader(fp, delimiter='\t'))
        check(reader(FIXTURES.joinpath('csv.txt')))

        res = list(reader(FIXTURES.joinpath('tsv.txt'), namedtuples=True, delimiter='\t'))
        assert res[0].a_name == 'b'
        # Missing column values should be set to None:
        assert res[2].a_name is None

        r = list(reader(lines, dicts=True, delimiter='\t'))
        assert len(r) == 1 and r[0]['first'] == 'sücond'
        r = list(reader(lines, namedtuples=True, delimiter='\t'))
        assert len(r) == 1 and r[0].first == 'sücond'
        r = list(reader(csv_lines, namedtuples=True))
        assert len(r) == 1 and r[0].first == 'sücond'
        self.assertEqual(list(reader([], dicts=True, delimiter='\t')), [])
        self.assertEqual(
            list(reader([''], dicts=True, fieldnames=['a', 'b'], delimiter='\t')), [])
        self.assertEqual(list(reader(['a,b', ''], dicts=True, delimiter='\t')), [])

        r = reader(
            ['a,b', '1,2,3,4', '1'], dicts=True, restkey='x', restval='y', delimiter=',')
        self.assertEqual(list(r), [dict(a='1', b='2', x=['3', '4']), dict(a='1', b='y')])
Exemplo n.º 56
0
def reflexes(write_stats=True, path='concepticondata'):
    """
    Returns a dictionary with concept set label as value and tuples of concept
    list identifier and concept label as values.
    """
    D, G = {}, {}
    cpl = 0
    cln = 0
    clb = set([])
    
    dpath = Path(path) if path else PKG_PATH
    
    for i, cl in enumerate(dpath.joinpath('conceptlists').glob('*.tsv')):
        concepts = list(reader(cl, namedtuples=True, delimiter="\t"))
        for j,concept in enumerate([c for c in concepts if c.CONCEPTICON_ID]):
            label = concept.GLOSS if hasattr(concept, 'GLOSS') else concept.ENGLISH
            name = cl.name
            try:
                D[concept.CONCEPTICON_GLOSS] += [(name, label)]
            except KeyError:
                D[concept.CONCEPTICON_GLOSS] = [(name, label)]
            try:
                G[label] += [(concept.CONCEPTICON_ID, concept.CONCEPTICON_GLOSS, name)]
            except KeyError:
                G[label] = [(concept.CONCEPTICON_ID, concept.CONCEPTICON_GLOSS, name)]
            clb.add(label)
            cpl += 1
        cln += 1
    # write basic statistics and most frequent glosses
    if write_stats:
        txt = """# Concepticon Statistics
* concept sets (used): {0}
* concept lists: {1}
* concept labels: {2}
* concept labels (unique): {3}
* Ø concepts per list: {4:.2f}
* Ø concepts per concept set: {5:.2f}
* Ø unique concept labels per concept set: {6:.2f}

"""
        txt = txt.format(
            len(D),
            cln,
            cpl,
            len(clb),
            cpl / cln,
            sum([len(v) for k,v in D.items()]) / len(D),
            sum([len(set([label for _,label in v])) for k,v in D.items()]) / len(D)
            )
        
        txt += '# Twenty Most Diverse Concept Sets\n\n'
        txt += '| No. | concept set | distinct labels | concept lists | examples |\n'
        txt += '| --- | --- | --- | --- | --- |\n'
        for i,(k,v) in enumerate(sorted(D.items(), key=lambda x: len(set([label for _,label in
            x[1]])), reverse=True)[:20]):
            txt += '| {0} | {1} | {2} | {3} | {4} |\n'.format(
                    i+1,
                    k,
                    len(set([label for _,label in v])),
                    len(set([clist for clist,_ in v])),
                    ', '.join(sorted(set(['«{0}»'.format(label.replace('*','`*`')) for _,label in
                        v])))
                    )

        txt += '# Twenty Most Frequent Concept Sets\n\n'
        txt += '| No. | concept set | distinct labels | concept lists | examples |\n'
        txt += '| --- | --- | --- | --- | --- |\n'
        for i,(k,v) in enumerate(sorted(D.items(), key=lambda x: len(set([clist for clist,_ in
            x[1]])), reverse=True)[:20]):
            txt += '| {0} | {1} | {2} | {3} | {4} |\n'.format(
                    i+1,
                    k,
                    len(set([label for _,label in v])),
                    len(set([clist for clist,_ in v])),
                    ', '.join(sorted(set(['«{0}»'.format(label.replace('*','`*`')) for _,label in
                        v])))
                    )

        with dpath.joinpath('README.md').open('w', encoding='utf8') as fp:
            fp.write(txt)

    return D, G
Exemplo n.º 57
0
def gps(args):
    """
    Multilingual,
    lg family, -> Dogon, Atlantic, Mande, Berber, ...
    family code ,
    Language (group), -> Names of Dogon languages
    alternate lg (group), -> comma separated list of language names
    language code, -> ISO 639-3
    Language (based on native name),
    dialect code,
    ISO 3 Letter country code,
    OfficialVillageName, -> name!
    MajorCity,
    PopulationNumber,
    village (RB),
    village (DNAFLA),
    village (SIL),
    village (map), -> alternative name
    Transcribed Village Name, -> keep
    N Lat,
    W Lon,
    NFr,
    WFr,
    Nmn60,
    Wmn60,
    NMinFr,
    WMinFr,
    Ndg,
    Wdg,
    N Lat_2, -> 12 12.123
    W Lon_2, -> 12 12.123
    N SIL,
    W SIL,
    N Lat source,
    WLon source,
    N Lat map,
    WLon map,
    N Lat us,
    W Long us,
    sourceOfCoordinates, -> keep
    name of map, -> keep
    lg comment, -> keep
    industries, -> keep
    weekly market, -> keep
    surnames, -> keep
    social info, -> keep
    Image,
    ImageDescription,
    Audio,
    AudioTranscription,
    Video,
    VideoTranscription
    """
    full_name_map = {
        'Oualo (upper)': 'walo_upper',
        'Oualo (lower)': 'walo_lower',
        'Kenntaba-Leye': 'kentabaley',
        'Djimerou-Doungo': 'djimeroudungo',
        'Sassourou': 'sassouru',
        'Sege-Bougie': 'seguebougie',
        'Fiko': 'ficko',
        'Iribanga (Fulbe)': 'iribanga_fulbe',
        'Madina (near Banggel-Toupe)': 'madina_near_bangueltoupe)',
        'Dourou Tanga (1)': 'douroutanga_1',
        'Dourou Tanga (2)': 'douroutanga_2',
        'Dourou Tanga (3)': 'douroutanga_3',
        'Tena (Tere)': 'tena_aka_tere',
        'Anakaga (Amamounou)': 'anakaga_in_amamounou',
        'Dari (near Hombori)': 'dari_near_hombori',
        'Bamba Tene': 'bambatende',
        'Kenntaba-Do': 'kentabado',
        'Tialegel': 'tialeggel',
        'Bani-Banggou': 'banibangou',
        'Ourobangourdi': 'ourobaangourdi',
        'Ourodjougal': 'ourodiouggal',
        'Yadianga (Fulbe)': 'yadiangapoulogoro',
        'Gueourou (Fulbe)': 'gueouroupulogoro',
        'Tongoro-Legu': 'tongorolegou',
        'Koundougou-Mossi': 'koundougoumouniougoro',
        'Billanto-Bella': 'bella',
        'Dianggassagou (Diemessogou)': 'diangassagou_aka_diemessogou)',
    }
    name_map = {
        'kelmita': 'kelmitaa',
        'yrebann': 'yreban',
        'aouguine': 'aougine',
        'bendielysigen': 'bendielisigen',
        'bendielydana': 'bendielidana',
        'ourongeou': 'ourongueou',
        'oukoulourou': 'oukolourou',
        'bendielygirikombo': 'bendieligirikombo',
        'dianggassagou': 'diangassagou',
        'komokanina': 'komokaninaa',
        'dourouna': 'dourounaa',
        'idielina': 'idielinaa',
        'woltigueri': 'woltiguere',
        'irelikanaw': 'ireli_kanaw',
        'korimaounde': 'kori_maounde',
        'yandaguinedia': 'yandaginedia',
        'boudoufolii': 'boudoufoli_section1',
        'boudoufoliii': 'boudoufoli_section2',
    }

    def location(d):
        if d['OfficialVillageName'] == 'Balaguina (Balaguina-Baboye)':
            d['N Lat'] = d['N Lat'].replace(' 115.3', ' 15.3')
        if d['OfficialVillageName'] == 'Daidourou':
            return None, None
        #if d['W Lon us'] and d['N Lat us']:
        #    return parse_deg(d['N Lat us']), parse_deg(d['W Lon us'])
        lat, lon = parse_deg(d['N Lat']), parse_deg(d['W Lon'])
        if lon:
            lon = -lon
        if lon and lon < -10:
            lon += 10
        return lat, lon

    for d in reader(
            args.data_file('repos', 'GPS_Dogon.csv'), dicts=True):
        for k in d:
            d[k] = d[k].strip()
        if not d['OfficialVillageName']:
            continue
        normname = full_name_map.get(d['OfficialVillageName'].strip())
        if normname is None:
            normname = d['OfficialVillageName'].replace('-', '').replace(' ', '').replace('(', '_aka_').replace(')', '').split(',')[0].strip().lower()
            normname = name_map.get(normname, normname)
        v = Village(
            d['OfficialVillageName'],
            normname,
            GPS_LANGS.get(d['Language (group)']),
            data=d)
        v.lat, v.lon = location(d)
        yield v
Exemplo n.º 58
0
 def read(self, path, sep="\t", comment="#"):
     with Path(path).open(encoding='utf-8') as handle:
         lines = [unicodedata.normalize('NFC', hline) for hline in handle.readlines()
                  if hline and not hline.startswith(comment)]
     self.extend(list(reader(lines, dicts=True, delimiter=sep)))
Exemplo n.º 59
0
def main(args):
    if DBSession.bind.dialect.name == 'postgresql':
        Index('ducet', collkey(common.Value.name)).create(DBSession.bind)

    def data_file(*comps):
        return Path(args.data_repos).joinpath('tsammalexdata', 'data', *comps)

    data = Data()
    data.add(
        common.Dataset,
        'tsammalex',
        id="tsammalex",
        name="Tsammalex",
        description="Tsammalex: A lexical database on plants and animals",
        publisher_name="Max Planck Institute for the Science of Human History",
        publisher_place="Jena",
        publisher_url="http://www.shh.mpg.de",
        domain='tsammalex.clld.org',
        license='http://creativecommons.org/licenses/by/4.0/',
        contact='*****@*****.**',
        jsondata={
            'license_icon': 'cc-by.png',
            'license_name': 'Creative Commons Attribution 4.0 International License'})
    data.add(common.Contribution, 'tsammalex', name="Tsammalex", id="tsammalex")

    for rec in Database.from_file(data_file('sources.bib'), lowercase=True):
        data.add(models.Bibrec, rec.id, _obj=bibtex2source(rec, cls=models.Bibrec))

    load_ecoregions(data_file, data)
    load_countries(data)
    second_languages = {}

    def languoid_visitor(lang, row, _):
        add_language_codes(
            data, lang, lang.id.split('-')[0], None, glottocode=row[2] or None)
        second_languages[row[0]] = row[8]

    def habitat_visitor(cat, *_):
        cat.is_habitat = True

    def taxon_visitor(auto, taxon, *_):
        if auto.get(taxon.id):
            update_taxon_data(taxon, auto[taxon.id], data)
        else:
            print('--> missing in taxa.json:', taxon.id, taxon.name)
        taxon.countries_str = ' '.join([e.id for e in taxon.countries])
        taxon.ecoregions_str = ' '.join([e.id for e in taxon.ecoregions])

    auto = {s['id']: s for s in jsonload(data_file('taxa.json'))}
    for model, kw in [
        (models.Lineage, {}),
        (models.Use, {}),
        (models.TsammalexContributor, {}),
        (models.Languoid, dict(visitor=languoid_visitor)),
        (models.Category, dict(name='categories')),
        (models.Category, dict(name='habitats', visitor=habitat_visitor)),
        (models.Taxon, dict(visitor=partial(taxon_visitor, auto))),
        (models.Name, dict(filter_=lambda r: 'xxx' not in r[1])),
    ]:
        from_csv(data_file, model, data, **kw)

    for key, ids in second_languages.items():
        target = data['Languoid'][key]
        for lid in models.split_ids(ids):
            if lid in data['Languoid']:
                # we ignore 2nd languages which are not yet in Tsammalex.
                target.second_languages.append(data['Languoid'][lid])

    def image_url(source_url, type_):
        return re.sub('\.[a-zA-Z]+$', '.jpg', source_url).replace(
            '/original/', '/%s/' % type_)

    for fname in data_files(data_file, 'images.csv'):

        for image in reader(fname, namedtuples=True, delimiter=","):
            if image.taxa__id not in data['Taxon']:
                continue

            url = URL(image.source_url)
            if url.host() != 'edmond.mpdl.mpg.de':
                continue

            jsondata = dict(
                url=image.source_url,
                thumbnail=image_url(image.source_url, 'thumbnail'),
                web=image_url(image.source_url, 'web'))

            f = common.Parameter_files(
                object=data['Taxon'][image.taxa__id],
                id=image.id,
                name=image.tags,
                jsondata=jsondata,
                mime_type=image.mime_type)
            for k in 'source creator date place comments permission'.split():
                v = getattr(image, k)
                if v:
                    models.ImageData(key=k, value=v, image=f)