示例#1
0
    def from_metadata(cls, fname):
        fname = pathlib.Path(fname)
        if fname.is_dir():
            name = '{0}{1}'.format(cls.__name__, MD_SUFFIX)
            tablegroup = TableGroup.from_file(pkg_path('modules', name))
            # adapt the path of the metadata file such that paths to tables are resolved
            # correctly:
            tablegroup._fname = fname.joinpath(name)
        else:
            tablegroup = TableGroup.from_file(fname)

        comps = collections.Counter()
        for table in tablegroup.tables:
            try:
                dt = Dataset.get_tabletype(table)
                if dt:
                    comps.update([dt])
            except ValueError:
                pass
        if comps and comps.most_common(1)[0][1] > 1:
            raise ValueError('{0}: duplicate components!'.format(fname))

        for mod in get_modules():
            if mod.match(tablegroup):
                return mod.cls(tablegroup)
        return cls(tablegroup)
示例#2
0
    def from_metadata(cls, fname):
        fname = Path(fname)
        if fname.is_dir():
            name = '{0}{1}'.format(cls.__name__, MD_SUFFIX)
            tablegroup = TableGroup.from_file(pkg_path('modules', name))
            # adapt the path of the metadata file such that paths to tables are resolved
            # correctly:
            tablegroup._fname = fname.joinpath(name)
        else:
            tablegroup = TableGroup.from_file(fname)

        for mod in get_modules():
            if mod.match(tablegroup):
                return mod.cls(tablegroup)
        return cls(tablegroup)
示例#3
0
def iteraliases(name, table_name=None):
    for table in TableGroup.from_file(data_path(name, 'metadata.json')).tables:
        if table_name is None or table.local_name == table_name:
            for row in table:
                if row:
                    for alias in row['alias']:
                        yield char(alias), char(row['char'])
示例#4
0
 def metadata(self):
     md = self.path.parent.joinpath(self.path.name + '-metadata.json')
     if not md.exists():
         ddir = self._api.data_path() if hasattr(self._api, 'data_path') \
             else REPOS_PATH.joinpath('concepticondata')
         if self.local:
             md = ddir.joinpath('conceptlists', 'local-metadata.json')
         if not md.exists():
             md = ddir.joinpath('conceptlists', 'default-metadata.json')
     tg = TableGroup.from_file(md)
     if isinstance(self._api, Path):
         tg._fname = self._api.parent.joinpath(self._api.name + '-metadata.json')
     tg.tables[0].url = Link('{0}.tsv'.format(self.id))
     return tg.tables[0]
示例#5
0
    def from_metadata(cls, fname):
        fname = Path(fname)
        if fname.is_dir():
            name = '{0}{1}'.format(cls.__name__, MD_SUFFIX)
            tablegroup = TableGroup.from_file(pkg_path('modules', name))
            # adapt the path of the metadata file such that paths to tables are resolved
            # correctly:
            tablegroup._fname = fname.joinpath(name)
        else:
            tablegroup = TableGroup.from_file(fname)

        comps = Counter()
        for table in tablegroup.tables:
            try:
                comps.update([Dataset.get_tabletype(table)])
            except ValueError:
                pass
        if comps and comps.most_common(1)[0][1] > 1:
            raise ValueError('{0}: duplicate components!'.format(fname))

        for mod in get_modules():
            if mod.match(tablegroup):
                return mod.cls(tablegroup)
        return cls(tablegroup)
示例#6
0
def get_modules():
    global _modules
    if not _modules:
        ds = sys.modules[__name__]
        for p in pkg_path('modules').glob('*{0}'.format(MD_SUFFIX)):
            tg = TableGroup.from_file(p)
            mod = Module(
                tg.common_props['dc:conformsTo'],
                tg.tables[0].url.string if tg.tables else None)
            mod.cls = getattr(ds, mod.id)
            _modules.append(mod)
        # prefer Wordlist over ParallelText (forms.csv)
        _modules = sorted(
            _modules,
            key=lambda m: (m.cls in (Wordlist, ParallelText), m.cls is ParallelText))
    return _modules
示例#7
0
def get_modules():
    global _modules
    if not _modules:
        ds = sys.modules[__name__]
        for p in pkg_path('modules').glob('*{0}'.format(MD_SUFFIX)):
            tg = TableGroup.from_file(p)
            mod = Module(
                tg.common_props['dc:conformsTo'],
                tg.tables[0].url.string if tg.tables else None)
            mod.cls = getattr(ds, mod.id)
            _modules.append(mod)
        # prefer Wordlist over ParallelText (forms.csv)
        _modules = sorted(
            _modules,
            key=lambda m: (m.cls in (Wordlist, ParallelText), m.cls is ParallelText))
    return _modules
示例#8
0
 def tg(self):
     md = self.path.parent.joinpath(self.path.name + MD_SUFFIX)
     if not md.exists():
         if hasattr(self._api, 'repos'):
             ddir = self._api.path('concepticondata')
             if self.local:
                 md = ddir.joinpath('conceptlists', 'local' + MD_SUFFIX)
             if not md.exists():
                 md = ddir.joinpath('conceptlists', 'default' + MD_SUFFIX)
         else:
             md = Path(__file__).parent / 'conceptlist-metadata.json'
     tg = TableGroup.from_file(md)
     if isinstance(self._api, Path):
         tg._fname = self._api.parent.joinpath(self._api.name + MD_SUFFIX)
     tg.tables[0].url = Link('{0}.tsv'.format(self.id))
     return tg
示例#9
0
    def validate(self, log=None, validators=None):
        validators = validators or []
        validators.extend(VALIDATORS)
        success = True
        default_tg = TableGroup.from_file(
            pkg_path('modules', '{0}{1}'.format(self.module, MD_SUFFIX)))
        for default_table in default_tg.tables:
            dtable_uri = default_table.common_props['dc:conformsTo']
            try:
                table = self[dtable_uri]
            except KeyError:
                log_or_raise('{0} requires {1}'.format(self.module, dtable_uri), log=log)
                success = False
                table = None

            if table:
                default_cols = {
                    c.propertyUrl.uri for c in default_table.tableSchema.columns
                    if c.required or c.common_props.get('dc:isRequiredBy')}
                cols = {
                    c.propertyUrl.uri for c in table.tableSchema.columns
                    if c.propertyUrl}
                table_uri = table.common_props['dc:conformsTo']
                for col in default_cols - cols:
                    log_or_raise('{0} requires column {1}'.format(table_uri, col), log=log)
                    success = False

        for table in self.tables:
            type_uri = table.common_props.get('dc:conformsTo')
            if type_uri:
                try:
                    TERMS.is_cldf_uri(type_uri)
                except ValueError:
                    success = False
                    log_or_raise('invalid CLDF URI: {0}'.format(type_uri), log=log)

            # FIXME: check whether table.common_props['dc:conformsTo'] is in validators!
            validators_ = []
            for col in table.tableSchema.columns:
                if col.propertyUrl:
                    col_uri = col.propertyUrl.uri
                    try:
                        TERMS.is_cldf_uri(col_uri)
                    except ValueError:
                        success = False
                        log_or_raise('invalid CLDF URI: {0}'.format(col_uri), log=log)
                for table_, col_, v_ in validators:
                    if (not table_ or table is self.get(table_)) and col is self.get((table, col_)):
                        validators_.append((col, v_))

            fname = Path(table.url.resolve(table._parent.base))
            if fname.exists():
                for fname, lineno, row in table.iterdicts(log=log, with_metadata=True):
                    for col, validate in validators_:
                        try:
                            validate(self, table, col, row)
                        except ValueError as e:
                            log_or_raise(
                                '{0}:{1}:{2} {3}'.format(fname.name, lineno, col.name, e),
                                log=log)
                            success = False
                if not table.check_primary_key(log=log):
                    success = False

        if not self.tablegroup.check_referential_integrity(log=log):
            success = False

        return success
示例#10
0
    def validate(self, log=None, validators=None):
        validators = validators or []
        validators.extend(VALIDATORS)
        success = True
        default_tg = TableGroup.from_file(
            pkg_path('modules', '{0}{1}'.format(self.module, MD_SUFFIX)))
        for default_table in default_tg.tables:
            dtable_uri = default_table.common_props['dc:conformsTo']
            try:
                table = self[dtable_uri]
            except KeyError:
                log_or_raise('{0} requires {1}'.format(self.module, dtable_uri), log=log)
                success = False
                table = None

            if table:
                default_cols = {
                    c.propertyUrl.uri for c in default_table.tableSchema.columns
                    if c.required or c.common_props.get('dc:isRequiredBy')}
                cols = {
                    c.propertyUrl.uri for c in table.tableSchema.columns
                    if c.propertyUrl}
                table_uri = table.common_props['dc:conformsTo']
                for col in default_cols - cols:
                    log_or_raise('{0} requires column {1}'.format(table_uri, col), log=log)
                    success = False

        for table in self.tables:
            type_uri = table.common_props.get('dc:conformsTo')
            if type_uri:
                try:
                    TERMS.is_cldf_uri(type_uri)
                except ValueError:
                    success = False
                    log_or_raise('invalid CLDF URI: {0}'.format(type_uri), log=log)

            # FIXME: check whether table.common_props['dc:conformsTo'] is in validators!
            validators_ = []
            for col in table.tableSchema.columns:
                if col.propertyUrl:
                    col_uri = col.propertyUrl.uri
                    try:
                        TERMS.is_cldf_uri(col_uri)
                    except ValueError:
                        success = False
                        log_or_raise('invalid CLDF URI: {0}'.format(col_uri), log=log)
                for table_, col_, v_ in validators:
                    if (not table_ or table is self.get(table_)) and col is self.get((table, col_)):
                        validators_.append((col, v_))

            fname = Path(table.url.resolve(table._parent.base))
            if fname.exists():
                for fname, lineno, row in table.iterdicts(log=log, with_metadata=True):
                    for col, validate in validators_:
                        try:
                            validate(self, table, col, row)
                        except ValueError as e:
                            log_or_raise(
                                '{0}:{1}:{2} {3}'.format(fname.name, lineno, col.name, e),
                                log=log)
                            success = False
                if not table.check_primary_key(log=log):
                    success = False
            else:
                log_or_raise('{0} does not exist'.format(fname), log=log)
                success = False

        if not self.tablegroup.check_referential_integrity(log=log):
            success = False

        return success
示例#11
0
from csvw.metadata import TableGroup
from lingpy import util
from lingpy.convert.html import template_path

# receive the template path from lingpy for splitstree
tpath = util.Path(template_path('splitstree.nex'))
if tpath.exists:
    _template = util.read_text_file(tpath.as_posix())
else:  # pragma: no cover
    raise IOError("Unknown template %s" % template)

tbg = TableGroup.from_file('cldf/StructureDataset-metadata.json')
taxa = {t['ID']: (i, t['Name']) for i, t in
        enumerate(tbg.tabledict['languages.csv'])}
params = {t['ID']: (i, t['Name']) for i, t in
        enumerate(tbg.tabledict['parameters.csv'])}
matrix = [[0 for p in params] for t in taxa]
for row in tbg.tabledict['values.csv']:
    tidx, tname = taxa[row['Language_ID']]
    pidx, pname = params[row['Parameter_ID']]
    if row['Value'] == '+':
        matrix[tidx][pidx] = 1
        
alpha = 'abcdefghijklmnopqrstuvwxyz'
alpha += alpha.upper()
alpha += '0123456789'

matrix_string = ''
tax_list = sorted([t[1] for t in taxa.items()], key=lambda x: x[0])
for i, line in enumerate(matrix):
    matrix_string += '{0:12}'.format(''.join([x for x in tax_list[i][1] if 
示例#12
0
def add_metadata(fname: Path, logger: cli.logging.Logger = cli.logger):
    if fname.name != "forms.csv":
        cli.Exit.CLI_ARGUMENT_ERROR(
            "A metadata-free Wordlist must be in a file called 'forms.csv'.")
    default_wordlist = TableGroup.from_file(
        pycldf.util.pkg_path("modules", "Wordlist-metadata.json"))
    default_wordlist._fname = fname.with_name("Wordlist-metadata.json")
    ds = pycldf.Wordlist(default_wordlist)

    # `from_data` checks that the reqired columns of the FormTable are present,
    # but it does not consolidate the columns further.

    colnames = next(iterrows(fname))

    understood_colnames = {
        c.name
        for c in ds[ds.primary_table].tableSchema.columns if c.name in colnames
    }
    more_columns = {
        c.propertyUrl.uri: c
        for c in ds[ds.primary_table].tableSchema.columns
        if c.name not in understood_colnames
    }
    logger.info(
        "CLDF freely understood the columns %s in your forms.csv.",
        sorted(understood_colnames),
    )

    # Consider the columns that were not understood.
    columns_without_metadata = set(colnames) - understood_colnames
    for column_name in columns_without_metadata:
        column: Column
        # Maybe they are known CLDF properties?
        if column_name in pycldf.terms.TERMS:
            column = pycldf.TERMS[column_name].to_column()
        # Maybe they are CLDF default column names?
        elif column_name in DEFAULT_NAME_COLUMNS:
            column = DEFAULT_NAME_COLUMNS[column_name]
        # Maybe they are columns that Lexedata knows to handle?
        elif column_name in LEXEDATA_COLUMNS:
            column = LEXEDATA_COLUMNS[column_name]
        # Maybe they are columns inherited from LingPy?
        elif column_name.upper() in LINGPY_COLUMNS:
            column = LINGPY_COLUMNS[column_name.upper()]
        # Maybe they are some name we have seen before?
        elif column_name in OTHER_KNOWN_COLUMNS:
            column = OTHER_KNOWN_COLUMNS[column_name]
        else:
            # TODO: Maybe they look like they have a specific type?
            ...
            # Otherwise, they are probably just text to be kept.
            column = Column(
                datatype=Datatype(base="string"),
                default="",
                null=[""],
                name=column_name,
            )
        column.name = column_name

        ds[ds.primary_table].tableSchema.columns.append(column)
        summary = column.propertyUrl or column.datatype
        logger.info(f"Column {column_name} seems to be a {summary} column.")
        if column.propertyUrl:
            to_be_replaced = more_columns.pop(column.propertyUrl.uri, None)
            if to_be_replaced is not None:
                ds[ds.primary_table].tableSchema.columns.remove(to_be_replaced)

    for column in more_columns.values():
        logger.info(
            f"Also added column {column.name}, as expected for a FormTable.")

    ds[ds.primary_table].tableSchema.columns.sort(
        key=lambda k: colnames.index(k.name) if k.name in colnames else 1e10)

    # TODO: Once lexedata is properly published, we can give a better URL.
    ds.properties["dc:contributor"] = [
        "https://github.com/Anaphory/lexedata/blob/master/src/lexedata/edit/add_metadata.py"
    ]
    return ds
示例#13
0
        try:
            alignments = [
                len(
                    split_segments(data[value]['Alignment'])[
                        data[value]['Cognate_Sets'].index(key)])
                for value in values
            ]
            if len(set(alignments)) != 1:
                errors += [key]
        except IndexError:
            errors += [key]
    return errors


if __name__ == '__main__':
    tg = TableGroup.from_file('Wordlist-metadata.json')
    problems = []
    count = 1
    wordlist = {}
    for item in tg.tables[0]:
        morphemes = validate_morphemes(item)
        wordlist[item['ID']] = item
        struc = validate_structure(item)
        if not morphemes:
            problems += [[
                count, 'morphemes',
                str(item['ID']), item['Language_Name'], item['Parameter_name'],
                ' '.join(item['Segments'])
            ]]
            count += 1
        if not struc:
示例#14
0
 def tablegroup(self):
     return TableGroup.from_file(self.dir.joinpath(self.fname + '-metadata.json'))
示例#15
0
 def table(self):
     return TableGroup.from_file(self.path).tabledict[self.id + '.tsv']
示例#16
0
def from_cldf(path, to=Wordlist, concept='Name', concepticon='Concepticon_ID',
        glottocode='Glottocode', language='Name'
        ):
    """
    Load data from CLDF into a LingPy Wordlist object or similar.

    Parameters
    ----------
    path : str
        The path to the metadata-file of your CLDF dataset.
    to : ~lingpy.basic.wordlist.Wordlist
        A ~lingpy.basic.wordlist.Wordlist object or one of the descendants
        (LexStat, Alignmnent).
    concept : str (default='gloss')
        The name used for the basic gloss in the `parameters.csv` table.
    glottocode : str (default='glottocode')
        The default name for the column storing the Glottolog ID in the
        `languages.csv` table.
    language : str (default='name')
        The default name for the language name in the `languages.csv` table.
    concepticon : str (default='conceptset')
        The default name for the concept set in the `paramters.csv` table.
        
    Notes
    -----
    This function does not offer absolute flexibility regarding the data you
    can input so far. However, it can regularly read CLDF-formatted data into
    LingPy and thus allow you to use CLDF data in LingPy analyses.

    """
    tbg = TableGroup.from_file(path)
    forms = tbg.tabledict['forms.csv']

    # obtain the dictionaries to convert ids to values
    taxa = {t['ID']: (t[language], t[glottocode]) for t in
            tbg.tabledict['languages.csv']}
    concepts = {c['ID']: (c[concept], c[concepticon]) for c in 
            tbg.tabledict['parameters.csv']}

    # create dictionary
    D = {}
    id2idx = {}
    for i, row in enumerate(forms):
        # check for numeric ID
        if row['ID'].isdigit():
            idx = int(row['ID'])
        else:
            idx = i+1
        id2idx[row['ID']] = idx

        doculect, glottocode = taxa[row['Language_ID']]
        concept, concepticon_id = concepts[row['Parameter_ID']]
        D[idx] = [doculect, glottocode, concept, concepticon_id] + [row.get(f,
            '') or '' for f in ['form_in_source', 'Form', 'Segments',
                'Comment', 'Source']]
    # add the header
    D[0] = ['doculect', 'glottocode', 'concept', 'concepticon_id', 'value',
            'form', 'tokens', 'note', 'source']

    # convert to wordlist (simplifies handling)
    wordlist = to(D)

    # add cognates if they are needed and provided
    if 'cognates.csv' in tbg.tabledict:
        cognates = {id2idx[row['Form_ID']]: (row['Cognateset_ID'],
            row['Alignment']) for row in tbg.tabledict['cognates.csv']}
        if cognates:
            wordlist.add_entries('cogid', cognates, lambda x: x[0] or 0)
            wordlist.add_entries('alignment', cognates, lambda x: x[1] or '')

    return wordlist
示例#17
0
def from_cldf(path, to=Wordlist, concept='Name', concepticon='Concepticon_ID',
        glottocode='Glottocode', language='Name'
        ):
    """
    Load data from CLDF into a LingPy Wordlist object or similar.

    Parameters
    ----------
    path : str
        The path to the metadata-file of your CLDF dataset.
    to : ~lingpy.basic.wordlist.Wordlist
        A ~lingpy.basic.wordlist.Wordlist object or one of the descendants
        (LexStat, Alignmnent).
    concept : str (default='gloss')
        The name used for the basic gloss in the `parameters.csv` table.
    glottocode : str (default='glottocode')
        The default name for the column storing the Glottolog ID in the
        `languages.csv` table.
    language : str (default='name')
        The default name for the language name in the `languages.csv` table.
    concepticon : str (default='conceptset')
        The default name for the concept set in the `paramters.csv` table.
        
    Notes
    -----
    This function does not offer absolute flexibility regarding the data you
    can input so far. However, it can regularly read CLDF-formatted data into
    LingPy and thus allow you to use CLDF data in LingPy analyses.

    """
    tbg = TableGroup.from_file(path)
    forms = tbg.tabledict['forms.csv']

    # obtain the dictionaries to convert ids to values
    taxa = {t['ID']: (t[language], t[glottocode]) for t in
            tbg.tabledict['languages.csv']}
    concepts = {c['ID']: (c[concept], c[concepticon]) for c in 
            tbg.tabledict['parameters.csv']}

    # create dictionary
    D = {}
    id2idx = {}
    for i, row in enumerate(forms):
        # check for numeric ID
        if row['ID'].isdigit():
            idx = int(row['ID'])
        else:
            idx = i+1
        id2idx[row['ID']] = idx

        doculect, glottocode = taxa[row['Language_ID']]
        concept, concepticon_id = concepts[row['Parameter_ID']]
        D[idx] = [doculect, glottocode, concept, concepticon_id] + [row.get(f,
            '') or '' for f in ['form_in_source', 'Form', 'Segments',
                'Comment', 'Source']]
    # add the header
    D[0] = ['doculect', 'glottocode', 'concept', 'concepticon_id', 'value',
            'form', 'tokens', 'note', 'source']

    # convert to wordlist (simplifies handling)
    wordlist = to(D)

    # add cognates if they are needed and provided
    if 'cognates.csv' in tbg.tabledict:
        cognates = {id2idx[row['Form_ID']]: (row['Cognateset_ID'],
            row['Alignment']) for row in tbg.tabledict['cognates.csv']}
        if cognates:
            wordlist.add_entries('cogid', cognates, lambda x: x[0] or 0)
            wordlist.add_entries('alignment', cognates, lambda x: x[1] or '')

    return wordlist