예제 #1
0
def get_dataset(fname):
    """Load a CLDF dataset.

    Load the file as `json` CLDF metadata description file, or as metadata-free
    dataset contained in a single csv file.

    The distinction is made depending on the file extension: `.json` files are
    loaded as metadata descriptions, all other files are matched against the
    CLDF module specifications. Directories are checked for the presence of
    any CLDF datasets in undefined order of the dataset types.

    Parameters
    ----------
    fname : str or Path
        Path to a CLDF dataset

    Returns
    -------
    Dataset
    """
    fname = Path(fname)
    if not fname.exists():
        raise FileNotFoundError(
            '{:} does not exist'.format(fname))
    if fname.suffix == '.json':
        return pycldf.dataset.Dataset.from_metadata(fname)
    return pycldf.dataset.Dataset.from_data(fname)
예제 #2
0
def sniff(filename):
    """Read the beginning of the file and guess its csv dialect.

    Parameters
    ----------
    filename: str or pathlib.Path
        Path to a csv file to be sniffed

    Returns
    -------
    csv.Dialect
    """
    with Path(filename).open("rb") as fp:
        # On large files, csv.Sniffer seems to need a lot of data to make a
        # successful inference...
        sample = fp.read(1024)
        encoding = chardet.detect(sample)["encoding"]
        sample = sample.decode(encoding)
        while True:
            try:
                dialect = csv.Sniffer().sniff(sample, [",", "\t"])
                dialect.encoding = encoding
                return dialect
            except csv.Error: # pragma: no cover
                blob = fp.read(1024).decode(encoding)
                sample += blob
                if not blob:
                    raise
예제 #3
0
def sniff(filename):
    """Read the beginning of the file and guess its csv dialect.

    Parameters
    ----------
    filename: str or pathlib.Path
        Path to a csv file to be sniffed

    Returns
    -------
    csv.Dialect
    """
    with Path(filename).open("rb" if PY2 else "r") as fp:
        # On large files, csv.Sniffer seems to need a lot of data to make a
        # successful inference...
        sample = fp.read(1024)
        while True:
            try:
                return csv.Sniffer().sniff(sample, [",", "\t"])
            except csv.Error:  # pragma: no cover
                blob = fp.read(1024)
                sample += blob
                if not blob:
                    raise
예제 #4
0
    def from_cldf(cls, path, columns=[], filter=lambda row: row["Form"], *args, **kwargs):
        """Load a CLDF dataset.

        Open a CLDF Dataset – with metadata or metadata-free – (only Wordlist
        datasets are supported for now, because other modules don't seem to
        make sense for LingPy) and transform it into this Class. Columns from
        the FormTable are imported in lowercase, columns from LanguageTable,
        ParameterTable and CognateTable are prefixed with `langage_`,
        `concept_` and `cogid_`and converted to lowercase.

        Notes
        -----
        CLDFs default column names for wordlists are different from LingPy's,
        so you probably have to use::

        >>> lingpy.Wordlist.from_cldf(
            "Wordlist-metadata.json",
            col="language_id", row="parameter_id", segments="segments", transcription="form")

        in order to avoid errors from LingPy not finding required columns.

        Parameters
        ----------
        columns: list of strings
          The list of columns to import. (default: all columns)

        filter: function: rowdict → bool
          A condition function for importing only some rows. (default: lambda row: row["form"])

        All other parameters are passed on to the `cls`

        Returns
        -------
        A `cls` object representing the CLDF dataset

        """
        # Load the dataset.
        fname = Path(path)
        if not fname.exists():
            raise compat.FileNotFoundError(
                '{:} does not exist'.format(fname))
        if fname.suffix == '.json':
            dataset = pycldf.dataset.Dataset.from_metadata(fname)
        else:
            dataset = pycldf.dataset.Dataset.from_data(fname)

        if dataset.module == "Wordlist":
            # First, make a list of cognate codes if they are in a separate table.
            cognateset_assignments = {}
            try:
                form_reference = dataset["CognateTable", "formReference"].name
                for row in dataset["CognateTable"].iterdicts():
                    cognateset_assignments[row[form_reference]] = row
            except KeyError:
                # Either there are no cognate codes, or they are in the form
                # table. Both options are fine.
                pass

            f_id = dataset["FormTable", "id"].name

            # Access columns by type, not by name.
            language_column = dataset["FormTable", "languageReference"].name
            parameter_column = dataset["FormTable", "parameterReference"].name

            try:
                l_id = dataset["LanguageTable", "id"].name
                languages = {l[l_id]: l
                            for l in dataset["LanguageTable"].iterdicts()}
            except KeyError:
                l_id = "ID"
                languages = bounce_as_id

            try:
                c_id = dataset["ParameterTable", "id"].name
                concepts = {c[c_id]: c
                            for c in dataset["ParameterTable"].iterdicts()}
            except KeyError:
                c_id = "ID"
                concepts = bounce_as_id

            # create dictionary
            D = {0: columns} # Reserve the header
            for row in dataset["FormTable"].iterdicts():
                # TODO: Improve prefixing behaviour
                s = {"Cogid_{:}".format(key): value
                     for key, value in cognateset_assignments.get(
                             row[f_id], {}).items()}
                s.update(
                    {"Language_{:}".format(key): value
                     for key, value in languages[row[language_column]].items()})
                s.update(
                    {"Concept_{:}".format(key): value
                     for key, value in concepts[row[parameter_column]].items()})
                s.update(row)

                if not filter(s):
                    continue

                # check for numeric ID
                try:
                    idx = int(row[f_id])
                except ValueError:
                    idx = len(D)
                while idx in D:
                    idx += 1

                if not D[0]:
                    columns = list(s.keys())
                    D[0] = [c.lower() for c in columns]

                D[idx] = [s.get(column) for column in columns]

            # convert to wordlist and return
            return cls(D, *args, **kwargs)
        else:
            # For most LingPy applications, it might be best to see whether we got
            # a Wordlist module.
            raise ValueError("LingPy has no procedures for CLDF {:} data.".format(
                dataset.module))
예제 #5
0
파일: wordlist.py 프로젝트: javiervz/lingpy
    def from_cldf(cls,
                  path,
                  columns=[],
                  filter=lambda row: row["Form"],
                  *args,
                  **kwargs):
        """Load a CLDF dataset.

        Open a CLDF Dataset – with metadata or metadata-free – (only Wordlist
        datasets are supported for now, because other modules don't seem to
        make sense for LingPy) and transform it into this Class. Columns from
        the FormTable are imported in lowercase, columns from LanguageTable,
        ParameterTable and CognateTable are prefixed with `langage_`,
        `concept_` and `cogid_`and converted to lowercase.

        Notes
        -----
        CLDFs default column names for wordlists are different from LingPy's,
        so you probably have to use::

        >>> lingpy.Wordlist.from_cldf(
            "Wordlist-metadata.json",
            col="language_id", row="parameter_id", segments="segments", transcription="form")

        in order to avoid errors from LingPy not finding required columns.

        Parameters
        ----------
        columns: list of strings
          The list of columns to import. (default: all columns)

        filter: function: rowdict → bool
          A condition function for importing only some rows. (default: lambda row: row["form"])

        All other parameters are passed on to the `cls`

        Returns
        -------
        A `cls` object representing the CLDF dataset

        """
        # Load the dataset.
        fname = Path(path)
        if not fname.exists():
            raise compat.FileNotFoundError('{:} does not exist'.format(fname))
        if fname.suffix == '.json':
            dataset = pycldf.dataset.Dataset.from_metadata(fname)
        else:
            dataset = pycldf.dataset.Dataset.from_data(fname)

        if dataset.module == "Wordlist":
            # First, make a list of cognate codes if they are in a separate table.
            cognateset_assignments = {}
            try:
                form_reference = dataset["CognateTable", "formReference"].name
                for row in dataset["CognateTable"].iterdicts():
                    cognateset_assignments[row[form_reference]] = row
            except KeyError:
                # Either there are no cognate codes, or they are in the form
                # table. Both options are fine.
                pass

            f_id = dataset["FormTable", "id"].name

            # Access columns by type, not by name.
            language_column = dataset["FormTable", "languageReference"].name
            parameter_column = dataset["FormTable", "parameterReference"].name

            try:
                l_id = dataset["LanguageTable", "id"].name
                languages = {
                    l[l_id]: l
                    for l in dataset["LanguageTable"].iterdicts()
                }
            except KeyError:
                l_id = "ID"
                languages = bounce_as_id

            try:
                c_id = dataset["ParameterTable", "id"].name
                concepts = {
                    c[c_id]: c
                    for c in dataset["ParameterTable"].iterdicts()
                }
            except KeyError:
                c_id = "ID"
                concepts = bounce_as_id

            # create dictionary
            D = {0: columns}  # Reserve the header
            for row in dataset["FormTable"].iterdicts():
                # TODO: Improve prefixing behaviour
                s = {
                    "Cogid_{:}".format(key): value
                    for key, value in cognateset_assignments.get(
                        row[f_id], {}).items()
                }
                s.update({
                    "Language_{:}".format(key): value
                    for key, value in languages[row[language_column]].items()
                })
                s.update({
                    "Concept_{:}".format(key): value
                    for key, value in concepts[row[parameter_column]].items()
                })
                s.update(row)

                if not filter(s):
                    continue

                # check for numeric ID
                try:
                    idx = int(row[f_id])
                except ValueError:
                    idx = len(D)
                while idx in D:
                    idx += 1

                if not D[0]:
                    columns = list(s.keys())
                    D[0] = [c.lower() for c in columns]

                D[idx] = [s.get(column) for column in columns]

            # convert to wordlist and return
            return cls(D, *args, **kwargs)
        else:
            # For most LingPy applications, it might be best to see whether we got
            # a Wordlist module.
            raise ValueError(
                "LingPy has no procedures for CLDF {:} data.".format(
                    dataset.module))
예제 #6
0
            del segments[s - 1]
            continue
        if segments[s - 1] == "0":
            del segments[s - 1]
            continue
        if segments[s - 1] in "_#◦+→←" and segments[s] in "_#◦+→←":
            del segments[s - 1]
            continue
    row["segments"] = segments[1:-1]
    return row["segments"]


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__.split("\n")[0])
    parser.add_argument("input",
                        default=Path("Wordlist-metadata.json"),
                        nargs="?",
                        type=Path,
                        help="Input file containing the CLDF word list."
                        " (default: ./Wordlist-metadata.json")
    parser.add_argument(
        "output",
        nargs="?",
        # type=argparse.FileType('w'),
        default="aligned",
        help="Output file to write segmented data to,"
        " without extension .tsv (automatically added)")
    parser.add_argument("--soundclass",
                        default="sca",
                        choices=["sca", "dolgo", "asjp", "art"],
                        help="Sound class model to use. (default: sca)")
예제 #7
0
    def from_cldf(cls,
                  path,
                  columns=('parameter_id', 'concept_name', 'language_id',
                           'language_name', 'value', 'form', 'segments',
                           'language_glottocode', 'concept_concepticon_id',
                           'language_latitude', 'language_longitude',
                           'cognacy'),
                  namespace=(('concept_name', 'concept'), ('language_id',
                                                           'doculect'),
                             ('segments', 'tokens'), ('language_glottocode',
                                                      'glottolog'),
                             ('concept_concepticon_id', 'concepticon'),
                             ('language_latitude',
                              'latitude'), ('language_longitude', 'longitude'),
                             ('cognacy', 'cognacy'), ('cogid_cognateset_id',
                                                      'cogid')),
                  filter=lambda row: row["form"],
                  **kwargs):
        """Load a CLDF dataset.

        Open a CLDF Dataset – with metadata or metadata-free – (only Wordlist
        datasets are supported for now, because other modules don't seem to
        make sense for LingPy) and transform it into this Class. Columns from
        the FormTable are imported in lowercase, columns from LanguageTable,
        ParameterTable and CognateTable are prefixed with `langage_`,
        `concept_` and `cogid_`and converted to lowercase.

        Notes
        -----
        CLDFs default column names for wordlists are different from LingPy's,
        so you probably have to use::

        >>> lingpy.Wordlist.from_cldf(
            "Wordlist-metadata.json",
            )

        in order to avoid errors from LingPy not finding required columns.

        Parameters
        ----------
        columns: list or tuple 
          The list of columns to import. (default: all columns)

        filter: function: rowdict → bool
          A condition function for importing only some rows. (default: lambda row: row["form"])

        All other parameters are passed on to the `cls`

        Returns
        -------
        A `cls` object representing the CLDF dataset

        """
        kw = {
            'row': 'concept',
            'col': 'doculect',
            'conf': util.data_path('conf', 'wordlist.rc'),
        }
        kwargs.update(kw)

        if isinstance(namespace, tuple):
            namespace = dict(namespace)

        # get the datatypes from configuration as to namespace
        datatypes = read_conf(kwargs['conf'])[1]

        # Load the dataset.
        fname = Path(path)
        if not fname.exists():
            raise compat.FileNotFoundError('{:} does not exist'.format(fname))
        if fname.suffix == '.json':
            dataset = pycldf.dataset.Dataset.from_metadata(fname)
        else:
            dataset = pycldf.dataset.Dataset.from_data(fname)

        if dataset.module == "Wordlist":
            # First, make a list of cognate codes if they are in a separate table.
            cognateset_assignments = {}
            try:
                form_reference = dataset["CognateTable", "formReference"].name
                for row in dataset["CognateTable"].iterdicts():
                    cognateset_assignments[row[form_reference]] = row
            except KeyError:
                # Either there are no cognate codes, or they are in the form
                # table. Both options are fine.
                pass

            f_id = dataset["FormTable", "id"].name

            # Access columns by type, not by name.
            language_column = dataset["FormTable", "languageReference"].name
            parameter_column = dataset["FormTable", "parameterReference"].name

            try:
                l_id = dataset["LanguageTable", "id"].name
                languages = {
                    l[l_id]: l
                    for l in dataset["LanguageTable"].iterdicts()
                }
            except KeyError:
                l_id = "ID"
                languages = bounce_as_id

            try:
                c_id = dataset["ParameterTable", "id"].name
                concepts = {
                    c[c_id]: c
                    for c in dataset["ParameterTable"].iterdicts()
                }
            except KeyError:
                c_id = "ID"
                concepts = bounce_as_id

            # create dictionary
            D = {0: columns}  # Reserve the header
            for row in dataset["FormTable"].iterdicts():
                # TODO: Improve prefixing behaviour
                s = {
                    "cogid_{:}".format(key).lower(): value
                    for key, value in cognateset_assignments.get(
                        row[f_id], {}).items()
                }
                s.update({
                    "language_{:}".format(key).lower(): value
                    for key, value in languages[row[language_column]].items()
                })
                s.update({
                    "concept_{:}".format(key).lower(): value
                    for key, value in concepts[row[parameter_column]].items()
                })
                s.update({k.lower(): v for k, v in row.items()})

                if not filter(s):
                    continue

                # check for numeric ID
                try:
                    idx = int(row[f_id])
                except ValueError:
                    idx = len(D)
                while idx in D:
                    idx += 1

                if not D[0]:
                    columns = list(s.keys())
                    D[0] = [c.lower() for c in columns]

                D[idx] = [
                    datatypes.get(namespace.get(column, ''),
                                  lambda x: x)(s.get(column, ''))
                    for column in columns
                ]
            D[0] = [namespace.get(c, c) for c in columns]
            if len(D[0]) != len(set(D[0])):
                log.warning('|'.join(columns))
                log.warning('|'.join(D[0]))
                raise ValueError('name space clashes, cannot parse data')

            # convert to wordlist and return
            return cls(D, **kwargs)
        else:
            # For most LingPy applications, it might be best to see whether we got
            # a Wordlist module.
            raise ValueError(
                "LingPy has no procedures for CLDF {:} data.".format(
                    dataset.module))