def get_dataset(fname): """Load a CLDF dataset. Load the file as `json` CLDF metadata description file, or as metadata-free dataset contained in a single csv file. The distinction is made depending on the file extension: `.json` files are loaded as metadata descriptions, all other files are matched against the CLDF module specifications. Directories are checked for the presence of any CLDF datasets in undefined order of the dataset types. Parameters ---------- fname : str or Path Path to a CLDF dataset Returns ------- Dataset """ fname = Path(fname) if not fname.exists(): raise FileNotFoundError( '{:} does not exist'.format(fname)) if fname.suffix == '.json': return pycldf.dataset.Dataset.from_metadata(fname) return pycldf.dataset.Dataset.from_data(fname)
def sniff(filename): """Read the beginning of the file and guess its csv dialect. Parameters ---------- filename: str or pathlib.Path Path to a csv file to be sniffed Returns ------- csv.Dialect """ with Path(filename).open("rb") as fp: # On large files, csv.Sniffer seems to need a lot of data to make a # successful inference... sample = fp.read(1024) encoding = chardet.detect(sample)["encoding"] sample = sample.decode(encoding) while True: try: dialect = csv.Sniffer().sniff(sample, [",", "\t"]) dialect.encoding = encoding return dialect except csv.Error: # pragma: no cover blob = fp.read(1024).decode(encoding) sample += blob if not blob: raise
def sniff(filename): """Read the beginning of the file and guess its csv dialect. Parameters ---------- filename: str or pathlib.Path Path to a csv file to be sniffed Returns ------- csv.Dialect """ with Path(filename).open("rb" if PY2 else "r") as fp: # On large files, csv.Sniffer seems to need a lot of data to make a # successful inference... sample = fp.read(1024) while True: try: return csv.Sniffer().sniff(sample, [",", "\t"]) except csv.Error: # pragma: no cover blob = fp.read(1024) sample += blob if not blob: raise
def from_cldf(cls, path, columns=[], filter=lambda row: row["Form"], *args, **kwargs): """Load a CLDF dataset. Open a CLDF Dataset – with metadata or metadata-free – (only Wordlist datasets are supported for now, because other modules don't seem to make sense for LingPy) and transform it into this Class. Columns from the FormTable are imported in lowercase, columns from LanguageTable, ParameterTable and CognateTable are prefixed with `langage_`, `concept_` and `cogid_`and converted to lowercase. Notes ----- CLDFs default column names for wordlists are different from LingPy's, so you probably have to use:: >>> lingpy.Wordlist.from_cldf( "Wordlist-metadata.json", col="language_id", row="parameter_id", segments="segments", transcription="form") in order to avoid errors from LingPy not finding required columns. Parameters ---------- columns: list of strings The list of columns to import. (default: all columns) filter: function: rowdict → bool A condition function for importing only some rows. (default: lambda row: row["form"]) All other parameters are passed on to the `cls` Returns ------- A `cls` object representing the CLDF dataset """ # Load the dataset. fname = Path(path) if not fname.exists(): raise compat.FileNotFoundError( '{:} does not exist'.format(fname)) if fname.suffix == '.json': dataset = pycldf.dataset.Dataset.from_metadata(fname) else: dataset = pycldf.dataset.Dataset.from_data(fname) if dataset.module == "Wordlist": # First, make a list of cognate codes if they are in a separate table. cognateset_assignments = {} try: form_reference = dataset["CognateTable", "formReference"].name for row in dataset["CognateTable"].iterdicts(): cognateset_assignments[row[form_reference]] = row except KeyError: # Either there are no cognate codes, or they are in the form # table. Both options are fine. pass f_id = dataset["FormTable", "id"].name # Access columns by type, not by name. language_column = dataset["FormTable", "languageReference"].name parameter_column = dataset["FormTable", "parameterReference"].name try: l_id = dataset["LanguageTable", "id"].name languages = {l[l_id]: l for l in dataset["LanguageTable"].iterdicts()} except KeyError: l_id = "ID" languages = bounce_as_id try: c_id = dataset["ParameterTable", "id"].name concepts = {c[c_id]: c for c in dataset["ParameterTable"].iterdicts()} except KeyError: c_id = "ID" concepts = bounce_as_id # create dictionary D = {0: columns} # Reserve the header for row in dataset["FormTable"].iterdicts(): # TODO: Improve prefixing behaviour s = {"Cogid_{:}".format(key): value for key, value in cognateset_assignments.get( row[f_id], {}).items()} s.update( {"Language_{:}".format(key): value for key, value in languages[row[language_column]].items()}) s.update( {"Concept_{:}".format(key): value for key, value in concepts[row[parameter_column]].items()}) s.update(row) if not filter(s): continue # check for numeric ID try: idx = int(row[f_id]) except ValueError: idx = len(D) while idx in D: idx += 1 if not D[0]: columns = list(s.keys()) D[0] = [c.lower() for c in columns] D[idx] = [s.get(column) for column in columns] # convert to wordlist and return return cls(D, *args, **kwargs) else: # For most LingPy applications, it might be best to see whether we got # a Wordlist module. raise ValueError("LingPy has no procedures for CLDF {:} data.".format( dataset.module))
def from_cldf(cls, path, columns=[], filter=lambda row: row["Form"], *args, **kwargs): """Load a CLDF dataset. Open a CLDF Dataset – with metadata or metadata-free – (only Wordlist datasets are supported for now, because other modules don't seem to make sense for LingPy) and transform it into this Class. Columns from the FormTable are imported in lowercase, columns from LanguageTable, ParameterTable and CognateTable are prefixed with `langage_`, `concept_` and `cogid_`and converted to lowercase. Notes ----- CLDFs default column names for wordlists are different from LingPy's, so you probably have to use:: >>> lingpy.Wordlist.from_cldf( "Wordlist-metadata.json", col="language_id", row="parameter_id", segments="segments", transcription="form") in order to avoid errors from LingPy not finding required columns. Parameters ---------- columns: list of strings The list of columns to import. (default: all columns) filter: function: rowdict → bool A condition function for importing only some rows. (default: lambda row: row["form"]) All other parameters are passed on to the `cls` Returns ------- A `cls` object representing the CLDF dataset """ # Load the dataset. fname = Path(path) if not fname.exists(): raise compat.FileNotFoundError('{:} does not exist'.format(fname)) if fname.suffix == '.json': dataset = pycldf.dataset.Dataset.from_metadata(fname) else: dataset = pycldf.dataset.Dataset.from_data(fname) if dataset.module == "Wordlist": # First, make a list of cognate codes if they are in a separate table. cognateset_assignments = {} try: form_reference = dataset["CognateTable", "formReference"].name for row in dataset["CognateTable"].iterdicts(): cognateset_assignments[row[form_reference]] = row except KeyError: # Either there are no cognate codes, or they are in the form # table. Both options are fine. pass f_id = dataset["FormTable", "id"].name # Access columns by type, not by name. language_column = dataset["FormTable", "languageReference"].name parameter_column = dataset["FormTable", "parameterReference"].name try: l_id = dataset["LanguageTable", "id"].name languages = { l[l_id]: l for l in dataset["LanguageTable"].iterdicts() } except KeyError: l_id = "ID" languages = bounce_as_id try: c_id = dataset["ParameterTable", "id"].name concepts = { c[c_id]: c for c in dataset["ParameterTable"].iterdicts() } except KeyError: c_id = "ID" concepts = bounce_as_id # create dictionary D = {0: columns} # Reserve the header for row in dataset["FormTable"].iterdicts(): # TODO: Improve prefixing behaviour s = { "Cogid_{:}".format(key): value for key, value in cognateset_assignments.get( row[f_id], {}).items() } s.update({ "Language_{:}".format(key): value for key, value in languages[row[language_column]].items() }) s.update({ "Concept_{:}".format(key): value for key, value in concepts[row[parameter_column]].items() }) s.update(row) if not filter(s): continue # check for numeric ID try: idx = int(row[f_id]) except ValueError: idx = len(D) while idx in D: idx += 1 if not D[0]: columns = list(s.keys()) D[0] = [c.lower() for c in columns] D[idx] = [s.get(column) for column in columns] # convert to wordlist and return return cls(D, *args, **kwargs) else: # For most LingPy applications, it might be best to see whether we got # a Wordlist module. raise ValueError( "LingPy has no procedures for CLDF {:} data.".format( dataset.module))
del segments[s - 1] continue if segments[s - 1] == "0": del segments[s - 1] continue if segments[s - 1] in "_#◦+→←" and segments[s] in "_#◦+→←": del segments[s - 1] continue row["segments"] = segments[1:-1] return row["segments"] if __name__ == "__main__": parser = argparse.ArgumentParser(description=__doc__.split("\n")[0]) parser.add_argument("input", default=Path("Wordlist-metadata.json"), nargs="?", type=Path, help="Input file containing the CLDF word list." " (default: ./Wordlist-metadata.json") parser.add_argument( "output", nargs="?", # type=argparse.FileType('w'), default="aligned", help="Output file to write segmented data to," " without extension .tsv (automatically added)") parser.add_argument("--soundclass", default="sca", choices=["sca", "dolgo", "asjp", "art"], help="Sound class model to use. (default: sca)")
def from_cldf(cls, path, columns=('parameter_id', 'concept_name', 'language_id', 'language_name', 'value', 'form', 'segments', 'language_glottocode', 'concept_concepticon_id', 'language_latitude', 'language_longitude', 'cognacy'), namespace=(('concept_name', 'concept'), ('language_id', 'doculect'), ('segments', 'tokens'), ('language_glottocode', 'glottolog'), ('concept_concepticon_id', 'concepticon'), ('language_latitude', 'latitude'), ('language_longitude', 'longitude'), ('cognacy', 'cognacy'), ('cogid_cognateset_id', 'cogid')), filter=lambda row: row["form"], **kwargs): """Load a CLDF dataset. Open a CLDF Dataset – with metadata or metadata-free – (only Wordlist datasets are supported for now, because other modules don't seem to make sense for LingPy) and transform it into this Class. Columns from the FormTable are imported in lowercase, columns from LanguageTable, ParameterTable and CognateTable are prefixed with `langage_`, `concept_` and `cogid_`and converted to lowercase. Notes ----- CLDFs default column names for wordlists are different from LingPy's, so you probably have to use:: >>> lingpy.Wordlist.from_cldf( "Wordlist-metadata.json", ) in order to avoid errors from LingPy not finding required columns. Parameters ---------- columns: list or tuple The list of columns to import. (default: all columns) filter: function: rowdict → bool A condition function for importing only some rows. (default: lambda row: row["form"]) All other parameters are passed on to the `cls` Returns ------- A `cls` object representing the CLDF dataset """ kw = { 'row': 'concept', 'col': 'doculect', 'conf': util.data_path('conf', 'wordlist.rc'), } kwargs.update(kw) if isinstance(namespace, tuple): namespace = dict(namespace) # get the datatypes from configuration as to namespace datatypes = read_conf(kwargs['conf'])[1] # Load the dataset. fname = Path(path) if not fname.exists(): raise compat.FileNotFoundError('{:} does not exist'.format(fname)) if fname.suffix == '.json': dataset = pycldf.dataset.Dataset.from_metadata(fname) else: dataset = pycldf.dataset.Dataset.from_data(fname) if dataset.module == "Wordlist": # First, make a list of cognate codes if they are in a separate table. cognateset_assignments = {} try: form_reference = dataset["CognateTable", "formReference"].name for row in dataset["CognateTable"].iterdicts(): cognateset_assignments[row[form_reference]] = row except KeyError: # Either there are no cognate codes, or they are in the form # table. Both options are fine. pass f_id = dataset["FormTable", "id"].name # Access columns by type, not by name. language_column = dataset["FormTable", "languageReference"].name parameter_column = dataset["FormTable", "parameterReference"].name try: l_id = dataset["LanguageTable", "id"].name languages = { l[l_id]: l for l in dataset["LanguageTable"].iterdicts() } except KeyError: l_id = "ID" languages = bounce_as_id try: c_id = dataset["ParameterTable", "id"].name concepts = { c[c_id]: c for c in dataset["ParameterTable"].iterdicts() } except KeyError: c_id = "ID" concepts = bounce_as_id # create dictionary D = {0: columns} # Reserve the header for row in dataset["FormTable"].iterdicts(): # TODO: Improve prefixing behaviour s = { "cogid_{:}".format(key).lower(): value for key, value in cognateset_assignments.get( row[f_id], {}).items() } s.update({ "language_{:}".format(key).lower(): value for key, value in languages[row[language_column]].items() }) s.update({ "concept_{:}".format(key).lower(): value for key, value in concepts[row[parameter_column]].items() }) s.update({k.lower(): v for k, v in row.items()}) if not filter(s): continue # check for numeric ID try: idx = int(row[f_id]) except ValueError: idx = len(D) while idx in D: idx += 1 if not D[0]: columns = list(s.keys()) D[0] = [c.lower() for c in columns] D[idx] = [ datatypes.get(namespace.get(column, ''), lambda x: x)(s.get(column, '')) for column in columns ] D[0] = [namespace.get(c, c) for c in columns] if len(D[0]) != len(set(D[0])): log.warning('|'.join(columns)) log.warning('|'.join(D[0])) raise ValueError('name space clashes, cannot parse data') # convert to wordlist and return return cls(D, **kwargs) else: # For most LingPy applications, it might be best to see whether we got # a Wordlist module. raise ValueError( "LingPy has no procedures for CLDF {:} data.".format( dataset.module))