choices=['global', 'local', 'overlap', 'dialign'], help="Select the mode for the alignment analysis." "(default: overlap)") parser.add_argument("--ratio", default=1.5, type=float, help="Ratio of language-pair specific vs. general" " scores in the LexStat algorithm. (default: 1.5)") parser.add_argument("--initial-threshold", default=0.7, type=float, help="Threshold value for the initial pairs used to" "bootstrap the calculation. (default: 0.7)") args = parser.parse_args() dataset = get_dataset(args.input) lex = lingpy.compare.partial.Partial.from_cldf( args.input, filter=clean_segments, model=lingpy.data.model.Model(args.soundclass), check=True) if args.ratio != 1.5: if args.ratio == float("inf"): ratio_pair = (1, 0) ratio_str = "-inf" if args.ratio == int(args.ratio) >= 0: r = int(args.ratio) ratio_pair = (r, 1) ratio_str = "-{:d}".format(r)
parser = argparse.ArgumentParser( description="Import word lists from a new source into LexiRumah.") parser.add_argument("directory", nargs="?", type=Path, default="./", help="The folder containing the wordlist description," " derived from the standard template. (default: The" " current working directory.)") parser.add_argument("--wordlist", type=Path, default=repository, help="The Wordlist to expand. (default: LexiRumah.)") args = parser.parse_args() dataset = get_dataset(args.wordlist) if dataset.module != 'Wordlist': raise ValueError( "This script can only import wordlist data to a CLDF Wordlist.") # Define how to find the relevant changed files try: import pygit2 def changed_files(path, extension): ... raise NotImplementedError("No git support yet.") except ImportError: print( "WARNING: No pygit2 module found, relying on heuristics for finding"
from collections import OrderedDict, Counter import xlrd import pybtex import pyglottolog from pyglottolog.fts import search_langs from pylexirumah import get_dataset from pylexirumah.geo_lookup import geonames, get_region from pylexirumah.util import identifier gl = pyglottolog.Glottolog( Path(pyglottolog.__file__).parent.parent.parent.parent / "glottolog") lr = get_dataset() # The concepts.json matches Indonesian glosses to LexiRumah concepts and # necessary comments. Most of the matches there were found automatically # through very close matches of the Indonesian or English gloss, with some # manual corrections. concepts = json.load((Path(__file__).parent / "concepts.json").open()) new_sources = pybtex.database.BibliographyData() new_lects = list(lr["LanguageTable"].iterdicts()) new_forms = list(lr["FormTable"].iterdicts()) synonym_counts = Counter() header = None for row in xlrd.open_workbook( str(Path(__file__).parent / "Buton Muna Wordlists.xlsx")).sheet_by_index(0).get_rows():
score = Column(Float) def __repr__(self): return "{:}/{:}:{:}".format(self.form1, self.form2, self.score) Base.metadata.create_all(engine) from sqlalchemy.orm import sessionmaker Session = sessionmaker(bind=engine) Session.configure(bind=engine) # once engine is available session = Session() import pylexirumah dataset = pylexirumah.get_dataset() from lingpy.convert.strings import scorer2str from lingpy.read.qlc import read_scorer from lingpy.compare.partial import Partial import lingpy def import_all_languages(): for language in dataset["LanguageTable"].iterdicts(): session.add(Language(id=language["ID"], language=language["Name"])) try: session.commit() except sqlalchemy.exc.IntegrityError: print("Language {:} already exists.".format(language["ID"])) session.rollback()
import argparse import collections import pyclts bipa = pyclts.TranscriptionSystem() from pylexirumah import get_dataset, repository parser = argparse.ArgumentParser( description="List the sound inventories contained in a CLDF Wordlist") parser.add_argument("--dataset", default=None) args = parser.parse_args() dataset = get_dataset(args.dataset) inventories = collections.defaultdict(collections.Counter) c_language = dataset["FormTable", "languageReference"].name c_segments = dataset["FormTable", "segments"].name for row in dataset["FormTable"].iterdicts(): normalized = [str(bipa[x]) for x in row[c_segments]] inventories[row[c_language]].update(normalized) all = collections.Counter() for language, inventory in inventories.items(): print(language) for item, frequency in inventory.most_common(): print("\t{:}\t{:d}".format(item, frequency)) all.update(inventory) print() print("Summa")
if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--force", action='store_true') parser.add_argument("dataset", nargs="+") args = parser.parse_args() for argument in args.dataset: if argument in stats and not args.force: continue datafile = Path(argument) print(datafile) basename = datafile.stem dataset = get_dataset(datafile) c_language = dataset["FormTable", "languageReference"].name c_concept = dataset["FormTable", "parameterReference"].name c_form = dataset["FormTable", "form"].name c_segments = dataset["FormTable", "segments"].name lects = list( set(row[c_language] for row in dataset["FormTable"].iterdicts())) concepts = [] wordlengths = [] synonyms = [] raw_segments = {} for l, language in enumerate(lects): print(language) c = {} concepts.append(c)
try: if d == "ADM1": continue else: element = geonames.reverse((latitude, longitude), feature_code=d, find_nearby_type='findNearby', exactly_one=False)[0] address.insert(0, element.raw["name"]) except (geopy.exc.GeocoderServiceError, TypeError): continue return address if __name__ == "__main__": data = get_dataset() lang = data["LanguageTable"] updated = [] for language in lang.iterdicts(): if not language["Latitude"]: updated.append(language) continue print(language["Name"]) latlon = (language["Latitude"], language["Longitude"]) print("{:10.5f} {:10.5f}".format(*latlon)) region = get_region(*latlon) sleep(1) print(region) if not region: continue language["Region"] = ", ".join(region)