def create(self, req, filename=None, verbose=True, outfile=None): with safe_overwrite(outfile or self.abspath(req)) as tmp: if self.rdf: # we do not create archives with a readme for rdf downloads, because each # RDF entity points to the dataset and the void description of the dataset # covers all relevant metadata. # # TODO: write test for the file name things!? # with closing( GzipFile(filename=Path(tmp.stem).stem, fileobj=tmp.open('wb'))) as fp: self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) else: with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile: if not filename: fp = self.get_stream() self.before(req, fp) for i, item in enumerate( page_query(self.query(req), verbose=verbose)): self.dump(req, fp, item, i) self.after(req, fp) zipfile.writestr(self.name, self.read_stream(fp)) else: # pragma: no cover zipfile.write(Path(filename).as_posix(), self.name) zipfile.writestr( 'README.txt', format_readme( req, req.db.query(Dataset).first()).encode('utf8'))
def langsearch(args): """Search Glottolog languoids glottolog --repos=. langsearch "QUERY" """ def highlight(text): res, i = '', 0 for m in re.finditer('\[\[(?P<m>[^\]]+)\]\]', text): res += text[i:m.start()] res += colored(m.group('m'), 'red', attrs=['bold']) i = m.end() res += text[i:] return res + '\n' count, results = fts.search_langs(args.repos, args.args[0]) cwd = os.getcwd() print('{} matches'.format(count)) for res in results: try: p = Path(res.fname).relative_to(Path(cwd)) except ValueError: p = res.fname sprint('{0.name} [{0.id}] {0.level}'.format(res), color=None, attrs=['bold']) sprint(p, color='green') sprint(highlight(res.highlights) if res.highlights else '') print('{} matches'.format(count))
def iso2codes(args): """ Map ISO codes to the list of all Glottolog languages and dialects subsumed "under" it. """ nodes = list(args.repos.languoids()) res = {} for node in nodes: if node.iso: res[node.id] = (node.iso, set()) for node in nodes: if node.level == args.repos.languoid_levels.family or node.id in res: continue for nid in res: matched = False for l in node.lineage: if l[1] == nid: res[nid][1].add(node.id) matched = True break if matched: break outdir = Path('.') if not args.args else Path(args.args[0]) with UnicodeWriter(outdir / 'iso2glottocodes.csv') as writer: writer.writerow(['iso', 'glottocodes']) for gc, (iso, gcs) in res.items(): writer.writerow([iso, ';'.join([gc] + list(gcs))])
def test_catalogue_of_life(tmpdir): data = fixtures('data_providers', 'catalogueoflife') id_ = '9249d9473aac5c8e99fb9d758ced91ec' repos = create_repos(tmpdir) with patch('pytsammalex.util.requests', MockRequests(content=data['identify'])): prov = CatalogueOfLife(Path(repos)) assert (prov.identify('x') == id_) with patch('pytsammalex.util.requests', MockRequests(content=data['metadata'])): prov = CatalogueOfLife(Path(repos)) md = prov.cached_metadata('test', id_) taxon = {} prov.update(taxon, md) assert (taxon == { 'catalogueoflife_url': 'http://www.catalogueoflife.org/col/' 'browse/tree/id/' '9249d9473aac5c8e99fb9d758ced91ec', 'class': 'Mammalia', 'family': 'Felidae', 'kingdom': 'Animalia', 'order': 'Carnivora', 'phylum': 'Chordata' })
def __init__(self, repos=None): self.repos = (Path(repos) if repos else Path(__file__).parent.parent).resolve() self.tree = self.repos / 'languoids' / 'tree' if not self.tree.exists(): raise ValueError('repos dir %s missing tree dir: %s' % (self.repos, self.tree))
def cldf(args): """ Create CLDF datasets from the raw data for a dataset. lexibank --glottolog-repos PATH --concepticon-repos PATH cldf [DATASET_ID] """ if not args.glottolog_repos or not Path(args.glottolog_repos).exists(): raise ParserError('Invalid glottolog repository path given') if not args.concepticon_repos or not Path(args.concepticon_repos).exists(): raise ParserError('Invalid concepticon repository path given') # FIXME: get dict of all glottolog langs right here, and attach to datasets! try: languoids = load('glottolog') except ValueError: languoids = { l.id: l for l in Glottolog(args.glottolog_repos).languoids() } dump(languoids, 'glottolog') def _cldf(ds, **kw): ds.glottolog_languoids = languoids ds.cldf(**kw) ds.write_cognates() with_dataset(args, _cldf)
def test_data_url_from_string(): from clldutils.path import Path assert data_url('ü') == 'data:application/octet-stream;base64,w7w=' assert data_url(Path(__file__)).startswith('data:') assert data_url(Path(__file__), mimetype='text/plain').startswith('data:text/plain')
def create_repos(dir_): tsammalexdata = dir_.join('tsammalexdata') tsammalexdata.mkdir() data = tsammalexdata.join('data') data.mkdir() with data.join('test.csv').open('w', encoding='utf8') as fp: fp.write("""\ a,b,c 1,2,3 4,5,6""") with data.join('distribution.csv').open('w', encoding='utf8') as fp: fp.write("id,coregions__ids,countries_ids") test_eco_path = fixture_path('test_ecoregions.json') eco_path = data.join('ecoregions.json') copy(Path(test_eco_path), Path(eco_path)) external = data.join('external') external.mkdir() with external.join('test.csv').open('w', encoding='utf8') as fp: fp.write("""\ a,b,c 1,2,3 4,5,6""") external.join('gbif').mkdir() occurrences = fixture_path('abelmoschusesculentus.json') copy(Path(occurrences), Path(external.join('gbif', occurrences.name))) return dir_
def get_ini(fname, **kw): fname = Path(fname) if not fname.exists(): # For old-style (<=3.4) repository layout we ship the config data with pyglottolog: name = fname.name if fname.name != 'hhtype.ini' else 'document_types.ini' fname = Path(__file__).parent / name assert fname.exists() return INI.from_file(fname, **kw)
def create_archive(args): rels = get_release_config() for section in rels.sections(): _load_sql_dump(rels[section], args.log) out = Path('archive') if args.args: out = Path(args.args[0]) static_archive.create( [rels.get(sec, 'version') for sec in rels.sections()], out) args.log.info('static archive created in {0}'.format(out))
def new_dataset(args): """ lexibank new-dataset OUTDIR [ID] """ if not args.args: raise ParserError('you must specify an existing directory') outdir = Path(args.args.pop(0)) if not outdir.exists(): raise ParserError('you must specify an existing directory') id_pattern = re.compile('[a-z_0-9]+$') md = {} if args.args: md['id'] = args.args.pop(0) else: md['id'] = input('Dataset ID: ') while not id_pattern.match(md['id']): print( 'dataset id must only consist of lowercase ascii letters, digits and _ (underscore)!' ) md['id'] = input('Dataset ID: ') outdir = outdir / md['id'] if not outdir.exists(): outdir.mkdir() for key in ['title', 'url', 'license', 'conceptlist', 'citation']: md[key] = input('Dataset {0}: '.format(key)) # check license! # check conceptlist! for path in Path( pylexibank.__file__).parent.joinpath('dataset_template').iterdir(): if path.is_file(): if path.suffix in ['.pyc']: continue # pragma: no cover target = path.name content = read_text(path) if '+' in path.name: target = re.sub('\+([a-z]+)\+', lambda m: '{' + m.groups()[0] + '}', path.name).format(**md) if target.endswith('_tmpl'): target = target[:-5] content = content.format(**md) write_text(outdir / target, content) else: target = outdir / path.name if target.exists(): shutil.rmtree(str(target)) shutil.copytree(str(path), str(target)) del md['id'] jsonlib.dump(md, outdir / 'metadata.json', indent=4)
def test_json_data(tmpdir): tmp_ = create_repos(tmpdir) with JsonData('test.json', repos=Path(tmp_)) as jdat: jdat['a'] = 1 assert (data_file('test.json', repos=Path(tmp_)).exists() is True) with JsonData('test.json', repos=Path(tmp_)) as jdat: assert (len(jdat) == 1) assert (jdat['a'] == 1)
def main(): # pragma: no cover pkg_dir = Path(glottolog3.__file__).parent parser = ArgumentParserWithLogging('glottolog3') parser.add_argument( '--repos', help="path to glottolog data repository", type=Glottolog, default=Glottolog( Path(glottolog3.__file__).parent.parent.parent.joinpath( 'glottolog'))) parser.add_argument('--pkg-dir', help=argparse.SUPPRESS, default=pkg_dir) sys.exit(parser.main())
def __init__(self, dataset): self._count = defaultdict(int) self._cognate_count = defaultdict(int) self.dataset = dataset md = self.dataset.cldf_dir / MD_NAME if not md.exists(): md = self.dataset.cldf_dir / ALT_MD_NAME if not md.exists(): md = self.dataset.cldf_dir / MD_NAME copy(Path(__file__).parent / MD_NAME, md) self.wl = Wordlist.from_metadata(md) default_cldf = Wordlist.from_metadata( Path(__file__).parent / 'cldf-metadata.json') self.objects = {} self._obj_index = {} for cls in [ self.dataset.lexeme_class, self.dataset.language_class, self.dataset.concept_class, self.dataset.cognate_class, ]: self.objects[cls.__cldf_table__()] = [] self._obj_index[cls.__cldf_table__()] = set() cols = set( col.header for col in self.wl[cls.__cldf_table__()].tableSchema.columns) properties = set( col.propertyUrl.uri for col in self.wl[cls.__cldf_table__()].tableSchema.columns if col.propertyUrl) for field in cls.fieldnames(): try: col = default_cldf[cls.__cldf_table__(), field] # # We added Latitude and Longitude to the default metadata later, and want to # make sure, existing datasets are upgraded silently. # if field in ['Latitude', 'Longitude'] \ and cls.__cldf_table__() == 'LanguageTable': properties.add(col.propertyUrl.uri) self.wl[cls.__cldf_table__(), field].propertyUrl = col.propertyUrl self.wl[cls.__cldf_table__(), field].datatype = col.datatype except KeyError: col = Column(name=field, datatype="string") if (col.propertyUrl and col.propertyUrl.uri not in properties) or \ ((not col.propertyUrl) and (field not in cols)): self.wl[cls.__cldf_table__()].tableSchema.columns.append( col)
def main(args): Index('ducet', collkey(common.Value.name)).create(DBSession.bind) repos = Path( os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data') with transaction.manager: dataset = common.Dataset( id=lexirumah.__name__, name="lexirumah", publisher_name= "Max Planck Institute for the Science of Human History", publisher_place="Jena", publisher_url="http://shh.mpg.de", license="http://creativecommons.org/licenses/by/4.0/", domain='lexirumah.model-ling.eu', contact='*****@*****.**', jsondata={ 'license_icon': 'cc-by.png', 'license_name': 'Creative Commons Attribution 4.0 International License' }) DBSession.add(dataset) glottolog_repos = Path( lexirumah.__file__).parent.parent.parent.parent.joinpath( 'glottolog3', 'glottolog') languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()} concepticon = Concepticon( Path(lexirumah.__file__).parent.parent.parent.parent.joinpath( 'concepticon', 'concepticon-data')) conceptsets = {c.id: c for c in concepticon.conceptsets.values()} skip = True for dname in sorted(repos.joinpath('datasets').iterdir(), key=lambda p: p.name): #if dname.name == 'benuecongo': # skip = False #if skip: # continue if dname.is_dir() and dname.name != '_template': mdpath = dname.joinpath('cldf', 'metadata.json') if mdpath.exists(): print(dname.name) import_cldf(dname, load(mdpath), languoids, conceptsets) with transaction.manager: load_families(Data(), DBSession.query(LexiRumahLanguage), glottolog_repos=glottolog_repos, isolates_icon='tcccccc')
def test_update(tmpdir): repos = create_repos(tmpdir) with patch.multiple('pytsammalex.distribution', shape=Mock(return_value=Mock(return_value=True)), Point=Mock()): update(Path(repos), log=Mock()) data = CsvData('distribution', repos=Path(repos)) assert (len(data) == 1) assert (data.items[0].ecoregions__ids == [ 'AT0110', 'AT0111', 'AT0112', 'AT0113', 'AT0114', 'AT0115', 'AT0116', 'AT0117', 'AT0118', 'AT0119' ])
def lff2tree(tree=TREE, outdir=None, builddir=None, lffs=None): """ - get mapping glottocode -> Languoid from old tree - assemble new directory tree - for each path component in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - for each language/dialect in lff/dff: - create new dir - copy info file from old tree (possibly updating the name) or - create info file - rm old tree - copy new tree """ # FIXME: instead of removing trees, we should just move the current one # from outdir to build, and then recreate in outdir. builddir = Path(builddir) if builddir else build_path('tree') old_tree = {l.id: l for l in walk_tree(tree)} if tree else {} out = Path(outdir or tree) if not out.parent.exists(): out.parent.mkdir() if out.exists(): if builddir.exists(): try: rmtree(builddir) except: # pragma: no cover pass if builddir.exists(): # pragma: no cover raise ValueError('please remove %s before proceeding' % builddir) # move the old tree out of the way shutil.move(out.as_posix(), builddir.as_posix()) out.mkdir() lffs = lffs or {} languages = {} for lang in read_lff(Level.language, fp=lffs.get(Level.language)): languages[lang.id] = lang lang2tree(lang, lang.lineage, out, old_tree) for lang in read_lff(Level.dialect, fp=lffs.get(Level.dialect)): if not lang.lineage or lang.lineage[0][1] not in languages: raise ValueError('unattached dialect') # pragma: no cover lang2tree(lang, languages[lang.lineage[0][1]].lineage + lang.lineage, out, old_tree)
def setUp(self): WithTempDir.setUp(self) self.repos = self.tmp_path() self.languoids = self.tmp_path('languoids') copytree( Path(__file__).parent.joinpath('data', 'languoids'), self.languoids) self.tree = self.languoids.joinpath('tree') self.references = self.tmp_path('references') copytree( Path(__file__).parent.joinpath('data', 'references'), self.references) self.tmp_path('build').mkdir()
def download_tables(outdir=None): match = ZIP_NAME_PATTERN.search(urlopen(BASE_URL + 'download.asp').read()) if not match: raise ValueError('no matching zip file name found') # pragma: no cover target = Path(outdir or '.').joinpath(match.group('name')) urlretrieve(BASE_URL + match.group('name'), target.as_posix()) return target
def main(): parser = ArgumentParser('pytsammalex', update_taxa, upload_images, update_distribution) parser.add_argument('--tsammalex-data', help="path to tsammalex-data repository", default=Path(pytsammalex.__file__).parent.parent) sys.exit(parser.main())
def __init__(self, fname): """ A `Database` instance is initialized with a file path. :param fname: Path to a file in the file system where the db is to be stored. """ self.fname = Path(fname)
def main(): # pragma: no cover parser = ArgumentParser( __name__, link, stats, attributes, intersection, union, upload_sources, map_concepts, lookup) parser.add_argument( '--data', help="path to concepticon-data", default=Path(pyconcepticon.__file__).parent.parent) parser.add_argument( '--full_search', help="select between approximate search (default) and full search", default=False, action='store_true') parser.add_argument( '--output', help="specify output file", default=None) parser.add_argument( '--similarity', help="specify level of similarity for concept mapping", default=5, type=int) parser.add_argument( '--language', help="specify your desired language for mapping", default='en', type=text_type) sys.exit(parser.main())
def get_config(p): """Read a config file. :return: dict of ('section.option', value) pairs. """ if not isinstance(p, Path): p = Path(p) cfg = {} parser = ConfigParser() parser.readfp(p.open(encoding='utf8')) for section in parser.sections(): getters = { 'int': partial(parser.getint, section), 'boolean': partial(parser.getboolean, section), 'float': partial(parser.getfloat, section), 'list': lambda option: parser.get(section, option).split(), } default = partial(parser.get, section) for option in parser.options(section): type_ = option.rpartition('_')[2] if '_' in option else None value = getters.get(type_, default)(option) cfg['{0}.{1}'.format(section, option)] = value return cfg
def __call__(self, parser, namespace, values, option_string=None): path_ = Path(values) if not path_.exists(): raise argparse.ArgumentError(self, 'path does not exist') if not path_.is_dir(): raise argparse.ArgumentError(self, 'path is no directory') setattr(namespace, self.dest, path_)
class Tests(TestWithApp): __cfg__ = Path(plld_app.__file__).parent.joinpath( '..', 'development.ini').resolve() __setup_db__ = False def test_home(self): res = self.app.get('/', status=200)
class Dataset(pylexibank.Dataset): dir = Path(__file__).parent id = "othaniel2017" form_spec = pylexibank.FormSpec(brackets={"(": ")"}, missing_data=("", " ", "-"), replacements=[("ɗɨ̀ŋvi ̀", "ɗɨ̀ŋvi"), (" ", "_")], strip_inside_brackets=True) def cmd_makecldf(self, args): data = self.raw_dir.read_csv("raw.csv", dicts=True) args.writer.add_sources() languages = args.writer.add_languages(lookup_factory="Name") concepts = args.writer.add_concepts( id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english), lookup_factory="Name") for row in pylexibank.progressbar(data): for language, lexeme in row.items(): if language in languages: args.writer.add_forms_from_value( Language_ID=languages[language], Parameter_ID=concepts[row["Gloss"]], Value=lexeme, Source="Othaniel2017", )
def ls(args): """ gelato ls [COLS]+ column specification: - license - macroareas """ table = Table('ID', 'Title') cols = [col for col in args.args if col in ['license', 'macroareas']] tl = 40 if args.args: tl = 25 table.columns.extend(col.capitalize() for col in cols) for d in data_path(repos=Path(args.gelato_repos)).iterdir(): if is_dataset_dir(d): ds = Dataset(d) row = [d.name, ds.md['dc:title']] for col in cols: if col == 'license': lic = licenses.find(ds.md.get('dc:license') or '') row.append(lic.id if lic else ds.md.get('dc:license')) table.append(row) print( table.render(tablefmt='simple', sortkey=lambda r: r[0], condensed=False))
def __init__(self, name, default=None, **kw): """Initialization. :param name: Basename for the config file (suffix .ini will be appended). :param default: Default content of the config file. """ self.name = name self.default = default config_dir = Path(kw.pop('config_dir', None) or DIR) RawConfigParser.__init__(self, kw, allow_no_value=True) if self.default: if PY3: fp = io.StringIO(self.default) else: fp = io.BytesIO(self.default.encode('utf8')) self.readfp(fp) cfg_path = config_dir.joinpath(name + '.ini') if cfg_path.exists(): assert cfg_path.is_file() self.read(cfg_path.as_posix()) else: if not config_dir.exists(): try: config_dir.mkdir() except OSError: # pragma: no cover # this happens when run on travis-ci, by a system user. pass if config_dir.exists(): with open(cfg_path.as_posix(), 'w') as fp: self.write(fp) self.path = cfg_path
def test_Manifest(tmppath): d = Path(__file__).parent m = {k: v for k, v in Manifest.from_dir(d).items()} copytree(d, tmppath / 'd') assert m == Manifest.from_dir(tmppath / 'd') copytree(d, tmppath / 'd' / 'd') assert m != Manifest.from_dir(tmppath / 'd')
def __init__(self, name, dir_=None, default=None, **kw): """Initialization. :param name: Basename for the config file (suffix .ini will be appended). :param default: Default content of the config file. """ INI.__init__(self, kw, allow_no_value=True) self.name = name config_dir = Path(dir_ or CONFIG_DIR) if default: if isinstance(default, text_type): self.read_string(default) #elif isinstance(default, (dict, OrderedDict)): # self.read_dict(default) cfg_path = config_dir.joinpath(name + '.ini') if cfg_path.exists(): assert cfg_path.is_file() self.read(cfg_path.as_posix()) else: if not config_dir.exists(): try: config_dir.mkdir() except OSError: # pragma: no cover # this happens when run on travis-ci, by a system user. pass if config_dir.exists(): self.write(cfg_path.as_posix()) self.path = cfg_path