Пример #1
0
 def create(self, req, filename=None, verbose=True, outfile=None):
     with safe_overwrite(outfile or self.abspath(req)) as tmp:
         if self.rdf:
             # we do not create archives with a readme for rdf downloads, because each
             # RDF entity points to the dataset and the void description of the dataset
             # covers all relevant metadata.
             #
             # TODO: write test for the file name things!?
             #
             with closing(
                     GzipFile(filename=Path(tmp.stem).stem,
                              fileobj=tmp.open('wb'))) as fp:
                 self.before(req, fp)
                 for i, item in enumerate(
                         page_query(self.query(req), verbose=verbose)):
                     self.dump(req, fp, item, i)
                 self.after(req, fp)
         else:
             with ZipFile(tmp.as_posix(), 'w', ZIP_DEFLATED) as zipfile:
                 if not filename:
                     fp = self.get_stream()
                     self.before(req, fp)
                     for i, item in enumerate(
                             page_query(self.query(req), verbose=verbose)):
                         self.dump(req, fp, item, i)
                     self.after(req, fp)
                     zipfile.writestr(self.name, self.read_stream(fp))
                 else:  # pragma: no cover
                     zipfile.write(Path(filename).as_posix(), self.name)
                 zipfile.writestr(
                     'README.txt',
                     format_readme(
                         req,
                         req.db.query(Dataset).first()).encode('utf8'))
Пример #2
0
def langsearch(args):
    """Search Glottolog languoids

    glottolog --repos=. langsearch "QUERY"
    """
    def highlight(text):
        res, i = '', 0
        for m in re.finditer('\[\[(?P<m>[^\]]+)\]\]', text):
            res += text[i:m.start()]
            res += colored(m.group('m'), 'red', attrs=['bold'])
            i = m.end()
        res += text[i:]
        return res + '\n'

    count, results = fts.search_langs(args.repos, args.args[0])
    cwd = os.getcwd()
    print('{} matches'.format(count))
    for res in results:
        try:
            p = Path(res.fname).relative_to(Path(cwd))
        except ValueError:
            p = res.fname
        sprint('{0.name} [{0.id}] {0.level}'.format(res),
               color=None,
               attrs=['bold'])
        sprint(p, color='green')
        sprint(highlight(res.highlights) if res.highlights else '')
    print('{} matches'.format(count))
Пример #3
0
def iso2codes(args):
    """
    Map ISO codes to the list of all Glottolog languages and dialects subsumed "under" it.
    """
    nodes = list(args.repos.languoids())

    res = {}
    for node in nodes:
        if node.iso:
            res[node.id] = (node.iso, set())

    for node in nodes:
        if node.level == args.repos.languoid_levels.family or node.id in res:
            continue
        for nid in res:
            matched = False
            for l in node.lineage:
                if l[1] == nid:
                    res[nid][1].add(node.id)
                    matched = True
                    break
            if matched:
                break

    outdir = Path('.') if not args.args else Path(args.args[0])
    with UnicodeWriter(outdir / 'iso2glottocodes.csv') as writer:
        writer.writerow(['iso', 'glottocodes'])
        for gc, (iso, gcs) in res.items():
            writer.writerow([iso, ';'.join([gc] + list(gcs))])
Пример #4
0
def test_catalogue_of_life(tmpdir):
    data = fixtures('data_providers', 'catalogueoflife')
    id_ = '9249d9473aac5c8e99fb9d758ced91ec'
    repos = create_repos(tmpdir)

    with patch('pytsammalex.util.requests',
               MockRequests(content=data['identify'])):
        prov = CatalogueOfLife(Path(repos))
        assert (prov.identify('x') == id_)

    with patch('pytsammalex.util.requests',
               MockRequests(content=data['metadata'])):
        prov = CatalogueOfLife(Path(repos))
        md = prov.cached_metadata('test', id_)
        taxon = {}
        prov.update(taxon, md)
        assert (taxon == {
            'catalogueoflife_url':
            'http://www.catalogueoflife.org/col/'
            'browse/tree/id/'
            '9249d9473aac5c8e99fb9d758ced91ec',
            'class':
            'Mammalia',
            'family':
            'Felidae',
            'kingdom':
            'Animalia',
            'order':
            'Carnivora',
            'phylum':
            'Chordata'
        })
Пример #5
0
 def __init__(self, repos=None):
     self.repos = (Path(repos)
                   if repos else Path(__file__).parent.parent).resolve()
     self.tree = self.repos / 'languoids' / 'tree'
     if not self.tree.exists():
         raise ValueError('repos dir %s missing tree dir: %s' %
                          (self.repos, self.tree))
Пример #6
0
def cldf(args):
    """
    Create CLDF datasets from the raw data for a dataset.

    lexibank --glottolog-repos PATH --concepticon-repos PATH cldf [DATASET_ID]
    """
    if not args.glottolog_repos or not Path(args.glottolog_repos).exists():
        raise ParserError('Invalid glottolog repository path given')

    if not args.concepticon_repos or not Path(args.concepticon_repos).exists():
        raise ParserError('Invalid concepticon repository path given')

    # FIXME: get dict of all glottolog langs right here, and attach to datasets!
    try:
        languoids = load('glottolog')
    except ValueError:
        languoids = {
            l.id: l
            for l in Glottolog(args.glottolog_repos).languoids()
        }
        dump(languoids, 'glottolog')

    def _cldf(ds, **kw):
        ds.glottolog_languoids = languoids
        ds.cldf(**kw)
        ds.write_cognates()

    with_dataset(args, _cldf)
Пример #7
0
def test_data_url_from_string():
    from clldutils.path import Path

    assert data_url('ü') == 'data:application/octet-stream;base64,w7w='
    assert data_url(Path(__file__)).startswith('data:')
    assert data_url(Path(__file__),
                    mimetype='text/plain').startswith('data:text/plain')
Пример #8
0
def create_repos(dir_):
    tsammalexdata = dir_.join('tsammalexdata')
    tsammalexdata.mkdir()
    data = tsammalexdata.join('data')
    data.mkdir()

    with data.join('test.csv').open('w', encoding='utf8') as fp:
        fp.write("""\
a,b,c
1,2,3
4,5,6""")

    with data.join('distribution.csv').open('w', encoding='utf8') as fp:
        fp.write("id,coregions__ids,countries_ids")

    test_eco_path = fixture_path('test_ecoregions.json')
    eco_path = data.join('ecoregions.json')

    copy(Path(test_eco_path), Path(eco_path))

    external = data.join('external')
    external.mkdir()
    with external.join('test.csv').open('w', encoding='utf8') as fp:
        fp.write("""\
a,b,c
1,2,3
4,5,6""")
    external.join('gbif').mkdir()
    occurrences = fixture_path('abelmoschusesculentus.json')

    copy(Path(occurrences), Path(external.join('gbif', occurrences.name)))

    return dir_
Пример #9
0
def get_ini(fname, **kw):
    fname = Path(fname)
    if not fname.exists():
        # For old-style (<=3.4) repository layout we ship the config data with pyglottolog:
        name = fname.name if fname.name != 'hhtype.ini' else 'document_types.ini'
        fname = Path(__file__).parent / name
    assert fname.exists()
    return INI.from_file(fname, **kw)
Пример #10
0
def create_archive(args):
    rels = get_release_config()
    for section in rels.sections():
        _load_sql_dump(rels[section], args.log)
    out = Path('archive')
    if args.args:
        out = Path(args.args[0])
    static_archive.create(
        [rels.get(sec, 'version') for sec in rels.sections()], out)
    args.log.info('static archive created in {0}'.format(out))
Пример #11
0
def new_dataset(args):
    """
    lexibank new-dataset OUTDIR [ID]
    """
    if not args.args:
        raise ParserError('you must specify an existing directory')
    outdir = Path(args.args.pop(0))
    if not outdir.exists():
        raise ParserError('you must specify an existing directory')

    id_pattern = re.compile('[a-z_0-9]+$')
    md = {}
    if args.args:
        md['id'] = args.args.pop(0)
    else:
        md['id'] = input('Dataset ID: ')

    while not id_pattern.match(md['id']):
        print(
            'dataset id must only consist of lowercase ascii letters, digits and _ (underscore)!'
        )
        md['id'] = input('Dataset ID: ')

    outdir = outdir / md['id']
    if not outdir.exists():
        outdir.mkdir()

    for key in ['title', 'url', 'license', 'conceptlist', 'citation']:
        md[key] = input('Dataset {0}: '.format(key))

    # check license!
    # check conceptlist!

    for path in Path(
            pylexibank.__file__).parent.joinpath('dataset_template').iterdir():
        if path.is_file():
            if path.suffix in ['.pyc']:
                continue  # pragma: no cover
            target = path.name
            content = read_text(path)
            if '+' in path.name:
                target = re.sub('\+([a-z]+)\+',
                                lambda m: '{' + m.groups()[0] + '}',
                                path.name).format(**md)
            if target.endswith('_tmpl'):
                target = target[:-5]
                content = content.format(**md)
            write_text(outdir / target, content)
        else:
            target = outdir / path.name
            if target.exists():
                shutil.rmtree(str(target))
            shutil.copytree(str(path), str(target))
    del md['id']
    jsonlib.dump(md, outdir / 'metadata.json', indent=4)
Пример #12
0
def test_json_data(tmpdir):
    tmp_ = create_repos(tmpdir)

    with JsonData('test.json', repos=Path(tmp_)) as jdat:
        jdat['a'] = 1

    assert (data_file('test.json', repos=Path(tmp_)).exists() is True)

    with JsonData('test.json', repos=Path(tmp_)) as jdat:
        assert (len(jdat) == 1)
        assert (jdat['a'] == 1)
Пример #13
0
def main():  # pragma: no cover
    pkg_dir = Path(glottolog3.__file__).parent
    parser = ArgumentParserWithLogging('glottolog3')
    parser.add_argument(
        '--repos',
        help="path to glottolog data repository",
        type=Glottolog,
        default=Glottolog(
            Path(glottolog3.__file__).parent.parent.parent.joinpath(
                'glottolog')))
    parser.add_argument('--pkg-dir', help=argparse.SUPPRESS, default=pkg_dir)
    sys.exit(parser.main())
Пример #14
0
    def __init__(self, dataset):
        self._count = defaultdict(int)
        self._cognate_count = defaultdict(int)
        self.dataset = dataset

        md = self.dataset.cldf_dir / MD_NAME
        if not md.exists():
            md = self.dataset.cldf_dir / ALT_MD_NAME
            if not md.exists():
                md = self.dataset.cldf_dir / MD_NAME
                copy(Path(__file__).parent / MD_NAME, md)
        self.wl = Wordlist.from_metadata(md)
        default_cldf = Wordlist.from_metadata(
            Path(__file__).parent / 'cldf-metadata.json')

        self.objects = {}
        self._obj_index = {}
        for cls in [
                self.dataset.lexeme_class,
                self.dataset.language_class,
                self.dataset.concept_class,
                self.dataset.cognate_class,
        ]:
            self.objects[cls.__cldf_table__()] = []
            self._obj_index[cls.__cldf_table__()] = set()

            cols = set(
                col.header
                for col in self.wl[cls.__cldf_table__()].tableSchema.columns)
            properties = set(
                col.propertyUrl.uri
                for col in self.wl[cls.__cldf_table__()].tableSchema.columns
                if col.propertyUrl)
            for field in cls.fieldnames():
                try:
                    col = default_cldf[cls.__cldf_table__(), field]
                    #
                    # We added Latitude and Longitude to the default metadata later, and want to
                    # make sure, existing datasets are upgraded silently.
                    #
                    if field in ['Latitude', 'Longitude'] \
                            and cls.__cldf_table__() == 'LanguageTable':
                        properties.add(col.propertyUrl.uri)
                        self.wl[cls.__cldf_table__(),
                                field].propertyUrl = col.propertyUrl
                        self.wl[cls.__cldf_table__(),
                                field].datatype = col.datatype
                except KeyError:
                    col = Column(name=field, datatype="string")
                if (col.propertyUrl and col.propertyUrl.uri not in properties) or \
                        ((not col.propertyUrl) and (field not in cols)):
                    self.wl[cls.__cldf_table__()].tableSchema.columns.append(
                        col)
Пример #15
0
def main(args):
    Index('ducet', collkey(common.Value.name)).create(DBSession.bind)
    repos = Path(
        os.path.expanduser('~')).joinpath('venvs/lexirumah/lexirumah-data')

    with transaction.manager:
        dataset = common.Dataset(
            id=lexirumah.__name__,
            name="lexirumah",
            publisher_name=
            "Max Planck Institute for the Science of Human History",
            publisher_place="Jena",
            publisher_url="http://shh.mpg.de",
            license="http://creativecommons.org/licenses/by/4.0/",
            domain='lexirumah.model-ling.eu',
            contact='*****@*****.**',
            jsondata={
                'license_icon':
                'cc-by.png',
                'license_name':
                'Creative Commons Attribution 4.0 International License'
            })
        DBSession.add(dataset)

    glottolog_repos = Path(
        lexirumah.__file__).parent.parent.parent.parent.joinpath(
            'glottolog3', 'glottolog')
    languoids = {l.id: l for l in Glottolog(glottolog_repos).languoids()}
    concepticon = Concepticon(
        Path(lexirumah.__file__).parent.parent.parent.parent.joinpath(
            'concepticon', 'concepticon-data'))
    conceptsets = {c.id: c for c in concepticon.conceptsets.values()}

    skip = True
    for dname in sorted(repos.joinpath('datasets').iterdir(),
                        key=lambda p: p.name):
        #if dname.name == 'benuecongo':
        #    skip = False
        #if skip:
        #    continue
        if dname.is_dir() and dname.name != '_template':
            mdpath = dname.joinpath('cldf', 'metadata.json')
            if mdpath.exists():
                print(dname.name)
                import_cldf(dname, load(mdpath), languoids, conceptsets)

    with transaction.manager:
        load_families(Data(),
                      DBSession.query(LexiRumahLanguage),
                      glottolog_repos=glottolog_repos,
                      isolates_icon='tcccccc')
Пример #16
0
def test_update(tmpdir):
    repos = create_repos(tmpdir)

    with patch.multiple('pytsammalex.distribution',
                        shape=Mock(return_value=Mock(return_value=True)),
                        Point=Mock()):
        update(Path(repos), log=Mock())

    data = CsvData('distribution', repos=Path(repos))
    assert (len(data) == 1)
    assert (data.items[0].ecoregions__ids == [
        'AT0110', 'AT0111', 'AT0112', 'AT0113', 'AT0114', 'AT0115', 'AT0116',
        'AT0117', 'AT0118', 'AT0119'
    ])
Пример #17
0
def lff2tree(tree=TREE, outdir=None, builddir=None, lffs=None):
    """
    - get mapping glottocode -> Languoid from old tree
    - assemble new directory tree
      - for each path component in lff/dff:
        - create new dir
        - copy info file from old tree (possibly updating the name) or
        - create info file
      - for each language/dialect in lff/dff:
        - create new dir
        - copy info file from old tree (possibly updating the name) or
        - create info file
    - rm old tree
    - copy new tree
    """
    # FIXME: instead of removing trees, we should just move the current one
    # from outdir to build, and then recreate in outdir.
    builddir = Path(builddir) if builddir else build_path('tree')
    old_tree = {l.id: l for l in walk_tree(tree)} if tree else {}
    out = Path(outdir or tree)
    if not out.parent.exists():
        out.parent.mkdir()

    if out.exists():
        if builddir.exists():
            try:
                rmtree(builddir)
            except:  # pragma: no cover
                pass
            if builddir.exists():  # pragma: no cover
                raise ValueError('please remove %s before proceeding' %
                                 builddir)
        # move the old tree out of the way
        shutil.move(out.as_posix(), builddir.as_posix())
    out.mkdir()

    lffs = lffs or {}
    languages = {}
    for lang in read_lff(Level.language, fp=lffs.get(Level.language)):
        languages[lang.id] = lang
        lang2tree(lang, lang.lineage, out, old_tree)

    for lang in read_lff(Level.dialect, fp=lffs.get(Level.dialect)):
        if not lang.lineage or lang.lineage[0][1] not in languages:
            raise ValueError('unattached dialect')  # pragma: no cover

        lang2tree(lang, languages[lang.lineage[0][1]].lineage + lang.lineage,
                  out, old_tree)
Пример #18
0
    def setUp(self):
        WithTempDir.setUp(self)
        self.repos = self.tmp_path()

        self.languoids = self.tmp_path('languoids')
        copytree(
            Path(__file__).parent.joinpath('data', 'languoids'),
            self.languoids)
        self.tree = self.languoids.joinpath('tree')

        self.references = self.tmp_path('references')
        copytree(
            Path(__file__).parent.joinpath('data', 'references'),
            self.references)

        self.tmp_path('build').mkdir()
Пример #19
0
def download_tables(outdir=None):
    match = ZIP_NAME_PATTERN.search(urlopen(BASE_URL + 'download.asp').read())
    if not match:
        raise ValueError('no matching zip file name found')  # pragma: no cover
    target = Path(outdir or '.').joinpath(match.group('name'))
    urlretrieve(BASE_URL + match.group('name'), target.as_posix())
    return target
Пример #20
0
def main():
    parser = ArgumentParser('pytsammalex', update_taxa, upload_images,
                            update_distribution)
    parser.add_argument('--tsammalex-data',
                        help="path to tsammalex-data repository",
                        default=Path(pytsammalex.__file__).parent.parent)
    sys.exit(parser.main())
Пример #21
0
    def __init__(self, fname):
        """
        A `Database` instance is initialized with a file path.

        :param fname: Path to a file in the file system where the db is to be stored.
        """
        self.fname = Path(fname)
Пример #22
0
def main():  # pragma: no cover
    parser = ArgumentParser(
        __name__,
        link,
        stats,
        attributes,
        intersection,
        union,
        upload_sources,
        map_concepts,
        lookup)
    parser.add_argument(
        '--data',
        help="path to concepticon-data",
        default=Path(pyconcepticon.__file__).parent.parent)
    parser.add_argument(
        '--full_search',
        help="select between approximate search (default) and full search",
        default=False,
        action='store_true')
    parser.add_argument(
        '--output',
        help="specify output file",
        default=None)
    parser.add_argument(
        '--similarity',
        help="specify level of similarity for concept mapping",
        default=5,
        type=int)
    parser.add_argument(
        '--language',
        help="specify your desired language for mapping",
        default='en',
        type=text_type)
    sys.exit(parser.main())
Пример #23
0
def get_config(p):
    """Read a config file.

    :return: dict of ('section.option', value) pairs.
    """
    if not isinstance(p, Path):
        p = Path(p)
    cfg = {}

    parser = ConfigParser()
    parser.readfp(p.open(encoding='utf8'))

    for section in parser.sections():
        getters = {
            'int': partial(parser.getint, section),
            'boolean': partial(parser.getboolean, section),
            'float': partial(parser.getfloat, section),
            'list': lambda option: parser.get(section, option).split(),
        }
        default = partial(parser.get, section)
        for option in parser.options(section):
            type_ = option.rpartition('_')[2] if '_' in option else None
            value = getters.get(type_, default)(option)
            cfg['{0}.{1}'.format(section, option)] = value

    return cfg
Пример #24
0
 def __call__(self, parser, namespace, values, option_string=None):
     path_ = Path(values)
     if not path_.exists():
         raise argparse.ArgumentError(self, 'path does not exist')
     if not path_.is_dir():
         raise argparse.ArgumentError(self, 'path is no directory')
     setattr(namespace, self.dest, path_)
Пример #25
0
class Tests(TestWithApp):
    __cfg__ = Path(plld_app.__file__).parent.joinpath(
        '..', 'development.ini').resolve()
    __setup_db__ = False

    def test_home(self):
        res = self.app.get('/', status=200)
Пример #26
0
class Dataset(pylexibank.Dataset):
    dir = Path(__file__).parent
    id = "othaniel2017"

    form_spec = pylexibank.FormSpec(brackets={"(": ")"},
                                    missing_data=("", " ", "-"),
                                    replacements=[("ɗɨ̀ŋvi ̀", "ɗɨ̀ŋvi"),
                                                  (" ", "_")],
                                    strip_inside_brackets=True)

    def cmd_makecldf(self, args):
        data = self.raw_dir.read_csv("raw.csv", dicts=True)
        args.writer.add_sources()
        languages = args.writer.add_languages(lookup_factory="Name")

        concepts = args.writer.add_concepts(
            id_factory=lambda c: c.id.split("-")[-1] + "_" + slug(c.english),
            lookup_factory="Name")

        for row in pylexibank.progressbar(data):
            for language, lexeme in row.items():
                if language in languages:
                    args.writer.add_forms_from_value(
                        Language_ID=languages[language],
                        Parameter_ID=concepts[row["Gloss"]],
                        Value=lexeme,
                        Source="Othaniel2017",
                    )
Пример #27
0
def ls(args):
    """
    gelato ls [COLS]+

    column specification:
    - license
    - macroareas
    """
    table = Table('ID', 'Title')
    cols = [col for col in args.args if col in ['license', 'macroareas']]
    tl = 40
    if args.args:
        tl = 25
        table.columns.extend(col.capitalize() for col in cols)
    for d in data_path(repos=Path(args.gelato_repos)).iterdir():
        if is_dataset_dir(d):
            ds = Dataset(d)
            row = [d.name, ds.md['dc:title']]
            for col in cols:
                if col == 'license':
                    lic = licenses.find(ds.md.get('dc:license') or '')
                    row.append(lic.id if lic else ds.md.get('dc:license'))

            table.append(row)
    print(
        table.render(tablefmt='simple',
                     sortkey=lambda r: r[0],
                     condensed=False))
Пример #28
0
    def __init__(self, name, default=None, **kw):
        """Initialization.

        :param name: Basename for the config file (suffix .ini will be appended).
        :param default: Default content of the config file.
        """
        self.name = name
        self.default = default
        config_dir = Path(kw.pop('config_dir', None) or DIR)
        RawConfigParser.__init__(self, kw, allow_no_value=True)
        if self.default:
            if PY3:
                fp = io.StringIO(self.default)
            else:
                fp = io.BytesIO(self.default.encode('utf8'))
            self.readfp(fp)

        cfg_path = config_dir.joinpath(name + '.ini')
        if cfg_path.exists():
            assert cfg_path.is_file()
            self.read(cfg_path.as_posix())
        else:
            if not config_dir.exists():
                try:
                    config_dir.mkdir()
                except OSError:  # pragma: no cover
                    # this happens when run on travis-ci, by a system user.
                    pass
            if config_dir.exists():
                with open(cfg_path.as_posix(), 'w') as fp:
                    self.write(fp)
        self.path = cfg_path
Пример #29
0
def test_Manifest(tmppath):
    d = Path(__file__).parent
    m = {k: v for k, v in Manifest.from_dir(d).items()}
    copytree(d, tmppath / 'd')
    assert m == Manifest.from_dir(tmppath / 'd')
    copytree(d, tmppath / 'd' / 'd')
    assert m != Manifest.from_dir(tmppath / 'd')
Пример #30
0
    def __init__(self, name, dir_=None, default=None, **kw):
        """Initialization.

        :param name: Basename for the config file (suffix .ini will be appended).
        :param default: Default content of the config file.
        """
        INI.__init__(self, kw, allow_no_value=True)
        self.name = name
        config_dir = Path(dir_ or CONFIG_DIR)

        if default:
            if isinstance(default, text_type):
                self.read_string(default)
            #elif isinstance(default, (dict, OrderedDict)):
            #    self.read_dict(default)

        cfg_path = config_dir.joinpath(name + '.ini')
        if cfg_path.exists():
            assert cfg_path.is_file()
            self.read(cfg_path.as_posix())
        else:
            if not config_dir.exists():
                try:
                    config_dir.mkdir()
                except OSError:  # pragma: no cover
                    # this happens when run on travis-ci, by a system user.
                    pass
            if config_dir.exists():
                self.write(cfg_path.as_posix())
        self.path = cfg_path