Exemplos de dump em Python, exemplos de clldutils.jsonlib.dump em Python

Exemplo n.º 1

0

Exibir arquivo

def prepare_release(repos, version=None):
    edition = get_edition(repos, version=version)
    cit = citation(repos, edition=edition, version=version)
    dump(
        {
            "title":
            repos.publication.zenodo.title_format.format(edition['version']),
            "description":
            to_html(cit, repos.publication.web.url),
            "license": {
                "id": repos.publication.zenodo.license_id
            },
            "keywords":
            repos.publication.zenodo.keywords.split(),
            "communities": [{
                "identifier": cid
            } for cid in repos.publication.zenodo.communities.split()],
            "creators":
            [editor_to_dict(n, repos.editors) for n in edition['editors']],
            "access_right":
            "open",
            "upload_type":
            "dataset",
        },
        repos.path('.zenodo.json'),
        indent=4)
    return cit

Exemplo n.º 2

0

Exibir arquivo

Arquivo: freeze.py Projeto: cevmartinez/clld

def freeze_func(args, dataset=None, with_history=True):
    dataset = dataset or args.env["request"].dataset
    dump_dir = args.data_file("dumps")
    if not dump_dir.exists():
        dump_dir.mkdir()
    dump_dir = dump_dir.resolve()

    with dump_dir.joinpath("README.txt").open("w", encoding="utf8") as fp:
        fp.write(freeze_readme(dataset, args.env["request"]))

    db_version = get_alembic_version(DBSession)

    for table in Base.metadata.sorted_tables:
        csv = dump_dir.joinpath("%s.csv" % table.name)
        if with_history or not table.name.endswith("_history"):
            _freeze(table, csv)

        if csv.exists():
            csvm = "%s.%s" % (table.name, CsvmJsonAdapter.extension)
            doc = CsvmJsonAdapter.csvm_doc(csvm, args.env["request"], [(col.name, col) for col in table.columns])
            if db_version:
                # We (ab)use a dc:identifier property to pass the alembic revision of the
                # database to the unfreeze script.
                doc["dc:identifier"] = db_version
            jsonlib.dump(doc, dump_dir.joinpath(csvm))

    with ZipFile(as_posix(args.data_file("..", "data.zip")), "w", ZIP_DEFLATED) as zipfile:
        for f in dump_dir.iterdir():
            if f.is_file():
                with f.open("rb") as fp:
                    zipfile.writestr(f.name, fp.read())

Exemplo n.º 3

0

Exibir arquivo

def prepare_release(repos, version):
    for v, year, editors in read_editors(repos):
        if v == version:
            break
    else:  # pragma: no cover
        raise ValueError('Add version to CONTRIBUTORS.md first!')

    citation = "{0}. {1}. {2} {3}. {4}: {5}. (Available online at {6})".format(
        ' & '.join('{0.last}, {0.first}'.format(HumanName(e)) for e in editors),
        year,
        repos.publication.web.name,
        version,
        repos.publication.publisher.place,
        repos.publication.publisher.name,
        repos.publication.web.url,
    )
    dump(
        {
            "title": repos.publication.zenodo.title_format.format(version),
            "description": to_html(citation, repos.publication.web.url),
            "license": {"id": repos.publication.zenodo.license_id},
            "keywords": repos.publication.zenodo.keywords.split(),
            "communities": [
                {"identifier": cid} for cid in repos.publication.zenodo.communities.split()],
            "creators": [editor_to_dict(n, repos.editors) for n in editors],
            "access_right": "open"
        },
        repos.path('.zenodo.json'),
        indent=4)

Exemplo n.º 4

0

Exibir arquivo

def freeze_func(args, dataset=None, with_history=True):
    dataset = dataset or args.env['request'].dataset
    dump_dir = args.data_file('dumps')
    if not dump_dir.exists():
        dump_dir.mkdir()
    dump_dir = dump_dir.resolve()

    with dump_dir.joinpath('README.txt').open('w', encoding='utf8') as fp:
        fp.write(freeze_readme(dataset, args.env['request']))

    db_version = get_alembic_version(DBSession)

    for table in Base.metadata.sorted_tables:
        csv = dump_dir.joinpath('%s.csv' % table.name)
        if with_history or not table.name.endswith('_history'):
            _freeze(table, csv)

        if csv.exists():
            csvm = '%s.%s' % (table.name, CsvmJsonAdapter.extension)
            doc = CsvmJsonAdapter.csvm_doc(
                csvm, args.env['request'], [(col.name, col) for col in table.columns])
            if db_version:
                # We (ab)use a dc:identifier property to pass the alembic revision of the
                # database to the unfreeze script.
                doc["dc:identifier"] = db_version  # pragma: no cover
            jsonlib.dump(doc, dump_dir.joinpath(csvm))

    with ZipFile(
            as_posix(args.data_file('..', 'data.zip')), 'w', ZIP_DEFLATED) as zipfile:
        for f in dump_dir.iterdir():
            if f.is_file():
                with f.open('rb') as fp:
                    zipfile.writestr(f.name, fp.read())

Exemplo n.º 5

0

Exibir arquivo

def llod_func(args):  # pragma: no cover
    """Create an RDF dump and compute some statistics about it."""
    tmp = Path(mkdtemp())
    count_rsc = 0
    count_triples = 0

    tmp_dump = tmp.joinpath('rdf.n3')
    with open(as_posix(tmp_dump), 'w') as fp:
        for rsc in RESOURCES:
            args.log.info('Resource type %s ...' % rsc.name)
            try:
                q = DBSession.query(rsc.model)
            except InvalidRequestError:
                args.log.info('... skipping')
                continue
            for obj in page_query(q.order_by(rsc.model.pk), n=10000, verbose=True):
                graph = get_graph(obj, args.env['request'], rsc.name)
                count_triples += len(graph)
                count_rsc += 1
                fp.write(n3(graph, with_head=count_rsc == 1))
            args.log.info('... finished')

    # put in args.data_file('..', 'static', 'download')?
    md = {'path': as_posix(tmp), 'resources': count_rsc, 'triples': count_triples}
    md.update(count_links(as_posix(tmp_dump)))
    jsonlib.dump(md, args.data_file('rdf-metadata.json'))
    print(md)

    dataset = Dataset.first()
    rdf_dump = args.module_dir.joinpath(
        'static', 'download', '%s-dataset.n3' % dataset.id)
    tmp_dump.copy(rdf_dump)
    check_call('gzip -f %s' % rdf_dump, shell=True)
    print(str(rdf_dump))

Exemplo n.º 6

0

Exibir arquivo

Arquivo: freeze.py Projeto: cevmartinez/clld

def freeze_func(args, dataset=None, with_history=True):
    dataset = dataset or args.env['request'].dataset
    dump_dir = args.data_file('dumps')
    if not dump_dir.exists():
        dump_dir.mkdir()
    dump_dir = dump_dir.resolve()

    with dump_dir.joinpath('README.txt').open('w', encoding='utf8') as fp:
        fp.write(freeze_readme(dataset, args.env['request']))

    db_version = get_alembic_version(DBSession)

    for table in Base.metadata.sorted_tables:
        csv = dump_dir.joinpath('%s.csv' % table.name)
        if with_history or not table.name.endswith('_history'):
            _freeze(table, csv)

        if csv.exists():
            csvm = '%s.%s' % (table.name, CsvmJsonAdapter.extension)
            doc = CsvmJsonAdapter.csvm_doc(csvm, args.env['request'],
                                           [(col.name, col)
                                            for col in table.columns])
            if db_version:
                # We (ab)use a dc:identifier property to pass the alembic revision of the
                # database to the unfreeze script.
                doc["dc:identifier"] = db_version
            jsonlib.dump(doc, dump_dir.joinpath(csvm))

    with ZipFile(as_posix(args.data_file('..', 'data.zip')), 'w',
                 ZIP_DEFLATED) as zipfile:
        for f in dump_dir.iterdir():
            if f.is_file():
                with f.open('rb') as fp:
                    zipfile.writestr(f.name, fp.read())

Exemplo n.º 7

0

Exibir arquivo

Arquivo: util.py Projeto: concepticon/pyconcepticon

 def __exit__(self, exc_type, exc_val, exc_tb):
     jsonlib.dump(collections.OrderedDict([
         (k, collections.OrderedDict([i for i in sorted(v.items())]))
         for k, v in sorted(self.items.items())
     ]),
                  self.path,
                  indent=4)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_jsonlib.py Projeto: pombredanne/clldutils

    def test_json(self):
        from clldutils.jsonlib import dump, load

        d = {'a': 234, 'ä': 'öäüß'}
        p = self.tmp_path('test')
        dump(d, p)
        for k, v in load(p).items():
            assert d[k] == v

Exemplo n.º 9

0

Exibir arquivo

Arquivo: _bibfiles_db.py Projeto: clld/glottolog

 def to_replacements(self, filename):
     """Write a JSON file with 301s from merged glottolog_ref_ids."""
     with self.connect() as conn:
         conn.row_factory = sqlite3.Row
         cursor = conn.execute('SELECT refid AS id, id AS replacement '
             'FROM entry WHERE id != refid ORDER BY id')
         pairs = map(dict, cursor)
     jsonlib.dump(pairs, filename, indent=4)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: _bibfiles_db.py Projeto: billbrasky1/glottolog

 def to_replacements(self, filename):
     """Write a JSON file with 301s from merged glottolog_ref_ids."""
     with self.connect() as conn:
         conn.row_factory = sqlite3.Row
         cursor = conn.execute('SELECT refid AS id, id AS replacement '
                               'FROM entry WHERE id != refid ORDER BY id')
         pairs = map(dict, cursor)
     jsonlib.dump(pairs, filename, indent=4)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: __main__.py Projeto: clld/glottolog3

def ldstatus(args):
    from glottolog3.langdocstatus import extract_data

    endangerment = {
        l.id: l.cfg['endangerment']
        for l in args.repos.languoids() if 'endangerment' in l.cfg}
    with_session(args)
    dump(extract_data(endangerment), 'glottolog3/static/ldstatus.json', indent=4)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_profile.py Projeto: Anaphory/segments

def test_profile_with_bad_metadata(tmpdir):
    mdpath = tmpdir / 'md.json'
    md = deepcopy(Profile.MD)
    md['tables'].append({'tableSchema': {'columns': []}})
    jsonlib.dump(md, str(mdpath))

    with pytest.raises(ValueError):
        Profile.from_file(str(mdpath))

Exemplo n.º 13

0

Exibir arquivo

def run(args):  # pragma: no cover
    if Repo:
        assert str(Repo(str(args.repos.repos)).active_branch) == 'master', \
            'Command should be run on master branch'
    res = {'language': [], 'family': [], 'dialect': []}
    for lang in args.repos.languoids():
        res[lang.level.name].append(lang.id)
    jsonlib.dump(res, args.repos.build_path('languoids.json'))

Exemplo n.º 14

0

Exibir arquivo

def main(args):  # pragma: no cover
    ldstatus = {}
    limit = 200
    q = language_query().order_by(Language.pk)
    offset = 0
    # we merge information about extinct languages from unesco and Harald.
    if 1:
        # loop over active, established languages with geo-coords
        while True:
            transaction.begin()
            langs = [l for l in q.offset(offset).limit(limit)]
            if not langs:
                break
            offset += limit
            # let's collect the relevant sources in a way that allows computation of med.
            # Note: we limit refs to the ones without computerized assignments.
            lsources = list(DBSession.query(Ref).join(LanguageSource)\
                .filter(LanguageSource.language_pk.in_([l.pk for l in langs])) \
                .filter(Ref.ca_doctype_trigger == None)\
                .filter(Ref.ca_language_trigger == None)\
                .options(joinedload(Ref.doctypes), joinedload(Source_.languages)))
            for l in langs:
                sources = [s for s in lsources if l in s.languages]
                sources = sorted(map(Source, sources))

                # keep the overall med
                # note: this source may not be included in the potential meds computed
                # below,
                # e.g. because it may not have a year.
                med = sources[0].__json__() if sources else None

                # now we have to compute meds respecting a cut-off year.
                # to do so, we collect eligible sources per year and then
                # take the med of this collection.
                potential_meds = []

                # we only have to loop over publication years within all sources, because
                # only in these years something better might have come along.
                for year in set(s.year for s in sources if s.year):
                    # let's see if something better was published!
                    eligible = [
                        s for s in sources if s.year and s.year <= year
                    ]
                    if eligible:
                        potential_meds.append(sorted(eligible)[0])

                # we store the precomputed sources information as jsondata:
                ldstatus[l.id] = [
                    med,
                    [
                        s.__json__() for s in sorted(set(potential_meds),
                                                     key=lambda s: -s.year)
                    ]
                ]
            print(offset)
            transaction.abort()

    dump(ldstatus, 'glottolog3/static/ldstatus.json', indent=4)

Exemplo n.º 15

0

Exibir arquivo

def run(args):
    ordered = [d['species'].lower() for d in reader(args.ordered, dicts=True)]
    ranks = ['phylum', 'klass', 'order', 'family', 'genus']

    ordered_ranks = {r: {} for r in ranks}
    seen = {}
    augmented_species = []
    for ex in args.api.experiments:
        species = ex.gbif.cname
        if species not in seen:
            seen[species] = (ex.gbif.classification, ex.species_latin)
            skey = species.lower()
            if skey not in ordered:
                skey = ' '.join(skey.split()[:2])
            if skey not in ordered:
                skey = [n for n in ordered if n.split()[0] == skey.split()[0]]
                if skey:
                    skey = skey[0]

            if skey in ordered:
                augmented_species.append((species, ordered.index(skey)))
            else:
                augmented_species.append((species, len(ordered) + 1))

    for s, i in sorted(augmented_species, key=lambda t: t[1], reverse=True):
        for r in ranks:
            ordered_ranks[r][getattr(seen[s][0], r)] = i

    fully_augmented_species = {
        s: (ordered_ranks['phylum'][seen[s][0].phylum],
            ordered_ranks['klass'][seen[s][0].klass],
            ordered_ranks['order'][seen[s][0].order],
            ordered_ranks['family'][seen[s][0].family],
            ordered_ranks['genus'][seen[s][0].genus], i)
        for s, i in sorted(augmented_species, key=lambda t: t[1])
    }
    clf = collections.defaultdict(lambda: [-1, None])
    prefix = {}
    for k, _ in sorted(fully_augmented_species.items(),
                       key=lambda i: i[1],
                       reverse=True):
        for j, a in enumerate(ranks):
            if clf[a][1] != getattr(seen[k][0], a):
                for aa in ranks[j + 1:]:
                    clf[aa][0] = -1
                if a == 'genus':
                    # reset prefix index for all deeper taxonomy ranks:
                    clf['species'][0] = -1
                clf[a][0] += 1
                clf[a][1] = getattr(seen[k][0], a)
                node_name = '_'.join(
                    getattr(seen[k][0], aa) for aa in ranks[:j + 1])
                prefix[node_name] = string.ascii_lowercase[clf[a][0]]
        if clf['species'][1] != k:
            clf['species'][0] += 1
            clf['species'][1] = k
            prefix[k.lower()] = string.ascii_lowercase[clf['species'][0]]
    dump(prefix, args.api.path('taxa_sortkeys.json'), indent=4)

Exemplo n.º 16

0

Exibir arquivo

def jsondump(obj, fname, log=None):
    fname = Path(fname)
    if fname.exists():
        d = jsonlib.load(fname)
        d.update(obj)
        obj = d
    jsonlib.dump(sorted_obj(obj), fname, indent=4)
    log_dump(fname, log=log)
    return obj

Exemplo n.º 17

0

Exibir arquivo

Arquivo: util.py Projeto: marctang/lexibank-data-old

def parse(soup, id_, path, with_items=True):
    props = {}
    for i, dl in enumerate(soup.find_all('dl')):
        props.update(dict(list(parse_dl(dl))))
    if with_items:
        parse_table(soup, props)
    props['name'] = soup.find('h2').get_text()
    props['id'] = id_
    jsonlib.dump(props, path, indent=4)

Exemplo n.º 18

0

Exibir arquivo

def rename(args):  # pragma: no cover
    api = Concepticon(args.repos)

    from_, to_ = args.args
    assert CONCEPTLIST_ID_PATTERN.match(to_)
    cl = api.conceptlists[from_]

    # write the adapted concept list to the new path:
    with UnicodeWriter(cl.path.parent / cl.path.name.replace(from_, to_),
                       delimiter='\t') as writer:
        header = []
        for i, row in enumerate(reader(cl.path, delimiter='\t')):
            if i == 0:
                header = row
                writer.writerow(row)
                header = {v: k
                          for k, v in enumerate(header)
                          }  # Map col name to row index
            else:
                oid = row[header['ID']]
                assert oid.startswith(from_)
                nid = oid.replace(from_, to_)
                api.add_retirement(
                    'Concept', dict(id=oid,
                                    comment='renaming',
                                    replacement=nid))
                row[header['ID']] = nid
                writer.writerow(row)

    # write adapted metadata to the new path:
    fname = cl.path.name.replace(from_, to_) + MD_SUFFIX
    md = jsonlib.load(cl.path.parent / (cl.path.name + MD_SUFFIX),
                      object_pairs_hook=OrderedDict)
    md['tables'][0]['url'] = fname
    jsonlib.dump(md, cl.path.parent / fname, indent=4)

    # remove obsolete concept list and metadata:
    cl.path.unlink()
    cl.path.parent.joinpath(cl.path.name + MD_SUFFIX).unlink()

    # adapt conceptlists.tsv
    rows = []
    for row in reader(api.data_path('conceptlists.tsv'), delimiter='\t'):
        rows.append([col.replace(from_, to_) if col else col for col in row])

    with UnicodeWriter(api.data_path('conceptlists.tsv'),
                       delimiter='\t') as writer:
        writer.writerows(rows)

    api.add_retirement('Conceptlist',
                       dict(id=from_, comment='renaming', replacement=to_))

    print("""Please run
grep -r "{0}" concepticondata/ | grep -v retired.json

to confirm the renaming was complete!""".format(from_))

Exemplo n.º 19

0

Exibir arquivo

Arquivo: coverage.py Projeto: atlas-comp-cog/pyacc

def run(args):
    acc = Taxa()
    seen = set()
    for ex in args.api.experiments:
        species = ex.gbif.name
        if species not in seen:
            seen.add(species)
            acc.add(ex.gbif.classification)
    #print(acc)
    gbif, head = Taxa(), None
    for i, line in enumerate(args.taxa.open(encoding='utf8').readlines()):
        if i == 0:
            head = line.strip().split('\t')
            continue
        cols = line.strip().split('\t')
        d = dict(zip(head, cols))

        if d['kingdom'] != 'Animalia':
            continue
        if d['taxonomicStatus'] != 'accepted':
            continue
        if d['taxonRank'] != 'species':
            continue
        if 'genus' in d:
            gbif.add(d)

    coverage = collections.OrderedDict()
    for phylum, classes in acc.items():
        print('Phylum {}: {}/{} classes'.format(phylum, len(classes),
                                                len(gbif[phylum])))
        coverage[(phylum, )] = (len(classes), len(gbif[phylum]))
        for klass, orders in classes.items():
            print('  Class {}: {}/{} orders'.format(klass, len(orders),
                                                    len(gbif[phylum][klass])))
            coverage[(phylum, klass)] = (len(orders), len(gbif[phylum][klass]))
            for order, families in orders.items():
                print('    Order {}: {}/{} families'.format(
                    order, len(families), len(gbif[phylum][klass][order])))
                coverage[(phylum, klass,
                          order)] = (len(families),
                                     len(gbif[phylum][klass][order]))
                for family, genera in families.items():
                    print('      Family {}: {}/{} genera'.format(
                        family, len(genera),
                        len(gbif[phylum][klass][order][family])))
                    coverage[(phylum, klass, order, family)] = (
                        len(genera), len(gbif[phylum][klass][order][family]))
                    for genus, nspec in genera.items():
                        print('        Genus {}: {}/{} species'.format(
                            genus, nspec,
                            gbif[phylum][klass][order][family].get(genus)))
                        coverage[(phylum, klass, order, family, genus)] = (
                            nspec, gbif[phylum][klass][order][family][genus])
    coverage = collections.OrderedDict([('_'.join(k), v)
                                        for k, v in coverage.items()])
    jsonlib.dump(coverage, args.api.path('gbif_coverage.json'), indent=4)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: languoids.py Projeto: clld/glottolog

 def new(self, alpha, dry_run=False):
     num = self._store.get(alpha, 1233) + 1
     if not dry_run:
         self._store[alpha] = num
         # Store the updated dictionary of glottocodes back.
         ordered = OrderedDict()
         for k in sorted(self._store.keys()):
             ordered[k] = self._store[k]
         jsonlib.dump(ordered, self._fname, indent=4)
     return Glottocode('%s%s' % (alpha, num))

Exemplo n.º 21

0

Exibir arquivo

 def new(self, alpha, dry_run=False):
     num = self._store.get(alpha, 1233) + 1
     if not dry_run:
         self._store[alpha] = num
         # Store the updated dictionary of glottocodes back.
         ordered = OrderedDict()
         for k in sorted(self._store.keys()):
             ordered[k] = self._store[k]
         jsonlib.dump(ordered, self._fname, indent=4)
     return Glottocode('%s%s' % (alpha, num))

Exemplo n.º 22

0

Exibir arquivo

Arquivo: comments.py Projeto: cldf-datasets/wals

def run(args):
    ds = Dataset()
    comments = {}
    for p in ds.raw_dir.glob('blog_comments/comments*.html'):
        for c in iter_comments(p):
            comments[c['id']] = c
    comments = sorted(comments.values(),
                      key=lambda c: int(c['id'].split('comment-')[-1]))
    dump(comments, ds.etc_dir / 'comments.json', indent=4)
    args.log.info('{} comments'.format(len(comments)))

Exemplo n.º 23

0

Exibir arquivo

Arquivo: misc.py Projeto: basagashka/pylexibank

def new_dataset(args):
    """
    lexibank new-dataset OUTDIR [ID]
    """
    if not args.args:
        raise ParserError('you must specify an existing directory')
    outdir = Path(args.args.pop(0))
    if not outdir.exists():
        raise ParserError('you must specify an existing directory')

    id_pattern = re.compile('[a-z_0-9]+$')
    md = {}
    if args.args:
        md['id'] = args.args.pop(0)
    else:
        md['id'] = input('Dataset ID: ')

    while not id_pattern.match(md['id']):
        print(
            'dataset id must only consist of lowercase ascii letters, digits and _ (underscore)!'
        )
        md['id'] = input('Dataset ID: ')

    outdir = outdir / md['id']
    if not outdir.exists():
        outdir.mkdir()

    for key in ['title', 'url', 'license', 'conceptlist', 'citation']:
        md[key] = input('Dataset {0}: '.format(key))

    # check license!
    # check conceptlist!

    for path in Path(
            pylexibank.__file__).parent.joinpath('dataset_template').iterdir():
        if path.is_file():
            if path.suffix in ['.pyc']:
                continue  # pragma: no cover
            target = path.name
            content = read_text(path)
            if '+' in path.name:
                target = re.sub('\+([a-z]+)\+',
                                lambda m: '{' + m.groups()[0] + '}',
                                path.name).format(**md)
            if target.endswith('_tmpl'):
                target = target[:-5]
                content = content.format(**md)
            write_text(outdir / target, content)
        else:
            target = outdir / path.name
            if target.exists():
                shutil.rmtree(str(target))
            shutil.copytree(str(path), str(target))
    del md['id']
    jsonlib.dump(md, outdir / 'metadata.json', indent=4)

Exemplo n.º 24

0

Exibir arquivo

Arquivo: util.py Projeto: LinguList/clpa

def write_CLPA(clpadata, path):
    """
    Basic function to write clpa-data.
    """
    if isinstance(path, Path):
        outdir, fname = path.parent, path.name
    else:
        outdir, fname = local_path(), path  # pragma: no cover
    old_clpa = load_CLPA()
    jsonlib.dump(old_clpa, outdir.joinpath(fname + '.bak'), indent=4)
    jsonlib.dump(clpadata, outdir.joinpath(fname), indent=4)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: __main__.py Projeto: somiyagawa/glottolog3

def ldstatus(args):
    from glottolog3.langdocstatus import extract_data

    endangerment = {
        l.id: l.cfg['endangerment']
        for l in args.repos.languoids() if 'endangerment' in l.cfg
    }
    with_session(args)
    dump(extract_data(endangerment),
         'glottolog3/static/ldstatus.json',
         indent=4)

Exemplo n.º 26

0

Exibir arquivo

def run(args):  # pragma: no cover
    auth = HTTPBasicAuth(args.user, args.token)
    issues = list(iteritems(auth, '/repos/{0}/issues'.format(REPOS), state='all'))
    jsonlib.dump(issues, args.repos.issues_path, indent=4)

    res = {}
    for issue in issues:
        if issue['comments']:
            res[issue['number']] = list(iteritems(auth, issue['comments_url']))

    jsonlib.dump(res, args.repos.comments_path, indent=4)

Exemplo n.º 27

0

Exibir arquivo

Arquivo: util.py Projeto: LinguList/clpa

def write_CLPA(clpadata, path):
    """
    Basic function to write clpa-data.
    """
    if isinstance(path, Path):
        outdir, fname = path.parent, path.name
    else:
        outdir, fname = local_path(), path  # pragma: no cover
    old_clpa = load_CLPA()
    jsonlib.dump(old_clpa, outdir.joinpath(fname + '.bak'), indent=4)
    jsonlib.dump(clpadata, outdir.joinpath(fname), indent=4)

Exemplo n.º 28

0

Exibir arquivo

Arquivo: api.py Projeto: concepticon/pyconcepticon

 def add_retirement(self, type_, repl):
     obj = collections.OrderedDict()
     for k in ['id', 'comment', 'replacement']:
         obj[k] = repl[k]
         assert obj[k]
     if type_ not in self.retirements:
         self.retirements[type_] = []
     self.retirements[type_].append(obj)
     jsonlib.dump(self.retirements,
                  self.data_path('retired.json'),
                  indent=2)

Exemplo n.º 29

0

Exibir arquivo

Arquivo: test_languoids.py Projeto: jrnold/glottolog

    def test_Glottocodes(self):
        gcjson = self.tmp_path('glottocodes.json')
        jsonlib.dump({}, gcjson)

        glottocodes = Glottocodes(gcjson)
        gc = glottocodes.new('a', dry_run=True)
        self.assertTrue(gc.startswith('aaaa'))
        self.assertNotIn(gc, glottocodes)
        gc = glottocodes.new('a')
        self.assertIn(gc, glottocodes)
        # make sure it's also written to file:
        self.assertIn(gc, Glottocodes(gcjson))
        self.assertEqual(len(list(Glottocodes(gcjson))), 1)

Exemplo n.º 30

0

Exibir arquivo

def get(dataset, resource, offset=0, limit=LIMIT, download_=False):
    fname = dataset.raw.joinpath("%(resource)s-%(limit)s-%(offset)s.json" %
                                 locals())
    if fname.exists() and not download_:
        return jsonlib.load(fname)
    if not download_:
        raise ValueError
    res = requests.get("{0}/api/v1/{1}/".format(BASE_URL, resource),
                       params=dict(format='json',
                                   limit='{0}'.format(limit),
                                   offset='{0}'.format(offset))).json()
    jsonlib.dump(res, fname)
    return res

Exemplo n.º 31

0

Exibir arquivo

Arquivo: endangeredlanguages.py Projeto: clld/glottolog

def store(details_):
    db = read_store()
    if not details_:
        return db
    db[details_['id']] = details_
    ordered = OrderedDict()
    for k in sorted(list(db.keys()), key=lambda lid: int(lid)):
        v = OrderedDict()
        for key in sorted(list(db[k].keys())):
            if key != 'id':
                v[key] = db[k][key]
        ordered[k] = v
    jsonlib.dump(ordered, STORE, indent=4)
    return db

Exemplo n.º 32

0

Exibir arquivo

Arquivo: test_languoids.py Projeto: billbrasky1/glottolog

    def test_Glottocodes(self):
        from pyglottolog.languoids import Glottocodes

        languoids = self.tmp_path('languoids')
        languoids.mkdir()
        jsonlib.dump({}, languoids.joinpath('glottocodes.json'))

        glottocodes = Glottocodes(repos=self.tmp_path())
        gc = glottocodes.new('abcd', dry_run=True)
        self.assertNotIn(gc, glottocodes)
        gc = glottocodes.new('abcd')
        self.assertIn(gc, glottocodes)
        # make sure it's also written to file:
        self.assertIn(gc, Glottocodes(repos=self.tmp_path()))

Exemplo n.º 33

0

Exibir arquivo

def store(details_, fname):  # pragma: no cover
    db = read_store(fname)
    if not details_:
        return db
    db[details_['id']] = details_
    ordered = OrderedDict()
    for k in sorted(list(db.keys()), key=lambda lid: int(lid)):
        v = OrderedDict()
        for key in sorted(list(db[k].keys())):
            if key != 'id':
                v[key] = db[k][key]
        ordered[k] = v
    jsonlib.dump(ordered, fname, indent=4)
    return db

Exemplo n.º 34

0

Exibir arquivo

 def new(self, name, dry_run=False):
     alpha = slug(text_type(name))[:4]
     assert alpha
     while len(alpha) < 4:
         alpha += alpha[-1]
     num = self._store.get(alpha, 1233) + 1
     if not dry_run:
         self._store[alpha] = num
         # Store the updated dictionary of glottocodes back.
         ordered = OrderedDict()
         for k in sorted(self._store.keys()):
             ordered[k] = self._store[k]
         jsonlib.dump(ordered, self._fname, indent=4)
     return Glottocode('%s%s' % (alpha, num))

Exemplo n.º 35

0

Exibir arquivo

Arquivo: catalog.py Projeto: dlce-eva/cdstarcat

 def __exit__(self, *args):
     ordered = collections.OrderedDict([
         (k, v.asdict()) for k, v in sorted(self.objects.items())
     ])
     if self.path.suffix.lower() == '.zip':
         with zipfile.ZipFile(str(self.path), 'w',
                              zipfile.ZIP_DEFLATED) as z:
             z.writestr(
                 self.path.stem,
                 json.dumps(ordered,
                            ensure_ascii=False,
                            indent=0,
                            separators=(',', ':')))
     else:
         dump(ordered, self.path, indent=0, separators=(',', ':'))

Exemplo n.º 36

0

Exibir arquivo

def parse(soup, id_, outdir):
    props = {'id': id_, 'name': soup.find('h2').get_text(), 'tables': {}}
    for i, dl in enumerate(soup.find_all('dl')):
        props.update(dict(list(parse_dl(dl))))

    for frame in [
            'basic_frame',
            'flora_frame',
            'cult_frame',
            'grammar_frame',
            'ethno_frame',
    ]:
        div = soup.find('div', id=frame)
        if div:
            props['tables'][frame.split('_')[0]] = rows(div.find('table'))
    jsonlib.dump(props, outdir.joinpath('{0}.json'.format(id_)), indent=4)

Exemplo n.º 37

0

Exibir arquivo

Arquivo: conftest.py Projeto: concepticon/pynorare

def concepticon_api(tmpdir):
    concepticon_repos = pathlib.Path(str(tmpdir.join('concepticon-data')))
    shutil.copytree(str(TEST_REPOS), str(concepticon_repos))
    md = jsonlib.load(TEST_REPOS / 'concepticondata' / 'conceptlists' /
                      'default-metadata.json')
    md['tables'][0]['url'] = 'Perrin-2010-110.tsv'
    md['tables'][0]['tableSchema']['columns'].extend(
        [dict(name='FRENCH'), dict(name='GERMAN')])
    jsonlib.dump(
        md, concepticon_repos / 'concepticondata' / 'conceptlists' /
        'Perrin-2010-110.tsv-metadata.json')
    mappings = concepticon_repos / 'mappings'
    mappings.joinpath('map-fr.tsv').write_text("""\
ID\tGLOSS\tPRIORITY
1\tG///the gloss\t2
2\tH///the gloss\t3
3\tH///the gloss\t3
""",
                                               encoding='utf8')
    return Concepticon(concepticon_repos)

Exemplo n.º 38

0

Exibir arquivo

Arquivo: base.py Projeto: clld/tsammalex-data

    def cached_metadata(self, sid, id=None, name=None, refresh=False):
        if data_file('external', self.name, repos=self.repos).is_dir():
            fname = data_file('external', self.name, sid + '.json', repos=self.repos)
            if not fname.exists() or refresh:
                try:
                    data = self.metadata(id or self.identify(name))
                except:  # pragma: no cover
                    data = None
                if not data:
                    return  # pragma: no cover
                jsonlib.dump(data, fname)
                return data
            return jsonlib.load(fname)

        if sid not in self.items or refresh:
            try:
                self.items[sid] = self.metadata(id or self.identify(name))
            except:
                return
        return self.items[sid]

Exemplo n.º 39

0

Exibir arquivo

Arquivo: static_archive.py Projeto: clld/glottolog3

def create(versions, out=None):
    out = out or Path('archive')
    if not out.exists():
        out.mkdir()

    langs, identifiers = {}, {}
    for version in versions:
        aggregate(version, langs, identifiers)

    for version in versions:
        dump(
            out.joinpath('glottolog-{0}'.format(version)),
            version, 
            langs,
            {pk: list(c) for pk, c in groupby(identifiers[version], lambda i: i.lpk)})

    gc2v = {}
    for v in versions:
        for gc in sorted(langs[v].keys()):
            gc2v[gc] = v
    jsonlib.dump(gc2v, out.joinpath('glottocode2version.json'), indent=4)

Exemplo n.º 40

0

Exibir arquivo

def create(versions, out=None):
    out = out or Path('archive')
    if not out.exists():
        out.mkdir()

    langs, identifiers = {}, {}
    for version in versions:
        aggregate(version, langs, identifiers)

    for version in versions:
        dump(
            out.joinpath('glottolog-{0}'.format(version)), version, langs, {
                pk: list(c)
                for pk, c in groupby(identifiers[version], lambda i: i.lpk)
            })

    gc2v = {}
    for v in versions:
        for gc in sorted(langs[v].keys()):
            gc2v[gc] = v
    jsonlib.dump(gc2v, out.joinpath('glottocode2version.json'), indent=4)

Exemplo n.º 41

0

Exibir arquivo

Arquivo: llod.py Projeto: cevmartinez/clld

def llod_func(args):  # pragma: no cover
    """Create an RDF dump and compute some statistics about it."""
    tmp = Path(mkdtemp())
    count_rsc = 0
    count_triples = 0

    tmp_dump = tmp.joinpath('rdf.n3')
    with open(as_posix(tmp_dump), 'w') as fp:
        for rsc in RESOURCES:
            args.log.info('Resource type %s ...' % rsc.name)
            try:
                q = DBSession.query(rsc.model)
            except InvalidRequestError:
                args.log.info('... skipping')
                continue
            for obj in page_query(q.order_by(rsc.model.pk),
                                  n=10000,
                                  verbose=True):
                graph = get_graph(obj, args.env['request'], rsc.name)
                count_triples += len(graph)
                count_rsc += 1
                fp.write(n3(graph, with_head=count_rsc == 1))
            args.log.info('... finished')

    # put in args.data_file('..', 'static', 'download')?
    md = {
        'path': as_posix(tmp),
        'resources': count_rsc,
        'triples': count_triples
    }
    md.update(count_links(as_posix(tmp_dump)))
    jsonlib.dump(md, args.data_file('rdf-metadata.json'))
    print(md)

    dataset = Dataset.first()
    rdf_dump = args.module_dir.joinpath('static', 'download',
                                        '%s-dataset.n3' % dataset.id)
    tmp_dump.copy(rdf_dump)
    check_call('gzip -f %s' % rdf_dump, shell=True)
    print(str(rdf_dump))

Exemplo n.º 42

0

Exibir arquivo

Arquivo: make_defaults.py Projeto: cysouw/cldf

def make():
    tables = {}
    columns = {}

    for e in read_terms().iter():
        if ns('rdf:about') in e.attrib:
            lname = e.attrib[ns('rdf:about')].split('#')[-1]
            if e.tag == ns('rdfs:Class') and lname.endswith('Table'):
                tables[lname] = e
            elif e.tag == ns('rdf:Property'):
                columns[lname] = e

    comps = {}
    for subdir, spec in COMPONENTS.items():
        table = make_table(tables.pop(spec['table']))
        for c, req in spec['columns']:
            table['tableSchema']['columns'].append(make_column(columns[c], req))
        comps[subdir] = table
        dump(
            table,
            REPO_DIR.joinpath(
                'components', subdir, '{0}-metadata.json'.format(spec['table'])),
            indent=4)

    for subdir, comprefs in MODULES.items():
        dump(
            OrderedDict([
                ("@context", ["http://www.w3.org/ns/csvw", {"@language": "en"}]),
                ("dc:conformsTo",
                 "http://cldf.clld.org/v1.0/terms.rdf#{0}".format(subdir)),
                ("dialect", {
                    "commentPrefix": None,
                }),
                ("tables", [comps[ref] for ref in comprefs]),
            ]),
            REPO_DIR.joinpath('modules', subdir, '{0}-metadata.json'.format(subdir)),
            indent=4)

Exemplo n.º 43

0

Exibir arquivo

Arquivo: util.py Projeto: clld/tsammalex-data

 def write(self):
     jsonlib.dump(self.items, self.path, **self._json_opts)

Exemplo n.º 44

0

Exibir arquivo

Arquivo: commands.py Projeto: clics/clics-data

def communities(args, neighbor_weight=None):
    graphname = args.graphname or 'network'
    edge_weights = args.weight
    vertex_weights = str('FamilyFrequency')
    normalize = args.normalize
    edgefilter = args.edgefilter
    threshold = args.threshold or 1
    neighbor_weight = neighbor_weight or 5

    _graph = args.api.load_graph(graphname, threshold, edgefilter)
    args.log.info('loaded graph')
    for n, d in tqdm(_graph.nodes(data=True), desc='vertex-weights', leave=False):
        d[vertex_weights] = int(d[vertex_weights])

    if normalize:
        for edgeA, edgeB, data in tqdm(_graph.edges(data=True), desc='normalizing', leave=False):
            data[str('weight')] = data[edge_weights] ** 2 / (
                _graph.node[edgeA][vertex_weights] +
                _graph.node[edgeB][vertex_weights] -
                data[edge_weights])
        vertex_weights = None
        edge_weights = 'weight'
        args.log.info('computed weights')

    graph = networkx2igraph(_graph)
    args.log.info('starting infomap')
    args.log.info('converted graph...')

    comps = graph.community_infomap(
        edge_weights=str(edge_weights), vertex_weights=vertex_weights)

    args.log.info('finished infomap')
    D, Com = {}, defaultdict(list)
    for i, comp in enumerate(sorted(comps.subgraphs(), key=lambda x: len(x.vs), reverse=True)):
        for vertex in [v['name'] for v in comp.vs]:
            D[graph.vs[vertex]['ConcepticonId']] = str(i + 1)
            Com[i + 1].append(graph.vs[vertex]['ConcepticonId'])

    for node, data in _graph.nodes(data=True):
        data['infomap'] = D[node]
        data['ClusterName'] = ''
        data['CentralConcept'] = ''

    # get the articulation points etc. immediately
    for idx, nodes in sorted(Com.items()):
        sg = _graph.subgraph(nodes)
        if len(sg) > 1:
            d_ = sorted(sg.degree(), key=lambda x: x[1], reverse=True)
            d = [_graph.node[a]['Gloss'] for a, b in d_][0]
            cluster_name = 'infomap_{0}_{1}'.format(idx, d)
        else:
            d = _graph.node[nodes[0]]['Gloss']
            cluster_name = 'infomap_{0}_{1}'.format(idx, _graph.node[nodes[0]]['Gloss'])
        args.log.debug(cluster_name, d)
        for node in nodes:
            _graph.node[node]['ClusterName'] = cluster_name
            _graph.node[node]['CentralConcept'] = d

    args.log.info('computed cluster names')

    cluster_dir = args.api.existing_dir('app', 'cluster', clean=True)
    cluster_names = {}
    removed = []
    for idx, nodes in tqdm(sorted(Com.items()), desc='export to app', leave=False):
        sg = _graph.subgraph(nodes)
        for node, data in sg.nodes(data=True):
            data['OutEdge'] = []
            neighbors = [
                n for n in _graph if
                n in _graph[node] and
                _graph[node][n]['FamilyWeight'] >= neighbor_weight and
                n not in sg]
            if neighbors:
                sg.node[node]['OutEdge'] = []
                for n in neighbors:
                    sg.node[node]['OutEdge'].append([
                        _graph.node[n]['ClusterName'],
                        _graph.node[n]['CentralConcept'],
                        _graph.node[n]['Gloss'],
                        _graph[node][n]['WordWeight'],
                        n
                    ])
        if len(sg) > 1:
            jsonlib.dump(
                json_graph.adjacency_data(sg),
                cluster_dir / (_graph.node[nodes[0]]['ClusterName'] + '.json'),
                sort_keys=True)
            for node in nodes:
                cluster_names[_graph.node[node]['Gloss']] = _graph.node[node]['ClusterName']
        else:
            removed += [list(nodes)[0]]
    _graph.remove_nodes_from(removed)
    for node, data in _graph.nodes(data=True):
        if 'OutEdge' in data:
            data['OutEdge'] = '//'.join(['/'.join([str(y) for y in x]) for x in data['OutEdge']])
    removed = []
    for nA, nB, data in tqdm(_graph.edges(data=True), desc='remove edges', leave=False):
        if _graph.node[nA]['infomap'] != _graph.node[nB]['infomap'] and data['FamilyWeight'] < 5:
            removed += [(nA, nB)]
    _graph.remove_edges_from(removed)

    args.api.save_graph(_graph, 'infomap', threshold, edgefilter)
    args.api.write_js_var('INFO', cluster_names, 'app', 'source', 'infomap-names.js')

Exemplo n.º 45

0

Exibir arquivo

Arquivo: util.py Projeto: cevmartinez/clld

def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == "cleanup":
        for fname in args.data_file("gbs").glob("*.json"):
            try:
                data = jsonlib.load(fname)
                if data.get("totalItems") == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source).order_by(common.Source.id).options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file("gbs", "source%s.json" % source.id)

        if command == "update":
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ["verify", "update"]:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn("no JSON object found in: %s" % filepath)
                    continue
                if not data["totalItems"]:
                    continue
                item = data["items"][0]
            else:
                continue

        if command == "verify":
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item["volumeInfo"].get("publishedDate", "").split("-")[0]
            if not year or year != slug(source.year or ""):
                needs_check = True
            twords = words(stitle)
            iwords = words(item["volumeInfo"]["title"] + " " + item["volumeInfo"].get("subtitle", ""))
            if (
                twords == iwords
                or (len(iwords) > 2 and iwords.issubset(twords))
                or (len(twords) > 2 and twords.issubset(iwords))
            ):
                needs_check = False
            if int(source.id) == 241:
                log.info("%s" % sorted(words(stitle)))
                log.info("%s" % sorted(iwords))
            if needs_check:
                log.info("------- %s -> %s" % (source.id, item["volumeInfo"].get("industryIdentifiers")))
                log.info("%s %s" % (item["volumeInfo"]["title"], item["volumeInfo"].get("subtitle", "")))
                log.info(stitle)
                log.info(item["volumeInfo"].get("publishedDate"))
                log.info(source.year)
                log.info(item["volumeInfo"].get("authors"))
                log.info(source.author)
                log.info(item["volumeInfo"].get("publisher"))
                log.info(source.publisher)
                if not confirm("Are the records the same?"):
                    log.warn("---- removing ----")
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == "update":
            source.google_book_search_id = item["id"]
            source.update_jsondata(gbs=item)
            count += 1
        elif command == "download":
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    "inauthor:" + quote_plus(source.author.encode("utf8")),
                    "intitle:" + quote_plus(title.encode("utf8")),
                ]
                if source.publisher:
                    q.append("inpublisher:" + quote_plus(source.publisher.encode("utf8")))
                url = api_url + "q=%s&key=%s" % ("+".join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={"accept": "application/json"})
                log.info("%s - %s" % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), "w") as fp:
                        fp.write(r.text.encode("utf8"))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == "update":
        log.info("assigned gbs ids for %s out of %s sources" % (count, i))
    elif command == "download":
        log.info("queried gbs for %s sources" % count)

Exemplo n.º 46

0

Exibir arquivo

Arquivo: commands.py Projeto: clics/clics-data

def subgraph(args, neighbor_weight=None):
    args.api._log = args.log
    graphname = args.graphname or 'network'
    threshold = args.threshold or 1
    edgefilter = args.edgefilter
    neighbor_weight = neighbor_weight or 5

    _graph = args.api.load_graph(graphname, threshold, edgefilter)
    for node, data in _graph.nodes(data=True):
        generations = [{node}]
        while generations[-1] and len(set.union(*generations)) < 30 and len(generations) < 3:
            nextgen = set.union(*[set(_graph[n].keys()) for n in generations[-1]])
            if len(nextgen) > 50:
                break  # pragma: no cover
            else:
                generations.append(set.union(*[set(_graph[n].keys()) for n in generations[-1]]))
        data['subgraph'] = list(set.union(*generations))

    args.api.save_graph(_graph, 'subgraph', threshold, edgefilter)

    outdir = args.api.existing_dir('app', 'subgraph', clean=True)
    cluster_names = {}
    nodes2cluster = {}
    nidx = 1
    for node, data in tqdm(
            sorted(_graph.nodes(data=True), key=lambda x: len(x[1]['subgraph']), reverse=True),
            leave=False):
        nodes = tuple(sorted(data['subgraph']))
        sg = _graph.subgraph(data['subgraph'])
        if nodes not in nodes2cluster:
            d_ = sorted(sg.degree(), key=lambda x: x[1], reverse=True)
            d = [_graph.node[a]['Gloss'] for a, b in d_][0]
            nodes2cluster[nodes] = 'subgraph_{0}_{1}'.format(nidx, d)
            nidx += 1
        cluster_name = nodes2cluster[nodes]
        data['ClusterName'] = cluster_name
        for n, d in sg.nodes(data=True):
            d['OutEdge'] = []
            neighbors = [
                n_ for n_ in _graph if
                n_ in _graph[node] and
                _graph[node][n_]['FamilyWeight'] >= neighbor_weight and
                n_ not in sg]
            if neighbors:
                sg.node[node]['OutEdge'] = []
                for n_ in neighbors:
                    sg.node[node]['OutEdge'].append([
                        'subgraph_' + n_ + '_' + _graph.node[n]['Gloss'],
                        _graph.node[n_]['Gloss'],
                        _graph.node[n_]['Gloss'],
                        _graph[node][n_]['FamilyWeight'],
                        n_
                    ])
                    sg.node[node]['OutEdge'].append([
                        _graph.node[n]['ClusterName'],
                        _graph.node[n]['CentralConcept'],
                        _graph.node[n]['Gloss'],
                        _graph[node][n]['WordWeight'],
                        n
                    ])
        if len(sg) > 1:
            jsonlib.dump(
                json_graph.adjacency_data(sg), outdir / (cluster_name + '.json'), sort_keys=True)
            cluster_names[data['Gloss']] = cluster_name

    for node, data in _graph.nodes(data=True):
        if 'OutEdge' in data:
            data['OutEdge'] = '//'.join([str(x) for x in data['OutEdge']])
    args.api.write_js_var('SUBG', cluster_names, 'app', 'source', 'subgraph-names.js')

Exemplo n.º 47

0

Exibir arquivo

Arquivo: api.py Projeto: clics/clics-data

 def json_dump(self, obj, *path):
     p = self.existing_dir(*path[:-1]) / path[-1]
     jsonlib.dump(obj, p, indent=2)
     self.file_written(p)

Exemplo n.º 48

0

Exibir arquivo

Arquivo: not_in_eth.py Projeto: clld/asjp

from collections import OrderedDict

from csvw.dsv import reader
from clldutils.jsonlib import dump
from sqlalchemy import create_engine


eth17 = OrderedDict()
for l in reader('LanguageCodes.tab', dicts=True, delimiter='\t'):
    eth17[l['LangID']] = l['Name']

db = create_engine('postgresql://robert@/asjp')
in_asjp = set(r[0] for r in db.execute('select code_iso from doculect where code_iso is not null'))

missing = [(k, v) for k, v in eth17.items() if k not in in_asjp]
dump(missing, 'missing.json', indent=4)

Exemplo n.º 49

0

Exibir arquivo

Arquivo: util.py Projeto: clld/clld

def gbs_func(command, args, sources=None):  # pragma: no cover
    def words(s):
        return set(slug(s.strip(), remove_whitespace=False).split())

    log = args.log
    count = 0
    api_url = "https://www.googleapis.com/books/v1/volumes?"

    if command == 'cleanup':
        for fname in args.data_file('gbs').glob('*.json'):
            try:
                data = jsonlib.load(fname)
                if data.get('totalItems') == 0:
                    remove(fname)
            except ValueError:
                remove(fname)
        return

    if not sources:
        sources = DBSession.query(common.Source)\
            .order_by(common.Source.id)\
            .options(joinedload(common.Source.data))
    if callable(sources):
        sources = sources()

    for i, source in enumerate(page_query(sources, verbose=True, commit=True)):
        filepath = args.data_file('gbs', 'source%s.json' % source.id)

        if command == 'update':
            source.google_book_search_id = None
            source.update_jsondata(gbs={})

        if command in ['verify', 'update']:
            if filepath.exists():
                try:
                    data = jsonlib.load(filepath)
                except ValueError:
                    log.warn('no JSON object found in: %s' % filepath)
                    continue
                if not data['totalItems']:
                    continue
                item = data['items'][0]
            else:
                continue

        if command == 'verify':
            stitle = source.description or source.title or source.booktitle
            needs_check = False
            year = item['volumeInfo'].get('publishedDate', '').split('-')[0]
            if not year or year != slug(source.year or ''):
                needs_check = True
            twords = words(stitle)
            iwords = words(
                item['volumeInfo']['title'] + ' '
                + item['volumeInfo'].get('subtitle', ''))
            if twords == iwords \
                    or (len(iwords) > 2 and iwords.issubset(twords))\
                    or (len(twords) > 2 and twords.issubset(iwords)):
                needs_check = False
            if int(source.id) == 241:
                log.info('%s' % sorted(words(stitle)))
                log.info('%s' % sorted(iwords))
            if needs_check:
                log.info('------- %s -> %s' % (
                    source.id, item['volumeInfo'].get('industryIdentifiers')))
                log.info('%s %s' % (
                    item['volumeInfo']['title'], item['volumeInfo'].get('subtitle', '')))
                log.info(stitle)
                log.info(item['volumeInfo'].get('publishedDate'))
                log.info(source.year)
                log.info(item['volumeInfo'].get('authors'))
                log.info(source.author)
                log.info(item['volumeInfo'].get('publisher'))
                log.info(source.publisher)
                if not confirm('Are the records the same?'):
                    log.warn('---- removing ----')
                    jsonlib.dump({"totalItems": 0}, filepath)
        elif command == 'update':
            source.google_book_search_id = item['id']
            source.update_jsondata(gbs=item)
            count += 1
        elif command == 'download':
            if source.author and (source.title or source.booktitle):
                title = source.title or source.booktitle
                if filepath.exists():
                    continue
                q = [
                    'inauthor:' + quote_plus(source.author.encode('utf8')),
                    'intitle:' + quote_plus(title.encode('utf8')),
                ]
                if source.publisher:
                    q.append('inpublisher:' + quote_plus(
                        source.publisher.encode('utf8')))
                url = api_url + 'q=%s&key=%s' % ('+'.join(q), args.api_key)
                count += 1
                r = requests.get(url, headers={'accept': 'application/json'})
                log.info('%s - %s' % (r.status_code, url))
                if r.status_code == 200:
                    with open(as_posix(filepath), 'w') as fp:
                        fp.write(r.text.encode('utf8'))
                elif r.status_code == 403:
                    log.warn("limit reached")
                    break
    if command == 'update':
        log.info('assigned gbs ids for %s out of %s sources' % (count, i))
    elif command == 'download':
        log.info('queried gbs for %s sources' % count)