示例#1
0
文件: download.py 项目: gopyruby/clld
 def before(self, req, fp):
     self.writer = UnicodeWriter(fp)
     self.writer.__enter__()
     self.writer.writerow([
         f if isinstance(f, string_types) else f[1]
         for f in self.get_fields(req)
     ])
示例#2
0
文件: test_dsv.py 项目: Anaphory/csvw
def test_UnicodeWriter(tmpdir, row, expected):
    with UnicodeWriter() as writer:
        writer.writerows([row])
    assert writer.read() == expected

    filepath = tmpdir / 'test.csv'
    with UnicodeWriter(str(filepath)) as writer:
        writer.writerow(row)
    assert filepath.read_binary() == expected
示例#3
0
def rename(args):  # pragma: no cover
    api = Concepticon(args.repos)

    from_, to_ = args.args
    assert CONCEPTLIST_ID_PATTERN.match(to_)
    cl = api.conceptlists[from_]

    # write the adapted concept list to the new path:
    with UnicodeWriter(cl.path.parent / cl.path.name.replace(from_, to_),
                       delimiter='\t') as writer:
        header = []
        for i, row in enumerate(reader(cl.path, delimiter='\t')):
            if i == 0:
                header = row
                writer.writerow(row)
                header = {v: k
                          for k, v in enumerate(header)
                          }  # Map col name to row index
            else:
                oid = row[header['ID']]
                assert oid.startswith(from_)
                nid = oid.replace(from_, to_)
                api.add_retirement(
                    'Concept', dict(id=oid,
                                    comment='renaming',
                                    replacement=nid))
                row[header['ID']] = nid
                writer.writerow(row)

    # write adapted metadata to the new path:
    fname = cl.path.name.replace(from_, to_) + MD_SUFFIX
    md = jsonlib.load(cl.path.parent / (cl.path.name + MD_SUFFIX),
                      object_pairs_hook=OrderedDict)
    md['tables'][0]['url'] = fname
    jsonlib.dump(md, cl.path.parent / fname, indent=4)

    # remove obsolete concept list and metadata:
    cl.path.unlink()
    cl.path.parent.joinpath(cl.path.name + MD_SUFFIX).unlink()

    # adapt conceptlists.tsv
    rows = []
    for row in reader(api.data_path('conceptlists.tsv'), delimiter='\t'):
        rows.append([col.replace(from_, to_) if col else col for col in row])

    with UnicodeWriter(api.data_path('conceptlists.tsv'),
                       delimiter='\t') as writer:
        writer.writerows(rows)

    api.add_retirement('Conceptlist',
                       dict(id=from_, comment='renaming', replacement=to_))

    print("""Please run
grep -r "{0}" concepticondata/ | grep -v retired.json

to confirm the renaming was complete!""".format(from_))
示例#4
0
def run(args):
    report, counts = [], {}
    api = args.repos

    if args.filename:
        sheets = [Sheet(args.filename)]
    else:
        sheets = [(s, list(s.itervalues(api))) for s in api.iter_sheets()]
        sheets = (s[0] for s in iterunique(sheets, verbose=args.verbose))

    for sheet in sorted(sheets, key=lambda s: s.path):
        n = sheet.check(api, report=report)
        if (sheet.glottocode
                not in counts) or (n > counts[sheet.glottocode][0]):
            counts[sheet.glottocode] = (n, sheet.path.stem)

    selected = set(v[1] for v in counts.values())
    for row in report:
        row.insert(1, row[0] in selected)

    if report and args.report:
        with UnicodeWriter(args.report, delimiter='\t') as w:
            w.writerow(
                ['sheet', 'selected', 'level', 'line', 'feature', 'message'])
            w.writerows(report)
        args.log.info('Report written to {0}'.format(args.report))
    args.log.error('Repos check found WARNINGs or ERRORs')
示例#5
0
文件: api.py 项目: D-PLACE/pydplace
 def _write_items(self, what, attr=None, items=None):
     items = items if items is not None else getattr(self, attr or what)
     if items:
         with UnicodeWriter(self._path(what)) as writer:
             writer.writerow(items[0].__class__.fields())
             for item in items:
                 writer.writerow(item.astuple())
示例#6
0
    def cmd_download(self, args):
        from docx import Document
        fname = self.raw_dir / "Table_S2_Supplementary_Mennecier_et_al..doc"

        self.raw_dir.download_and_unpack(
            "https://ndownloader.figshare.com/articles/3443090/versions/1",
            fname,
            log=args.log,
        )

        check_call(
            "libreoffice --headless --convert-to docx %s --outdir %s" %
            (fname, self.raw_dir),
            shell=True,
        )

        doc = Document(self.raw_dir /
                       "Table_S2_Supplementary_Mennecier_et_al..docx")
        for i, table in enumerate(doc.tables):
            with UnicodeWriter(
                    self.raw_dir.joinpath("%s.csv" %
                                          (i + 1, )).as_posix()) as writer:
                for row in table.rows:
                    # This code fixes a wrong gloss in the raw source,
                    # where the `itʲerʊ` set is glossed as "to pull"
                    # instead of the correct "to push". See discussion
                    # at https://github.com/lexibank/cals/pull/7
                    row_data = map(text_and_color, row.cells)
                    if i == 11:
                        row_data = [
                            cell if cell != "to pull" else "to push"
                            for cell in row_data
                        ]
                    writer.writerow(row_data)
示例#7
0
    def get_profile(self, clts=None, filename=None):
        """
        Compute an orthography profile with LingPy's function.

        :param filename: Write the computed profile to a file in addition to returning it.
        :return: `segments.Profile` instance.
        """
        clts = clts.bipa if clts else None

        D = {0: ['doculect', 'concept', 'ipa']}
        for i, key in enumerate(self._concordances.form, start=1):
            D[i] = ['dummy', key[1], key[0]]
        wordlist = lingpy.basic.wordlist.Wordlist(D)

        if not filename:
            with tempfile.NamedTemporaryFile(delete=FileExistsError) as fp:
                pass
            p = pathlib.Path(fp.name)
        else:
            p = pathlib.Path(filename)

        with UnicodeWriter(p, delimiter='\t') as w:
            w.writerow(['Grapheme', 'IPA', 'Example', 'Count', 'Unicode'])
            for line in lingpy.sequence.profile.context_profile(wordlist, ref='ipa', clts=clts):
                w.writerow([line[0], line[1], line[2], line[4], line[5]])

        res = segments.Profile.from_file(p)
        if not filename:
            p.unlink()
        return res
示例#8
0
文件: test_dsv.py 项目: Anaphory/csvw
def test_roundtrip_with_keyword_dialect(tmpdir,
                                        rows=[['1', 'y'], ['  "1 ', '3\t4']],
                                        dialect='excel'):
    filename = str(tmpdir / 'test.csv')
    with UnicodeWriter(filename, dialect=dialect) as w:
        w.writerows(rows)
    assert list(iterrows(filename, dialect=dialect)) == rows
示例#9
0
def iso2codes(args):
    """
    Map ISO codes to the list of all Glottolog languages and dialects subsumed "under" it.
    """
    nodes = list(args.repos.languoids())

    res = {}
    for node in nodes:
        if node.iso:
            res[node.id] = (node.iso, set())

    for node in nodes:
        if node.level == args.repos.languoid_levels.family or node.id in res:
            continue
        for nid in res:
            matched = False
            for l in node.lineage:
                if l[1] == nid:
                    res[nid][1].add(node.id)
                    matched = True
                    break
            if matched:
                break

    outdir = Path('.') if not args.args else Path(args.args[0])
    with UnicodeWriter(outdir / 'iso2glottocodes.csv') as writer:
        writer.writerow(['iso', 'glottocodes'])
        for gc, (iso, gcs) in res.items():
            writer.writerow([iso, ';'.join([gc] + list(gcs))])
示例#10
0
    def create(self, table=None, dbase=None, ignore=None):
        """
        Upload triple-data to sqlite3-db. Thereby, delete the previous table
        if it is still in the database.
        """
        dbase = pathlib.Path(dbase or self.dbase)
        table = table or dbase.stem
        ignore = ignore or []

        # write a log for the blacklist
        with UnicodeWriter(
                dbase.parent.joinpath(
                    lingpy.rc('timestamp') + '-blacklist.log')) as w:
            w.writerow(['ID'] +
                       sorted(self.header, key=lambda x: self.header[x]))
            w.writerows([[str(k)] + [stringval(e) for e in self[k]]
                         for k in self.blacklist])

        with self.cursor(dbase) as cu:
            cu.execute(
                "CREATE TABLE IF NOT EXISTS backup "
                "(file TEXT, id INT, col TEXT, val TEXT, date TEXT, user TEXT)"
            )
            cu.execute(
                "CREATE TABLE IF NOT EXISTS {0} (id INT, col TEXT, val TEXT)".
                format(table))
            cu.execute("DELETE FROM {0}".format(table))

        self.update(table=table, dbase=dbase, ignore=ignore)
示例#11
0
def run(args):
    bipa = args.clts.api.bipa
    func = profile.simple_profile
    cols = ['Grapheme', 'IPA', 'Frequence', 'Codepoints']
    kw = {'ref': 'form', 'clts': bipa}
    if args.context:
        func = profile.context_profile
        cols = [
            'Grapheme', 'IPA', 'Examples', 'Languages', 'Frequence',
            'Codepoints'
        ]
        kw['col'] = 'language_id'

    ds = get_dataset(args)
    profile_path = ds.etc_dir / 'orthography.tsv'
    if profile_path.exists() and not args.force:
        raise ParserError(
            'Orthography profile exists already. To overwrite, pass "-f" flag')

    header, D = [], {}
    for i, row in enumerate(ds.cldf_reader()['FormTable'], start=1):
        if i == 1:
            header = [f for f in row.keys() if f != 'ID']
            D = {0: ['lid'] + [h.lower() for h in header]}

        row['Segments'] = ' '.join(row['Segments'])
        D[i] = [row['ID']] + [row[h] for h in header]

    with UnicodeWriter(profile_path, delimiter='\t') as writer:
        writer.writerow(cols)
        for row in func(Wordlist(D, row='parameter_id', col='language_id'),
                        **kw):
            writer.writerow(row)
    args.log.info('Orthography profile written to {0}'.format(profile_path))
示例#12
0
def run(args):
    if args.out == FNAME:  # pragma: no cover
        name = args.repos.path(args.out)
    else:
        name = pathlib.Path(args.out)

    rows = collections.OrderedDict([
        ('Feature_ID', lambda f: f.id),
        ('Feature', lambda f: f.wiki['title']),
        ('Possible Values', lambda f: f['Possible Values']),
        ('Language_ID', lambda f: ''),
        ('Value', lambda f: ''),
        ('Source', lambda f: ''),
        ('Comment', lambda f: ''),
        ('Contributed_Datapoints', lambda f: ''),
        ('Clarifying comments',
         lambda f: f.wiki['Summary'].replace('\n', ' ')),
        ('Relevant unit(s)', lambda f: f['Relevant unit(s)']),
        ('Function', lambda f: f['Function']),
        ('Form', lambda f: f['Form']),
        ('Patron', lambda f: f.wiki['Patron']),
    ])

    with UnicodeWriter(name, delimiter='\t') as w:
        w.writerow(rows.keys())
        for feature in args.repos.features.values():
            w.writerow([v(feature) for v in rows.values()])
示例#13
0
文件: tob.py 项目: liualg/pylexibank
    def cmd_download(self, args):
        self.raw_dir.write('sources.bib', "\n".join(self.tob_sources.values()))

        # download data
        all_records = []
        for i in pb(list(range(1, 20 * self.pages + 1, 20))):
            with self.raw_dir.temp_download(self._url(i),
                                            'file-{0}'.format(i),
                                            log=args.log) as fname:
                soup = BeautifulSoup(
                    fname.open(encoding='utf8').read(), 'html.parser')
                for record in soup.findAll(name='div',
                                           attrs={"class": "results_record"}):
                    if isinstance(record, bs4.element.Tag):
                        children = list(record.children)
                        number = children[0].findAll('span')[1].text.strip()
                        concept = children[1].findAll('span')[1].text
                        for child in children[2:]:
                            if isinstance(child, bs4.element.Tag):
                                dpoints = child.findAll('span')
                                if len(dpoints) >= 3:
                                    lname = dpoints[1].text
                                    glottolog = re.findall(
                                        'Glottolog: (........)',
                                        str(dpoints[1]))[0]
                                    entry = dpoints[2].text
                                    cogid = list(
                                        child.children)[4].text.strip()
                                    all_records.append(
                                        (number, concept, lname, glottolog,
                                         entry, cogid))
        with UnicodeWriter(self.raw_dir / 'output.csv') as f:
            f.writerows(all_records)
示例#14
0
def markconservative(m, trigs, ref, hht, outfn, verbose=True, rank=None):
    blamefield = "hhtype"
    mafter = markall(m, trigs, verbose=verbose, rank=rank)
    ls = lstat(ref, hht)
    lsafter = lstat_witness(mafter, hht)
    log = []
    no_status = defaultdict(set)
    for (lg, (stat, wits)) in lsafter.items():
        if not ls.get(lg):
            srctrickles = [mafter[k][1].get('srctrickle') for k in wits]
            for t in srctrickles:
                if t and not t.startswith('iso6393'):
                    no_status[lg].add(t)
            continue
        if hht[stat] > hht[ls[lg]]:
            log = log + [(lg, [(mafter[k][1].get(
                blamefield, "No %s" % blamefield), k, mafter[k][1].get(
                    'title', 'no title'), mafter[k][1].get(
                        'srctrickle', 'no srctrickle'))
                               for k in wits], ls[lg])]
            for k in wits:
                (t, f) = mafter[k]
                if blamefield in f:
                    del f[blamefield]
                mafter[k] = (t, f)
    for lg in no_status:
        print('{0} lacks status'.format(lg))
    with UnicodeWriter(outfn, dialect='excel-tab') as writer:
        writer.writerows(
            ((lg, was) + mis for (lg, miss, was) in log for mis in miss))
    return mafter
示例#15
0
def run(args):
    datasets = args.dataset.split(",") if args.dataset else None
    variables = args.variable.split(",") if args.variable else None
    societies = args.society.split(",") if args.society else None

    if args.tree:
        for tree in args.repos.phylogenies:
            if args.tree == tree.id:
                break
        else:
            raise SystemExit("Failed to find Tree %s" % args.tree)
        societies = [
            s for sublist in [t.soc_ids for t in tree.taxa] for s in sublist
        ]

    with UnicodeWriter(f=args.filename) as out:
        header = [
            'ID', 'XD_ID', 'Glottocode', 'Name', 'OriginalName', 'FocalYear',
            'Latitude', 'Longitude', 'Variable', 'Value'
        ]
        out.writerow(header)

        socs = args.repos.societies
        for record in args.repos.iter_data(datasets=datasets,
                                           variables=variables,
                                           societies=societies):
            s = socs.get(record.soc_id, None)
            row = [
                s.id, s.xd_id, s.glottocode, s.pref_name_for_society,
                s.ORIG_name_and_ID_in_this_dataset, s.main_focal_year, s.Lat,
                s.Long, record.var_id, record.code
            ]
            out.writerow(row)
示例#16
0
    def write_concepts(self, ctype, filename=None):
        def format_form(f):
            if self.monolingual:
                return f[0]
            return '{}: {}'.format(f[1], f[0])

        concepts = self.get_concepts(ctype)
        with UnicodeWriter(filename, delimiter='\t') as w:
            w.writerow(
                ['ID', 'ENGLISH', 'OCCURRENCE', 'CONCEPT_IN_SOURCE', 'FORMS', 'PHRASE', 'GLOSS'])
            for i, (concept, forms) in enumerate(
                    sorted(concepts.items(), key=lambda x: x[1][0][-1], reverse=True), start=1):
                # Get the IGT containing the first occurrence listed in the concordance as example:
                igt = self[
                    self._concordances[ctype][
                        forms[0][0],
                        concept,
                        forms[0][2],
                        forms[0][1],
                    ][0][0]
                ]
                w.writerow([
                    i,
                    concept,
                    sum([f[3] for f in forms]),
                    ' // '.join(sorted(set([f[2] for f in forms]))),
                    ' // '.join(sorted(set([format_form(f) for f in forms]))),
                    igt.phrase_text,
                    igt.gloss_text,
                ])
        if not filename:
            print(w.read().decode('utf8'))
示例#17
0
def run(args):
    nodes = list(args.repos.languoids())

    res = {}
    for node in nodes:
        if node.iso:
            res[node.id] = (node.iso, set())

    for node in nodes:
        if node.level == args.repos.languoid_levels.family or node.id in res:
            continue
        for nid in res:
            matched = False
            for lineage in node.lineage:
                if lineage[1] == nid:
                    res[nid][1].add(node.id)
                    matched = True
                    break
            if matched:
                break

    with UnicodeWriter(args.output / 'iso2glottocodes.csv') as writer:
        writer.writerow(['iso', 'glottocodes'])
        for gc, (iso, gcs) in res.items():
            writer.writerow([iso, ';'.join([gc] + list(gcs))])
示例#18
0
def languoids(api, langs, outdir):
    _Level = Level or api.languoid_levels
    with UnicodeWriter(outdir / 'csv' / 'glottolog.csv') as writer:
        writer.writerow([
            'id', 'name', 'family_id', 'family_name', 'iso_code',
            'language_id', 'macroarea', 'lineage', 'level'
        ])
        for lang in sorted(langs, key=lambda l: l.id):
            if lang.level == _Level.language:
                lid = lang.id
            elif lang.level == _Level.dialect:
                for _, lid, level in reversed(lang.lineage):
                    if level == _Level.language:
                        break
                else:  # pragma: no cover
                    raise ValueError
            else:
                lid = None
            writer.writerow([
                lang.id,
                lang.name,
                lang.lineage[0][1] if lang.lineage else '',
                lang.lineage[0][0] if lang.lineage else '',
                lang.iso or '',
                lid,
                getattr(lang.macroareas[0], 'value', lang.macroareas[0].name)
                if lang.macroareas else '',
                '/'.join(gc for _, gc, _ in lang.lineage),
                lang.level.name,
            ])
示例#19
0
def fetch_sheet(sheetname, keyfile=None, output=None, delimiter='\t'):
    spreadsheet = google_api_client(keyfile).open_by_key(DOCUMENT_ID)

    for i, worksheet in enumerate(spreadsheet.worksheets()):
        if worksheet.title == sheetname:
            output = output or '{0}.tsv'.format(worksheet.title)
            with UnicodeWriter(output, delimiter=delimiter) as writer:
                writer.writerows(worksheet.get_all_values())
def locations(glottolog, fid, outpath):
    with UnicodeWriter(outpath) as writer:
        writer.writerow(['name', 'glottocode', 'latitude', 'longitude'])
        for lang in glottolog.languoids():
            if lang.level == glottolog.languoid_levels.language and lang.latitude is not None:
                if fid in [l[1] for l in lang.lineage]:
                    writer.writerow(
                        [lang.name, lang.id, lang.latitude, lang.longitude])
示例#21
0
def _write_linking_data(api, lang, args):
    out, freqs = collections.defaultdict(int), collections.defaultdict(int)
    # find those concept sets that are wrongly linked, they should not go into
    # the mapping, so we just make a re-linker here
    rep = {}
    for c in api.conceptsets.values():
        if c.replacement_id:
            rep[c.id] = c.replacement_id
            rep[c.gloss] = api.conceptsets[c.replacement_id].gloss
        else:
            rep[c.id] = c.id
            rep[c.gloss] = c.gloss

    for clist in api.conceptlists.values():
        args.log.info("checking {clist.id}".format(clist=clist))
        for row in clist.concepts.values():
            if row.concepticon_id:
                gls = None
                if lang.iso2 == "en":
                    if row.english:
                        gls = row.english.strip("*$-—+")
                else:
                    if lang.name in row.attributes and row.attributes[
                            lang.name]:
                        gls = row.attributes[lang.name].strip("*$-—+")

                if gls:
                    out[rep[row.concepticon_gloss] + "///" + gls,
                        rep[row.concepticon_id]] += 1
                    freqs[rep[row.concepticon_id]] += 1

    if lang.iso2 == "en":
        for cset in api.conceptsets.values():
            gloss = rep[cset.gloss]
            cid = rep[cset.id]
            if cset.ontological_category == "Person/Thing":
                out[gloss + "///the " + cset.gloss.lower(), cid] = freqs[cid]
                out[gloss + "///the " + cset.gloss.lower() + "s",
                    cid] = freqs[cid]
            elif cset.ontological_category == "Action/Process":
                out[gloss + "///to " + cset.gloss.lower(), cid] = freqs[cid]
            elif cset.ontological_category == "Property":
                out[gloss + "///" + cset.gloss.lower() + " (adjective)",
                    cid] = freqs[cid]
            elif cset.ontological_category == "Classifier":
                out[gloss + "///" + cset.gloss.lower() + " (classifier)",
                    cid] = freqs[cid]
            else:
                out[gloss + "///" + cset.gloss.lower(), cid] = freqs[cid]

    p = api.path("mappings", "map-{0}.tsv".format(lang.iso2))
    if not p.parent.exists():
        p.parent.mkdir()
    with UnicodeWriter(p, delimiter="\t") as f:
        f.writerow(["ID", "GLOSS", "PRIORITY"])
        for i, (gloss, cid) in enumerate(sorted(out)):
            f.writerow([cid, gloss, out[gloss, cid]])
示例#22
0
文件: csv.py 项目: gopyruby/clld
 def render(self, ctx, req):
     with UnicodeWriter() as writer:
         rows = iter(ctx.get_query(limit=QUERY_LIMIT))
         first = next(rows, None)
         if first is not None:
             cols = first.csv_head()
             writer.writerow(cols)
             for item in chain([first], rows):
                 writer.writerow(item.to_csv(ctx=ctx, req=req, cols=cols))
         return writer.read()
示例#23
0
文件: test_dsv.py 项目: Anaphory/csvw
def test_roundtrip_escapechar(tmpdir,
                              quoting,
                              escapechar='\\',
                              row=['\\spam', 'eggs']):
    filename = str(tmpdir / 'spam.csv')
    kwargs = {'escapechar': escapechar, 'quoting': quoting}
    with UnicodeWriter(filename, **kwargs) as writer:
        writer.writerow(row)
    with UnicodeReader(filename, **kwargs) as reader:
        result = next(reader)
    assert result == row
示例#24
0
文件: test_dsv.py 项目: Anaphory/csvw
def test_roundtrip_multibyte(tmpdir,
                             encoding,
                             row=['spam', 'eggs'],
                             expected='spam,eggs\r\n',
                             n=2):
    filepath = tmpdir / 'spam.csv'
    kwargs = {'encoding': encoding}
    with UnicodeWriter(str(filepath), **kwargs) as writer:
        writer.writerows([row] * n)
    with UnicodeReader(str(filepath), **kwargs) as reader:
        result = next(reader)
    assert result == row
    assert filepath.read_binary() == (expected * n).encode(encoding)
示例#25
0
def _write_csv_to_file(data, file_name, api, header=None, dir_name='cldf'):
    outdir = api.repos.joinpath(dir_name)
    if not outdir.exists():
        outdir.mkdir()
    if header is None:
        try:
            header = data.keys()
        except AttributeError:
            pass
    with UnicodeWriter(outdir.joinpath(file_name)) as w:
        if header is not None:
            w.writerow(header)
        for row in data:
            w.writerow(row)
示例#26
0
def write(p, rows, features):
    rows = collections.OrderedDict([(r['Feature_ID'], r) for r in rows])
    cols = 'Feature_ID Value Source Contributed_Datapoint Comment'.split()
    with UnicodeWriter(p, delimiter='\t') as writer:
        writer.writerow(cols)

        for feature in features.values():
            row = rows.pop(feature.id, None)
            if not row:
                row = {k: '' for k in cols}
                row['Feature_ID'] = feature.id
            writer.writerow([row.get(col, '') for col in cols])
        for row in rows.values():
            writer.writerow([row.get(col, '') for col in cols])
示例#27
0
 def render(self, ctx, req):
     fid = req.route_url('parameter', id='xxx').replace('xxx', '{0}')
     lid = req.route_url('language', id='xxx').replace('xxx', '{0}')
     with UnicodeWriter() as writer:
         writer.writerow(['Language_ID', 'Feature_ID', 'Value'])
         for _lid, _fid, v in DBSession.query(
                 Language.id, Parameter.id, DomainElement.name) \
                 .filter(Language.pk == ValueSet.language_pk) \
                 .filter(Parameter.pk == ValueSet.parameter_pk) \
                 .filter(Value.valueset_pk == ValueSet.pk) \
                 .filter(Value.domainelement_pk == DomainElement.pk) \
                 .order_by(Parameter.pk, Language.id):
             writer.writerow([lid.format(_lid), fid.format(_fid), v])
         return writer.read()
示例#28
0
def run(args):
    api = args.repos
    if not args.outdir.exists():
        args.outdir.mkdir()

    # Iterate over groups of sheets for the same glottocode:
    for gc, sheets in itertools.groupby(
            sorted(api.iter_sheets(), key=lambda s: s.glottocode),
            lambda s: s.glottocode):
        if gc not in ['sina1266', 'meke1243']:
            continue
        sheets = list(sheets)
        if len(sheets) == 1:
            continue

        with UnicodeWriter(args.outdir / '{}.tsv'.format(gc),
                           delimiter='\t') as w:
            w.writerow([
                'Feature_ID',
                'Value',
                'Conflict',
                'Classification of conflict',
                'Select',
                'Sheet',
                'Source',
                'Contributed_Datapoint',
                'Comment',
                'Warnings',
            ])
            # Iterate over rows grouped by feature ID:
            for fid, rows in grouped_rows(sheets):
                conflict = len(
                    set(r[0]['Value'] for r in rows if r[0]['Value'])) > 1
                for row, sheet in rows:
                    # See what "grambank check" would do with the row:
                    warnings = Warnings()
                    sheet.valid_row(row, api, log=warnings)
                    w.writerow([
                        fid,
                        row['Value'],
                        str(conflict),
                        '',
                        '',
                        sheet.path.stem,
                        row.get('Source', ''),
                        ' '.join(Row.from_dict(row).contributed_datapoint),
                        row.get('Comment', ''),
                        warnings.messages,
                    ])
示例#29
0
def _write_linking_data(api, l, args):
    out, freqs = defaultdict(int), defaultdict(int)

    for clist in api.conceptlists.values():
        args.log.info("checking {clist.id}".format(clist=clist))
        for row in clist.concepts.values():
            if row.concepticon_id:
                gls = None
                if l.iso2 == "en":
                    if row.english:
                        gls = row.english.strip("*$-—+")
                else:
                    if l.name in row.attributes and row.attributes[l.name]:
                        gls = row.attributes[l.name].strip("*$-—+")

                if gls:
                    out[row.concepticon_gloss + "///" + gls,
                        row.concepticon_id] += 1
                    freqs[row.concepticon_id] += 1

    if l.iso2 == "en":
        for cset in api.conceptsets.values():
            gloss = cset.gloss
            if cset.ontological_category == "Person/Thing":
                out[gloss + "///the " + cset.gloss.lower(),
                    cset.id] = freqs[cset.id]
                out[gloss + "///the " + cset.gloss.lower() + "s",
                    cset.id] = freqs[cset.id]
            elif cset.ontological_category == "Action/Process":
                out[gloss + "///to " + cset.gloss.lower(),
                    cset.id] = freqs[cset.id]
            elif cset.ontological_category == "Property":
                out[gloss + "///" + cset.gloss.lower() + " (adjective)",
                    cset.id] = freqs[cset.id]
            elif cset.ontological_category == "Classifier":
                out[gloss + "///" + cset.gloss.lower() + " (classifier)",
                    cset.id] = freqs[cset.id]
            else:
                out[gloss + "///" + cset.gloss.lower(),
                    cset.id] = freqs[cset.id]

    p = api.path("mappings", "map-{0}.tsv".format(l.iso2))
    if not p.parent.exists():
        p.parent.mkdir()
    with UnicodeWriter(p, delimiter="\t") as f:
        f.writerow(["ID", "GLOSS", "PRIORITY"])
        for i, (gloss, cid) in enumerate(sorted(out)):
            f.writerow([cid, gloss, out[gloss, cid]])
示例#30
0
def fix(p):
    if '_history' in p.stem:
        p.unlink()
        return

    rows = list(reader(p, dicts=True))
    if not rows:
        p.unlink()
        return

    remove = {'created', 'updated', 'active', 'polymorphic_type'}
    with UnicodeWriter(p) as w:
        for i, row in enumerate(rows):
            if i == 0:
                w.writerow([c for c in row.keys() if c not in remove])
            for c in remove:
                if c in row:
                    del row[c]
            w.writerow(row.values())
示例#31
0
文件: download.py 项目: clld/clld
class CsvDump(Download):

    """Download of a resource type as csv."""

    ext = 'csv'

    def __init__(self, model, pkg, fields=None, **kw):
        """Initialize.

        fields can be a list of column names or a dictionary mapping model attribute
        names to csv column names.
        """
        super(CsvDump, self).__init__(model, pkg, **kw)
        self.fields = fields
        self.writer = None

    def get_stream(self):
        return StringIO(newline='') if PY3 else BytesIO()

    def read_stream(self, fp):
        res = Download.read_stream(self, fp)
        if PY3:  # pragma: no cover
            res = res.encode('utf8')
        return res

    def get_fields(self, req):
        if not self.fields:
            self.fields = ['id', 'name']
        return self.fields

    def before(self, req, fp):
        self.writer = UnicodeWriter(fp)
        self.writer.__enter__()
        self.writer.writerow(
            [f if isinstance(f, string_types) else f[1] for f in self.get_fields(req)])

    def row(self, req, fp, item, index):
        return [getattr(item, f if isinstance(f, string_types) else f[0])
                for f in self.get_fields(req)]

    def dump(self, req, fp, item, index):
        self.writer.writerow(self.row(req, fp, item, index))
示例#32
0
文件: download.py 项目: clld/clld
 def before(self, req, fp):
     self.writer = UnicodeWriter(fp)
     self.writer.__enter__()
     self.writer.writerow(
         [f if isinstance(f, string_types) else f[1] for f in self.get_fields(req)])