Пример #1
0
    def ods2csv(self, fname, outdir=None):
        """
        Dump the data from an OpenDocument Spreadsheet (suffix .ODS) file to CSV.

        .. note::

            Requires `cldfbench` to be installed with extra "odf".
        """
        if not load_odf:  # pragma: no cover
            raise EnvironmentError(
                'ods2csv is only available when cldfbench is installed with odf support\n'
                'pip install cldfbench[odf]')

        fname = self._path(fname)
        ods_data = load_odf(fname)
        tables = [
            e for e in ods_data.spreadsheet.childNodes
            if e.qname == (ODF_NS_TABLE, 'table')
        ]

        outdir = outdir or self
        res = {}
        for table in tables:
            table_name = table.attributes[ODF_NS_TABLE, 'name']
            csv_path = outdir / '{}.{}.csv'.format(
                fname.stem, slug(table_name, lowercase=False))
            with dsv.UnicodeWriter(csv_path) as writer:
                writer.writerows(_ods_to_list(table))
            res[table_name] = csv_path
        return res
Пример #2
0
    def xls2csv(self, fname, outdir=None):
        """
        Dump the data from an Excel XLS file to CSV.

        .. note::

            Requires `cldfbench` to be installed with extra "excel".
        """
        if not xlrd:  # pragma: no cover
            raise EnvironmentError(
                'xls2csv is only available when cldfbench is installed with excel support\n'
                'pip install cldfbench[excel]')
        fname = self._path(fname)
        res = {}
        outdir = outdir or self
        try:
            wb = xlrd.open_workbook(str(fname))
        except xlrd.biffh.XLRDError as e:
            if 'xlsx' in str(e):
                raise ValueError('To read xlsx files, call xlsx2csv!')
            raise  # pragma: no cover
        for sname in wb.sheet_names():
            sheet = wb.sheet_by_name(sname)
            if sheet.nrows:
                path = outdir.joinpath(fname.stem + '.' +
                                       slug(sname, lowercase=False) + '.csv')
                with dsv.UnicodeWriter(path) as writer:
                    for i in range(sheet.nrows):
                        writer.writerow([col.value for col in sheet.row(i)])
                res[sname] = path
        return res
Пример #3
0
    def xlsx2csv(self, fname, outdir=None):
        if not openpyxl:  # pragma: no cover
            raise EnvironmentError(
                'xlsx2csv is only available when cldfbench is installed with excel support\n'
                'pip install cldfbench[excel]')

        def _excel_value(x):
            if x is None:
                return ""
            if isinstance(x, float):
                return '{0}'.format(int(x))
            return '{0}'.format(x).strip()

        fname = self._path(fname)
        res = {}
        outdir = outdir or self
        wb = openpyxl.load_workbook(str(fname), data_only=True)
        for sname in wb.sheetnames:
            sheet = wb.get_sheet_by_name(sname)
            path = outdir.joinpath(fname.stem + '.' +
                                   slug(sname, lowercase=False) + '.csv')
            with dsv.UnicodeWriter(path) as writer:
                for row in sheet.rows:
                    writer.writerow([_excel_value(col.value) for col in row])
            res[sname] = path
        return res
Пример #4
0
    def rewrite(self, fname, v):
        rows = list(dsv.reader(self.raw_dir / fname, dicts=True))

        with dsv.UnicodeWriter(self.raw_dir / fname) as w:
            for i, row in enumerate(rows):
                if i == 0:
                    w.writerow(row.keys())
                res = v(row)
                if res:
                    w.writerow(res.values())
Пример #5
0
def run(args):
    dicts = list(dsv.reader(get_conceptlist(args, path_only=True), delimiter="\t", dicts=True))
    out_dict = collections.OrderedDict()

    for d in dicts:
        out_dict[d[args.column]] = list(d.values())

    with dsv.UnicodeWriter(args.output, delimiter='\t') as w:
        w.writerow(dicts[0].keys())
        w.writerows(out_dict.values())
    if not args.output:
        print(w.read().decode('utf8'))
Пример #6
0
 def to_csvfile(self, filename, encoding='utf-8', dialect='excel'):
     """Write a CSV file with one row for each entry in each bibfile."""
     select_rows = sa.select(
         [
             File.name.label('filename'), Entry.bibkey, Entry.hash,
             sa.cast(Entry.id, sa.Text).label('id'),
         ]).select_from(sa.join(File, Entry))\
         .order_by(sa.func.lower(File.name), sa.func.lower(Entry.bibkey), Entry.hash, Entry.id)
     with self.execute(select_rows) as cursor:
         with dsv.UnicodeWriter(filename,
                                encoding=encoding,
                                dialect=dialect) as writer:
             writer.writerow(cursor.keys())
             for row in cursor:
                 writer.writerow(row)
Пример #7
0
def write_tsv(in_, out_, glottocode):
    rows = list({
        '.xlsx': iter_xlsx,
        '.xls': iter_xls,
        '.csv': iter_csv,
        '.tsv': iter_tsv,
    }[in_.suffix](in_))

    i = 0
    with dsv.UnicodeWriter(out_, delimiter='\t') as w:
        for i, row in enumerate(rows):
            if i == 0:
                w.writerow(list(row.keys()))
            row['Language_ID'] = glottocode
            w.writerow(list(row.values()))
    return i
Пример #8
0
    def to_csvfile(self,
                   filename,
                   *,
                   dialect: str = 'excel',
                   encoding: str = ENCODING):
        """Write a CSV file with one row for each entry in each .bib file."""
        select_rows = (sa.select(
            File.name.label('filename'), Entry.bibkey, Entry.hash,
            sa.cast(Entry.id, sa.Text).label('id')).join_from(
                File, Entry).order_by(sa.func.lower(File.name),
                                      sa.func.lower(Entry.bibkey), 'hash',
                                      Entry.id))

        with self.execute(select_rows) as result,\
             dsv.UnicodeWriter(filename, encoding=encoding, dialect=dialect) as writer:
            header = list(result.keys())
            writer.writerow(header)
            writer.writerows(result)
Пример #9
0
 def xls2csv(self, fname, outdir=None):
     if not xlrd:  # pragma: no cover
         raise EnvironmentError(
             'xls2csv is only available when cldfbench is installed with excel support\n'
             'pip install cldfbench[excel]')
     fname = self._path(fname)
     res = {}
     outdir = outdir or self
     wb = xlrd.open_workbook(str(fname))
     for sname in wb.sheet_names():
         sheet = wb.sheet_by_name(sname)
         if sheet.nrows:
             path = outdir.joinpath(fname.stem + '.' +
                                    slug(sname, lowercase=False) + '.csv')
             with dsv.UnicodeWriter(path) as writer:
                 for i in range(sheet.nrows):
                     writer.writerow([col.value for col in sheet.row(i)])
             res[sname] = path
     return res
Пример #10
0
def run(args):
    for p in args.path:
        # use reader rather than iterrows so we operate on raw file rather than a
        # grambank-ifyed version.
        rows = list(Sheet(p)._reader())
        not_empty = None
        with dsv.UnicodeWriter(p, delimiter='\t', encoding='utf8') as w:
            for i, row in enumerate(rows):
                if i == 0:
                    not_empty = [i for i, k in enumerate(row) if k]
                if set(row) == {''}:
                    continue
                # check other cells are empty
                for i, e in enumerate(row):
                    if i not in not_empty and e:  # pragma: no cover
                        raise ValueError(
                            "Unlabelled column has value on line %d. Fix manually!"
                            % i)
                w.writerow([row[i] for i in not_empty])
    return
Пример #11
0
    def dump(self):
        def _excel_value(x):
            if x is None:
                return ""
            if isinstance(x, float):
                return '{0}'.format(int(x))
            return '{0}'.format(x).strip()

        res = {}
        outdir = self.repos
        wb = openpyxl.load_workbook(str(self.path('COMBINED.xlsx')),
                                    data_only=True)
        for sname in wb.sheetnames:
            sheet = wb[sname]
            path = outdir.joinpath('data.' + slug(sname, lowercase=False) +
                                   '.csv')
            with dsv.UnicodeWriter(path) as writer:
                for row in sheet.rows:
                    writer.writerow([_excel_value(col.value) for col in row])
            res[sname] = path
        return res
Пример #12
0
    def visit(self, row_visitor=None):
        """
        Apply `row_visitor` to all rows in a sheet.

        :param row_visitor:
        :return: Pair of `int`s specifying the number of rows read and written.
        """
        if row_visitor is None:
            row_visitor = lambda r: r  # noqa: E731
        rows = list(self.iterrows())
        count = 0
        with dsv.UnicodeWriter(self.path, delimiter='\t',
                               encoding='utf8') as w:
            for i, row in enumerate(rows):
                if i == 0:
                    w.writerow(list(row.keys()))
                res = row_visitor(row)
                if res:
                    w.writerow(list(row.values()))
                    count += 1
        # Make sure calling iterrows again will re-read from disk:
        self._rows = None
        return (len(rows), count)
Пример #13
0
    def xlsx2csv(self, fname, outdir=None):
        """
        Dump the data from an Excel XLSX file to CSV.

        .. note::

            Requires `cldfbench` to be installed with extra "excel".
        """
        if not openpyxl:  # pragma: no cover
            raise EnvironmentError(
                'xlsx2csv is only available when cldfbench is installed with excel support\n'
                'pip install cldfbench[excel]')

        def _excel_value(x):
            if x is None:
                return ""
            if isinstance(x, float) and int(x) == x:
                # Since Excel does not have an integer type, integers are rendered as "n.0",
                # which in turn confuses type detection of tools like csvkit. Thus, we normalize
                # numbers of the form "n.0" to "n".
                return '{0}'.format(int(x))  # pragma: no cover
            return '{0}'.format(x).strip()

        fname = self._path(fname)
        res = {}
        outdir = outdir or self
        wb = openpyxl.load_workbook(str(fname), data_only=True)
        for sname in wb.sheetnames:
            sheet = wb[sname]
            path = outdir.joinpath(fname.stem + '.' +
                                   slug(sname, lowercase=False) + '.csv')
            with dsv.UnicodeWriter(path) as writer:
                for row in sheet.rows:
                    writer.writerow([_excel_value(col.value) for col in row])
            res[sname] = path
        return res
Пример #14
0
    def cmd_create_ref_etc_files(self, args):
        # Helper command to generate raw/concepts.csv and raw/languages.csv out of
        # the JSON data file which can be used to detect changes for the files
        # etc/concepts.csv and etc/langauges.csv

        # Load JSON data
        json_data = self.raw_dir.read_json(self.data_file_name)

        longnames = {
            rl['LanguageIx']:
            rl['RegionGpMemberLgNameLongInThisSubFamilyWebsite'].strip()
            for rl in json_data['regionLanguages']
        }

        # Create raw/languages.csv for usage as etc/languages.csv
        fname = self.raw_dir / 'languages.csv'
        seen_codes = {}
        with dsv.UnicodeWriter(fname) as f:
            f.writerow([
                'ID', 'Name', 'LongName', 'Glottocode', 'Glottolog_Name',
                'ISO639P3code', 'Macroarea', 'Latitude', 'Longitude', 'Family',
                'IndexInSource'
            ])
            for language in sorted(json_data['languages'],
                                   key=lambda k: int(k['LanguageIx'])):

                # Build ID
                lang_id = slug(language['ShortName']).capitalize()

                language['GlottoCode'] = language['GlottoCode'].strip()\
                    if language['GlottoCode'] else ''
                # add to language map
                if language['GlottoCode'] in seen_codes:
                    gldata = seen_codes[language['GlottoCode']]
                else:
                    gldata = args.glottolog.api.languoid(
                        language['GlottoCode'])
                    seen_codes[language['GlottoCode']] = gldata

                f.writerow([
                    lang_id,
                    language['ShortName'].strip(),
                    longnames[language['LanguageIx']]
                    if longnames[language['LanguageIx']] !=
                    language['ShortName'].strip() else '',
                    language['GlottoCode'],
                    gldata.name if gldata else '',
                    language['ISOCode'].strip(),
                    gldata.macroareas[0].name
                    if gldata and gldata.macroareas else '',
                    language['Latitude'].strip()
                    if language['Latitude'] else '',
                    language['Longtitude'].strip()
                    if language['Longtitude'] else '',
                    gldata.family.name if gldata and gldata.family else '',
                    language['LanguageIx'].strip(),
                ])

        # Create raw/concepts.csv to compare it against etc/concepts.csv
        fname = self.raw_dir / 'concepts.csv'
        with dsv.UnicodeWriter(fname) as f:
            if self.second_gloss_lang is None:
                f.writerow([
                    'ID', 'Name', 'Concepticon_ID', 'Concepticon_Gloss',
                    'IndexInSource'
                ])
            else:
                f.writerow([
                    'ID', 'Name', 'Concepticon_ID', 'Concepticon_Gloss',
                    '{0}_Gloss'.format(self.second_gloss_lang), 'IndexInSource'
                ])
            for c_idx, concept in enumerate(
                    sorted(json_data['words'],
                           key=lambda k: (int(k['IxElicitation']),
                                          int(k['IxMorphologicalInstance'])))):
                # Build ID
                concept_id = '%i_%s' % (c_idx,
                                        slug(concept['FullRfcModernLg01']))

                # Unmapped concepts are reported with int(ID)<1 in source
                if int(concept['StudyDefaultConcepticonID']) > 0:
                    concepticon_id = concept['StudyDefaultConcepticonID']
                    co_gloss = args.concepticon.api.conceptsets[
                        concepticon_id].gloss
                else:
                    concepticon_id = None
                    co_gloss = ''
                if self.second_gloss_lang is None:
                    f.writerow([
                        concept_id,
                        concept['FullRfcModernLg01'],
                        concepticon_id,
                        co_gloss,
                        '%s-%s' % (concept['IxElicitation'],
                                   concept['IxMorphologicalInstance']),
                    ])
                else:
                    f.writerow([
                        concept_id,
                        concept['FullRfcModernLg01'],
                        concepticon_id,
                        co_gloss,
                        concept['FullRfcModernLg02'],
                        '%s-%s' % (concept['IxElicitation'],
                                   concept['IxMorphologicalInstance']),
                    ])
Пример #15
0
 def write_csv(self, fname, rows, **kw):
     with dsv.UnicodeWriter(self._path(fname), **kw) as writer:
         writer.writerows(rows)