예제 #1
0
파일: test_dsv.py 프로젝트: Anaphory/csvw
def test_rewrite(tmpdir,
                 tsvname=str(TESTDIR / 'tsv.txt'),
                 csvname=str(TESTDIR / 'csv.txt')):
    filename = str(tmpdir / 'test.txt')
    shutil.copy(tsvname, filename)
    rewrite(filename, lambda i, row: [len(row)], delimiter='\t')
    assert next(iterrows(filename)) == ['2']

    shutil.copy(csvname, filename)
    rewrite(filename, lambda i, row: row)
    assert list(iterrows(filename)) == list(iterrows(csvname))
예제 #2
0
파일: test_dsv.py 프로젝트: Anaphory/csvw
def test_iterrows_dialect(
        lines=['1,x,y', ' #1,a,b', '#1,1,2', ',,', '1,3, 4\t ']):
    dialect = Dialect(trim=True, skipRows=1, skipColumns=1, skipBlankRows=True)
    r = list(iterrows(lines, dialect=dialect))
    # make sure comment lines are stripped:
    assert len(r) == 2
    # make sure cells are trimmmed:
    assert r[1][1] == '4'

    r = list(
        iterrows(lines, dialect=dialect.updated(skipRows=0, skipColumns=0)))
    assert r[2][2] == '4'
예제 #3
0
파일: test_dsv.py 프로젝트: Anaphory/csvw
def test_add_delete_rows(tmpdir):
    filename = str(tmpdir / 'test.csv')
    add_rows(filename, ['a', 'b'], [1, 2], [3, 4])
    assert len(list(iterrows(filename, dicts=True))) == 2

    filter_rows_as_dict(filename, lambda item: item['a'] == '1')
    assert len(list(iterrows(filename, dicts=True))) == 1

    add_rows(filename, [2, 2], [2, 4])
    assert len(list(iterrows(filename, dicts=True))) == 3

    nremoved = filter_rows_as_dict(filename, lambda item: item['a'] == '1')
    assert nremoved == 2
예제 #4
0
파일: test_dsv.py 프로젝트: Anaphory/csvw
def test_roundtrip_with_keyword_dialect(tmpdir,
                                        rows=[['1', 'y'], ['  "1 ', '3\t4']],
                                        dialect='excel'):
    filename = str(tmpdir / 'test.csv')
    with UnicodeWriter(filename, dialect=dialect) as w:
        w.writerows(rows)
    assert list(iterrows(filename, dialect=dialect)) == rows
예제 #5
0
 def __init__(self, name_and_date, fp):
     parts = name_and_date.split('_')
     digits = map(int, DATESTAMP_PATTERN.match(parts[-1]).groups())
     self.date = datetime.date(*digits)
     name = '_'.join(parts[:-1])
     if name.startswith(('_', '-')):
         name = name[1:]
     if not name:
         name = 'Codes'
     self.name = name
     super(Table, self).__init__(
         iterrows(fp.splitlines(), dicts=True, delimiter='\t'))
예제 #6
0
파일: test_dsv.py 프로젝트: Anaphory/csvw
def test_iterrows_restkey(lines=['a,b', '1,2,3,4', '1']):
    result = iterrows(lines,
                      dicts=True,
                      restkey='x',
                      restval='y',
                      delimiter=',')
    assert list(result) == [{
        'a': '1',
        'b': '2',
        'x': ['3', '4']
    }, {
        'a': '1',
        'b': 'y'
    }]
예제 #7
0
 def __init__(self, name_and_date, date, fp):
     parts = name_and_date.split('_')
     # The ISO 639-3 code tables from 2020-05-15 contain a table with a
     # malformed name - having an excess "0" in the date stamp.
     if parts[-1] == '202000515':  # pragma: no cover
         date = '20200515'
     digits = map(int, DATESTAMP_PATTERN.match(date).groups())
     self.date = datetime.date(*digits)
     name = '_'.join([p for p in parts if not DATESTAMP_PATTERN.match(p)])
     if name.startswith(('_', '-')):
         name = name[1:]
     if not name:
         name = 'Codes'
     self.name = name
     super(Table, self).__init__(iterrows(
         [line for line in fp.splitlines() if line.strip()],  # strip malformed lines.
         dicts=True,
         delimiter='\t'))
예제 #8
0
    def from_data(cls, fname):
        fname = Path(fname)
        colnames = next(iterrows(fname), [])
        if not colnames:
            raise ValueError('empty data file!')
        if cls is Dataset:
            try:
                cls = next(mod.cls for mod in get_modules() if mod.match(fname))
            except StopIteration:
                raise ValueError(fname)
            assert issubclass(cls, Dataset) and cls is not Dataset

        res = cls.from_metadata(fname.parent)
        required_cols = {
            c.name for c in res[res.primary_table].tableSchema.columns
            if c.required}
        if not required_cols.issubset(colnames):
            raise ValueError('missing columns: %r' % sorted(required_cols.difference(colnames)))
        return res
예제 #9
0
    def from_data(cls, fname):
        fname = Path(fname)
        colnames = next(iterrows(fname), [])
        if not colnames:
            raise ValueError('empty data file!')
        if cls is Dataset:
            try:
                cls = next(mod.cls for mod in get_modules() if mod.match(fname))
            except StopIteration:
                raise ValueError('{0} does not match a CLDF module spec'.format(fname))
            assert issubclass(cls, Dataset) and cls is not Dataset

        res = cls.from_metadata(fname.parent)
        required_cols = {
            c.name for c in res[res.primary_table].tableSchema.columns
            if c.required}
        if not required_cols.issubset(colnames):
            raise ValueError('missing columns: %r' % sorted(required_cols.difference(colnames)))
        return res
예제 #10
0
파일: test_dsv.py 프로젝트: Anaphory/csvw
def test_iterrows(rows=[['first', 'line'], ['s\u00fccond',
                                            'l\u00e4ne\u00df']]):
    assert list(iterrows(TESTDIR / 'csv.txt')) == rows

    lines = ['\t'.join(r) for r in rows]
    assert list(iterrows(lines, delimiter='\t')) == rows

    for lt in ['\n', '\r\n', '\r']:
        # Simulate file opened in text mode:
        fp = io.StringIO(lt.join(lines), newline='')
        assert list(iterrows(fp, delimiter='\t')) == rows

    assert list(iterrows(lines, dicts=True,
                         delimiter='\t')) == [OrderedDict(zip(*rows))]

    r = list(iterrows(lines, namedtuples=True, delimiter='\t'))
    assert len(r) == 1 and r[0].first == 's\u00fccond'

    r = list(iterrows([l.replace('\t', ',') for l in lines], namedtuples=True))
    assert len(r) == 1 and r[0].first == 's\u00fccond'
예제 #11
0
파일: test_dsv.py 프로젝트: Anaphory/csvw
def test_iterrows_tsv(filename=str(TESTDIR / 'tsv.txt')):
    res = list(iterrows(filename, namedtuples=True, delimiter='\t'))
    assert res[0].a_name == 'b'
    # Missing column values should be set to None:
    assert res[2].a_name is None
예제 #12
0
파일: test_dsv.py 프로젝트: Anaphory/csvw
def test_iterrows_empty():
    assert list(iterrows([], dicts=True, delimiter='\t')) == []
    assert list(iterrows([''], dicts=True, fieldnames=['a', 'b'], delimiter='\t')) == \
           []
    assert list(iterrows(['a,b', ''], dicts=True, delimiter='\t')) == []
예제 #13
0
파일: test_dsv.py 프로젝트: Anaphory/csvw
def test_iterrows_quote_comment(dialect, lines, expected):
    assert list(iterrows(lines, dialect=dialect)) == expected
예제 #14
0
def add_metadata(fname: Path, logger: cli.logging.Logger = cli.logger):
    if fname.name != "forms.csv":
        cli.Exit.CLI_ARGUMENT_ERROR(
            "A metadata-free Wordlist must be in a file called 'forms.csv'.")
    default_wordlist = TableGroup.from_file(
        pycldf.util.pkg_path("modules", "Wordlist-metadata.json"))
    default_wordlist._fname = fname.with_name("Wordlist-metadata.json")
    ds = pycldf.Wordlist(default_wordlist)

    # `from_data` checks that the reqired columns of the FormTable are present,
    # but it does not consolidate the columns further.

    colnames = next(iterrows(fname))

    understood_colnames = {
        c.name
        for c in ds[ds.primary_table].tableSchema.columns if c.name in colnames
    }
    more_columns = {
        c.propertyUrl.uri: c
        for c in ds[ds.primary_table].tableSchema.columns
        if c.name not in understood_colnames
    }
    logger.info(
        "CLDF freely understood the columns %s in your forms.csv.",
        sorted(understood_colnames),
    )

    # Consider the columns that were not understood.
    columns_without_metadata = set(colnames) - understood_colnames
    for column_name in columns_without_metadata:
        column: Column
        # Maybe they are known CLDF properties?
        if column_name in pycldf.terms.TERMS:
            column = pycldf.TERMS[column_name].to_column()
        # Maybe they are CLDF default column names?
        elif column_name in DEFAULT_NAME_COLUMNS:
            column = DEFAULT_NAME_COLUMNS[column_name]
        # Maybe they are columns that Lexedata knows to handle?
        elif column_name in LEXEDATA_COLUMNS:
            column = LEXEDATA_COLUMNS[column_name]
        # Maybe they are columns inherited from LingPy?
        elif column_name.upper() in LINGPY_COLUMNS:
            column = LINGPY_COLUMNS[column_name.upper()]
        # Maybe they are some name we have seen before?
        elif column_name in OTHER_KNOWN_COLUMNS:
            column = OTHER_KNOWN_COLUMNS[column_name]
        else:
            # TODO: Maybe they look like they have a specific type?
            ...
            # Otherwise, they are probably just text to be kept.
            column = Column(
                datatype=Datatype(base="string"),
                default="",
                null=[""],
                name=column_name,
            )
        column.name = column_name

        ds[ds.primary_table].tableSchema.columns.append(column)
        summary = column.propertyUrl or column.datatype
        logger.info(f"Column {column_name} seems to be a {summary} column.")
        if column.propertyUrl:
            to_be_replaced = more_columns.pop(column.propertyUrl.uri, None)
            if to_be_replaced is not None:
                ds[ds.primary_table].tableSchema.columns.remove(to_be_replaced)

    for column in more_columns.values():
        logger.info(
            f"Also added column {column.name}, as expected for a FormTable.")

    ds[ds.primary_table].tableSchema.columns.sort(
        key=lambda k: colnames.index(k.name) if k.name in colnames else 1e10)

    # TODO: Once lexedata is properly published, we can give a better URL.
    ds.properties["dc:contributor"] = [
        "https://github.com/Anaphory/lexedata/blob/master/src/lexedata/edit/add_metadata.py"
    ]
    return ds
예제 #15
0
파일: test_dsv.py 프로젝트: Anaphory/csvw
def test_iterrows_invalid():
    with pytest.raises(ValueError, match=r'either namedtuples or dicts'):
        next(iterrows([], namedtuples=True, dicts=True))
예제 #16
0
 def _read(self, what):
     return iterrows(self.raw_dir / "{0}.tsv".format(what),
                     dicts=True,
                     delimiter="\t")
예제 #17
0
파일: test_dsv.py 프로젝트: Anaphory/csvw
def test_iterrows_with_bom(tmpdir):
    filepath = tmpdir / 'spam.csv'
    filepath.write_text('\ufeffcol1,col2\nval1,val2', encoding='utf8')
    rows = list(iterrows(str(filepath)))
    assert rows[0] == ['col1', 'col2']