예제 #1
0
def _dataconvert(args):

    # What is the type of input file?
    if args.format:
        intype = args.format
    else:
        intype = guess_type(args.inpath)

    # What is the type of output file?
    outtype = guess_type(args.outpath)

    # If outtype is ARFF then we need to guess field-types.
    # Thus we overwrite the args.guess_types to True.
    if outtype == arff.MIMETYPE:
        args.guess_types = True

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype in ['text/csv', 'csv']:
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in [
            'application/vnd.ms-excel',
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
            'xls'
    ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(
            instream,
            excel_type=excel_type,
            sheet=args.sheet,
            guess_types=args.guess_types,
            encoding=args.encoding)
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present'
            % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    elif outtype == arff.MIMETYPE:
        arff.write(outstream, records, metadata)
    else:
        raise ValueError('Only support writing to csv and json at present')
예제 #2
0
def _dataconvert(args):
    
    # What is the type of input file?
    if args.format:
        intype = args.format
    else:
        intype = guess_type(args.inpath)
        
    # What is the type of output file?
    outtype = guess_type(args.outpath)
    
    # If outtype is ARFF then we need to guess field-types.
    # Thus we overwrite the args.guess_types to True.
    if outtype == arff.MIMETYPE:
        args.guess_types = True

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype in ['text/csv', 'csv'] + tsv_types:
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in ['application/vnd.ms-excel',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
        'xls'
        ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(instream,
                excel_type=excel_type,
                sheet=args.sheet,
                guess_types=args.guess_types,
                encoding=args.encoding
                )
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present' % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    elif outtype == arff.MIMETYPE:
        arff.write(outstream, records, metadata)
    else:
        raise ValueError('Only support writing to csv and json at present')
예제 #3
0
def test_csv_from_ressource():
    """ download file from ressource """
    url  ="https://ckannet-storage.commondatastorage.googleapis.com/2013-05-02T185247/Valeurs_ajoutees_par_branches_dactivites_aux_prix_constants_de_1999_en_milliards_FCFA).csv"
    instream   =urllib.urlopen(url)
    records, metadata = dcsv.parse(instream,
                                guess_types=True
                                #guess_types=args.guess_types)
                                   )
    outstream = open("some_json.json", 'w')
    js.write(outstream, records, metadata)
예제 #4
0
def main():
    parser = argparse.ArgumentParser(description=\
'''Convert data between formats. Supported formats:

    Input:  csv, tsv, excel (xls, xlsx).
    Output: csv, json

Examples
========

dataconvert https://github.com/okfn/dataconverters/raw/master/testdata/xls/simple.xls out.csv

Help
====
''',
    epilog=\
'''Copyright Open Knowledge Foundation 2007-2013. Licensed under the MIT license.
Part of the DataConverters project: https://github.com/okfn/dataconverters''',
     formatter_class=argparse.RawDescriptionHelpFormatter
)
    parser.add_argument('inpath', metavar='inpath', type=str,
                       help='in file path or url')
    parser.add_argument('outpath', metavar='outpath', type=str,
                       help='out file path to write to (use underscore "_" as filename to indicate stdout e.g. _.csv or _.json)')
    parser.add_argument('--no-guess-types', dest='guess_types',
        action='store_false',
        help='''Disable type-guessing (where it is used e.g. with CSVs). Type guessing may significantly affect performance''',
        default=True
        )
    parser.add_argument('--sheet', metavar='NUM',
        help='''Index of sheet in spreadsheet to convert (index starts at 1)''',
        default=1
        )
    parser.add_argument('--records', metavar='NUM',
        help='''Only convert a maximum of NUM records'''
        )

    args = parser.parse_args()
    intype = guess_type(args.inpath)
    outtype = guess_type(args.outpath)

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype == 'text/csv':
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in ['application/vnd.ms-excel',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(instream,
                excel_type=excel_type,
                sheet=args.sheet,
                guess_types=args.guess_types)
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present' % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    else:
        print 'Only support writing to csv and json at present'
예제 #5
0
def main():
    parser = argparse.ArgumentParser(description=\
'''Convert data between formats. Supported formats:

    Input:  csv, tsv, excel (xls, xlsx).
    Output: csv, json

Examples
========

dataconvert https://github.com/okfn/dataconverters/raw/master/testdata/xls/simple.xls out.csv

Help
====
''',
    epilog=\
'''Copyright Open Knowledge Foundation 2007-2013. Licensed under the MIT license.
Part of the DataConverters project: https://github.com/okfn/dataconverters''',
     formatter_class=argparse.RawDescriptionHelpFormatter
)
    parser.add_argument('inpath', metavar='inpath', type=str,
                       help='in file path or url')
    parser.add_argument('outpath', metavar='outpath', type=str,
                       help='out file path to write to (use underscore "_" as filename to indicate stdout e.g. _.csv or _.json)')
    parser.add_argument('--no-guess-types', dest='guess_types',
        action='store_false',
        help='''Disable type-guessing (where it is used e.g. with CSVs). Type guessing may significantly affect performance''',
        default=True
        )
    parser.add_argument('--sheet', metavar='NUM',
        help='''Index of sheet in spreadsheet to convert (index starts at 1)''',
        default=1
        )
    parser.add_argument('--records', metavar='NUM',
        help='''Only convert a maximum of NUM records''',
        default=1
        )

    args = parser.parse_args()
    intype = guess_type(args.inpath)
    outtype = guess_type(args.outpath)

    if is_url_path(args.inpath):
        instream = urllib2.urlopen(args.inpath)
    else:
        instream = open(args.inpath)

    # tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']
    if intype == 'text/csv':
        records, metadata = dcsv.parse(instream, guess_types=args.guess_types)
    elif intype in ['application/vnd.ms-excel',
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        ]:
        import dataconverters.xls
        excel_type = 'xls' if intype == 'application/vnd.ms-excel' else 'xlsx'
        records, metadata = dataconverters.xls.parse(instream,
                excel_type=excel_type,
                sheet=args.sheet,
                guess_types=args.guess_types)
    else:
        raise ValueError(
            'No support for reading file type %s - support for csv or xls only at present' % intype)

    if args.outpath.startswith('_.'):
        outstream = sys.stdout
    else:
        outstream = open(args.outpath, 'w')

    if (args.records):
        records = itertools.islice(records, int(args.records))

    if outtype == 'text/csv':
        dcsv.write(outstream, records, metadata)
    elif outtype == 'application/json':
        import dataconverters.jsondata as js
        js.write(outstream, records, metadata)
    else:
        print 'Only support writing to csv and json at present'