Exemplo n.º 1
0
 def __init__(self, file_path, filename):
     now = datetime.now().isoformat()
     self.file_path = file_path
     self.filename = filename
     self.file_type = convert.guess_format(self.filename)
     if self.file_type not in ["xls", "csv", "xlsx"]:
         if client:
             client.captureMessage(" %s Unsupported Format: %s, (%s)" % (now, self.file_type, self.filename))
         raise DedupeFileError("%s is not a supported format" % self.file_type)
     try:
         self.converted = convert.convert(open(self.file_path, "rb"), self.file_type)
     except UnicodeDecodeError:
         if client:
             client.captureException()
         raise DedupeFileError(
             "We had a problem with the file you uploaded. \
                 This might be related to encoding or the file name having the wrong file extension."
         )
     self.line_count = self.converted.count("\n")
     if self.line_count > 10000:
         if client:
             client.captureMessage(" %s File too big: %s, (%s)" % (now, self.line_count, self.filename))
         raise DedupeFileError("Your file has %s rows and we can only currently handle 10,000." % self.line_count)
     if client:
         client.captureMessage(" %s Format: %s, Line Count: %s" % (now, self.file_type, self.line_count))
Exemplo n.º 2
0
 def process_file(self):
     '''
     Here we will see if the input file is CSV, or if it is an understood
     format that can be converted to CSV.  Assuming it's one of those two,
     we will pass the resulting CSV file over to the csv processor.
     '''
     for this_filename in self.filelist:
         logger.debug('Filename processing is %s', this_filename)
         self.format = convert.guess_format(this_filename)
         logger.debug('Guessed format of %s', self.format)
         if self.format == 'csv':
             self.filename = this_filename
             break
         elif self.format:
             # If it is not a CSV file, but some other
             # understood format, we will convert it to a CSV and
             # write it out to a temporary file.
             fh, self.temp_file = tempfile.mkstemp(suffix='.csv')
             os.close(fh)
             self.filename = self.temp_file
             try:
                 logger.debug(
                     'Attempting to convert to format CSV (from %s)',
                     self.format)
                 with open(self.temp_file, 'w') as fh:
                     fh.write(
                         convert.convert(open(this_filename, 'rb'),
                                         self.format))
                 break
             except Exception, e:
                 logger.exception('Failed to process %s to CSV: %s',
                                  self.filename, e)
                 os.unlink(self.filename)
                 self.filename = None
Exemplo n.º 3
0
def upload():
    session_id = unicode(uuid4())
    f = request.files['input_file']
    flask_session['session_name'] = f.filename
    file_type = f.filename.rsplit('.')[1]
    u = StringIO(f.read())
    u.seek(0)
    if file_type != 'csv':  # pragma: no cover
        file_format = convert.guess_format(flask_session['session_name'])
        u = StringIO(convert.convert(u, file_format))
    fieldnames = [
        slugify(unicode(i)) for i in u.next().strip('\r\n').split(',')
    ]
    flask_session['fieldnames'] = fieldnames
    user_id = flask_session['user_id']
    user = db_session.query(User).get(user_id)
    group = user.groups[0]
    sess = DedupeSession(id=session_id,
                         name=request.form.get('name'),
                         description=request.form.get('description'),
                         filename=f.filename,
                         group=group,
                         status=STATUS_LIST[0]['machine_name'])
    db_session.add(sess)
    db_session.commit()
    u.seek(0)
    with open('/tmp/%s_raw.csv' % session_id, 'wb') as s:
        s.write(u.getvalue())
    del u
    initializeSession.delay(session_id)
    flask_session['session_id'] = session_id
    return jsonify(ready=True, session_id=session_id)
Exemplo n.º 4
0
 def process_file(self):
     '''
     Here we will see if the input file is CSV, or if it is an understood
     format that can be converted to CSV.  Assuming it's one of those two,
     we will pass the resulting CSV file over to the csv processor.
     '''
     for this_filename in self.filelist:
         logger.debug('Filename processing is %s', this_filename)
         self.format=convert.guess_format(this_filename)
         logger.debug('Guessed format of %s', self.format)
         if self.format == 'csv':
             self.filename=this_filename
             break
         elif self.format:
             # If it is not a CSV file, but some other
             # understood format, we will convert it to a CSV and
             # write it out to a temporary file.
             fh, self.temp_file=tempfile.mkstemp(suffix='.csv')
             os.close(fh)
             self.filename=self.temp_file
             try:
                 logger.debug('Attempting to convert to format CSV (from %s)',
                              self.format)
                 with open(self.temp_file,'w') as fh:
                     fh.write(convert.convert(open(this_filename,'rb'), 
                                              self.format))
                 break
             except Exception, e:
                 logger.exception('Failed to process %s to CSV: %s',
                                  self.filename, e)
                 os.unlink(self.filename)
                 self.filename=None
Exemplo n.º 5
0
def upload():
    context = {}
    if request.method == 'POST':
        big_file = False
        try:
            files = request.files
        except RequestEntityTooLarge, e:
            files = None
            big_file = True
            current_app.logger.info(e)
        if files:
            f = files['input_file']
            if allowed_file(f.filename):
                inp = StringIO(f.read())
                file_format = convert.guess_format(f.filename)
                try:
                    converted = convert.convert(inp, file_format)
                except UnicodeDecodeError:
                    context['errors'] = [
                        'We had a problem with reading your file. \
                        This could have to do with the file encoding or format'
                    ]
                    converted = None
                f.seek(0)
                if converted:
                    outp = StringIO(converted)
                    reader = UnicodeCSVReader(outp)
                    session['header_row'] = reader.next()
                    rows = []
                    columns = [[] for c in session['header_row']]
                    column_ids = range(len(session['header_row']))
                    for row in range(100):
                        try:
                            rows.append(reader.next())
                        except StopIteration:
                            break
                    for i, row in enumerate(rows):
                        for j, d in enumerate(row):
                            columns[j].append(row[column_ids[j]])
                    sample_data = []
                    guesses = {}
                    for index, header_val in enumerate(session['header_row']):
                        guesses[index] = guess_geotype(header_val,
                                                       columns[index])
                        sample_data.append((index, header_val, columns[index]))
                    session['sample_data'] = sample_data
                    session['guesses'] = json.dumps(guesses)
                    outp.seek(0)
                    session['file'] = outp.getvalue()
                    session['filename'] = f.filename
                    return redirect(url_for('views.select_geo'))
            else:
                context['errors'] = [
                    'Only .xls or .xlsx and .csv files are allowed.'
                ]
        else:
            context['errors'] = ['You must provide a file to upload.']
            if big_file:
                context['errors'] = ['Uploaded file must be 10mb or less.']
Exemplo n.º 6
0
    def main(self):
        if self.args.filetype:
            filetype = self.args.filetype

            if filetype not in convert.SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' %
                                     self.args.filetype)

        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error(
                    'You must specify a format when providing data via STDIN (pipe).'
                )

            filetype = convert.guess_format(self.args.input_path)

            if not filetype:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying a format with --format.'
                )

        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs['schema'] = self._open_input_file(self.args.schema)

        if self.args.key:
            kwargs['key'] = self.args.key

        if self.args.snifflimit:
            kwargs['snifflimit'] = self.args.snifflimit

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if self.args.no_inference:
            kwargs['type_inference'] = False

        if filetype == 'csv' and self.args.no_header_row:
            kwargs['no_header_row'] = True

        # Fixed width can be processed as a stream
        if filetype == 'fixed':
            kwargs['output'] = self.output_file

        data = convert.convert(self.input_file, filetype, **kwargs)

        self.output_file.write(data)
Exemplo n.º 7
0
    def main(self):
        if self.args.format:
            format = self.args.format

            if format not in convert.SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' %
                                     self.args.format)

        elif self.args.schema:
            format = 'fixed'
        elif self.args.key:
            format = 'json'
        else:
            if self.args.file == sys.stdin:
                self.argparser.error(
                    'You must specify a format when providing data via STDIN (pipe).'
                )

            format = convert.guess_format(self.args.file)

            if not format:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying a format with --format.'
                )

        if isinstance(self.args.file, file):
            f = self.args.file
        elif format in ('xls', 'xlsx'):
            f = open(self.args.file, 'rb')
        else:
            f = open(self.args.file, 'rU')

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs['schema'] = self.args.schema

        if self.args.key:
            kwargs['key'] = self.args.key

        if self.args.snifflimit:
            kwargs['snifflimit'] = self.args.snifflimit

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if self.args.no_inference:
            kwargs['type_inference'] = False

        if format == 'csv' and self.args.no_header_row:
            kwargs['no_header_row'] = True

        # Fixed width can be processed as a stream
        if format == 'fixed':
            kwargs['output'] = self.output_file

        self.output_file.write(convert.convert(f, format, **kwargs))
Exemplo n.º 8
0
    def main(self):
        if self.args.filetype:
            filetype = self.args.filetype

            if filetype not in convert.SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.filetype)

        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error('You must specify a format when providing data via STDIN (pipe).')

            filetype = convert.guess_format(self.args.input_path)

            if not filetype:
                self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs['schema'] = self._open_input_file(self.args.schema)

        if self.args.key:
            kwargs['key'] = self.args.key

        if self.args.snifflimit:
            kwargs['snifflimit'] = self.args.snifflimit

        if self.args.export:
            kwargs['export'] = self.args.export

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if self.args.no_inference:
            kwargs['type_inference'] = False

        if filetype == 'csv' and self.args.no_header_row:
            kwargs['no_header_row'] = True

        # Fixed width can be processed as a stream
        if filetype == 'fixed':
            kwargs['output'] = self.output_file

        data = convert.convert(self.input_file, filetype, **kwargs)

        self.output_file.write(data)
Exemplo n.º 9
0
    def main(self):
        if self.filetype:
            filetype = self.filetype

            if filetype not in convert.SUPPORTED_FORMATS:
                self.csvResult = (_('"%s" is not a supported format') % self.filetype)

#         elif self.args.schema:
#             filetype = 'fixed'
#         elif self.args.key:
#             filetype = 'json'
        else:
            if not self.file_name or self.file_name == '-':
                self.csvResult = _('You must specify a format.')

            filetype = convert.guess_format(self.file_name)

            if not filetype:
                self.csvResult = _('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.file_name, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        kwargs = self.reader_kwargs

#         if self.args.schema:
#             kwargs['schema'] = self._open_input_file(self.args.schema)
# 
#         if self.args.key:
#             kwargs['key'] = self.args.key
# 
#         if self.args.snifflimit:
#             kwargs['snifflimit'] = self.args.snifflimit
# 
#         if self.args.sheet:
#             kwargs['sheet'] = self.args.sheet
# 
#         if self.args.no_inference:
#             kwargs['type_inference'] = False
# 
#         if filetype == 'csv' and self.args.no_header_row:
#             kwargs['no_header_row'] = True
# 
#         # Fixed width can be processed as a stream
#         if filetype == 'fixed':
#             kwargs['output'] = self.output_file

        data = convert.convert(self.input_file, filetype, **kwargs)

        print 'out: '
        out_file_opened = open(self.output_file,'w')
        print out_file_opened
        out_file_opened.write(data)
Exemplo n.º 10
0
def upload():
    context = {}
    if request.method == 'POST':
        big_file = False
        try:
            files = request.files
        except RequestEntityTooLarge, e:
            files = None
            big_file = True
            current_app.logger.info(e)
        if files:
            f = files['input_file']
            if allowed_file(f.filename):
                inp = StringIO(f.read())
                file_format = convert.guess_format(f.filename)
                try:
                    converted = convert.convert(inp, file_format)
                except UnicodeDecodeError:
                    context['errors'] = ['We had a problem with reading your file. \
                        This could have to do with the file encoding or format']
                    converted = None
                f.seek(0)
                if converted:
                    outp = StringIO(converted)
                    reader = UnicodeCSVReader(outp)
                    session['header_row'] = reader.next()
                    rows = []
                    columns = [[] for c in session['header_row']]
                    column_ids = range(len(session['header_row']))
                    for row in range(100):
                        try:
                            rows.append(reader.next())
                        except StopIteration:
                            break
                    for i, row in enumerate(rows):
                        for j,d in enumerate(row):
                            columns[j].append(row[column_ids[j]])
                    sample_data = []
                    guesses = {}
                    for index, header_val in enumerate(session['header_row']):
                        guesses[index] = guess_geotype(header_val, columns[index])
                        sample_data.append((index, header_val, columns[index]))
                    session['sample_data'] = sample_data
                    session['guesses'] = json.dumps(guesses)
                    outp.seek(0)
                    session['file'] = outp.getvalue()
                    session['filename'] = f.filename
                    return redirect(url_for('views.select_geo'))
            else:
                context['errors'] = ['Only .xls or .xlsx and .csv files are allowed.']
        else:
            context['errors'] = ['You must provide a file to upload.']
            if big_file:
                context['errors'] = ['Uploaded file must be 10mb or less.'] 
Exemplo n.º 11
0
def upload():
    context = {}
    if request.method == 'POST':
        f = request.files['input_file']
        if f:
            if allowed_file(f.filename):
                inp = StringIO(f.read())
                if sys.getsizeof(inp.getvalue()) <= MAX_CONTENT_LENGTH:
                    inp.seek(0)
                    file_format = convert.guess_format(f.filename)
                    try:
                        converted = convert.convert(inp, file_format)
                    except UnicodeDecodeError:
                        context['errors'] = [
                            'We had a problem with reading your file. \
                            This could have to do with the file encoding or format'
                        ]
                        converted = None
                    f.seek(0)
                    if converted:
                        outp = StringIO(converted)
                        reader = UnicodeCSVReader(outp)
                        session['header_row'] = reader.next()
                        rows = []
                        columns = [[] for c in session['header_row']]
                        column_ids = range(len(session['header_row']))
                        for row in range(10):
                            try:
                                rows.append(reader.next())
                            except StopIteration:
                                break
                        for i, row in enumerate(rows):
                            for j, d in enumerate(row):
                                columns[j].append(row[column_ids[j]])
                        columns = [', '.join(c) for c in columns]
                        sample_data = []
                        for index, _ in enumerate(session['header_row']):
                            sample_data.append(
                                (index, session['header_row'][index],
                                 columns[index]))
                        session['sample_data'] = sample_data
                        outp.seek(0)
                        session['file'] = outp.getvalue()
                        session['filename'] = f.filename
                        return redirect(url_for('views.select_geo'))
                else:
                    context['errors'] = ['Uploaded file must be 10mb or less.']
            else:
                context['errors'] = [
                    'Only .xls or .xlsx and .csv files are allowed.'
                ]
        else:
            context['errors'] = ['You must provide a file to upload.']
    return render_template('upload.html', **context)
Exemplo n.º 12
0
    def __init__(self, incoming_file):
        self.file = incoming_file

        # We need two copies of the file chunk generator:
        # One to detect file encoding, and one to grab the field names.
        all_chunks, first_chunk = itertools.tee(self.file.chunks())
        self.first_chunk = next(first_chunk)
        self.chunks = all_chunks

        self.file_type = guess_format(self.file.name.lower())
        self.file_encoding = self._file_encoding()
        self.field_names = self._field_names()
Exemplo n.º 13
0
def convert_to_csv_reader(filename, sheet=None, infer_types=True):
    format = convert.guess_format(filename)
    f = open(filename, "rb")
    convert_kwargs = {}
    if sheet is not None:
        # Only pass `sheet` to the `convert` function when its set to
        # a non-None value.  This is done to satisfy csvkit which checks
        # for the presence of `sheet`, not whether it's valid.
        convert_kwargs['sheet'] = sheet
    converted = StringIO(convert.convert(f, format, infer_types=infer_types, **convert_kwargs))
    reader = UnicodeCSVReader(converted)
    return reader
Exemplo n.º 14
0
    def main(self):
        if self.args.format:
            format = self.args.format

            if format not in convert.SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.format)

        elif self.args.schema:
            format = 'fixed'
        elif self.args.key:
            format = 'json'
        else:
            if self.args.file == sys.stdin:
                self.argparser.error('You must specify a format when providing data via STDIN (pipe).')

            format = convert.guess_format(self.args.file)

            if not format:
                self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        if isinstance(self.args.file, file):
            f = self.args.file
        elif format in ('xls', 'xlsx'):
            f = open(self.args.file, 'rb')
        else:
            f = open(self.args.file, 'rU')

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs['schema'] = self.args.schema

        if self.args.key:
            kwargs['key'] = self.args.key

        if self.args.snifflimit:
            kwargs['snifflimit'] = self.args.snifflimit

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if self.args.no_inference:
            kwargs['type_inference'] = False

        if format == 'csv' and self.args.no_header_row:
            kwargs['no_header_row'] = True

        # Fixed width can be processed as a stream
        if format == 'fixed':
            kwargs['output'] = self.output_file

        self.output_file.write(convert.convert(f, format, **kwargs))
Exemplo n.º 15
0
 def __init__(self, file_path, filename):
     now = datetime.now().isoformat()
     self.file_path = file_path
     self.filename = filename
     self.file_type = convert.guess_format(self.filename)
     if self.file_type not in ['xls', 'csv', 'xlsx']:
         logger.warning(' %s Unsupported Format: %s, (%s)' % (now, self.file_type, self.filename))
         raise DedupeFileError('%s is not a supported format' % self.file_type)
     self.converted = convert.convert(open(self.file_path, 'rb'), self.file_type)
     self.line_count = self.converted.count('\n')
     if self.line_count > 10000:
         logger.warning(' %s File too big: %s, (%s)' % (now, self.line_count, self.filename))
         raise DedupeFileError('Your file has %s rows and we can only currently handle 10,000.' % self.line_count)
     logger.warning(' %s Format: %s, Line Count: %s' % (now, self.file_type, self.line_count))
Exemplo n.º 16
0
def upload():
    context = {}
    if request.method == 'POST':
        f = request.files['input_file']
        if f:
            if allowed_file(f.filename):
                inp = StringIO(f.read())
                if sys.getsizeof(inp.getvalue()) <= MAX_CONTENT_LENGTH:
                    inp.seek(0)
                    file_format = convert.guess_format(f.filename)
                    try:
                        converted = convert.convert(inp, file_format)
                    except UnicodeDecodeError:
                        context['errors'] = ['We had a problem with reading your file. \
                            This could have to do with the file encoding or format']
                        converted = None
                    f.seek(0)
                    if converted:
                        outp = StringIO(converted)
                        reader = UnicodeCSVReader(outp)
                        session['header_row'] = reader.next()
                        rows = []
                        columns = [[] for c in session['header_row']]
                        column_ids = range(len(session['header_row']))
                        for row in range(10):
                            try:
                                rows.append(reader.next())
                            except StopIteration:
                                break
                        for i, row in enumerate(rows):
                            for j,d in enumerate(row):
                                columns[j].append(row[column_ids[j]])
                        columns = [', '.join(c) for c in columns]
                        sample_data = []
                        for index,_ in enumerate(session['header_row']):
                            sample_data.append((index, session['header_row'][index], columns[index]))
                        session['sample_data'] = sample_data
                        outp.seek(0)
                        session['file'] = outp.getvalue()
                        session['filename'] = f.filename
                        return redirect(url_for('views.select_geo'))
                else:
                   context['errors'] = ['Uploaded file must be 10mb or less.'] 
            else:
                context['errors'] = ['Only .xls or .xlsx and .csv files are allowed.']
        else:
            context['errors'] = ['You must provide a file to upload.']
    return render_template('upload.html', **context)
Exemplo n.º 17
0
    def main(self):
        if self.args.filetype:
            filetype = self.args.filetype

            if filetype not in convert.SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.filetype)

        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error('You must specify a format when providing data via STDIN (pipe).')

            filetype = convert.guess_format(self.args.input_path)

            if not filetype:
                self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs['schema'] = self._open_input_file(self.args.schema)

        if self.args.key:
            kwargs['key'] = self.args.key

        if self.args.sniff_limit:
            kwargs['sniff_limit'] = self.args.sniff_limit

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if self.args.no_inference:
            kwargs['column_types'] = agate.TypeTester(limit=0)

        if filetype == 'csv' and self.args.no_header_row:
            kwargs['header'] = False

        convert.convert(self.input_file, filetype, output=self.output_file, **kwargs)
Exemplo n.º 18
0
    def main(self):
        if self.args.format:
            format = self.args.format

            if format not in convert.SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.format)

        elif self.args.schema:
            format = "fixed"
        elif self.args.key:
            format = "json"
        else:
            if self.args.file == sys.stdin:
                self.argparser.error("You must specify a format when providing data via STDIN (pipe).")

            format = convert.guess_format(self.args.file)

            if not format:
                self.argparser.error(
                    "Unable to automatically determine the format of the input file. Try specifying a format with --format."
                )

        if isinstance(self.args.file, file):
            f = self.args.file
        elif format == "xls":
            f = open(self.args.file, "rb")
        else:
            f = open(self.args.file, "rU")

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs["schema"] = self.args.schema

        if self.args.key:
            kwargs["key"] = self.args.key

        if self.args.snifflimit:
            kwargs["snifflimit"] = self.args.snifflimit

        # Fixed width can be processed as a stream
        if format == "fixed":
            kwargs["output"] = self.output_file

        self.output_file.write(convert.convert(f, format, **kwargs))
Exemplo n.º 19
0
    def main(self):
        if self.args.format:
            format = self.args.format

            if format not in convert.SUPPORTED_FORMATS:
                sys.exit('"%s" is not a supported format' % self.args.format)

        elif self.args.schema:
            format = 'fixed'
        elif self.args.key:
            format = 'json'
        else:
            if self.args.file == '<stdin>':
                sys.exit('You must specify a format when providing data via STDIN (pipe).')

            format = convert.guess_format(self.args.file)

            if not format:
                sys.exit('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        if isinstance(self.args.file, file):
            f = self.args.file
        elif format == 'xls':
            f = open(self.args.file, 'rb')
        else:
            f = open(self.args.file, 'rU')

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs['schema'] = self.args.schema

        if self.args.key:
            kwargs['key'] = self.args.key

        if self.args.snifflimit:
            kwargs['snifflimit'] = self.args.snifflimit

        # Fixed width can be processed as a stream
        if format == 'fixed':
            kwargs['output'] = self.output_file

        self.output_file.write(convert.convert(f, format, **kwargs))
Exemplo n.º 20
0
 def __init__(self, file_path, filename):
     now = datetime.now().isoformat()
     self.file_path = file_path
     self.filename = filename
     self.file_type = convert.guess_format(self.filename)
     if self.file_type not in ['xls', 'csv', 'xlsx']:
         client.captureMessage(' %s Unsupported Format: %s, (%s)' % (now, self.file_type, self.filename))
         raise DedupeFileError('%s is not a supported format' % self.file_type)
     try:
         self.converted = convert.convert(open(self.file_path, 'rb'), self.file_type)
     except UnicodeDecodeError:
         client.captureException()
         raise DedupeFileError('We had a problem with the file you uploaded. \
                 This might be related to encoding or the file name having the wrong file extension.')
     self.line_count = self.converted.count('\n')
     if self.line_count > 10000:
         client.captureMessage(' %s File too big: %s, (%s)' % (now, self.line_count, self.filename))
         raise DedupeFileError('Your file has %s rows and we can only currently handle 10,000.' % self.line_count)
     client.captureMessage(' %s Format: %s, Line Count: %s' % (now, self.file_type, self.line_count))
Exemplo n.º 21
0
 def test_guess_xls_uppercase(self):
     self.assertEqual('xls', convert.guess_format('testdata.XLS'))
Exemplo n.º 22
0
 def test_guess_xlsx(self):
     self.assertEqual('xlsx', convert.guess_format('testdata.xlsx'))
Exemplo n.º 23
0
    def main(self):
        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' %
                                     self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error(
                    'You must specify a format when providing data via STDIN (pipe).'
                )
            filetype = convert.guess_format(self.args.input_path)
            if not filetype:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying a format with --format.'
                )

        self.buffers_input = filetype == 'csv' or not self.args.no_inference

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            # Streaming CSV musn't set sniff_limit, but non-streaming should.
            if not self.args.no_inference:
                kwargs['sniff_limit'] = self.args.sniff_limit
            if self.args.no_header_row:
                kwargs['header'] = False
        elif self.args.no_inference:
            # Streaming CSV musn't set column_types, but other formats should.
            kwargs['column_types'] = agate.TypeTester(limit=0)

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference:
            reader = agate.csv.reader(self.input_file, **self.reader_kwargs)
            writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(
                fixed2csv(self.input_file,
                          schema,
                          output=self.output_file,
                          **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              newline=True,
                                              **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file,
                                             sheet=kwargs.get('sheet'))
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file,
                                              sheet=kwargs.get('sheet'))
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError(
                        'DBF files can not be converted from stdin. You must pass a filename.'
                    )
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file)

        self.input_file.close()

        if self.args.schema:
            schema.close()
Exemplo n.º 24
0
 def test_guess_fixed(self):
     self.assertEqual('fixed', convert.guess_format('testdata'))
Exemplo n.º 25
0
    def main(self):
        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' %
                                     self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error(
                    'You must specify a format when providing data via STDIN (pipe).'
                )
            filetype = convert.guess_format(self.args.input_path)
            if not filetype:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying a format with --format.'
                )

        # Buffer standard input if the input file is in CSV format or if performing type inference.
        self.buffers_input = filetype == 'csv' or not self.args.no_inference

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        if self.args.names_only:
            sheet_names = None
            if filetype == 'xls':
                sheet_names = xlrd.open_workbook(
                    file_contents=self.input_file.read()).sheet_names()
            elif filetype == 'xlsx':
                sheet_names = openpyxl.load_workbook(self.input_file,
                                                     read_only=True,
                                                     data_only=True).sheetnames
            if sheet_names:
                for name in sheet_names:
                    self.output_file.write('%s\n' % name)
            else:
                self.argparser.error(
                    'You cannot use the -n or --names options with non-Excel files.'
                )
            self.input_file.close()
            return

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            kwargs['sniff_limit'] = self.args.sniff_limit

        if filetype not in ('dbf', 'geojson', 'json', 'ndjson'):
            kwargs['skip_lines'] = self.args.skip_lines

        if filetype != 'dbf':
            kwargs['column_types'] = self.get_column_types()

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference and not self.args.skip_lines:
            reader = agate.csv.reader(self.input_file, **self.reader_kwargs)
            writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(
                fixed2csv(self.input_file,
                          schema,
                          output=self.output_file,
                          **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              newline=True,
                                              **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file, **kwargs)
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file, **kwargs)
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError(
                        'DBF files can not be converted from stdin. You must pass a filename.'
                    )
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file)

        self.input_file.close()

        if self.args.schema:
            schema.close()
Exemplo n.º 26
0
    def main(self):
        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error('You must specify a format when providing data via STDIN (pipe).')
            filetype = convert.guess_format(self.args.input_path)
            if not filetype:
                self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            # Streaming CSV musn't set sniff_limit, but non-streaming should.
            if not self.args.no_inference:
                kwargs['sniff_limit'] = self.args.sniff_limit
            if self.args.no_header_row:
                kwargs['header'] = False
        elif self.args.no_inference:
            # Streaming CSV musn't set column_types, but other formats should.
            kwargs['column_types'] = agate.TypeTester(limit=0)

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference:
            reader = agate.reader(self.input_file, **self.reader_kwargs)
            writer = agate.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(fixed2csv(self.input_file, schema, output=self.output_file, **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file, sheet=kwargs.get('sheet', None))
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file, sheet=kwargs.get('sheet', None))
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError('DBF files can not be converted from stdin. You must pass a filename.')
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file)
Exemplo n.º 27
0
 def test_guess_csv(self):
     self.assertEqual('csv', convert.guess_format('testdata.csv'))
Exemplo n.º 28
0
 def test_guess_json(self):
     self.assertEqual('json', convert.guess_format('testdata.json'))
Exemplo n.º 29
0
 def test_guess_invalid(self):
     self.assertEqual(None, convert.guess_format('testdata.invalid'))
Exemplo n.º 30
0
 def test_guess_dbf(self):
     self.assertEqual('dbf', convert.guess_format('testdata.dbf'))
Exemplo n.º 31
0
def index():
    status_code = 200
    error = None
    return make_response(render_app_template('back_soon.html', error=error), status_code)
    

    if flask_session.get('ga_cid') is None:
        try:
            flask_session['ga_cid'] = request.cookies['_ga']
        except KeyError:
            flask_session['ga_cid'] = str(uuid4())
    if request.method == 'POST':
        f = request.files['input_file']
        if f and allowed_file(f.filename):
            fname = secure_filename(str(time.time()) + "_" + f.filename)
            file_path = os.path.abspath(os.path.join(UPLOAD_FOLDER, fname))
            f.save(file_path)
            file_type = convert.guess_format(f.filename)
            f.seek(0)
            try:
                file_contents = convert.convert(f, file_type)
            except UnicodeDecodeError:
                file_contents = None
                if sentry:
                    sentry.captureException()
                status_code = 500
                error = '''
                    We had a problem with the file you uploaded. 
                    This might be related to encoding or the file 
                    name having the wrong file extension.
                '''
            if file_contents:
                with open('{0}-converted.csv'.format(file_path), 'wb') as o:
                    o.write(file_contents)
                # Delete existing session keys
                sess_keys = ['training_data', 'counter']
                for k in sess_keys:
                    try:
                        del flask_session[k]
                    except KeyError:
                        pass
                flask_session['last_interaction'] = datetime.now()
                flask_session['raw_table'], \
                    flask_session['header'] = makeRawTable(file_contents)
                old = datetime.now() - timedelta(seconds=60 * 30)
                if flask_session['last_interaction'] < old:
                    del flask_session['raw_table']
                flask_session['filename'] = f.filename
                flask_session['file_path'] = file_path
                flask_session['file_type'] = file_type
               #send_ga_log(
               #    'Row Count', 
               #    flask_session['ga_cid'], 
               #    value=inp_file.line_count
               #)
               #send_ga_log(
               #    'File Type', 
               #    flask_session['ga_cid'], 
               #    label=inp_file.file_type, 
               #)
                return redirect(url_for('select_fields'))
           #except DedupeFileError as e:
           #    send_ga_log('Upload Error', flask_session['ga_cid'], label=e.message)
           #    error = e.message
           #    status_code = 500
        else:
            error = 'Error uploading file. Did you forget to select one?'
            send_ga_log('Upload Error', flask_session['ga_cid'], label=error)
            status_code = 500
    return make_response(render_app_template('index.html', error=error), status_code)
Exemplo n.º 32
0
    def main(self):
        path = self.args.input_path

        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not path or path == '-':
                self.argparser.error(
                    'You must specify a format when providing input as piped data via STDIN.'
                )
            filetype = convert.guess_format(path)
            if not filetype:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying '
                    'a format with --format.')

        if self.args.names_only:
            if filetype in ('xls', 'xlsx'):
                sheets = self.sheet_names(path, filetype)
                for sheet in sheets:
                    self.output_file.write('%s\n' % sheet)
            else:
                self.argparser.error(
                    'You cannot use the -n or --names options with non-Excel files.'
                )
            return

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = self.open_excel_input_file(path)
        else:
            self.input_file = self._open_input_file(path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            kwargs['sniff_limit'] = self.args.sniff_limit

        if filetype in ('xls', 'xlsx'):
            kwargs['header'] = not self.args.no_header_row

        if filetype not in ('dbf', 'geojson', 'json',
                            'ndjson'):  # csv, fixed, xls, xlsx
            kwargs['skip_lines'] = self.args.skip_lines

        if filetype != 'dbf':
            kwargs['column_types'] = self.get_column_types()

        # Convert the file.
        if (filetype == 'csv' and self.args.no_inference
                and not self.args.no_header_row and not self.args.skip_lines
                and self.args.sniff_limit == 0):
            reader = agate.csv.reader(self.input_file, **self.reader_kwargs)
            writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(
                fixed2csv(self.input_file,
                          schema,
                          output=self.output_file,
                          **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file,
                                              key=self.args.key,
                                              newline=True,
                                              **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(
                    self.input_file,
                    sheet=self.args.sheet,
                    encoding_override=self.args.encoding_xls,
                    **kwargs)
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file,
                                              sheet=self.args.sheet,
                                              **kwargs)
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError(
                        'DBF files can not be converted from stdin. You must pass a filename.'
                    )
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file, **self.writer_kwargs)

        if self.args.write_sheets:
            # Close and re-open the file, as the file object has been mutated or closed.
            self.input_file.close()

            self.input_file = self.open_excel_input_file(path)

            if self.args.write_sheets == '-':
                sheets = self.sheet_names(path, filetype)
            else:
                sheets = [
                    int(sheet) if sheet.isdigit() else sheet
                    for sheet in self.args.write_sheets.split(',')
                ]

            if filetype == 'xls':
                tables = agate.Table.from_xls(
                    self.input_file,
                    sheet=sheets,
                    encoding_override=self.args.encoding_xls,
                    **kwargs)
            elif filetype == 'xlsx':
                tables = agate.Table.from_xlsx(self.input_file,
                                               sheet=sheets,
                                               **kwargs)

            base = splitext(self.input_file.name)[0]
            for i, table in enumerate(tables.values()):
                with open('%s_%d.csv' % (base, i), 'w') as f:
                    table.to_csv(f, **self.writer_kwargs)

        self.input_file.close()

        if self.args.schema:
            schema.close()
Exemplo n.º 33
0
    def main(self):
        path = self.args.input_path

        # Determine the file type.
        if self.args.filetype:
            filetype = self.args.filetype
            if filetype not in SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.filetype)
        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not path or path == '-':
                self.argparser.error('You must specify a format when providing input as piped data via STDIN.')
            filetype = convert.guess_format(path)
            if not filetype:
                self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        if self.args.names_only:
            sheets = self.sheet_names(path, filetype)
            if sheets:
                for sheet in sheets:
                    self.output_file.write('%s\n' % sheet)
            else:
                self.argparser.error('You cannot use the -n or --names options with non-Excel files.')
            return

        # Set the input file.
        if filetype in ('xls', 'xlsx'):
            self.input_file = self.open_excel_input_file(path)
        else:
            self.input_file = self._open_input_file(path)

        # Set the reader's arguments.
        kwargs = {}

        if self.args.schema:
            schema = self._open_input_file(self.args.schema)
        elif filetype == 'fixed':
            raise ValueError('schema must not be null when format is "fixed"')

        if filetype == 'csv':
            kwargs.update(self.reader_kwargs)
            kwargs['sniff_limit'] = self.args.sniff_limit

        if filetype in ('xls', 'xlsx'):
            kwargs['header'] = not self.args.no_header_row

        if filetype not in ('dbf', 'geojson', 'json', 'ndjson'):  # csv, fixed, xls, xlsx
            kwargs['skip_lines'] = self.args.skip_lines

        if filetype != 'dbf':
            kwargs['column_types'] = self.get_column_types()

        # Convert the file.
        if filetype == 'csv' and self.args.no_inference and not self.args.no_header_row and not self.args.skip_lines and self.args.sniff_limit == 0:
            reader = agate.csv.reader(self.input_file, **self.reader_kwargs)
            writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
            writer.writerows(reader)
        elif filetype == 'fixed':
            self.output_file.write(fixed2csv(self.input_file, schema, output=self.output_file, **kwargs))
        elif filetype == 'geojson':
            self.output_file.write(geojson2csv(self.input_file, **kwargs))
        elif filetype in ('csv', 'dbf', 'json', 'ndjson', 'xls', 'xlsx'):
            if filetype == 'csv':
                table = agate.Table.from_csv(self.input_file, **kwargs)
            elif filetype == 'json':
                table = agate.Table.from_json(self.input_file, key=self.args.key, **kwargs)
            elif filetype == 'ndjson':
                table = agate.Table.from_json(self.input_file, key=self.args.key, newline=True, **kwargs)
            elif filetype == 'xls':
                table = agate.Table.from_xls(self.input_file, sheet=self.args.sheet, encoding_override=self.args.encoding_xls, **kwargs)
            elif filetype == 'xlsx':
                table = agate.Table.from_xlsx(self.input_file, sheet=self.args.sheet, **kwargs)
            elif filetype == 'dbf':
                if not hasattr(self.input_file, 'name'):
                    raise ValueError('DBF files can not be converted from stdin. You must pass a filename.')
                table = agate.Table.from_dbf(self.input_file.name, **kwargs)
            table.to_csv(self.output_file, **self.writer_kwargs)

        if self.args.write_sheets:
            # Close and re-open the file, as the file object has been mutated or closed.
            self.input_file.close()

            self.input_file = self.open_excel_input_file(path)

            if self.args.write_sheets == '-':
                sheets = self.sheet_names(path, filetype)
            else:
                sheets = [int(sheet) if sheet.isdigit() else sheet for sheet in self.args.write_sheets.split(',')]

            if filetype == 'xls':
                tables = agate.Table.from_xls(self.input_file, sheet=sheets, encoding_override=self.args.encoding_xls, **kwargs)
            elif filetype == 'xlsx':
                tables = agate.Table.from_xlsx(self.input_file, sheet=sheets, **kwargs)

            base = splitext(self.input_file.name)[0]
            for i, table in enumerate(tables.values()):
                with open('%s_%d.csv' % (base, i), 'w') as f:
                    table.to_csv(f, **self.writer_kwargs)

        self.input_file.close()

        if self.args.schema:
            schema.close()