예제 #1
0
    def test_valid_file(self):
        output = six.StringIO()

        with open('examples/test.xls', 'rb') as f:
            convert.convert(f, 'xls', output=output)

        with open('examples/testxls_converted.csv', 'r') as f:
            self.assertEquals(f.read(), output.getvalue())
예제 #2
0
 def __init__(self, file_path, filename):
     now = datetime.now().isoformat()
     self.file_path = file_path
     self.filename = filename
     self.file_type = convert.guess_format(self.filename)
     if self.file_type not in ["xls", "csv", "xlsx"]:
         if client:
             client.captureMessage(" %s Unsupported Format: %s, (%s)" % (now, self.file_type, self.filename))
         raise DedupeFileError("%s is not a supported format" % self.file_type)
     try:
         self.converted = convert.convert(open(self.file_path, "rb"), self.file_type)
     except UnicodeDecodeError:
         if client:
             client.captureException()
         raise DedupeFileError(
             "We had a problem with the file you uploaded. \
                 This might be related to encoding or the file name having the wrong file extension."
         )
     self.line_count = self.converted.count("\n")
     if self.line_count > 10000:
         if client:
             client.captureMessage(" %s File too big: %s, (%s)" % (now, self.line_count, self.filename))
         raise DedupeFileError("Your file has %s rows and we can only currently handle 10,000." % self.line_count)
     if client:
         client.captureMessage(" %s Format: %s, Line Count: %s" % (now, self.file_type, self.line_count))
예제 #3
0
 def process_file(self):
     '''
     Here we will see if the input file is CSV, or if it is an understood
     format that can be converted to CSV.  Assuming it's one of those two,
     we will pass the resulting CSV file over to the csv processor.
     '''
     for this_filename in self.filelist:
         logger.debug('Filename processing is %s', this_filename)
         self.format = convert.guess_format(this_filename)
         logger.debug('Guessed format of %s', self.format)
         if self.format == 'csv':
             self.filename = this_filename
             break
         elif self.format:
             # If it is not a CSV file, but some other
             # understood format, we will convert it to a CSV and
             # write it out to a temporary file.
             fh, self.temp_file = tempfile.mkstemp(suffix='.csv')
             os.close(fh)
             self.filename = self.temp_file
             try:
                 logger.debug(
                     'Attempting to convert to format CSV (from %s)',
                     self.format)
                 with open(self.temp_file, 'w') as fh:
                     fh.write(
                         convert.convert(open(this_filename, 'rb'),
                                         self.format))
                 break
             except Exception, e:
                 logger.exception('Failed to process %s to CSV: %s',
                                  self.filename, e)
                 os.unlink(self.filename)
                 self.filename = None
예제 #4
0
def upload():
    session_id = unicode(uuid4())
    f = request.files['input_file']
    flask_session['session_name'] = f.filename
    file_type = f.filename.rsplit('.')[1]
    u = StringIO(f.read())
    u.seek(0)
    if file_type != 'csv':  # pragma: no cover
        file_format = convert.guess_format(flask_session['session_name'])
        u = StringIO(convert.convert(u, file_format))
    fieldnames = [
        slugify(unicode(i)) for i in u.next().strip('\r\n').split(',')
    ]
    flask_session['fieldnames'] = fieldnames
    user_id = flask_session['user_id']
    user = db_session.query(User).get(user_id)
    group = user.groups[0]
    sess = DedupeSession(id=session_id,
                         name=request.form.get('name'),
                         description=request.form.get('description'),
                         filename=f.filename,
                         group=group,
                         status=STATUS_LIST[0]['machine_name'])
    db_session.add(sess)
    db_session.commit()
    u.seek(0)
    with open('/tmp/%s_raw.csv' % session_id, 'wb') as s:
        s.write(u.getvalue())
    del u
    initializeSession.delay(session_id)
    flask_session['session_id'] = session_id
    return jsonify(ready=True, session_id=session_id)
예제 #5
0
파일: csv.py 프로젝트: bhargavasana/nmtk
 def process_file(self):
     '''
     Here we will see if the input file is CSV, or if it is an understood
     format that can be converted to CSV.  Assuming it's one of those two,
     we will pass the resulting CSV file over to the csv processor.
     '''
     for this_filename in self.filelist:
         logger.debug('Filename processing is %s', this_filename)
         self.format=convert.guess_format(this_filename)
         logger.debug('Guessed format of %s', self.format)
         if self.format == 'csv':
             self.filename=this_filename
             break
         elif self.format:
             # If it is not a CSV file, but some other
             # understood format, we will convert it to a CSV and
             # write it out to a temporary file.
             fh, self.temp_file=tempfile.mkstemp(suffix='.csv')
             os.close(fh)
             self.filename=self.temp_file
             try:
                 logger.debug('Attempting to convert to format CSV (from %s)',
                              self.format)
                 with open(self.temp_file,'w') as fh:
                     fh.write(convert.convert(open(this_filename,'rb'), 
                                              self.format))
                 break
             except Exception, e:
                 logger.exception('Failed to process %s to CSV: %s',
                                  self.filename, e)
                 os.unlink(self.filename)
                 self.filename=None
예제 #6
0
def upload():
    context = {}
    if request.method == 'POST':
        big_file = False
        try:
            files = request.files
        except RequestEntityTooLarge, e:
            files = None
            big_file = True
            current_app.logger.info(e)
        if files:
            f = files['input_file']
            if allowed_file(f.filename):
                inp = StringIO(f.read())
                file_format = convert.guess_format(f.filename)
                try:
                    converted = convert.convert(inp, file_format)
                except UnicodeDecodeError:
                    context['errors'] = [
                        'We had a problem with reading your file. \
                        This could have to do with the file encoding or format'
                    ]
                    converted = None
                f.seek(0)
                if converted:
                    outp = StringIO(converted)
                    reader = UnicodeCSVReader(outp)
                    session['header_row'] = reader.next()
                    rows = []
                    columns = [[] for c in session['header_row']]
                    column_ids = range(len(session['header_row']))
                    for row in range(100):
                        try:
                            rows.append(reader.next())
                        except StopIteration:
                            break
                    for i, row in enumerate(rows):
                        for j, d in enumerate(row):
                            columns[j].append(row[column_ids[j]])
                    sample_data = []
                    guesses = {}
                    for index, header_val in enumerate(session['header_row']):
                        guesses[index] = guess_geotype(header_val,
                                                       columns[index])
                        sample_data.append((index, header_val, columns[index]))
                    session['sample_data'] = sample_data
                    session['guesses'] = json.dumps(guesses)
                    outp.seek(0)
                    session['file'] = outp.getvalue()
                    session['filename'] = f.filename
                    return redirect(url_for('views.select_geo'))
            else:
                context['errors'] = [
                    'Only .xls or .xlsx and .csv files are allowed.'
                ]
        else:
            context['errors'] = ['You must provide a file to upload.']
            if big_file:
                context['errors'] = ['Uploaded file must be 10mb or less.']
예제 #7
0
파일: in2csv.py 프로젝트: sukic/csvkit
    def main(self):
        if self.args.filetype:
            filetype = self.args.filetype

            if filetype not in convert.SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.filetype)

        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error('You must specify a format when providing data via STDIN (pipe).')

            filetype = convert.guess_format(self.args.input_path)

            if not filetype:
                self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs['schema'] = self._open_input_file(self.args.schema)

        if self.args.key:
            kwargs['key'] = self.args.key

        if self.args.sniff_limit:
            kwargs['sniff_limit'] = self.args.sniff_limit

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if self.args.no_inference:
            kwargs['column_types'] = agate.TypeTester(limit=0)

        if filetype == 'csv' and self.args.no_header_row:
            kwargs['header'] = False

        convert.convert(self.input_file, filetype, output=self.output_file, **kwargs)
예제 #8
0
    def main(self):
        if self.args.format:
            format = self.args.format

            if format not in convert.SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' %
                                     self.args.format)

        elif self.args.schema:
            format = 'fixed'
        elif self.args.key:
            format = 'json'
        else:
            if self.args.file == sys.stdin:
                self.argparser.error(
                    'You must specify a format when providing data via STDIN (pipe).'
                )

            format = convert.guess_format(self.args.file)

            if not format:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying a format with --format.'
                )

        if isinstance(self.args.file, file):
            f = self.args.file
        elif format in ('xls', 'xlsx'):
            f = open(self.args.file, 'rb')
        else:
            f = open(self.args.file, 'rU')

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs['schema'] = self.args.schema

        if self.args.key:
            kwargs['key'] = self.args.key

        if self.args.snifflimit:
            kwargs['snifflimit'] = self.args.snifflimit

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if self.args.no_inference:
            kwargs['type_inference'] = False

        if format == 'csv' and self.args.no_header_row:
            kwargs['no_header_row'] = True

        # Fixed width can be processed as a stream
        if format == 'fixed':
            kwargs['output'] = self.output_file

        self.output_file.write(convert.convert(f, format, **kwargs))
예제 #9
0
    def main(self):
        if self.args.filetype:
            filetype = self.args.filetype

            if filetype not in convert.SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' %
                                     self.args.filetype)

        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error(
                    'You must specify a format when providing data via STDIN (pipe).'
                )

            filetype = convert.guess_format(self.args.input_path)

            if not filetype:
                self.argparser.error(
                    'Unable to automatically determine the format of the input file. Try specifying a format with --format.'
                )

        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs['schema'] = self._open_input_file(self.args.schema)

        if self.args.key:
            kwargs['key'] = self.args.key

        if self.args.snifflimit:
            kwargs['snifflimit'] = self.args.snifflimit

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if self.args.no_inference:
            kwargs['type_inference'] = False

        if filetype == 'csv' and self.args.no_header_row:
            kwargs['no_header_row'] = True

        # Fixed width can be processed as a stream
        if filetype == 'fixed':
            kwargs['output'] = self.output_file

        data = convert.convert(self.input_file, filetype, **kwargs)

        self.output_file.write(data)
예제 #10
0
    def main(self):
        if self.filetype:
            filetype = self.filetype

            if filetype not in convert.SUPPORTED_FORMATS:
                self.csvResult = (_('"%s" is not a supported format') % self.filetype)

#         elif self.args.schema:
#             filetype = 'fixed'
#         elif self.args.key:
#             filetype = 'json'
        else:
            if not self.file_name or self.file_name == '-':
                self.csvResult = _('You must specify a format.')

            filetype = convert.guess_format(self.file_name)

            if not filetype:
                self.csvResult = _('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.file_name, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        kwargs = self.reader_kwargs

#         if self.args.schema:
#             kwargs['schema'] = self._open_input_file(self.args.schema)
# 
#         if self.args.key:
#             kwargs['key'] = self.args.key
# 
#         if self.args.snifflimit:
#             kwargs['snifflimit'] = self.args.snifflimit
# 
#         if self.args.sheet:
#             kwargs['sheet'] = self.args.sheet
# 
#         if self.args.no_inference:
#             kwargs['type_inference'] = False
# 
#         if filetype == 'csv' and self.args.no_header_row:
#             kwargs['no_header_row'] = True
# 
#         # Fixed width can be processed as a stream
#         if filetype == 'fixed':
#             kwargs['output'] = self.output_file

        data = convert.convert(self.input_file, filetype, **kwargs)

        print 'out: '
        out_file_opened = open(self.output_file,'w')
        print out_file_opened
        out_file_opened.write(data)
예제 #11
0
파일: in2csv.py 프로젝트: MinoGames/csvkit
    def main(self):
        if self.args.filetype:
            filetype = self.args.filetype

            if filetype not in convert.SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.filetype)

        elif self.args.schema:
            filetype = 'fixed'
        elif self.args.key:
            filetype = 'json'
        else:
            if not self.args.input_path or self.args.input_path == '-':
                self.argparser.error('You must specify a format when providing data via STDIN (pipe).')

            filetype = convert.guess_format(self.args.input_path)

            if not filetype:
                self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        if filetype in ('xls', 'xlsx'):
            self.input_file = open(self.args.input_path, 'rb')
        else:
            self.input_file = self._open_input_file(self.args.input_path)

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs['schema'] = self._open_input_file(self.args.schema)

        if self.args.key:
            kwargs['key'] = self.args.key

        if self.args.snifflimit:
            kwargs['snifflimit'] = self.args.snifflimit

        if self.args.export:
            kwargs['export'] = self.args.export

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if self.args.no_inference:
            kwargs['type_inference'] = False

        if filetype == 'csv' and self.args.no_header_row:
            kwargs['no_header_row'] = True

        # Fixed width can be processed as a stream
        if filetype == 'fixed':
            kwargs['output'] = self.output_file

        data = convert.convert(self.input_file, filetype, **kwargs)

        self.output_file.write(data)
예제 #12
0
def upload():
    context = {}
    if request.method == 'POST':
        f = request.files['input_file']
        if f:
            if allowed_file(f.filename):
                inp = StringIO(f.read())
                if sys.getsizeof(inp.getvalue()) <= MAX_CONTENT_LENGTH:
                    inp.seek(0)
                    file_format = convert.guess_format(f.filename)
                    try:
                        converted = convert.convert(inp, file_format)
                    except UnicodeDecodeError:
                        context['errors'] = [
                            'We had a problem with reading your file. \
                            This could have to do with the file encoding or format'
                        ]
                        converted = None
                    f.seek(0)
                    if converted:
                        outp = StringIO(converted)
                        reader = UnicodeCSVReader(outp)
                        session['header_row'] = reader.next()
                        rows = []
                        columns = [[] for c in session['header_row']]
                        column_ids = range(len(session['header_row']))
                        for row in range(10):
                            try:
                                rows.append(reader.next())
                            except StopIteration:
                                break
                        for i, row in enumerate(rows):
                            for j, d in enumerate(row):
                                columns[j].append(row[column_ids[j]])
                        columns = [', '.join(c) for c in columns]
                        sample_data = []
                        for index, _ in enumerate(session['header_row']):
                            sample_data.append(
                                (index, session['header_row'][index],
                                 columns[index]))
                        session['sample_data'] = sample_data
                        outp.seek(0)
                        session['file'] = outp.getvalue()
                        session['filename'] = f.filename
                        return redirect(url_for('views.select_geo'))
                else:
                    context['errors'] = ['Uploaded file must be 10mb or less.']
            else:
                context['errors'] = [
                    'Only .xls or .xlsx and .csv files are allowed.'
                ]
        else:
            context['errors'] = ['You must provide a file to upload.']
    return render_template('upload.html', **context)
예제 #13
0
def upload():
    context = {}
    if request.method == 'POST':
        big_file = False
        try:
            files = request.files
        except RequestEntityTooLarge, e:
            files = None
            big_file = True
            current_app.logger.info(e)
        if files:
            f = files['input_file']
            if allowed_file(f.filename):
                inp = StringIO(f.read())
                file_format = convert.guess_format(f.filename)
                try:
                    converted = convert.convert(inp, file_format)
                except UnicodeDecodeError:
                    context['errors'] = ['We had a problem with reading your file. \
                        This could have to do with the file encoding or format']
                    converted = None
                f.seek(0)
                if converted:
                    outp = StringIO(converted)
                    reader = UnicodeCSVReader(outp)
                    session['header_row'] = reader.next()
                    rows = []
                    columns = [[] for c in session['header_row']]
                    column_ids = range(len(session['header_row']))
                    for row in range(100):
                        try:
                            rows.append(reader.next())
                        except StopIteration:
                            break
                    for i, row in enumerate(rows):
                        for j,d in enumerate(row):
                            columns[j].append(row[column_ids[j]])
                    sample_data = []
                    guesses = {}
                    for index, header_val in enumerate(session['header_row']):
                        guesses[index] = guess_geotype(header_val, columns[index])
                        sample_data.append((index, header_val, columns[index]))
                    session['sample_data'] = sample_data
                    session['guesses'] = json.dumps(guesses)
                    outp.seek(0)
                    session['file'] = outp.getvalue()
                    session['filename'] = f.filename
                    return redirect(url_for('views.select_geo'))
            else:
                context['errors'] = ['Only .xls or .xlsx and .csv files are allowed.']
        else:
            context['errors'] = ['You must provide a file to upload.']
            if big_file:
                context['errors'] = ['Uploaded file must be 10mb or less.'] 
예제 #14
0
def convert_to_csv_reader(filename, sheet=None, infer_types=True):
    format = convert.guess_format(filename)
    f = open(filename, "rb")
    convert_kwargs = {}
    if sheet is not None:
        # Only pass `sheet` to the `convert` function when its set to
        # a non-None value.  This is done to satisfy csvkit which checks
        # for the presence of `sheet`, not whether it's valid.
        convert_kwargs['sheet'] = sheet
    converted = StringIO(convert.convert(f, format, infer_types=infer_types, **convert_kwargs))
    reader = UnicodeCSVReader(converted)
    return reader
예제 #15
0
파일: in2csv.py 프로젝트: GMADIGITAL/csvkit
    def main(self):
        if self.args.format:
            format = self.args.format

            if format not in convert.SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.format)

        elif self.args.schema:
            format = 'fixed'
        elif self.args.key:
            format = 'json'
        else:
            if self.args.file == sys.stdin:
                self.argparser.error('You must specify a format when providing data via STDIN (pipe).')

            format = convert.guess_format(self.args.file)

            if not format:
                self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        if isinstance(self.args.file, file):
            f = self.args.file
        elif format in ('xls', 'xlsx'):
            f = open(self.args.file, 'rb')
        else:
            f = open(self.args.file, 'rU')

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs['schema'] = self.args.schema

        if self.args.key:
            kwargs['key'] = self.args.key

        if self.args.snifflimit:
            kwargs['snifflimit'] = self.args.snifflimit

        if self.args.sheet:
            kwargs['sheet'] = self.args.sheet

        if self.args.no_inference:
            kwargs['type_inference'] = False

        if format == 'csv' and self.args.no_header_row:
            kwargs['no_header_row'] = True

        # Fixed width can be processed as a stream
        if format == 'fixed':
            kwargs['output'] = self.output_file

        self.output_file.write(convert.convert(f, format, **kwargs))
예제 #16
0
 def __init__(self, file_path, filename):
     now = datetime.now().isoformat()
     self.file_path = file_path
     self.filename = filename
     self.file_type = convert.guess_format(self.filename)
     if self.file_type not in ['xls', 'csv', 'xlsx']:
         logger.warning(' %s Unsupported Format: %s, (%s)' % (now, self.file_type, self.filename))
         raise DedupeFileError('%s is not a supported format' % self.file_type)
     self.converted = convert.convert(open(self.file_path, 'rb'), self.file_type)
     self.line_count = self.converted.count('\n')
     if self.line_count > 10000:
         logger.warning(' %s File too big: %s, (%s)' % (now, self.line_count, self.filename))
         raise DedupeFileError('Your file has %s rows and we can only currently handle 10,000.' % self.line_count)
     logger.warning(' %s Format: %s, Line Count: %s' % (now, self.file_type, self.line_count))
예제 #17
0
def upload():
    context = {}
    if request.method == 'POST':
        f = request.files['input_file']
        if f:
            if allowed_file(f.filename):
                inp = StringIO(f.read())
                if sys.getsizeof(inp.getvalue()) <= MAX_CONTENT_LENGTH:
                    inp.seek(0)
                    file_format = convert.guess_format(f.filename)
                    try:
                        converted = convert.convert(inp, file_format)
                    except UnicodeDecodeError:
                        context['errors'] = ['We had a problem with reading your file. \
                            This could have to do with the file encoding or format']
                        converted = None
                    f.seek(0)
                    if converted:
                        outp = StringIO(converted)
                        reader = UnicodeCSVReader(outp)
                        session['header_row'] = reader.next()
                        rows = []
                        columns = [[] for c in session['header_row']]
                        column_ids = range(len(session['header_row']))
                        for row in range(10):
                            try:
                                rows.append(reader.next())
                            except StopIteration:
                                break
                        for i, row in enumerate(rows):
                            for j,d in enumerate(row):
                                columns[j].append(row[column_ids[j]])
                        columns = [', '.join(c) for c in columns]
                        sample_data = []
                        for index,_ in enumerate(session['header_row']):
                            sample_data.append((index, session['header_row'][index], columns[index]))
                        session['sample_data'] = sample_data
                        outp.seek(0)
                        session['file'] = outp.getvalue()
                        session['filename'] = f.filename
                        return redirect(url_for('views.select_geo'))
                else:
                   context['errors'] = ['Uploaded file must be 10mb or less.'] 
            else:
                context['errors'] = ['Only .xls or .xlsx and .csv files are allowed.']
        else:
            context['errors'] = ['You must provide a file to upload.']
    return render_template('upload.html', **context)
예제 #18
0
파일: app.py 프로젝트: liyanchang/XLStoCSV
def hello_world():
    url = request.values.get('url', None)
    extension = request.values.get('ext', None)

    if not url:
        return "No file to convert. url="
    u = urllib2.urlopen(url)

    if not extension:
        extension = u.headers.dict['content-disposition'].split('.')[-1][:-1]
        #extension = re.search('\.(.*?)\"', u.headers.dict['content-disposition']).group(1)

    if not extension:
        return "Couldn't figure out extension. specify with ext="

    output = convert.convert(u, extension)
    return output
예제 #19
0
파일: in2csv.py 프로젝트: mikewaters/csvkit
    def main(self):
        if self.args.format:
            format = self.args.format

            if format not in convert.SUPPORTED_FORMATS:
                self.argparser.error('"%s" is not a supported format' % self.args.format)

        elif self.args.schema:
            format = "fixed"
        elif self.args.key:
            format = "json"
        else:
            if self.args.file == sys.stdin:
                self.argparser.error("You must specify a format when providing data via STDIN (pipe).")

            format = convert.guess_format(self.args.file)

            if not format:
                self.argparser.error(
                    "Unable to automatically determine the format of the input file. Try specifying a format with --format."
                )

        if isinstance(self.args.file, file):
            f = self.args.file
        elif format == "xls":
            f = open(self.args.file, "rb")
        else:
            f = open(self.args.file, "rU")

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs["schema"] = self.args.schema

        if self.args.key:
            kwargs["key"] = self.args.key

        if self.args.snifflimit:
            kwargs["snifflimit"] = self.args.snifflimit

        # Fixed width can be processed as a stream
        if format == "fixed":
            kwargs["output"] = self.output_file

        self.output_file.write(convert.convert(f, format, **kwargs))
예제 #20
0
파일: in2csv.py 프로젝트: goldenboy/csvkit
    def main(self):
        if self.args.format:
            format = self.args.format

            if format not in convert.SUPPORTED_FORMATS:
                sys.exit('"%s" is not a supported format' % self.args.format)

        elif self.args.schema:
            format = 'fixed'
        elif self.args.key:
            format = 'json'
        else:
            if self.args.file == '<stdin>':
                sys.exit('You must specify a format when providing data via STDIN (pipe).')

            format = convert.guess_format(self.args.file)

            if not format:
                sys.exit('Unable to automatically determine the format of the input file. Try specifying a format with --format.')

        if isinstance(self.args.file, file):
            f = self.args.file
        elif format == 'xls':
            f = open(self.args.file, 'rb')
        else:
            f = open(self.args.file, 'rU')

        kwargs = self.reader_kwargs

        if self.args.schema:
            kwargs['schema'] = self.args.schema

        if self.args.key:
            kwargs['key'] = self.args.key

        if self.args.snifflimit:
            kwargs['snifflimit'] = self.args.snifflimit

        # Fixed width can be processed as a stream
        if format == 'fixed':
            kwargs['output'] = self.output_file

        self.output_file.write(convert.convert(f, format, **kwargs))
예제 #21
0
 def __init__(self, file_path, filename):
     now = datetime.now().isoformat()
     self.file_path = file_path
     self.filename = filename
     self.file_type = convert.guess_format(self.filename)
     if self.file_type not in ['xls', 'csv', 'xlsx']:
         client.captureMessage(' %s Unsupported Format: %s, (%s)' % (now, self.file_type, self.filename))
         raise DedupeFileError('%s is not a supported format' % self.file_type)
     try:
         self.converted = convert.convert(open(self.file_path, 'rb'), self.file_type)
     except UnicodeDecodeError:
         client.captureException()
         raise DedupeFileError('We had a problem with the file you uploaded. \
                 This might be related to encoding or the file name having the wrong file extension.')
     self.line_count = self.converted.count('\n')
     if self.line_count > 10000:
         client.captureMessage(' %s File too big: %s, (%s)' % (now, self.line_count, self.filename))
         raise DedupeFileError('Your file has %s rows and we can only currently handle 10,000.' % self.line_count)
     client.captureMessage(' %s Format: %s, Line Count: %s' % (now, self.file_type, self.line_count))
예제 #22
0
    def test_valid_file(self):
        with open('examples/test.xls', 'r') as f:
            output = convert.convert(f, 'xls')

        with open('examples/testxls_converted.csv', 'r') as f:
            self.assertEquals(f.read(), output)
예제 #23
0
    def scrape(self):
        seat_numbers = defaultdict(lambda: defaultdict(int))

        extension = os.path.splitext(self.csv_url)[1]
        if extension in ('.xls', '.xlsx'):
            data = StringIO(
                convert.convert(BytesIO(self.get(self.csv_url).content),
                                extension[1:]))
        else:
            data = None

        reader = self.csv_reader(self.csv_url,
                                 header=True,
                                 encoding=self.encoding,
                                 skip_rows=self.skip_rows,
                                 data=data)
        reader.fieldnames = [
            self.header_converter(field) for field in reader.fieldnames
        ]
        for row in reader:
            if any(row.values()):
                if row['last name'] == 'Vacant' or row[
                        'first name'] == 'Vacant':
                    continue

                for key, corrections in self.corrections.items():
                    if row.get(key) and row[key] in corrections:
                        row[key] = corrections[row[key]]

                role = row['primary role'].split(';', 1)[0]
                name = '{} {}'.format(row['first name'], row['last name'])
                province = row.get('province')

                if self.district_name:
                    if row['district id']:
                        district = self.district_name.format(**row)
                    else:
                        district = self.jurisdiction.division_name
                else:
                    district = row.get(
                        'district name') or self.jurisdiction.division_name

                if self.many_posts_per_area and role not in ('Mayor',
                                                             'Regional Chair'):
                    seat_numbers[role][district] += 1
                    district = '{} (seat {})'.format(
                        district, seat_numbers[role][district])

                lines = []
                if row.get('address line 1'):
                    lines.append(row['address line 1'])
                if row.get('address line 2'):
                    lines.append(row['address line 2'])
                if row.get('locality'):
                    parts = [row['locality']]
                    if province:
                        parts.append(province)
                    if row.get('postal code'):
                        parts.extend(['', row['postal code']])
                    lines.append(' '.join(parts))

                p = CanadianPerson(primary_org='legislature',
                                   name=name,
                                   district=district,
                                   role=role)
                p.add_source(self.csv_url)
                if row.get('gender'):
                    p.gender = row['gender']
                if row.get('photo url'):
                    p.image = row['photo url']
                if row.get('source url'):
                    p.add_source(row['source url'])
                if row.get('website'):
                    p.add_link(row['website'])
                p.add_contact('email', row['email'])
                if lines:
                    p.add_contact('address', '\n'.join(lines), 'legislature')
                if row.get('phone'):
                    p.add_contact('voice', row['phone'].split(';', 1)[0],
                                  'legislature')
                if row.get('fax'):
                    p.add_contact('fax', row['fax'], 'legislature')
                if row.get('cell'):
                    p.add_contact('cell', row['cell'], 'legislature')
                if row.get('facebook'):
                    p.add_link(re.sub(r'[#?].+', '', row['facebook']))
                if row.get('twitter'):
                    p.add_link(row['twitter'])
                if name in self.other_names:
                    for other_name in self.other_names[name]:
                        p.add_name(other_name)
                yield p
예제 #24
0
 def test_valid_file(self):
     with open('examples/test.xls', 'r') as f:
         output = convert.convert(f, 'xls')
     
     with open('examples/testxls_converted.csv', 'r') as f:
         self.assertEquals(f.read(), output)
예제 #25
0
파일: app.py 프로젝트: copyfun/dedupe-web
def index():
    status_code = 200
    error = None
    return make_response(render_app_template('back_soon.html', error=error), status_code)
    

    if flask_session.get('ga_cid') is None:
        try:
            flask_session['ga_cid'] = request.cookies['_ga']
        except KeyError:
            flask_session['ga_cid'] = str(uuid4())
    if request.method == 'POST':
        f = request.files['input_file']
        if f and allowed_file(f.filename):
            fname = secure_filename(str(time.time()) + "_" + f.filename)
            file_path = os.path.abspath(os.path.join(UPLOAD_FOLDER, fname))
            f.save(file_path)
            file_type = convert.guess_format(f.filename)
            f.seek(0)
            try:
                file_contents = convert.convert(f, file_type)
            except UnicodeDecodeError:
                file_contents = None
                if sentry:
                    sentry.captureException()
                status_code = 500
                error = '''
                    We had a problem with the file you uploaded. 
                    This might be related to encoding or the file 
                    name having the wrong file extension.
                '''
            if file_contents:
                with open('{0}-converted.csv'.format(file_path), 'wb') as o:
                    o.write(file_contents)
                # Delete existing session keys
                sess_keys = ['training_data', 'counter']
                for k in sess_keys:
                    try:
                        del flask_session[k]
                    except KeyError:
                        pass
                flask_session['last_interaction'] = datetime.now()
                flask_session['raw_table'], \
                    flask_session['header'] = makeRawTable(file_contents)
                old = datetime.now() - timedelta(seconds=60 * 30)
                if flask_session['last_interaction'] < old:
                    del flask_session['raw_table']
                flask_session['filename'] = f.filename
                flask_session['file_path'] = file_path
                flask_session['file_type'] = file_type
               #send_ga_log(
               #    'Row Count', 
               #    flask_session['ga_cid'], 
               #    value=inp_file.line_count
               #)
               #send_ga_log(
               #    'File Type', 
               #    flask_session['ga_cid'], 
               #    label=inp_file.file_type, 
               #)
                return redirect(url_for('select_fields'))
           #except DedupeFileError as e:
           #    send_ga_log('Upload Error', flask_session['ga_cid'], label=e.message)
           #    error = e.message
           #    status_code = 500
        else:
            error = 'Error uploading file. Did you forget to select one?'
            send_ga_log('Upload Error', flask_session['ga_cid'], label=error)
            status_code = 500
    return make_response(render_app_template('index.html', error=error), status_code)
예제 #26
0
    def scrape(self):
        seat_numbers = defaultdict(lambda: defaultdict(int))

        extension = os.path.splitext(self.csv_url)[1]
        if extension in ('.xls', '.xlsx'):
            data = StringIO(convert.convert(BytesIO(self.get(self.csv_url).content), extension[1:]))
        elif extension == '.zip':
            basename = os.path.basename(self.csv_url)
            if not self.encoding:
                self.encoding = 'utf-8'
            try:
                response = requests.get(self.csv_url, stream=True)
                with open(basename, 'wb') as f:
                    for chunk in response.iter_content():
                        f.write(chunk)
                with ZipFile(basename).open(self.filename, 'r') as fp:
                    data = StringIO(fp.read().decode(self.encoding))
            finally:
                os.unlink(basename)
        else:
            data = None

        reader = self.csv_reader(self.csv_url, header=True, encoding=self.encoding, skip_rows=self.skip_rows, data=data)
        reader.fieldnames = [self.header_converter(field) for field in reader.fieldnames]
        for row in reader:
            if any(row.values()):
                if row['last name'] == 'Vacant' or row['first name'] == 'Vacant':
                    continue

                for key, corrections in self.corrections.items():
                    if row.get(key) and row[key] in corrections:
                        row[key] = corrections[row[key]]

                role = re.split(r'(?: (?:and|et)\b|;)', row['primary role'], 1)[0].strip()  # ca_on_newmarket, ca_qc_laval, ca_qc_montreal
                name = '{} {}'.format(row['first name'], row['last name'])
                province = row.get('province')

                if not re.search(r'[A-Z]', role):  # ca_qc_laval
                    role = role.capitalize()

                if self.district_name_format_string:
                    if row['district id']:
                        district = self.district_name_format_string.format(**row)
                    else:
                        district = self.jurisdiction.division_name
                else:
                    district = row.get('district name') or self.jurisdiction.division_name

                if self.many_posts_per_area and role not in ('Mayor', 'Regional Chair'):
                    seat_numbers[role][district] += 1
                    district = '{} (seat {})'.format(district, seat_numbers[role][district])

                lines = []
                if row.get('address line 1'):
                    lines.append(row['address line 1'])
                if row.get('address line 2'):
                    lines.append(row['address line 2'])
                if row.get('locality'):
                    parts = [row['locality']]
                    if province:
                        parts.append(province)
                    if row.get('postal code'):
                        parts.extend(['', row['postal code']])
                    lines.append(' '.join(parts))

                p = CanadianPerson(primary_org='legislature', name=name, district=district, role=role)
                p.add_source(self.csv_url)
                if row.get('gender'):
                    p.gender = row['gender']
                if row.get('photo url'):
                    p.image = row['photo url']
                if row.get('source url'):
                    p.add_source(row['source url'])
                if row.get('website'):
                    p.add_link(row['website'], note='web site')
                p.add_contact('email', row['email'])
                if lines:
                    p.add_contact('address', '\n'.join(lines), 'legislature')
                if row.get('phone'):
                    p.add_contact('voice', row['phone'].split(';', 1)[0], 'legislature')  # ca_qc_montreal
                if row.get('fax'):
                    p.add_contact('fax', row['fax'], 'legislature')
                if row.get('cell'):
                    p.add_contact('cell', row['cell'], 'legislature')
                if row.get('facebook'):
                    p.add_link(re.sub(r'[#?].+', '', row['facebook']))
                if row.get('twitter'):
                    p.add_link(row['twitter'])
                if name in self.other_names:
                    for other_name in self.other_names[name]:
                        p.add_name(other_name)
                yield p
예제 #27
0
파일: utils.py 프로젝트: ppival/scrapers-ca
    def scrape(self):
        seat_numbers = defaultdict(lambda: defaultdict(int))

        extension = os.path.splitext(self.csv_url)[1]
        if extension in ('.xls', '.xlsx'):
            data = StringIO(convert.convert(BytesIO(self.get(self.csv_url).content), extension[1:]))
        else:
            data = None

        reader = self.csv_reader(self.csv_url, header=True, encoding=self.encoding, skip_rows=self.skip_rows, data=data)
        reader.fieldnames = [self.header_converter(field) for field in reader.fieldnames]
        for row in reader:
            if any(row.values()):
                if row['last name'] == 'Vacant' or row['first name'] == 'Vacant':
                    continue

                for key, corrections in self.corrections.items():
                    if row.get(key) and row[key] in corrections:
                        row[key] = corrections[row[key]]

                role = row['primary role'].split(';', 1)[0]
                name = '{} {}'.format(row['first name'], row['last name'])
                province = row.get('province')

                if self.district_name:
                    if row['district id']:
                        district = self.district_name.format(**row)
                    else:
                        district = self.jurisdiction.division_name
                else:
                    district = row.get('district name') or self.jurisdiction.division_name

                if self.many_posts_per_area and role not in ('Mayor', 'Regional Chair'):
                    seat_numbers[role][district] += 1
                    district = '{} (seat {})'.format(district, seat_numbers[role][district])

                lines = []
                if row.get('address line 1'):
                    lines.append(row['address line 1'])
                if row.get('address line 2'):
                    lines.append(row['address line 2'])
                if row.get('locality'):
                    parts = [row['locality']]
                    if province:
                        parts.append(province)
                    if row.get('postal code'):
                        parts.extend(['', row['postal code']])
                    lines.append(' '.join(parts))

                p = CanadianPerson(primary_org='legislature', name=name, district=district, role=role)
                p.add_source(self.csv_url)
                if row.get('gender'):
                    p.gender = row['gender']
                if row.get('photo url'):
                    p.image = row['photo url']
                if row.get('source url'):
                    p.add_source(row['source url'])
                if row.get('website'):
                    p.add_link(row['website'], note='web site')
                p.add_contact('email', row['email'])
                if lines:
                    p.add_contact('address', '\n'.join(lines), 'legislature')
                if row.get('phone'):
                    p.add_contact('voice', row['phone'].split(';', 1)[0], 'legislature')
                if row.get('fax'):
                    p.add_contact('fax', row['fax'], 'legislature')
                if row.get('cell'):
                    p.add_contact('cell', row['cell'], 'legislature')
                if row.get('facebook'):
                    p.add_link(re.sub(r'[#?].+', '', row['facebook']))
                if row.get('twitter'):
                    p.add_link(row['twitter'])
                if name in self.other_names:
                    for other_name in self.other_names[name]:
                        p.add_name(other_name)
                yield p