def test_valid_file(self): output = six.StringIO() with open('examples/test.xls', 'rb') as f: convert.convert(f, 'xls', output=output) with open('examples/testxls_converted.csv', 'r') as f: self.assertEquals(f.read(), output.getvalue())
def __init__(self, file_path, filename): now = datetime.now().isoformat() self.file_path = file_path self.filename = filename self.file_type = convert.guess_format(self.filename) if self.file_type not in ["xls", "csv", "xlsx"]: if client: client.captureMessage(" %s Unsupported Format: %s, (%s)" % (now, self.file_type, self.filename)) raise DedupeFileError("%s is not a supported format" % self.file_type) try: self.converted = convert.convert(open(self.file_path, "rb"), self.file_type) except UnicodeDecodeError: if client: client.captureException() raise DedupeFileError( "We had a problem with the file you uploaded. \ This might be related to encoding or the file name having the wrong file extension." ) self.line_count = self.converted.count("\n") if self.line_count > 10000: if client: client.captureMessage(" %s File too big: %s, (%s)" % (now, self.line_count, self.filename)) raise DedupeFileError("Your file has %s rows and we can only currently handle 10,000." % self.line_count) if client: client.captureMessage(" %s Format: %s, Line Count: %s" % (now, self.file_type, self.line_count))
def process_file(self): ''' Here we will see if the input file is CSV, or if it is an understood format that can be converted to CSV. Assuming it's one of those two, we will pass the resulting CSV file over to the csv processor. ''' for this_filename in self.filelist: logger.debug('Filename processing is %s', this_filename) self.format = convert.guess_format(this_filename) logger.debug('Guessed format of %s', self.format) if self.format == 'csv': self.filename = this_filename break elif self.format: # If it is not a CSV file, but some other # understood format, we will convert it to a CSV and # write it out to a temporary file. fh, self.temp_file = tempfile.mkstemp(suffix='.csv') os.close(fh) self.filename = self.temp_file try: logger.debug( 'Attempting to convert to format CSV (from %s)', self.format) with open(self.temp_file, 'w') as fh: fh.write( convert.convert(open(this_filename, 'rb'), self.format)) break except Exception, e: logger.exception('Failed to process %s to CSV: %s', self.filename, e) os.unlink(self.filename) self.filename = None
def upload(): session_id = unicode(uuid4()) f = request.files['input_file'] flask_session['session_name'] = f.filename file_type = f.filename.rsplit('.')[1] u = StringIO(f.read()) u.seek(0) if file_type != 'csv': # pragma: no cover file_format = convert.guess_format(flask_session['session_name']) u = StringIO(convert.convert(u, file_format)) fieldnames = [ slugify(unicode(i)) for i in u.next().strip('\r\n').split(',') ] flask_session['fieldnames'] = fieldnames user_id = flask_session['user_id'] user = db_session.query(User).get(user_id) group = user.groups[0] sess = DedupeSession(id=session_id, name=request.form.get('name'), description=request.form.get('description'), filename=f.filename, group=group, status=STATUS_LIST[0]['machine_name']) db_session.add(sess) db_session.commit() u.seek(0) with open('/tmp/%s_raw.csv' % session_id, 'wb') as s: s.write(u.getvalue()) del u initializeSession.delay(session_id) flask_session['session_id'] = session_id return jsonify(ready=True, session_id=session_id)
def process_file(self): ''' Here we will see if the input file is CSV, or if it is an understood format that can be converted to CSV. Assuming it's one of those two, we will pass the resulting CSV file over to the csv processor. ''' for this_filename in self.filelist: logger.debug('Filename processing is %s', this_filename) self.format=convert.guess_format(this_filename) logger.debug('Guessed format of %s', self.format) if self.format == 'csv': self.filename=this_filename break elif self.format: # If it is not a CSV file, but some other # understood format, we will convert it to a CSV and # write it out to a temporary file. fh, self.temp_file=tempfile.mkstemp(suffix='.csv') os.close(fh) self.filename=self.temp_file try: logger.debug('Attempting to convert to format CSV (from %s)', self.format) with open(self.temp_file,'w') as fh: fh.write(convert.convert(open(this_filename,'rb'), self.format)) break except Exception, e: logger.exception('Failed to process %s to CSV: %s', self.filename, e) os.unlink(self.filename) self.filename=None
def upload(): context = {} if request.method == 'POST': big_file = False try: files = request.files except RequestEntityTooLarge, e: files = None big_file = True current_app.logger.info(e) if files: f = files['input_file'] if allowed_file(f.filename): inp = StringIO(f.read()) file_format = convert.guess_format(f.filename) try: converted = convert.convert(inp, file_format) except UnicodeDecodeError: context['errors'] = [ 'We had a problem with reading your file. \ This could have to do with the file encoding or format' ] converted = None f.seek(0) if converted: outp = StringIO(converted) reader = UnicodeCSVReader(outp) session['header_row'] = reader.next() rows = [] columns = [[] for c in session['header_row']] column_ids = range(len(session['header_row'])) for row in range(100): try: rows.append(reader.next()) except StopIteration: break for i, row in enumerate(rows): for j, d in enumerate(row): columns[j].append(row[column_ids[j]]) sample_data = [] guesses = {} for index, header_val in enumerate(session['header_row']): guesses[index] = guess_geotype(header_val, columns[index]) sample_data.append((index, header_val, columns[index])) session['sample_data'] = sample_data session['guesses'] = json.dumps(guesses) outp.seek(0) session['file'] = outp.getvalue() session['filename'] = f.filename return redirect(url_for('views.select_geo')) else: context['errors'] = [ 'Only .xls or .xlsx and .csv files are allowed.' ] else: context['errors'] = ['You must provide a file to upload.'] if big_file: context['errors'] = ['Uploaded file must be 10mb or less.']
def main(self): if self.args.filetype: filetype = self.args.filetype if filetype not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error('You must specify a format when providing data via STDIN (pipe).') filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self._open_input_file(self.args.schema) if self.args.key: kwargs['key'] = self.args.key if self.args.sniff_limit: kwargs['sniff_limit'] = self.args.sniff_limit if self.args.sheet: kwargs['sheet'] = self.args.sheet if self.args.no_inference: kwargs['column_types'] = agate.TypeTester(limit=0) if filetype == 'csv' and self.args.no_header_row: kwargs['header'] = False convert.convert(self.input_file, filetype, output=self.output_file, **kwargs)
def main(self): if self.args.format: format = self.args.format if format not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.format) elif self.args.schema: format = 'fixed' elif self.args.key: format = 'json' else: if self.args.file == sys.stdin: self.argparser.error( 'You must specify a format when providing data via STDIN (pipe).' ) format = convert.guess_format(self.args.file) if not format: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying a format with --format.' ) if isinstance(self.args.file, file): f = self.args.file elif format in ('xls', 'xlsx'): f = open(self.args.file, 'rb') else: f = open(self.args.file, 'rU') kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self.args.schema if self.args.key: kwargs['key'] = self.args.key if self.args.snifflimit: kwargs['snifflimit'] = self.args.snifflimit if self.args.sheet: kwargs['sheet'] = self.args.sheet if self.args.no_inference: kwargs['type_inference'] = False if format == 'csv' and self.args.no_header_row: kwargs['no_header_row'] = True # Fixed width can be processed as a stream if format == 'fixed': kwargs['output'] = self.output_file self.output_file.write(convert.convert(f, format, **kwargs))
def main(self): if self.args.filetype: filetype = self.args.filetype if filetype not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error( 'You must specify a format when providing data via STDIN (pipe).' ) filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error( 'Unable to automatically determine the format of the input file. Try specifying a format with --format.' ) if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self._open_input_file(self.args.schema) if self.args.key: kwargs['key'] = self.args.key if self.args.snifflimit: kwargs['snifflimit'] = self.args.snifflimit if self.args.sheet: kwargs['sheet'] = self.args.sheet if self.args.no_inference: kwargs['type_inference'] = False if filetype == 'csv' and self.args.no_header_row: kwargs['no_header_row'] = True # Fixed width can be processed as a stream if filetype == 'fixed': kwargs['output'] = self.output_file data = convert.convert(self.input_file, filetype, **kwargs) self.output_file.write(data)
def main(self): if self.filetype: filetype = self.filetype if filetype not in convert.SUPPORTED_FORMATS: self.csvResult = (_('"%s" is not a supported format') % self.filetype) # elif self.args.schema: # filetype = 'fixed' # elif self.args.key: # filetype = 'json' else: if not self.file_name or self.file_name == '-': self.csvResult = _('You must specify a format.') filetype = convert.guess_format(self.file_name) if not filetype: self.csvResult = _('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if filetype in ('xls', 'xlsx'): self.input_file = open(self.file_name, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) kwargs = self.reader_kwargs # if self.args.schema: # kwargs['schema'] = self._open_input_file(self.args.schema) # # if self.args.key: # kwargs['key'] = self.args.key # # if self.args.snifflimit: # kwargs['snifflimit'] = self.args.snifflimit # # if self.args.sheet: # kwargs['sheet'] = self.args.sheet # # if self.args.no_inference: # kwargs['type_inference'] = False # # if filetype == 'csv' and self.args.no_header_row: # kwargs['no_header_row'] = True # # # Fixed width can be processed as a stream # if filetype == 'fixed': # kwargs['output'] = self.output_file data = convert.convert(self.input_file, filetype, **kwargs) print 'out: ' out_file_opened = open(self.output_file,'w') print out_file_opened out_file_opened.write(data)
def main(self): if self.args.filetype: filetype = self.args.filetype if filetype not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.filetype) elif self.args.schema: filetype = 'fixed' elif self.args.key: filetype = 'json' else: if not self.args.input_path or self.args.input_path == '-': self.argparser.error('You must specify a format when providing data via STDIN (pipe).') filetype = convert.guess_format(self.args.input_path) if not filetype: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if filetype in ('xls', 'xlsx'): self.input_file = open(self.args.input_path, 'rb') else: self.input_file = self._open_input_file(self.args.input_path) kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self._open_input_file(self.args.schema) if self.args.key: kwargs['key'] = self.args.key if self.args.snifflimit: kwargs['snifflimit'] = self.args.snifflimit if self.args.export: kwargs['export'] = self.args.export if self.args.sheet: kwargs['sheet'] = self.args.sheet if self.args.no_inference: kwargs['type_inference'] = False if filetype == 'csv' and self.args.no_header_row: kwargs['no_header_row'] = True # Fixed width can be processed as a stream if filetype == 'fixed': kwargs['output'] = self.output_file data = convert.convert(self.input_file, filetype, **kwargs) self.output_file.write(data)
def upload(): context = {} if request.method == 'POST': f = request.files['input_file'] if f: if allowed_file(f.filename): inp = StringIO(f.read()) if sys.getsizeof(inp.getvalue()) <= MAX_CONTENT_LENGTH: inp.seek(0) file_format = convert.guess_format(f.filename) try: converted = convert.convert(inp, file_format) except UnicodeDecodeError: context['errors'] = [ 'We had a problem with reading your file. \ This could have to do with the file encoding or format' ] converted = None f.seek(0) if converted: outp = StringIO(converted) reader = UnicodeCSVReader(outp) session['header_row'] = reader.next() rows = [] columns = [[] for c in session['header_row']] column_ids = range(len(session['header_row'])) for row in range(10): try: rows.append(reader.next()) except StopIteration: break for i, row in enumerate(rows): for j, d in enumerate(row): columns[j].append(row[column_ids[j]]) columns = [', '.join(c) for c in columns] sample_data = [] for index, _ in enumerate(session['header_row']): sample_data.append( (index, session['header_row'][index], columns[index])) session['sample_data'] = sample_data outp.seek(0) session['file'] = outp.getvalue() session['filename'] = f.filename return redirect(url_for('views.select_geo')) else: context['errors'] = ['Uploaded file must be 10mb or less.'] else: context['errors'] = [ 'Only .xls or .xlsx and .csv files are allowed.' ] else: context['errors'] = ['You must provide a file to upload.'] return render_template('upload.html', **context)
def upload(): context = {} if request.method == 'POST': big_file = False try: files = request.files except RequestEntityTooLarge, e: files = None big_file = True current_app.logger.info(e) if files: f = files['input_file'] if allowed_file(f.filename): inp = StringIO(f.read()) file_format = convert.guess_format(f.filename) try: converted = convert.convert(inp, file_format) except UnicodeDecodeError: context['errors'] = ['We had a problem with reading your file. \ This could have to do with the file encoding or format'] converted = None f.seek(0) if converted: outp = StringIO(converted) reader = UnicodeCSVReader(outp) session['header_row'] = reader.next() rows = [] columns = [[] for c in session['header_row']] column_ids = range(len(session['header_row'])) for row in range(100): try: rows.append(reader.next()) except StopIteration: break for i, row in enumerate(rows): for j,d in enumerate(row): columns[j].append(row[column_ids[j]]) sample_data = [] guesses = {} for index, header_val in enumerate(session['header_row']): guesses[index] = guess_geotype(header_val, columns[index]) sample_data.append((index, header_val, columns[index])) session['sample_data'] = sample_data session['guesses'] = json.dumps(guesses) outp.seek(0) session['file'] = outp.getvalue() session['filename'] = f.filename return redirect(url_for('views.select_geo')) else: context['errors'] = ['Only .xls or .xlsx and .csv files are allowed.'] else: context['errors'] = ['You must provide a file to upload.'] if big_file: context['errors'] = ['Uploaded file must be 10mb or less.']
def convert_to_csv_reader(filename, sheet=None, infer_types=True): format = convert.guess_format(filename) f = open(filename, "rb") convert_kwargs = {} if sheet is not None: # Only pass `sheet` to the `convert` function when its set to # a non-None value. This is done to satisfy csvkit which checks # for the presence of `sheet`, not whether it's valid. convert_kwargs['sheet'] = sheet converted = StringIO(convert.convert(f, format, infer_types=infer_types, **convert_kwargs)) reader = UnicodeCSVReader(converted) return reader
def main(self): if self.args.format: format = self.args.format if format not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.format) elif self.args.schema: format = 'fixed' elif self.args.key: format = 'json' else: if self.args.file == sys.stdin: self.argparser.error('You must specify a format when providing data via STDIN (pipe).') format = convert.guess_format(self.args.file) if not format: self.argparser.error('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if isinstance(self.args.file, file): f = self.args.file elif format in ('xls', 'xlsx'): f = open(self.args.file, 'rb') else: f = open(self.args.file, 'rU') kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self.args.schema if self.args.key: kwargs['key'] = self.args.key if self.args.snifflimit: kwargs['snifflimit'] = self.args.snifflimit if self.args.sheet: kwargs['sheet'] = self.args.sheet if self.args.no_inference: kwargs['type_inference'] = False if format == 'csv' and self.args.no_header_row: kwargs['no_header_row'] = True # Fixed width can be processed as a stream if format == 'fixed': kwargs['output'] = self.output_file self.output_file.write(convert.convert(f, format, **kwargs))
def __init__(self, file_path, filename): now = datetime.now().isoformat() self.file_path = file_path self.filename = filename self.file_type = convert.guess_format(self.filename) if self.file_type not in ['xls', 'csv', 'xlsx']: logger.warning(' %s Unsupported Format: %s, (%s)' % (now, self.file_type, self.filename)) raise DedupeFileError('%s is not a supported format' % self.file_type) self.converted = convert.convert(open(self.file_path, 'rb'), self.file_type) self.line_count = self.converted.count('\n') if self.line_count > 10000: logger.warning(' %s File too big: %s, (%s)' % (now, self.line_count, self.filename)) raise DedupeFileError('Your file has %s rows and we can only currently handle 10,000.' % self.line_count) logger.warning(' %s Format: %s, Line Count: %s' % (now, self.file_type, self.line_count))
def upload(): context = {} if request.method == 'POST': f = request.files['input_file'] if f: if allowed_file(f.filename): inp = StringIO(f.read()) if sys.getsizeof(inp.getvalue()) <= MAX_CONTENT_LENGTH: inp.seek(0) file_format = convert.guess_format(f.filename) try: converted = convert.convert(inp, file_format) except UnicodeDecodeError: context['errors'] = ['We had a problem with reading your file. \ This could have to do with the file encoding or format'] converted = None f.seek(0) if converted: outp = StringIO(converted) reader = UnicodeCSVReader(outp) session['header_row'] = reader.next() rows = [] columns = [[] for c in session['header_row']] column_ids = range(len(session['header_row'])) for row in range(10): try: rows.append(reader.next()) except StopIteration: break for i, row in enumerate(rows): for j,d in enumerate(row): columns[j].append(row[column_ids[j]]) columns = [', '.join(c) for c in columns] sample_data = [] for index,_ in enumerate(session['header_row']): sample_data.append((index, session['header_row'][index], columns[index])) session['sample_data'] = sample_data outp.seek(0) session['file'] = outp.getvalue() session['filename'] = f.filename return redirect(url_for('views.select_geo')) else: context['errors'] = ['Uploaded file must be 10mb or less.'] else: context['errors'] = ['Only .xls or .xlsx and .csv files are allowed.'] else: context['errors'] = ['You must provide a file to upload.'] return render_template('upload.html', **context)
def hello_world(): url = request.values.get('url', None) extension = request.values.get('ext', None) if not url: return "No file to convert. url=" u = urllib2.urlopen(url) if not extension: extension = u.headers.dict['content-disposition'].split('.')[-1][:-1] #extension = re.search('\.(.*?)\"', u.headers.dict['content-disposition']).group(1) if not extension: return "Couldn't figure out extension. specify with ext=" output = convert.convert(u, extension) return output
def main(self): if self.args.format: format = self.args.format if format not in convert.SUPPORTED_FORMATS: self.argparser.error('"%s" is not a supported format' % self.args.format) elif self.args.schema: format = "fixed" elif self.args.key: format = "json" else: if self.args.file == sys.stdin: self.argparser.error("You must specify a format when providing data via STDIN (pipe).") format = convert.guess_format(self.args.file) if not format: self.argparser.error( "Unable to automatically determine the format of the input file. Try specifying a format with --format." ) if isinstance(self.args.file, file): f = self.args.file elif format == "xls": f = open(self.args.file, "rb") else: f = open(self.args.file, "rU") kwargs = self.reader_kwargs if self.args.schema: kwargs["schema"] = self.args.schema if self.args.key: kwargs["key"] = self.args.key if self.args.snifflimit: kwargs["snifflimit"] = self.args.snifflimit # Fixed width can be processed as a stream if format == "fixed": kwargs["output"] = self.output_file self.output_file.write(convert.convert(f, format, **kwargs))
def main(self): if self.args.format: format = self.args.format if format not in convert.SUPPORTED_FORMATS: sys.exit('"%s" is not a supported format' % self.args.format) elif self.args.schema: format = 'fixed' elif self.args.key: format = 'json' else: if self.args.file == '<stdin>': sys.exit('You must specify a format when providing data via STDIN (pipe).') format = convert.guess_format(self.args.file) if not format: sys.exit('Unable to automatically determine the format of the input file. Try specifying a format with --format.') if isinstance(self.args.file, file): f = self.args.file elif format == 'xls': f = open(self.args.file, 'rb') else: f = open(self.args.file, 'rU') kwargs = self.reader_kwargs if self.args.schema: kwargs['schema'] = self.args.schema if self.args.key: kwargs['key'] = self.args.key if self.args.snifflimit: kwargs['snifflimit'] = self.args.snifflimit # Fixed width can be processed as a stream if format == 'fixed': kwargs['output'] = self.output_file self.output_file.write(convert.convert(f, format, **kwargs))
def __init__(self, file_path, filename): now = datetime.now().isoformat() self.file_path = file_path self.filename = filename self.file_type = convert.guess_format(self.filename) if self.file_type not in ['xls', 'csv', 'xlsx']: client.captureMessage(' %s Unsupported Format: %s, (%s)' % (now, self.file_type, self.filename)) raise DedupeFileError('%s is not a supported format' % self.file_type) try: self.converted = convert.convert(open(self.file_path, 'rb'), self.file_type) except UnicodeDecodeError: client.captureException() raise DedupeFileError('We had a problem with the file you uploaded. \ This might be related to encoding or the file name having the wrong file extension.') self.line_count = self.converted.count('\n') if self.line_count > 10000: client.captureMessage(' %s File too big: %s, (%s)' % (now, self.line_count, self.filename)) raise DedupeFileError('Your file has %s rows and we can only currently handle 10,000.' % self.line_count) client.captureMessage(' %s Format: %s, Line Count: %s' % (now, self.file_type, self.line_count))
def test_valid_file(self): with open('examples/test.xls', 'r') as f: output = convert.convert(f, 'xls') with open('examples/testxls_converted.csv', 'r') as f: self.assertEquals(f.read(), output)
def scrape(self): seat_numbers = defaultdict(lambda: defaultdict(int)) extension = os.path.splitext(self.csv_url)[1] if extension in ('.xls', '.xlsx'): data = StringIO( convert.convert(BytesIO(self.get(self.csv_url).content), extension[1:])) else: data = None reader = self.csv_reader(self.csv_url, header=True, encoding=self.encoding, skip_rows=self.skip_rows, data=data) reader.fieldnames = [ self.header_converter(field) for field in reader.fieldnames ] for row in reader: if any(row.values()): if row['last name'] == 'Vacant' or row[ 'first name'] == 'Vacant': continue for key, corrections in self.corrections.items(): if row.get(key) and row[key] in corrections: row[key] = corrections[row[key]] role = row['primary role'].split(';', 1)[0] name = '{} {}'.format(row['first name'], row['last name']) province = row.get('province') if self.district_name: if row['district id']: district = self.district_name.format(**row) else: district = self.jurisdiction.division_name else: district = row.get( 'district name') or self.jurisdiction.division_name if self.many_posts_per_area and role not in ('Mayor', 'Regional Chair'): seat_numbers[role][district] += 1 district = '{} (seat {})'.format( district, seat_numbers[role][district]) lines = [] if row.get('address line 1'): lines.append(row['address line 1']) if row.get('address line 2'): lines.append(row['address line 2']) if row.get('locality'): parts = [row['locality']] if province: parts.append(province) if row.get('postal code'): parts.extend(['', row['postal code']]) lines.append(' '.join(parts)) p = CanadianPerson(primary_org='legislature', name=name, district=district, role=role) p.add_source(self.csv_url) if row.get('gender'): p.gender = row['gender'] if row.get('photo url'): p.image = row['photo url'] if row.get('source url'): p.add_source(row['source url']) if row.get('website'): p.add_link(row['website']) p.add_contact('email', row['email']) if lines: p.add_contact('address', '\n'.join(lines), 'legislature') if row.get('phone'): p.add_contact('voice', row['phone'].split(';', 1)[0], 'legislature') if row.get('fax'): p.add_contact('fax', row['fax'], 'legislature') if row.get('cell'): p.add_contact('cell', row['cell'], 'legislature') if row.get('facebook'): p.add_link(re.sub(r'[#?].+', '', row['facebook'])) if row.get('twitter'): p.add_link(row['twitter']) if name in self.other_names: for other_name in self.other_names[name]: p.add_name(other_name) yield p
def index(): status_code = 200 error = None return make_response(render_app_template('back_soon.html', error=error), status_code) if flask_session.get('ga_cid') is None: try: flask_session['ga_cid'] = request.cookies['_ga'] except KeyError: flask_session['ga_cid'] = str(uuid4()) if request.method == 'POST': f = request.files['input_file'] if f and allowed_file(f.filename): fname = secure_filename(str(time.time()) + "_" + f.filename) file_path = os.path.abspath(os.path.join(UPLOAD_FOLDER, fname)) f.save(file_path) file_type = convert.guess_format(f.filename) f.seek(0) try: file_contents = convert.convert(f, file_type) except UnicodeDecodeError: file_contents = None if sentry: sentry.captureException() status_code = 500 error = ''' We had a problem with the file you uploaded. This might be related to encoding or the file name having the wrong file extension. ''' if file_contents: with open('{0}-converted.csv'.format(file_path), 'wb') as o: o.write(file_contents) # Delete existing session keys sess_keys = ['training_data', 'counter'] for k in sess_keys: try: del flask_session[k] except KeyError: pass flask_session['last_interaction'] = datetime.now() flask_session['raw_table'], \ flask_session['header'] = makeRawTable(file_contents) old = datetime.now() - timedelta(seconds=60 * 30) if flask_session['last_interaction'] < old: del flask_session['raw_table'] flask_session['filename'] = f.filename flask_session['file_path'] = file_path flask_session['file_type'] = file_type #send_ga_log( # 'Row Count', # flask_session['ga_cid'], # value=inp_file.line_count #) #send_ga_log( # 'File Type', # flask_session['ga_cid'], # label=inp_file.file_type, #) return redirect(url_for('select_fields')) #except DedupeFileError as e: # send_ga_log('Upload Error', flask_session['ga_cid'], label=e.message) # error = e.message # status_code = 500 else: error = 'Error uploading file. Did you forget to select one?' send_ga_log('Upload Error', flask_session['ga_cid'], label=error) status_code = 500 return make_response(render_app_template('index.html', error=error), status_code)
def scrape(self): seat_numbers = defaultdict(lambda: defaultdict(int)) extension = os.path.splitext(self.csv_url)[1] if extension in ('.xls', '.xlsx'): data = StringIO(convert.convert(BytesIO(self.get(self.csv_url).content), extension[1:])) elif extension == '.zip': basename = os.path.basename(self.csv_url) if not self.encoding: self.encoding = 'utf-8' try: response = requests.get(self.csv_url, stream=True) with open(basename, 'wb') as f: for chunk in response.iter_content(): f.write(chunk) with ZipFile(basename).open(self.filename, 'r') as fp: data = StringIO(fp.read().decode(self.encoding)) finally: os.unlink(basename) else: data = None reader = self.csv_reader(self.csv_url, header=True, encoding=self.encoding, skip_rows=self.skip_rows, data=data) reader.fieldnames = [self.header_converter(field) for field in reader.fieldnames] for row in reader: if any(row.values()): if row['last name'] == 'Vacant' or row['first name'] == 'Vacant': continue for key, corrections in self.corrections.items(): if row.get(key) and row[key] in corrections: row[key] = corrections[row[key]] role = re.split(r'(?: (?:and|et)\b|;)', row['primary role'], 1)[0].strip() # ca_on_newmarket, ca_qc_laval, ca_qc_montreal name = '{} {}'.format(row['first name'], row['last name']) province = row.get('province') if not re.search(r'[A-Z]', role): # ca_qc_laval role = role.capitalize() if self.district_name_format_string: if row['district id']: district = self.district_name_format_string.format(**row) else: district = self.jurisdiction.division_name else: district = row.get('district name') or self.jurisdiction.division_name if self.many_posts_per_area and role not in ('Mayor', 'Regional Chair'): seat_numbers[role][district] += 1 district = '{} (seat {})'.format(district, seat_numbers[role][district]) lines = [] if row.get('address line 1'): lines.append(row['address line 1']) if row.get('address line 2'): lines.append(row['address line 2']) if row.get('locality'): parts = [row['locality']] if province: parts.append(province) if row.get('postal code'): parts.extend(['', row['postal code']]) lines.append(' '.join(parts)) p = CanadianPerson(primary_org='legislature', name=name, district=district, role=role) p.add_source(self.csv_url) if row.get('gender'): p.gender = row['gender'] if row.get('photo url'): p.image = row['photo url'] if row.get('source url'): p.add_source(row['source url']) if row.get('website'): p.add_link(row['website'], note='web site') p.add_contact('email', row['email']) if lines: p.add_contact('address', '\n'.join(lines), 'legislature') if row.get('phone'): p.add_contact('voice', row['phone'].split(';', 1)[0], 'legislature') # ca_qc_montreal if row.get('fax'): p.add_contact('fax', row['fax'], 'legislature') if row.get('cell'): p.add_contact('cell', row['cell'], 'legislature') if row.get('facebook'): p.add_link(re.sub(r'[#?].+', '', row['facebook'])) if row.get('twitter'): p.add_link(row['twitter']) if name in self.other_names: for other_name in self.other_names[name]: p.add_name(other_name) yield p
def scrape(self): seat_numbers = defaultdict(lambda: defaultdict(int)) extension = os.path.splitext(self.csv_url)[1] if extension in ('.xls', '.xlsx'): data = StringIO(convert.convert(BytesIO(self.get(self.csv_url).content), extension[1:])) else: data = None reader = self.csv_reader(self.csv_url, header=True, encoding=self.encoding, skip_rows=self.skip_rows, data=data) reader.fieldnames = [self.header_converter(field) for field in reader.fieldnames] for row in reader: if any(row.values()): if row['last name'] == 'Vacant' or row['first name'] == 'Vacant': continue for key, corrections in self.corrections.items(): if row.get(key) and row[key] in corrections: row[key] = corrections[row[key]] role = row['primary role'].split(';', 1)[0] name = '{} {}'.format(row['first name'], row['last name']) province = row.get('province') if self.district_name: if row['district id']: district = self.district_name.format(**row) else: district = self.jurisdiction.division_name else: district = row.get('district name') or self.jurisdiction.division_name if self.many_posts_per_area and role not in ('Mayor', 'Regional Chair'): seat_numbers[role][district] += 1 district = '{} (seat {})'.format(district, seat_numbers[role][district]) lines = [] if row.get('address line 1'): lines.append(row['address line 1']) if row.get('address line 2'): lines.append(row['address line 2']) if row.get('locality'): parts = [row['locality']] if province: parts.append(province) if row.get('postal code'): parts.extend(['', row['postal code']]) lines.append(' '.join(parts)) p = CanadianPerson(primary_org='legislature', name=name, district=district, role=role) p.add_source(self.csv_url) if row.get('gender'): p.gender = row['gender'] if row.get('photo url'): p.image = row['photo url'] if row.get('source url'): p.add_source(row['source url']) if row.get('website'): p.add_link(row['website'], note='web site') p.add_contact('email', row['email']) if lines: p.add_contact('address', '\n'.join(lines), 'legislature') if row.get('phone'): p.add_contact('voice', row['phone'].split(';', 1)[0], 'legislature') if row.get('fax'): p.add_contact('fax', row['fax'], 'legislature') if row.get('cell'): p.add_contact('cell', row['cell'], 'legislature') if row.get('facebook'): p.add_link(re.sub(r'[#?].+', '', row['facebook'])) if row.get('twitter'): p.add_link(row['twitter']) if name in self.other_names: for other_name in self.other_names[name]: p.add_name(other_name) yield p