def test_delimiters(self): sniffer = csv.Sniffer() dialect = sniffer.sniff(self.sample3) # given that all three lines in sample3 are equal, # I think that any character could have been 'guessed' as the # delimiter, depending on dictionary order self.assertIn(dialect.delimiter, self.sample3) dialect = sniffer.sniff(self.sample3, delimiters="?,") self.assertEqual(dialect.delimiter, "?") dialect = sniffer.sniff(self.sample3, delimiters="/,") self.assertEqual(dialect.delimiter, "/") dialect = sniffer.sniff(self.sample4) self.assertEqual(dialect.delimiter, ";") dialect = sniffer.sniff(self.sample5) self.assertEqual(dialect.delimiter, "\t") dialect = sniffer.sniff(self.sample6) self.assertEqual(dialect.delimiter, "|") dialect = sniffer.sniff(self.sample7) self.assertEqual(dialect.delimiter, "|") self.assertEqual(dialect.quotechar, "'") dialect = sniffer.sniff(self.sample8) self.assertEqual(dialect.delimiter, '+') dialect = sniffer.sniff(self.sample9) self.assertEqual(dialect.delimiter, '+') self.assertEqual(dialect.quotechar, "'")
def load_data(year): ''' Load data into memory cache ''' year = str(year) if year in CACHE: return True data_file = os.path.join(os.path.dirname(__file__), 'data', '{}.csv'.format(year)) if not os.path.isfile(data_file): return False CACHE[year] = {} with io.open(data_file, encoding='utf-8') as rf: # Detect CSV header line has_header = csv.Sniffer().has_header(rf.read(1024)) rf.seek(0) reader = csv.DictReader(rf, DATA_FIELDS) if has_header: next(reader) for data_line in reader: day = clean_up_dict(data_line) # Convert into `int` type so we don't need to parse it afterwards dt = datetime.strptime(day['date'], '%Y-%m-%d') day['year'] = dt.year day['month'] = dt.month day['day'] = dt.day day['isholiday'] = bool(int(day['isholiday'])) day['isworkday'] = bool(int(day['isworkday'])) CACHE[year][day.pop('date')] = day return True
def parse(self, csvsrc): text, encoding = self.detect_encoding(csvsrc, default_encodings=['utf-8', 'utf-16']) #FIXME: raise parse error if encoding detection fails? self.encoding = encoding or 'utf-8' sniffer = csv.Sniffer() sample = text[:1024] try: self.dialect = sniffer.sniff(sample) if self.dialect.quoting == csv.QUOTE_MINIMAL: #HACKISH: most probably a default, not real detection self.dialect.quoting = csv.QUOTE_ALL self.dialect.doublequote = True except csv.Error: self.dialect = 'default' inputfile = csv.StringIO(text) try: fieldnames = detect_header(inputfile, self.dialect, self.fieldnames) self.fieldnames = fieldnames except csv.Error: pass inputfile.seek(0) reader = try_dialects(inputfile, self.fieldnames, self.dialect) first_row = True for row in reader: newce = self.UnitClass() newce.fromdict(row) if not first_row or not newce.match_header(): self.addunit(newce) first_row = False
def ingest(self, file_path): with io.open(file_path, 'rb') as fh: encoding = self.detect_stream_encoding(fh) log.debug("Detected encoding [%s]: %s", self.result, encoding) with io.open(file_path, 'r', newline='', encoding=encoding) as fh: sample = fh.read(4096 * 10) fh.seek(0) dialect = csv.Sniffer().sniff(sample) # dialect.delimiter = dialect.delimiter[0] has_header = csv.Sniffer().has_header(sample) reader = csv.reader(fh, dialect=dialect) rows = self.generate_rows(reader, has_header=has_header) self.result.flag(self.result.FLAG_TABULAR) self.result.emit_rows(rows)
def test_sniff(self): sniffer = csv.Sniffer() dialect = sniffer.sniff(self.sample1) self.assertEqual(dialect.delimiter, ",") self.assertEqual(dialect.quotechar, '"') self.assertEqual(dialect.skipinitialspace, True) dialect = sniffer.sniff(self.sample2) self.assertEqual(dialect.delimiter, ":") self.assertEqual(dialect.quotechar, "'") self.assertEqual(dialect.skipinitialspace, False)
def test_doublequote(self): sniffer = csv.Sniffer() dialect = sniffer.sniff(self.header1) self.assertFalse(dialect.doublequote) dialect = sniffer.sniff(self.header2) self.assertFalse(dialect.doublequote) dialect = sniffer.sniff(self.sample2) self.assertTrue(dialect.doublequote) dialect = sniffer.sniff(self.sample8) self.assertFalse(dialect.doublequote) dialect = sniffer.sniff(self.sample9) self.assertTrue(dialect.doublequote)
def read_csv(inputFilePath, outFilePath, headerFields, fileEncoding, detect): encoding = "utf-8" outfile = io.open(outFilePath, 'w', encoding=encoding) if fileEncoding is not None: encoding = fileEncoding filetype = "" numlines = 0 with io.open(inputFilePath, "rU", encoding=encoding) as file: sniffed = csv.Sniffer().sniff(file.readline()) while sniffed.delimiter not in ";,\t": sniffed = csv.Sniffer().sniff(file.readline()) file.seek(0) outData = "" reader = csv.reader(file, delimiter=sniffed.delimiter) if detect == "true": for i in reader: row = ';'.join(i) + '\n' if filetype == "": filetype, header = getFileType(row, headerFields, encoding) if header: outData = header else: break else: for i in reader: row = ';'.join(i) + '\n' if filetype == "": filetype, header = getFileType(row, headerFields, encoding) if header: outData = header else: outData = row numlines = numlines + 1 outfile.write(outData) outfile.close() return filetype, numlines
def __init__(self, file_path, encoding=None, delimiter=None): if encoding is None: with open(file_path, 'r') as fh: data = fh.read(SAMPLE_SIZE) encoding = guess_encoding(data) self.fh = io.open(file_path, 'r', encoding=encoding) data = self.fh.read(SAMPLE_SIZE) dialect = csv.Sniffer().sniff(data) if delimiter is not None: dialect.delimiter = delimiter self.fh.seek(0) self.reader = iter(csv.reader(self.fh, dialect=dialect)) self.headers = next(self.reader) self.count = 0
def get_campaign_events_from_csv(file_name): BASE_COLUMNS = [ 'relative_to', 'offset', 'unit', 'delivery_hour', 'lang_code', 'message' ] with io.open(file_name, encoding='utf-8') as csv_file: has_header = csv.Sniffer().has_header(csv_file.read(2048)) csv_file.seek(0) csv_reader = csv.reader(csv_file) base_col_len = len(BASE_COLUMNS) if has_header: header = next(csv_reader) length = len(header) if length < base_col_len: raise ValueError( "Invalid CSV format: A minimum of {0} columns expected". format(base_col_len)) # Translations are given as optional (lang_code, message) column pairs at the end of each row. if length > base_col_len: # Translations need to be provided as lang_code, message pairs if (length - base_col_len) % 2 != 0: raise ValueError( "Invalid CSV format: Pairs of lang_code, message columns expected" ) has_translations = True else: has_translations = False for row in csv_reader: event = CampaignEvent(relative_to=row[0].strip(), offset=row[1].strip(), unit=row[2].strip(), delivery_hour=row[3].strip(), message=row[5].strip()) lang_code = row[4].strip() if has_translations: translations = row[base_col_len:] full_message = {lang_code: event.message} iterator = iter(translations) translations = zip(iterator, iterator) for code, msg in translations: full_message[code.strip()] = msg.strip() event.message = json.dumps(full_message, sort_keys=True) yield event
def open_csv(file_path, encoding=None, delimiter=None): if encoding is None: with io.open(file_path, 'rb') as fh: data = fh.read(SAMPLE_SIZE) encoding = guess_encoding(data) fh = io.open(file_path, 'r', encoding=encoding) if delimiter is None: data = fh.read(SAMPLE_SIZE) dialect = csv.Sniffer().sniff(data) delimiter = dialect.delimiter fh.seek(0) reader = csv.reader(fh, delimiter=delimiter) headers = [] for row in reader: headers = row break fh.seek(0) return fh, delimiter, headers
def read_excel_xlrd(inputFilePath, outFilePath, headerFields, detect): wb = xlrd.open_workbook( inputFilePath) #on_demand = True, encoding='cp1252' outfile = open(outFilePath, "w") filetype = "" numlines = 0 for sheet_name in wb.sheet_names(): sh = wb.sheet_by_name(sheet_name) if sh.ncols == 1: sniffed = csv.Sniffer().sniff(''.join(sh.row_values(0))) for i in range(sh.nrows): row = ''.join( map(lambda e: unicode(e).strip(), ';'.join( sh.row_values(i)).split(sniffed.delimiter))) + '\n' if filetype == "": filetype, header = getFileType(row, headerFields) if header: outfile.write(header) else: if detect == "true": break outfile.write(row.encode("utf8")) numlines = numlines + 1 else: for i in range(sh.nrows): row = ';'.join( map(lambda e: unicode(e).strip(), sh.row_values(i))) + '\n' if filetype == "": filetype, header = getFileType(row, headerFields) if header: outfile.write(header) else: if detect == "true": break outfile.write(row.encode("utf8")) numlines = numlines + 1 outfile.close() return filetype, numlines
def test_has_header_regex_special_delimiter(self): sniffer = csv.Sniffer() self.assertEqual(sniffer.has_header(self.sample8), False) self.assertEqual(sniffer.has_header(self.header2 + self.sample8), True)
def test_has_header(self): sniffer = csv.Sniffer() self.assertEqual(sniffer.has_header(self.sample1), False) self.assertEqual(sniffer.has_header(self.header1 + self.sample1), True)