def try_dialects(inputfile, fieldnames, dialect): #FIXME: does it verify at all if we don't actually step through the file? try: inputfile.seek(0) reader = csv.DictReader(inputfile, fieldnames=fieldnames, dialect=dialect, restkey=EXTRA_KEY) except csv.Error: try: inputfile.seek(0) reader = csv.DictReader(inputfile, fieldnames=fieldnames, dialect='default', restkey=EXTRA_KEY) except csv.Error: inputfile.seek(0) reader = csv.DictReader(inputfile, fieldnames=fieldnames, dialect='excel', restkey=EXTRA_KEY) return reader
def readData(input_file, field_names, delimiter=',', prefix=None, configuration=None): """ Read in our data from a CSV file and create a dictionary of records, where the key is a unique record ID and each value is a dict of the row fields. **Currently, dedupe depends upon records' unique ids being integers with no integers skipped. The smallest valued unique id must be 0 or 1. Expect this requirement will likely be relaxed in the future.** """ if not configuration: raise Exception("configuration argument is not really optional") data = {} reader = csv.DictReader(StringIO(input_file), delimiter=delimiter) for i, row in enumerate(reader): clean_row = { k: preProcess(k, v, configuration) for (k, v) in row.items() if k is not None } if prefix: row_id = u"%s|%s" % (prefix, i) else: row_id = i data[row_id] = clean_row return data
def load_data(year): ''' Load data into memory cache ''' year = str(year) if year in CACHE: return True data_file = os.path.join(os.path.dirname(__file__), 'data', '{}.csv'.format(year)) if not os.path.isfile(data_file): return False CACHE[year] = {} with io.open(data_file, encoding='utf-8') as rf: # Detect CSV header line has_header = csv.Sniffer().has_header(rf.read(1024)) rf.seek(0) reader = csv.DictReader(rf, DATA_FIELDS) if has_header: next(reader) for data_line in reader: day = clean_up_dict(data_line) # Convert into `int` type so we don't need to parse it afterwards dt = datetime.strptime(day['date'], '%Y-%m-%d') day['year'] = dt.year day['month'] = dt.month day['day'] = dt.day day['isholiday'] = bool(int(day['isholiday'])) day['isworkday'] = bool(int(day['isworkday'])) CACHE[year][day.pop('date')] = day return True
def readData(input_file, field_names, prefix=None): """ Read in our data from a CSV file and create a dictionary of records, where the key is a unique record ID and each value is a dict of the row fields. **Currently, dedupe depends upon records' unique ids being integers with no integers skipped. The smallest valued unique id must be 0 or 1. Expect this requirement will likely be relaxed in the future.** """ data = {} reader = csv.DictReader(StringIO(input_file)) timer.elapsed(f'Opened file') for i, row in enumerate(reader): clean_row = { k: preProcess(v) for (k, v) in row.items() if k is not None } if prefix: row_id = u"%s|%s" % (prefix, i) else: row_id = i data[row_id] = clean_row if i % 100000 == 0: timer.elapsed(str(i)) return data
def delete(api, start, date, r): with io.open("tweets.csv", encoding='utf-8') as file: count = 0 for row in csv.DictReader(file): tweet_id = int(row["tweet_id"]) tweet_date = parse(row["timestamp"], ignoretz=True).date() if start != "" and tweet_date < parse(start).date(): continue if date != "" and tweet_date >= parse(date).date(): continue if (r == "retweet" and row["retweeted_status_id"] == "" or r == "reply" and row["in_reply_to_status_id"] == ""): continue try: print("Deleting tweet #{0} ({1})".format(tweet_id, tweet_date)) api.DestroyStatus(tweet_id) count += 1 time.sleep(0.2) except twitter.TwitterError as err: print("Exception: %s\n" % err.message) print("Number of deleted tweets: %s\n" % count)
def iter_records(self): info_path = os.path.join(self.path, 'data.csv') with io.open(info_path, encoding='utf8') as f: for row in csv.DictReader(f): if row['failed']: continue yield row
def parse(self, input): """parsese the given file or file source string""" if hasattr(input, 'name'): self.filename = input.name elif not getattr(self, 'filename', ''): self.filename = '' if hasattr(input, "read"): tmsrc = input.read() input.close() input = tmsrc if TAB_UTF16 in input.split(b"\n")[0]: self.encoding = 'utf-16' else: self.encoding = 'iso-8859-1' try: input = input.decode(self.encoding) except Exception: raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded") reader = csv.DictReader(input.split("\n"), fieldnames=WF_FIELDNAMES, dialect="wordfast") for idx, line in enumerate(reader): if idx == 0: header = dict(zip(WF_FIELDNAMES_HEADER, [line[key] for key in WF_FIELDNAMES])) self.header = WordfastHeader(header) continue newunit = WordfastUnit() newunit.dict = line self.addunit(newunit)
def test_read_dict_no_fieldnames(self): with TemporaryFile("w+") as fileobj: fileobj.write("f1,f2,f3\r\n1,2,abc\r\n") fileobj.seek(0) reader = csv.DictReader(fileobj) self.assertEqual(next(reader), {"f1": '1', "f2": '2', "f3": 'abc'}) self.assertEqual(reader.fieldnames, ["f1", "f2", "f3"])
def _parseCSV(self, file): """read CSV from IO using DictReader""" i = 1 reader = csv.DictReader(file) self._headers = list(reader.fieldnames) for row in reader: self._rows.append(row.copy()) i += 1
def trac_fetcher(ticket): url = '%s/ticket/%s' % (base_url, ticket) response = requests.get(url + '?format=csv') if response.status_code == 200: reader = csv.DictReader(response.text.split('\n')) row = next(reader) return url, row.get('summary', None) else: return url, None
def parse_csv(data): f = io.StringIO(data) reader = csv.DictReader(f, dialect="excel-tab", strict=True) result = [dict(row) for row in reader] for row in result: for k in row.keys(): if not k.isalnum(): # Otherwise the reader would parse just about anything... raise csv.Error return result
def test_read_dict_fieldnames_chain(self): import itertools with TemporaryFile("w+") as fileobj: fileobj.write("f1,f2,f3\r\n1,2,abc\r\n") fileobj.seek(0) reader = csv.DictReader(fileobj) first = next(reader) for row in itertools.chain([first], reader): self.assertEqual(reader.fieldnames, ["f1", "f2", "f3"]) self.assertEqual(row, {"f1": '1', "f2": '2', "f3": 'abc'})
def test_read_long(self): with TemporaryFile("w+") as fileobj: fileobj.write("1,2,abc,4,5,6\r\n") fileobj.seek(0) reader = csv.DictReader(fileobj, fieldnames=["f1", "f2"]) self.assertEqual(next(reader), { "f1": '1', "f2": '2', None: ["abc", "4", "5", "6"] })
def test_export(self): self.store.export() csv_fpath = self.store._CountDataStorage__csv_location with open(csv_fpath, newline='', encoding='utf-8') as rf: reader = csv.DictReader(rf) for row in reader: day = datetime.strptime(row['Date'], self.format) self.assertEqual( self.store.get(day), int(row['Count']), 'Count of date {} should be {}'.format( row['Date'], row['Count']))
def test_read_long_with_rest_no_fieldnames(self): with TemporaryFile("w+") as fileobj: fileobj.write("f1,f2\r\n1,2,abc,4,5,6\r\n") fileobj.seek(0) reader = csv.DictReader(fileobj, restkey="_rest") self.assertEqual(reader.fieldnames, ["f1", "f2"]) self.assertEqual(next(reader), { "f1": '1', "f2": '2', "_rest": ["abc", "4", "5", "6"] })
def test_read_semi_sep(self): reader = csv.DictReader(["1;2;abc;4;5;6\r\n"], fieldnames="1 2 3 4 5 6".split(), delimiter=';') self.assertEqual(next(reader), { "1": '1', "2": '2', "3": 'abc', "4": '4', "5": '5', "6": '6' })
def delete(csv_file, date, r): with io.open(csv_file, encoding='utf-8') as tweets_file: count = 0 api = twitter.Api(consumer_key=os.environ['TWITTER_CONSUMER_KEY'], consumer_secret=os.environ['TWITTER_CONSUMER_SECRET'], access_token_key=os.environ['TWITTER_ACCESS_TOKEN'], access_token_secret=os.environ['TWITTER_ACCESS_TOKEN_SECRET']) destroyer = TweetDestroyer(api) for row in TweetReader(csv.DictReader(tweets_file), date, r).read(): destroyer.destroy(row["tweet_id"]) count += 1 print("Number of deleted tweets: %s\n" % count)
def test_read_multi(self): sample = [ '2147483648,43.0e12,17,abc,def\r\n', '147483648,43.0e2,17,abc,def\r\n', '47483648,43.0,170,abc,def\r\n' ] reader = csv.DictReader(sample, fieldnames="i1 float i2 s1 s2".split()) self.assertEqual( next(reader), { "i1": '2147483648', "float": '43.0e12', "i2": '17', "s1": 'abc', "s2": 'def' })
def load_csv(location): """ Read CSV at `location`, return a list of ordered dictionaries, one for each row. """ results = [] # FIXME: why ignore encoding errors here? with codecs.open(location, mode='rb', encoding='utf-8-sig', errors='ignore') as csvfile: for row in csv.DictReader(csvfile): # convert all the column keys to lower case updated_row = OrderedDict([(key.lower(), value) for key, value in row.items()]) results.append(updated_row) return results
def main(): parser = argparse.ArgumentParser() org_files = os.environ.get("ORG", "~/org") parser.add_argument( "--contacts", "-c", default=os.path.join(org_files, "contacts.csv"), help="""The CSV file containing the contact details.""") parser.add_argument("--flag", "-f", action='store_true', help="""Search by flag.""") parser.add_argument("names", nargs='*', help="""The names to look for.""") args = parser.parse_args() by_name = {} by_id = {} with io.open(args.contacts, 'r', encoding='utf-8') as input: contacts_reader = csv.DictReader(input) for row in contacts_reader: for multi in multi_fields: row[multi] = (row.get(multi, "") or "").split() n = make_name(row) row['_name_'] = n by_name[n] = row by_id[id] = row.get('ID', "") if args.flag != "": flag = args.names[0] by_address = {} for n in sorted(by_name.keys()): who = by_name[n] if flag in who['Flags']: addr = assemble_postal_address(who, "\n ") if addr not in by_address: by_address[addr] = [] by_address[addr].append(who) for addr in sorted(by_address.keys()): sys.stdout.write( make_name_list(by_address[addr]) + "\n " + addr + "\n\n") else: name = " ".join(args.names) if name in by_name: show_person(sys.stdout, by_name[name]) else: for n in sorted(by_name.keys()): if name in n: show_person(sys.stdout, by_name[n])
def load_districts(csv_file): vbd = {} g = {} with io.open(csv_file, encoding='utf-8') as c: reader = csv.DictReader(c) #,delimiter=',', quotechar='"') for row in reader: vil = row[VILLAGE] dis = row[DISTRICT][6:] if dis in vbd.keys(): vbd[dis] += [vil] else: vbd[dis] = [vil] g[vil + dis] = (row[LAT], row[LON]) return vbd, g
def all(self, country, from_csv=None): file_handle = None # check for environment variable if not from_csv and 'OCD_DIVISION_CSV' in os.environ: from_csv = os.environ.get('OCD_DIVISION_CSV').format(country) try: file_handle = io.open(from_csv, encoding='utf8') except FileNotFoundError: raise ValueError("Unknown country in OCD ID") # going to the remote URL if not file_handle: file_handle = io.StringIO( urlopen(OCD_REMOTE_URL.format(country)).read().decode('utf-8')) for row in csv.DictReader(file_handle): yield Division(**row)
def all(self, country, from_csv=None): file_handle = None # Load from CSV if `from_csv` or `OCD_DIVISION_CSV` are set. if from_csv or 'OCD_DIVISION_CSV' in os.environ: if not from_csv: from_csv = os.environ.get('OCD_DIVISION_CSV').format(country) try: file_handle = io.open(from_csv, encoding='utf8') except FileNotFoundError: raise ValueError("Couldn't open CSV file {}".format(from_csv)) # Load from URL otherwise. if not file_handle: file_handle = io.StringIO( urlopen(OCD_REMOTE_URL.format(country)).read().decode('utf-8')) for row in csv.DictReader(file_handle): yield Division(**row)
def anonymize_file(source, dest, csvheaderformatdict=None, ignorementions=False): print('Reading from [{0}] and writing anonymized data to [{1}]...'.format( source, dest)) with io.open(source, 'r', encoding='utf8') as f: #with io.open(source, 'r') as f: with io.open(dest, 'w', encoding='utf8') as o: reader = csv.DictReader(f) if not csvheaderformatdict: fieldnames = consts.defaultHeader else: fieldnames = reader.fieldnames writer = csv.DictWriter(o, fieldnames) writer.writeheader() for row in reader: anonymize_row(row, fieldnames, csvheaderformatdict, ignorementions) writer.writerow(row)
def test_read_with_blanks(self): reader = csv.DictReader( ["1,2,abc,4,5,6\r\n", "\r\n", "1,2,abc,4,5,6\r\n"], fieldnames="1 2 3 4 5 6".split()) self.assertEqual(next(reader), { "1": '1', "2": '2', "3": 'abc', "4": '4', "5": '5', "6": '6' }) self.assertEqual(next(reader), { "1": '1', "2": '2', "3": 'abc', "4": '4', "5": '5', "6": '6' })
def read_csv_database(database_path): """Read database CSV file, providing one line at a time. We'll use a class to modify the csv library's default dialect ('excel') to enable strict syntax checking. This will trigger errors for things like unclosed quotes. """ class StrictExcel(csv.excel): # Our helper class is really simple # pylint: disable=too-few-public-methods, missing-class-docstring strict = True with database_path.open(mode="r", encoding="utf-8") as database_file: reader = csv.DictReader(database_file, dialect=StrictExcel) try: for row in reader: yield row except csv.Error as err: raise MailmergeError("{}:{}: {}".format(database_path, reader.line_num, err))
def get_job_results(results_file, encoding="utf8"): """ Read in results from a results file (results.csv.gz) Generate: dict """ if not results_file.endswith(".csv.gz"): raise NotImplementedError( "Expecting results file to be a .csv.gz file.") mv_fields = None with gzip.open(results_file, "rt", encoding=encoding) as stream: for row in csv.DictReader(stream): # Remove __mv_ fields; replacing the origion fields with lists of values, were necessary if mv_fields is None: mv_fields = [(f, f[5:]) for f in row if f.startswith("__mv_")] if mv_fields: for mv_field, field in mv_fields: if row[mv_field]: row[field] = _decode_mv_field(row[mv_field]) del row[mv_field] yield row
def people_to_qualtrics_csv(hub, repo_tools_data, frequency, update): """ Print out a formatted file as expected by Qualtrics import. """ if update is not None: with open(update, newline='', encoding='utf-8') as update_data: reader = csv.DictReader(update_data) initial = {row[EMAIL]: row for row in reader} fields = [field for field in reader.fieldnames if field] else: initial = {} fields = [NAME, EMAIL, WEEK, ASSOCIATED_WITH, UNSUBSCRIBED] csv_writer = csv.DictWriter(click.get_text_stream('stdout'), fieldnames=fields, extrasaction='ignore') csv_writer.writeheader() for username, person in repo_tools_data.people.iteritems(): if person.email is None: continue hashdigest = hashlib.md5(person.email.lower()).hexdigest() row = initial.get(person.email, {}) row.update({ NAME: person.name, EMAIL: person.email, WEEK: int(hashdigest, 16) % frequency + 1, ASSOCIATED_WITH: 'edX' if person.associated_with('edX', 'ArbiSoft') else 'other', }) if not person.email_ok: row[UNSUBSCRIBED] = 'true' csv_writer.writerow(row)
def parse(self, input): """parsese the given file or file source string""" if hasattr(input, 'name'): self.filename = input.name elif not getattr(self, 'filename', ''): self.filename = '' if hasattr(input, "read"): tmsrc = input.read() input.close() input = tmsrc input = input.decode(self.encoding) try: header_length = self._read_header(input) except Exception: raise base.ParseError("Cannot parse header") lines = csv.DictReader( input.split(UtxDialect.lineterminator)[header_length:], fieldnames=self._fieldnames, dialect="utx") for line in lines: newunit = UtxUnit() newunit.dict = line self.addunit(newunit)
def parse(self, input): """parsese the given file or file source string""" if hasattr(input, 'name'): self.filename = input.name elif not getattr(self, 'filename', ''): self.filename = '' if hasattr(input, "read"): tmsrc = input.read() input.close() input = tmsrc try: input = input.decode(self.encoding) except Exception: raise ValueError( "OmegaT files are either UTF-8 encoded or use the default system encoding" ) lines = csv.DictReader(input.split("\n"), fieldnames=OMEGAT_FIELDNAMES, dialect="omegat") for line in lines: newunit = OmegaTUnit() newunit.dict = line self.addunit(newunit)