def load_data(path): with open(path, 'r') as face_emotion_data: filed_names = ['emotion', 'pixels', 'Usage'] reader = DictReader(face_emotion_data, fieldnames=filed_names) reader.next() train_data = [] train_labels = [] test_data = [] test_labels = [] for row in reader: row['pixels'] = [ float(pixel) / 255 for pixel in row['pixels'].split() ] image = np.array(row['pixels'], dtype='float32').reshape( (1, 48, 48)) if row['Usage'] == 'Training': train_data.append(image) train_labels.append(int(row['emotion'])) else: test_data.append(image) test_labels.append(int(row['emotion'])) train_data = np.array(train_data) train_labels = np_utils.to_categorical(train_labels, 7) test_data = np.array(test_data) test_labels = np_utils.to_categorical(test_labels, 7) return train_data, test_data, train_labels, test_labels
def upload_resources(filename, skip=0, limit=None): """Upload from a CSV file.""" # Use sys.stdout.write so resources can be printed nicely and succinctly import sys date_converter = lambda s: datetime.strptime(s, '%Y-%m-%d') bool_converter = lambda s: s == "true" resource_schema = facility_schema['fields'] convert_map = { 'integer': int, 'float': float, 'datetime': date_converter, 'boolean': bool_converter } convert = {} for k, v in resource_schema.items(): field_type = v.get('type') if convert_map.has_key(field_type): convert[k] = convert_map[field_type] def print_flush(msg): sys.stdout.write(msg) sys.stdout.flush() facility_code = facility_schema['facility_code'] print_every = 1000 print_flush("Adding resources. Please be patient.") with open(filename) as f: reader = DictReader(f) for i in range(skip): reader.next() for i, d in enumerate(reader): actual_index = i + skip + 2 do_print = actual_index % print_every == 0 try: d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v) coords = [d.pop('longitude', None), d.pop('latitude', None)] if coords[0] and coords[1]: d['location'] = {'type': 'Point', 'coordinates': coords} d['facility_code'] = facility_code if not check(add_document(facility_schema['endpoint'], d), 201, False): raise Exception() if do_print: print_flush(".") except Exception as e: print "Error adding resource", e pprint(d) exit() if limit and i >= limit: break # Create a 2dsphere index on the location field for geospatial queries app.data.driver.db['resources'].create_index([('location', '2dsphere')]) print "Resources uploaded!"
def upload_waterpoints(filename, skip=0, limit=None): """Upload waterpoints from a CSV file.""" date_converter = lambda s: datetime.strptime(s, '%Y-%m-%d') bool_converter = lambda s: s == "true" status_map = { "non functional": "not functional", "functional needs repair": "needs repair" } status_converter = lambda s: status_map.get(s.lower(), s.lower()) convert = { 'gid': int, 'object_id': int, 'valid_from': date_converter, 'valid_to': date_converter, 'amount_tsh': float, 'breakdown_year': int, 'date_recorded': date_converter, 'gps_height': float, 'latitude': float, 'longitude': float, 'num_private': int, 'region_code': int, 'district_code': int, 'population': int, 'public_meeting': bool_converter, 'construction_year': int, 'status_group': status_converter } facility_code = "wpf001" with open(filename) as f: reader = DictReader(f) for i in range(skip): reader.next() for i, d in enumerate(reader): print "Adding line", i + skip + 2 try: d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v) coords = [d.pop('longitude'), d.pop('latitude')] d['location'] = {'type': 'Point', 'coordinates': coords} d['facility_code'] = facility_code if not check(add_document('waterpoints', d)): raise Exception() except Exception as e: print "Error adding waterpoint", e pprint(d) exit() if limit and i >= limit: break # Create a 2dsphere index on the location field for geospatial queries app.data.driver.db['facilities'].create_index([('location', '2dsphere')])
def __init__(self, csv): self.bag = Counter() reader = DictReader(open(csv, 'r'), fieldnames=[ "TileFile", "Borders", "Quantity", "Features", "Notes"]) reader.next() # skip header, we've defined our own for tile_dict in reader: tile = Tile.from_csv(tile_dict) quantity = int(tile_dict["Quantity"].strip()) self.bag[tile] = quantity if "B" in tile_dict["Features"]: self.first_tile = tile
def __init__(self, csv): self.bag = Counter() reader = DictReader(open(csv, 'r'), fieldnames=[ "TileFile", "Borders", "Quantity", "Features", "Notes" ]) reader.next() # skip header, we've defined our own for tile_dict in reader: tile = Tile.from_csv(tile_dict) quantity = int(tile_dict["Quantity"].strip()) self.bag[tile] = quantity if "B" in tile_dict["Features"]: self.first_tile = tile
def test_collectl_file_iterator(self): '''test for CollectlFileIterator class''' comments = [ '#COMMENT LINE 1\n', '# COMMENT 2\n', '############ COMMENT 3 ######\n' ] header = ['#Date,Time,Number\n'] data = ['20130102,01:23:45.56,5\n', '20120807,15:30:00.001,7'] mystring = StringIO.StringIO(''.join(comments) + ''.join(header) + ''.join(data)) myfile = CollectlFileIterator(mystring) reader = DictReader(myfile) self.assertEqual(reader.fieldnames, ["Date", "Time", "Number"]) entry1 = reader.next() entry2 = reader.next() self.assertEqual(entry1["Date"], "20130102") self.assertEqual(entry1["Time"], "01:23:45.56") self.assertEqual(entry1["Number"], "5") self.assertEqual(entry2["Date"], "20120807") self.assertEqual(entry2["Time"], "15:30:00.001") self.assertEqual(entry2["Number"], "7")
def test_subset_with_shapefile_no_ugid(self): """Test a subset operation using a shapefile without a UGID attribute.""" output_format = [constants.OUTPUT_FORMAT_NUMPY, constants.OUTPUT_FORMAT_CSV_SHAPEFILE] geom = self.get_shapefile_path_with_no_ugid() geom_select_uid = [8, 11] geom_uid = 'ID' rd = self.test_data.get_rd('cancm4_tas') for of in output_format: ops = OcgOperations(dataset=rd, geom=geom, geom_select_uid=geom_select_uid, geom_uid=geom_uid, snippet=True, output_format=of) self.assertEqual(len(ops.geom), 2) ret = ops.execute() if of == constants.OUTPUT_FORMAT_NUMPY: for element in geom_select_uid: self.assertIn(element, ret) self.assertEqual(ret.properties[8].dtype.names, ('STATE_FIPS', 'ID', 'STATE_NAME', 'STATE_ABBR')) else: with open(ret) as f: reader = DictReader(f) row = reader.next() self.assertIn(geom_uid, row.keys()) self.assertNotIn(env.DEFAULT_GEOM_UID, row.keys()) shp_path = os.path.split(ret)[0] shp_path = os.path.join(shp_path, 'shp', '{0}_gid.shp'.format(ops.prefix)) with fiona.open(shp_path) as source: record = source.next() self.assertIn(geom_uid, record['properties']) self.assertNotIn(env.DEFAULT_GEOM_UID, record['properties'])
def extractThresholdValues(fname): # parse csv file and add threshold values as dict # this method might be called multiple times for one item # There are various formats: # combined.modelEvaluation: Threshold Name, Testing.data, Cutoff, # Sensitivity, Specificity # biomod2.modelEvaluation: Threshold Name, Testing.data, Cutoff.*, # Sensitivity.*, Specificity.* # maxentResults.csv: Species,<various columns with interesting values> # <threshold name><space><cumulative threshold, # logistic threshold,area,training omission> # FIXME: this is really ugly and csv format detection should be done # differently thresholds = {} if fname.endswith("maxentResults.csv"): csvfile = open(fname, "r") dictreader = DictReader(csvfile) row = dictreader.next() # There is only one row in maxentResults namelist = ( "Fixed cumulative value 1", "Fixed cumulative value 5", "Fixed cumulative value 10", "Minimum training presence", "10 percentile training presence", "10 percentile training presence", "Equal training sensitivity and specificity", "Maximum training sensitivity plus specificity", "Balance training omission, predicted area and threshold value", "Equate entropy of thresholded and original distributions", ) for name in namelist: # We extract only 'cumulative threshold'' values threshold = "{} cumulative threshold".format(name) thresholds[threshold] = Decimal(row[threshold]) else: # assume it's one of our biomod/dismo results csvfile = open(fname, "r") dictreader = DictReader(csvfile) # search the field with Cutoff name = "Cutoff" for fieldname in dictreader.fieldnames: if fieldname.startswith("Cutoff."): name = fieldname break try: for row in dictreader: try: thresholds[row[""]] = Decimal(row[name]) except (TypeError, InvalidOperation) as e: LOG.warn( "Couldn't parse threshold value '%s' (%s) from" "file '%s': %s", name, row[name], fname, repr(e) ) except KeyError: LOG.warn("Couldn't extract Threshold '%s' from file '%s'", name, fname) return thresholds
class CSVUnicodeReader(object): def __init__(self, stream): self.reader = DictReader(UTF8Encoder(stream)) def __iter__(self): return self def next(self): entry = self.reader.next() return dict([(unicode(k, "utf-8"), unicode(v, "utf-8")) for (k,v) in entry.items()])
def extractThresholdValues(fname): # parse csv file and add threshold values as dict # this method might be called multiple times for one item # There are various formats: # combined.modelEvaluation: Threshold Name, Testing.data, Cutoff, # Sensitivity, Specificity # biomod2.modelEvaluation: Threshold Name, Testing.data, Cutoff.*, # Sensitivity.*, Specificity.* # maxentResults.csv: Species,<various columns with interesting values> # <threshold name><space><cumulative threshold, # logistic threshold,area,training omission> # FIXME: this is really ugly and csv format detection should be done # differently thresholds = {} if fname.endswith('maxentResults.csv'): csvfile = open(fname, 'r') dictreader = DictReader(csvfile) row = dictreader.next() # There is only one row in maxentResults namelist = ( 'Fixed cumulative value 1', 'Fixed cumulative value 5', 'Fixed cumulative value 10', 'Minimum training presence', '10 percentile training presence', '10 percentile training presence', 'Equal training sensitivity and specificity', 'Maximum training sensitivity plus specificity', 'Balance training omission, predicted area and threshold value', 'Equate entropy of thresholded and original distributions') for name in namelist: # We extract only 'cumulative threshold'' values threshold = '{} cumulative threshold'.format(name) thresholds[threshold] = Decimal(row[threshold]) else: # assume it's one of our biomod/dismo results csvfile = open(fname, 'r') dictreader = DictReader(csvfile) # search the field with Cutoff name = 'Cutoff' for fieldname in dictreader.fieldnames: if fieldname.startswith('Cutoff.'): name = fieldname break try: for row in dictreader: try: thresholds[row['']] = Decimal(row[name]) except (TypeError, InvalidOperation) as e: LOG.warn("Couldn't parse threshold value '%s' (%s) from" "file '%s': %s", name, row[name], fname, repr(e)) except KeyError: LOG.warn("Couldn't extract Threshold '%s' from file '%s'", name, fname) return thresholds
class UnicodeDictReader(object): '''A variant of the :class:`csv.DictReader` class that handles Unicode :param file f: The CSV file to process. :param list cols: The column-names of the CSV, as strings in a list. :param string dialect: The CSV dialect. If ``None`` then the dialect will be guessed. :param string encoding: The encoding of the file. If ``None`` the encoding will be guessed. If guessing fails then UTF-8 will be assumed.''' def __init__(self, f, cols, dialect=None, encoding=None, **kwds): e = self.guess_encoding(f) if encoding is None else encoding d = self.guess_dialect(f) if dialect is None else dialect f = UTF8Recoder(f, e) self.reader = DictReader(f, cols, dialect=d, **kwds) @staticmethod def guess_encoding(f): detector = UniversalDetector() for line in f: detector.feed(line) if detector.done: break f.seek(0) # The above read moves the file-cursor in the CSV file. detector.close() retval = detector.result['encoding'] if detector.result[ 'encoding'] else 'utf-8' return retval @staticmethod def guess_dialect(f): # Taken from the Python standard docs, with thanks to Piers Goodhew <*****@*****.**> # <https://docs.python.org/2/library/csv.html#csv.Sniffer> s = Sniffer() try: retval = s.sniff(f.read(1024), [ ',', '\t', ]) # 1024 taken from the Python docs except CSVError: retval = 'excel' finally: f.seek( 0) # The above f.read moves the file-cursor in the CSV file. return retval def next(self): row = self.reader.next() retval = { to_unicode_or_bust(k): to_unicode_or_bust(v) for k, v in row.items() } return retval def __iter__(self): return self
def upload_waterpoints(filename, skip=0, limit=None): """Upload waterpoints from a gzipped CSV file.""" convert = { 'date_recorded': lambda s: datetime.strptime(s, '%m/%d/%Y'), 'population': int, 'construction_year': lambda s: datetime.strptime(s, '%Y'), 'breakdown_year': lambda s: datetime.strptime(s, '%Y'), 'amount_tsh': float, 'gps_height': float, 'latitude': float, 'longitude': float, } with gzip.open(filename) as f: reader = DictReader(f) for i in range(skip): reader.next() for i, d in enumerate(reader): d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v) d['facility_code'] = 'wpf001' check(add_document('waterpoints', d)) if limit and i >= limit: break
def next(self): row = DictReader.next(self) try: processed_row = dict( (key, convert(value, self.field_types[key], self.allow_json)) for key, value in row.iteritems()) except ValueError as e: self.errors.append((e, row)) if not self.silent: raise e else: self.rows_imported += 1 return processed_row
def next(self): row = DictReader.next(self) try: processed_row = dict( (key, convert(value, self.field_types[key])) for key, value in row.iteritems() ) except ValueError as e: self.errors.append((e, row)) if not self.silent: raise e else: self.rows_imported += 1 return processed_row
class StructuredReader(object): def __init__(self, filename, container = None, dialect = 'simplecsv'): self._container = None if isinstance(container, ObjectContainer): self._container = container self._reader = DictReader(filename, fieldnames = None, restkey = "restkey", restval = "restval", dialect = dialect) elif isinstance(container, TupleContainer) or isinstance(container, ListContainer): self._container = container self._reader = csv.reader(filename, dialect = dialect) else: raise Exception("Given container is not valid") def next(self): # do not treat the header row if self._reader.line_num == 0: self._reader.next() row = self._reader.next() return self._container.fetch(row) def __iter__(self): return self
def number1(): filename = '/home/apt9online/src/bslcks/jtest.csv' cong = DictReader(open(filename)) while True: p = cong.next() print cong.line_num if p['Include on directory'] == 'Yes': if p['Family relation'] <> 'Duplicate': try: Person.objects.get(bslc_individual=p['Indiv #']) print "%s %s already exists in the DB" % (p['First name'],p['Last name']) except: record_person(p)
def test_success(field, expected, log_path, response_for, form_data, sm_mock): key, value = field form_data[key] = value assert response_for(form_data, log=False) == expected assert sm_mock.call_count == 1 params = sm_mock.call_args[0][1]['fields'] assert set(params.keys()) == set(form_data.keys()) for key, value in form_data.items(): assert params[key] == value.decode('utf8') assert response_for(form_data, log=True) == expected assert sm_mock.call_count == 2 assert response_for(form_data, log=True) == expected assert sm_mock.call_count == 3 with open(log_path) as log_file: reader = DictReader(log_file) row = reader.next() # rows should not be equal because the time field # is added by the logging function. assert row != reader.next()
def next(self): # Note: cannot use super because DictReader is an old-style class row = DictReader.next(self) d = {} for key, val in row.items(): if isinstance(key, str): key = key.strip() if isinstance(val, str): val = val.strip() # Try to split by '|' to get a list split = val.split("|") if len(split) > 1: val = map(str.strip, split) d[key] = val return d
class BaseCSVHandler(object): defined_input_field_names = ['date','customer','money'] defined_output_field_names = ['date','customer','money'] result = [] def __init__(self, fnm_in='input.csv', fnm_out='output.csv', restkey=None, restval=None, dialect_in="excel", dialect_out="excel"): self.f_in = open(fnm_in) self.csv_dict_reader = DictReader(self.f_in, restkey=restkey, restval=restval, dialect=dialect_in) field_names = self.csv_dict_reader.fieldnames if len(field_names) <> len(self.defined_input_field_names): raise ValueError,\ ("incorrect number of columns in the file %s, it should have %d columns" % (fnm_in, len(self.defined_input_field_names))) if [1 for x in zip(field_names,self.defined_input_field_names) if x[0] != x[1]]: raise ValueError,\ ("incorrect names of columns in the file %s, they should be %s" % (fnm_in,'"{0}"'.format('","'.join(x for x in self.defined_input_field_names)))) self.f_out = open(fnm_out, 'w') self.csv_dict_writer = DictWriter(self.f_out, self.defined_output_field_names, dialect=dialect_out) def __iter__(self): return self def one_string_handler(self,s): if s: self.result.append (s) def next(self): return self.csv_dict_reader.next() def calc_result(self): pass def write_result(self): self.csv_dict_writer.writeheader() self.csv_dict_writer.writerows(self.result) def close_all_files(self): self.f_in.close() self.f_out.close() self.csv_dict_writer = None self.csv_dict_reader = None def process_all(self): for i in self: self.one_string_handler(i) self.calc_result() self.write_result() self.close_all_files()
def loadcsv(): fieldnames = ['left_s0_kp', 'left_s0_ki', 'left_s0_kd', 'left_s1_kp', 'left_s1_ki', 'left_s1_kd', 'left_e0_kp', 'left_e0_ki', 'left_e0_kd', 'left_e1_kp', 'left_e1_ki', 'left_e1_kd', 'left_w0_kp', 'left_w0_ki', 'left_w0_kd', 'left_w1_kp', 'left_w1_ki', 'left_w1_kd', 'left_w2_kp', 'left_w2_ki', 'left_w2_kd', 'error'] with open("simAnnPIDprogress.csv", "rb") as f: reader = DictReader(f, fieldnames) dictionary = reader.next() params = dict2params(dictionary) error = dictionary['error'] return params, error
def test_write_csv(self): """TestBase: Base::write_csv() creates a valid csv""" from csv import DictReader fname = "thermal.csv" trappy.Run().thermal.write_csv(fname) with open(fname) as fin: csv_reader = DictReader(fin) self.assertTrue("Time" in csv_reader.fieldnames) self.assertTrue("temp" in csv_reader.fieldnames) first_data = csv_reader.next() self.assertEquals(first_data["Time"], "0.0") self.assertEquals(first_data["temp"], "68786")
def test_write_csv(self): """TestBase: Base::write_csv() creates a valid csv""" from csv import DictReader fname = "thermal.csv" trappy.FTrace().thermal.write_csv(fname) with open(fname) as fin: csv_reader = DictReader(fin) self.assertTrue("Time" in csv_reader.fieldnames) self.assertTrue("temp" in csv_reader.fieldnames) first_data = csv_reader.next() self.assertEquals(first_data["Time"], "0.0") self.assertEquals(first_data["temp"], "68786")
class UnicodeDictReader(object): '''A variant of the :class:`csv.DictReader` class that handles Unicode :param file f: The CSV file to process. :param list cols: The column-names of the CSV, as strings in a list. :param string dialect: The CSV dialect. If ``None`` then the dialect will be guessed. :param string encoding: The encoding of the file. If ``None`` the encoding will be guessed. If guessing fails then UTF-8 will be assumed.''' def __init__(self, f, cols, dialect=None, encoding=None, **kwds): e = self.guess_encoding(f) if encoding is None else encoding d = self.guess_dialect(f) if dialect is None else dialect f = UTF8Recoder(f, e) self.reader = DictReader(f, cols, dialect=d, **kwds) @staticmethod def guess_encoding(f): detector = UniversalDetector() for line in f: detector.feed(line) if detector.done: break f.seek(0) # The above read moves the file-cursor in the CSV file. detector.close() retval = detector.result['encoding'] if detector.result['encoding'] else 'utf-8' return retval @staticmethod def guess_dialect(f): # Taken from the Python standard docs, with thanks to Piers Goodhew <*****@*****.**> # <https://docs.python.org/2/library/csv.html#csv.Sniffer> s = Sniffer() try: retval = s.sniff(f.read(1024), [',', '\t', ]) # 1024 taken from the Python docs except CSVError: retval = 'excel' finally: f.seek(0) # The above f.read moves the file-cursor in the CSV file. return retval def next(self): row = self.reader.next() retval = {to_unicode_or_bust(k): to_unicode_or_bust(v) for k, v in row.items()} return retval def __iter__(self): return self
def graceful_read_csv(filename): from csv import DictReader data = [] try: f = open(filename, 'rb') except IOError as e: print( "ERROR:", e.strerror ) exit() csvreader = DictReader(f) while True: try: row = csvreader.next() except: break data.append(row) return data
class UnicodeDictReader(object): """ A CSV reader which will iterate over lines in the CSV file "f", which is encoded in the given encoding. """ def __init__(self, f, cols, dialect="excel", encoding="utf-8", **kwds): f = UTF8Recoder(f, encoding) self.reader = DictReader(f, cols, dialect=dialect, **kwds) def next(self): row = self.reader.next() retval = {to_unicode_or_bust(k): to_unicode_or_bust(v) for k, v in row.items()} return retval def __iter__(self): return self
class BLASRm4Reader: """ BLASR -m 4 -header should generate header: qname tname score pctsimilarity qstrand qstart qend qseqseqlength tstrand tstart tend tseqseqlength mapqv ncells clusterScore probscore numSigClusters """ def __init__(self, filename, flipQS=False): self.filename = filename self.f = open(filename) self.reader = DictReader(self.f, delimiter=' ') self.flipQS = flipQS # is True, swap query <-> subject/target def __iter__(self): return self def next(self): d = self.reader.next() if len(d) == 0: raise StopIteration, "EOF reached!" rec = BLAST9Record(None) # trim qID of the last /0_len which is added by BLASR #d['qname'] = d['qname'][:d['qname'].rfind('/')] if self.flipQS: rec.qID = d['tname'] rec.qStart = int(d['tstart']) # already 0-based rec.qEnd = int(d['tend']) rec.qLen = int(d['tseqlength']) rec.sID = d['qname'] rec.sStart = int(d['qstart']) rec.sEnd = int(d['qend']) rec.sLen = int(d['qseqlength']) else: # query is Q, target is S rec.qID = d['qname'] rec.qStart = int(d['qstart']) # already 0-based rec.qEnd = int(d['qend']) rec.qLen = int(d['qseqlength']) rec.sID = d['tname'] rec.sStart = int(d['tstart']) rec.sEnd = int(d['tend']) rec.sLen = int(d['tseqlength']) rec.strand = '+' if d['qstrand'] == d['tstrand'] else '-' rec.identity = float(d['pctsimilarity']) return rec
def read_junction_report(filename): """ tab-delimited with header: chr left right strand num_transcript num_sample genome annotation label return: dict of label --> records with that label """ reader = DictReader(open(filename), delimiter='\t') r = reader.next() cur_label, cur = r['label'], [r] for r in reader: if r['label'] != cur_label: yield cur_label, cur cur_label, cur = r['label'], [r] else: cur.append(r) yield cur_label, cur
class UnicodeReader: """ a csv reader which will iterate over lines in the csv file 'f', which is encoded in the given encoding; stolen and adapted from from http://docs.python.org/lib/csv-examples.html """ def __init__(self, f, dialect=excel, encoding='utf-8', **kwargs): f = UTF8Recoder(f, encoding) self.reader = DictReader(f, dialect=dialect, **kwargs) def next(self): data = self.reader.next() for key, value in data.items(): if isinstance(value, basestring): data[key] = unicode(value, 'utf-8') return data def __iter__(self): return self
def test_append_field_err(form_config, form_data, log_path): """ Checks that error logs are correctly written and appended Submits three forms, the second two have different fields to the first and should be added to the same log file as each other, and be identical """ formmail.log_formdata(form_data, log_path) del form_data['email'] # submit two forms with fields that dont match the config # this should append the second form to the error log file with pytest.raises(Exception): formmail.log_formdata(form_data, log_path) with pytest.raises(Exception): formmail.log_formdata(form_data, log_path) with open(log_path + '_error') as error_log: reader = DictReader(error_log) assert reader.next() == form_data assert reader.next() == form_data
def test_append_field_err(form_config, form_data, log_path): """ Checks that error logs are correctly written and appended Submits three forms, the second two have different fields to the first and should be added to the same log file as each other, and be identical """ formmail2.log_formdata(form_data, log_path) del form_data['email'] # submit two forms with fields that dont match the config # this should append the second form to the error log file with pytest.raises(Exception): formmail2.log_formdata(form_data, log_path) with pytest.raises(Exception): formmail2.log_formdata(form_data, log_path) with open(log_path + '_error') as error_log: reader = DictReader(error_log) assert reader.next() == form_data assert reader.next() == form_data
def clean_csv(self): """ ueberprueft, ob eine gueltige CSV-Datei hochgeladen wurde """ # erster Test: ist der Content-Type gueltig csv = self.cleaned_data['csv'] if csv.content_type != 'text/csv': self._errors['csv'] = self.error_class(['Nur CSV-Dateien sind als Eingabe erlaubt!']) return csv # zweiter Test: hat die Datei die richtige Anzahl Spalten? reader = DictReader(csv) try: entry = reader.next() if len(entry) != 12: msg = 'Ungültiges Format der CSV-Datei (falsche Anzahl Spalten)!' self._errors['csv'] = self.error_class([msg]) except StopIteration: msg = 'Ungültiges Format der CSV-Datei (keine Bestellungen vorhanden)!' self._errors['csv'] = self.error_class([msg]) orders = [entry] + [row for row in reader] return orders
def next(self): ret = self.csvAbleClass() for (csvField, value) in DictReader.next(self).iteritems(): if isinstance(value, str): value = value.decode(self.encoding) try: value = value.encode('ascii') except UnicodeError: pass field = self.fieldsDict.get(csvField) if self.fieldsDict and csvField != self.csvAbleClass.READ_REST_KEY else csvField if field: setattr(ret, field, value) elif value is not None: rest = getattr(ret, self.csvAbleClass.READ_REST_KEY, None) if rest is None: rest = value if field == self.csvAbleClass.READ_REST_KEY else {csvField: value} setattr(ret, self.csvAbleClass.READ_REST_KEY, rest) elif field == self.csvAbleClass.READ_REST_KEY: rest.update(value) else: rest[csvField] = value # pylint: disable=E1137 return ret
def dictcsv(csvname, fieldnames = None, arrays = False): """Reading csv files into a dictionary. Arguments: csvname: string filename Keyword Arguments: fieldnames: list of csv column names. If none, first column of the file being read will be used. arrays: Whether or not to return csv contents as a dict of arrays Returns: dictionary of columns as numpy arrays, keys are fieldnames """ fileobj = open(csvname, 'rU') DR = DictReader(fileobj, fieldnames = fieldnames) fields = DR.fieldnames l = DR.next() dicty = {} for f in fields: try: dicty[f] = [float(l[f])] except (TypeError, ValueError): dicty[f] = [l[f]] for row in DR: for f in fields: try: dicty[f].append(float(row[f])) except (TypeError, ValueError): dicty[f].append(row[f]) if arrays: for key in dicty: dicty[key] = np.array(dicty[key]) return dicty
def boundary_values(csvFile, header): """ csvFile - directory of csvFile to be read header - one of the numerical headers in the original csv file, assume valid returns [min value, max value] in the csv file given the header """ f = open(csvFile) reader = DictReader(f) firstLine = reader.next() minValue = maxValue = value_of(firstLine, header) for row in reader: value = value_of(row, header) if value < minValue: # new minimum minValue = value elif value > maxValue: # new maximum maxValue = value return [minValue, maxValue]
class Cincinnati311CSVDataParser(object): """ Class that parses and cleans a Cincinnati 311 Comma Seperated Value (CSV) file record Data set description: -------------------- https://data.cincinnati-oh.gov/Thriving-Healthy-Neighborhoods/ Cincinnati-311-Non-Emergency-Service-Requests/4cjh-bm8b/about""" def __init__(self, h_file): """ Cincinnati311CSVDataParser class constructor Args: self: Cincinnati311CSVDataParser class object handle h_file: Cincinnati 311 csv file handle Returns: None""" fieldnames = [ 'jurisdictionid', 'servicerequestid', 'status', 'statusnotes', 'servicename', 'servicecode', 'description', 'agencyresponsible', 'servicenotice', 'requesteddatetime', 'updateddatetime', 'expecteddatetime', 'address', 'addressid', 'zipcode', 'latitude', 'longitude', 'requesteddate', 'updateddate', 'lasttableupdate' ] matchobj = re.compile('.*date.*') self.date_fields = filter(lambda elem: matchobj.match(elem) != None, fieldnames) self.string_fields = filter(lambda elem: matchobj.match(elem) == None, fieldnames) # http://stackoverflow.com/questions/265960/ # best-way-to-strip-punctuation-from-a-string-in-python self.punctuation_table = table = string.maketrans("", "") self.readerobj = DictReader(h_file, fieldnames) def __iter__(self): """ Iterator :return: None """ return self def next(self): """ Parses a Cincinnati 311 CSV file record http://stackoverflow.com/questions/19151/how-to-make-class-iterable Args: self: Cincinnati311CSVDataParser class object handle Returns: record: Dictionary that stores a Cincinnati 311 data CSV file record""" record = self.readerobj.next() if record['jurisdictionid'] == 'JURISDICTION_ID': record = None else: for key in self.date_fields: if len(record[key]) > 0: record[key] = parser.parse(record[key]) for key in self.string_fields: record[key] = re.sub("\"", "", record[key].lower()) try: record['zipcode'] = int(record['zipcode']) except ValueError: record['zipcode'] = None for key in ['latitude', 'longitude']: try: record[key] = float(record[key]) except ValueError: record[key] = None if len(record['servicecode']) > 0: record['servicecode'] =\ re.sub("\s+", "", record['servicecode']) record['servicecode'] =\ record['servicecode'].translate(self.punctuation_table, string.punctuation) record['servicename'] =\ record['servicename'].translate(self.punctuation_table, string.punctuation) return record
class KnownGeneFile(SmartFileIter): '''An iterable that parses UCSC's KnownGene gene annotation files. Field names are:: FIELD_NAMES = [ 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'proteinID', 'alignID', ] ''' FIELD_NAMES = [ 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'proteinID', 'alignID', ] # function pointers for correct formatting of field names FIELD_TYPES = [ str, str, str, int, int, int, int, lambda x: [int(y) for y in x.split(',') if len(y) > 0], lambda x: [int(y) for y in x.split(',') if len(y) > 0], lambda x: [int(y) for y in x.split(',') if len(y) > 0], str, str, ] def __init__(self, kg_fn): self.meta_data = [] self.file_info = {} f = open(kg_fn, 'rU') self._dict_reader = DictReader(filter(lambda row: row[0] != '#', f), delimiter='\t', fieldnames=KnownGeneFile.FIELD_NAMES) def __iter__(self): return self def next(self): line = self._dict_reader.next() for k, f in zip(self.FIELD_NAMES, self.FIELD_TYPES): line[k] = f(line[k]) return line
class SmartFileIter: r"""An 'abstract' class implementing a smart file iterator. It is essentially a wrapper around a collections.DictReader object that parses fields into Python datatypes (int, float, tuple, objects, etc) as they are iterated. The constructor argument *f* can be either a valid filename or a file-like object. This class should not be directly instantiated - rather it should be subclassed with FIELD_NAMES and FIELD_TYPES defined. FIELD_NAMES is a list of strings referring to the names of the fields, FIELD_TYPES is a list of the same length of callables that will parse the column into the desired format. Example:: >>> s = StringIO('chr1\t0\t100\t+\nchr3\t300\t601\t-\n') >>> class IntervalFile(SmartFileIter): r'''A SmartFileIter for files with lines formatted like: chrom\tstart\tend\tstrand''' FIELD_NAMES = ['chrom','start','end','strand'] FIELD_TYPES= [str,int,int,lambda x: 0 if x == '+' else 1] >>> f = IntervalFile(s) >>> for r in f : print r['chrom'], 'length: ', r['end']-r['start'], 'strand: ',r['strand'] ``r['start']`` and ``r['end']`` are automatically available as integers, so the subraction works as expected. Arbitrary functions that accept a single argument and return a value may also be specified. """ def __init__(self, f, skip_line_chars='#'): if not hasattr(self, 'FIELD_NAMES') or not hasattr( self, 'FIELD_TYPES'): raise Exception( 'Subclasses must define class members FIELD_NAMES and FIELD_TYPES' ) if isinstance(f, str): f = open(f, 'rU') self._dict_reader = DictReader(filter(lambda row: row[0] != '#', f), delimiter='\t', fieldnames=self.FIELD_NAMES) self.fieldnames = self.FIELD_NAMES self.curr_line = self._dict_reader.next() self.skip_line_chars = skip_line_chars # skip initial comment lines while self.curr_line[self.FIELD_NAMES[0]][0] in self.skip_line_chars: self.curr_line = self._dict_reader.next() if self.FIELD_NAMES[0] in self.curr_line.values(): self.curr_line = self._dict_reader.next() def __iter__(self): return self def __getattr__(self, attr): try: return self.__dict__[attr] except KeyError: return getattr(self._dict_reader, attr) def next(self): """Emit the next record in the file as a dictionary with parsed values""" if self.curr_line is None: raise StopIteration() line = self.curr_line # check for comment while line[self.FIELD_NAMES[0]][0] in self.skip_line_chars: line = self.curr_line = self._dict_reader.next() for k, f in zip(self.FIELD_NAMES, self.FIELD_TYPES): try: line[k] = f(line[k]) except Exception, e: #sys.stderr.write('Warning: field %s on line %d could not be properly formatted, exception %s\n'%(k,self._dict_reader.reader.line_num,str(e))) line[k] = line[k] try: self.curr_line = self._dict_reader.next() except StopIteration: self.curr_line = None return line
#!/usr/bin/env fab from csv import DictReader from codecs import open as uopen from json import loads, dumps from string import ascii_letters, punctuation from fabric.api import * from cStringIO import StringIO from os import urandom keyFd = DictReader( uopen("/Users/ss/keys/aliyun_key.csv", 'r', encoding='utf-8-sig')) d = keyFd.next() env.user = '******' env.region = 'ap-southeast-1' env.key_filename = ['/Users/ss/keys/ralali_production_key.pem'] env.access_key = d['AccessKeyId'] env.access_secret = d['AccessKeySecret'] env.key_pair = 'default_aliyun' env.instance = 'ecs.n1.small' env.zone = 'a' env.imageid = 'ubuntu_16_0402_64_20G_alibase_20171227.vhd' env.wp_tarball = 'http://wordpress.org/latest.tar.gz' env.domain = 'test-aliyun.wordpress' env.dbname = 'test_aliyun_db' @task def provision_ecs(): instance_details = local("aliyuncli ecs CreateInstance --AccessKeyId %s --AccessKeySecret %s --KeyPairName %s --RegionId %s --InstanceType %s --ImageId %s" % \ (env.access_key, env.access_secret, env.key_pair, env.region, env.instance, env.imageid)) env.ecs_instance = loads(instance_details)['InstanceId']
class Cincinnati311CSVDataParser(object): """ Class that parses and cleans a Cincinnati 311 Comma Seperated Value (CSV) file record Data set description: -------------------- https://data.cincinnati-oh.gov/Thriving-Healthy-Neighborhoods/ Cincinnati-311-Non-Emergency-Service-Requests/4cjh-bm8b/about""" def __init__(self, h_file): """ Cincinnati311CSVDataParser class constructor Args: self: Cincinnati311CSVDataParser class object handle h_file: Cincinnati 311 csv file handle Returns: None""" fieldnames = ['jurisdictionid', 'servicerequestid', 'status', 'statusnotes', 'servicename', 'servicecode', 'description', 'agencyresponsible', 'servicenotice', 'requesteddatetime', 'updateddatetime', 'expecteddatetime', 'address', 'addressid', 'zipcode', 'latitude', 'longitude', 'requesteddate', 'updateddate', 'lasttableupdate'] matchobj = re.compile('.*date.*') self.date_fields = filter(lambda elem: matchobj.match(elem) != None, fieldnames) self.string_fields = filter(lambda elem: matchobj.match(elem) == None, fieldnames) # http://stackoverflow.com/questions/265960/ # best-way-to-strip-punctuation-from-a-string-in-python self.punctuation_table = table = string.maketrans("", "") self.readerobj = DictReader(h_file, fieldnames) def __iter__(self): """ Iterator :return: None """ return self def next(self): """ Parses a Cincinnati 311 CSV file record http://stackoverflow.com/questions/19151/how-to-make-class-iterable Args: self: Cincinnati311CSVDataParser class object handle Returns: record: Dictionary that stores a Cincinnati 311 data CSV file record""" record = self.readerobj.next() if record['jurisdictionid'] == 'JURISDICTION_ID': record = None else: for key in self.date_fields: if len(record[key]) > 0: record[key] = parser.parse(record[key]) for key in self.string_fields: record[key] = re.sub("\"", "", record[key].lower()) try: record['zipcode'] = int(record['zipcode']) except ValueError: record['zipcode'] = None for key in ['latitude', 'longitude']: try: record[key] = float(record[key]) except ValueError: record[key] = None if len(record['servicecode']) > 0: record['servicecode'] =\ re.sub("\s+", "", record['servicecode']) record['servicecode'] =\ record['servicecode'].translate(self.punctuation_table, string.punctuation) record['servicename'] =\ record['servicename'].translate(self.punctuation_table, string.punctuation) return record
def _import_from_csv(cls, path): """Populate the DB from CSV data. The 'path' attribute is the absolute path to the CSV file that must be inserted on the database. """ atualizadas = [] novas = [] errors = [] fieldnames = [ 'remove1', 'nome_unidade', 'sigla_unidade', 'tipo_logradouro', 'nome_logradouro', 'numero', 'complemento', 'bairro', 'municipio', 'uf', 'cep', 'ddd', 'telefone', 'email', 'remove2' ] with open(path, 'r') as csv_file: data = DictReader(csv_file, fieldnames=fieldnames) for row in data: if row['nome_unidade'] == 'nome_unidade': row = data.next() del row['remove1'] del row['remove2'] try: unidade = UnidadePrisional.objects.get( nome_unidade=row['nome_unidade'], municipio=Cidade.objects.get(nome=row['municipio'], estado=row['uf'])) unidade._update_from_dict(row) unidade.save() atualizadas.append(unidade.nome_unidade) except ObjectDoesNotExist: try: unidade = UnidadePrisional._new_from_dict(row) unidade.save() novas.append(unidade.nome_unidade) except Exception as e: error = { 'nome_unidade': row['nome_unidade'], 'erro': str(e), 'data': row } errors.append(error) msg = 'Resumo da operação:\n' if atualizadas: msg += ' - ' msg += '{} unidades foram atualizadas.\n'.format(len(atualizadas)) log.info(' {}'.format(atualizadas)) if novas: msg += ' - ' msg += '{} unidades foram adicionadas.\n'.format(len(novas)) if errors: msg += 'Ocorreram {} erros de importação:\n'.format(len(errors)) for error in errors: msg += ' - ' msg += 'Unidade: {:.30}'.format(error['nome_unidade']) msg += ' | {} | {}/{}\n'.format(error['erro'], error['data']['uf'], error['data']['municipio']) log.info(msg)
def meta_schema(self): from csv import DictReader from collections import defaultdict import yaml self.database.create() config = dict(self.metadata.build.config) url = self.metadata.build.sources['table_map'].format(**config) tn = self.filesystem.download(url) current_table = None t = None # These tables spread across more than one segment, # which is a difficult special case, so these tables # are re-named to have the segment number as a suffix. large_tables = [ 'B24121', 'B24122', 'B24123', 'B24124', 'B24125', 'B24126' ] table_segments = defaultdict(list) lr = self.init_log_rate(1000) with self.session, open(tn) as f: reader = DictReader(f) for i, row in enumerate(reader): if self.run_args.test and i > 500: break if row['Table ID'] in large_tables: row['Table ID'] = (row['Table ID'] + '_' + str(int(row['Sequence Number']))) #### These are gouping lines that have no data #### associated with them. if row['Line Number'].endswith('.5'): continue col_data = {'segment': int(row['Sequence Number'])} if row['Table ID'] != current_table: # # A New Table # new_table = True current_table = row['Table ID'] # The row after the table is the universe universe = reader.next()['Table Title'] if not universe.startswith('Universe:'): raise Exception("Universe fail") else: parts = universe.split(':') universe = parts[1].strip() t = self.schema.add_table( current_table, description=row['Table Title'].title(), universe=universe, keywords=row['Subject Area'], data={ 'segment': int(row['Sequence Number']), 'start': int(row['Start Position']) }) if not current_table in table_segments[ row['Sequence Number']]: (table_segments[int( row['Sequence Number'])].append(current_table)) ac = self.schema.add_column is1 = 'i1' # Flag to mark which columns should be removed from the table when constructing # a segment header. link_data = dict(col_data.items()) link_data['is_link'] = 1 ac(t, 'id', datatype='integer', is_primary_key=True, description=row['Table Title'].title()) ac(t, 'FILEID', datatype='varchar', size=6, data=link_data, description='Universe: {}'.format(universe)) ac(t, 'FILETYPE', datatype='varchar', size=6, data=link_data) ac(t, 'STUSAB', datatype='varchar', size=2, data=link_data, indexes=is1) ac(t, 'CHARITER', datatype='varchar', size=3, data=link_data) ac(t, 'SEQUENCE', datatype='varchar', size=4, data=link_data) ac(t, 'LOGRECNO', datatype='integer', size=7, data=link_data, indexes=is1) else: # # A row for an existing table. # try: int(row['Line Number']) except: print "Failed for ", row name = "{}{:03d}".format(current_table, int(row['Line Number'])) self.schema.add_column( t, name, datatype='integer', description=(row['Table Title'].decode('latin1')), data=col_data) lr("Creating schema: {}".format(t.name)) last_table = row['Table ID'] new_table = False with open(self.filesystem.path('meta', 'tables.yaml'), 'w') as f: f.write( yaml.dump(dict(table_segments), indent=4, default_flow_style=False)) with open(self.filesystem.path('meta', self.SCHEMA_FILE), 'w') as f: self.schema.as_csv(f) return True
print "- Parallel downloads: %s" % (threads) print "-----------------------------------------------------------------------" for i in range(2): print " " if not path.exists(destfolder): makedirs(destfolder) done = glob(path.join(destfolder, '*.jpg')) filename = 'photo_links' if path.exists(filename): op = open(filename, 'r') reader = DictReader(op) urls = [row['url_original'] for row in reader] op.seek(0) reader.next() names = [row['name'] for row in reader] op.close() print '- Checking which pictures are not already downloaded. -' print '- -' print '- It may take some time, or not, who knows. -' tengui = [] for el, name in enumerate(names): if name in [d.split('/')[-1].split('.')[0] for d in done]: tengui.append(el) names = [i for j, i in enumerate(names) if j not in tengui] urls = [i for j, i in enumerate(urls) if j not in tengui] else: print 'Need picture links. Go find photo_links on my github ;) https://github.com/dieguico/Project_Apollo_Archive'
Import data from CSV file to MongoDB. Author: Fabio Pani <fabiux AT fabiopani DOT it> License: GNU/GPL version 3 (see file LICENSE) """ from sys import argv from csv import DictReader, QUOTE_NONNUMERIC from lib.utils import csv_fieldnames, convert_row_for_mongo from pymongo import MongoClient if __name__ != '__main__': exit() if len(argv) < 2: print('Usage: python import_from_csv.py <csv_file>') exit() eq = MongoClient().ingv.earthquakes with open(argv[1], 'rb') as f: reader = DictReader(f, fieldnames=csv_fieldnames, quotechar='"', quoting=QUOTE_NONNUMERIC) reader.next() # skip header for event in reader: try: eq.insert_one(convert_row_for_mongo(event)) except Exception as e: pass f.close()
class SmartFileIter : r"""An 'abstract' class implementing a smart file iterator. It is essentially a wrapper around a collections.DictReader object that parses fields into Python datatypes (int, float, tuple, objects, etc) as they are iterated. The constructor argument *f* can be either a valid filename or a file-like object. This class should not be directly instantiated - rather it should be subclassed with FIELD_NAMES and FIELD_TYPES defined. FIELD_NAMES is a list of strings referring to the names of the fields, FIELD_TYPES is a list of the same length of callables that will parse the column into the desired format. Example:: >>> s = StringIO('chr1\t0\t100\t+\nchr3\t300\t601\t-\n') >>> class IntervalFile(SmartFileIter): r'''A SmartFileIter for files with lines formatted like: chrom\tstart\tend\tstrand''' FIELD_NAMES = ['chrom','start','end','strand'] FIELD_TYPES= [str,int,int,lambda x: 0 if x == '+' else 1] >>> f = IntervalFile(s) >>> for r in f : print r['chrom'], 'length: ', r['end']-r['start'], 'strand: ',r['strand'] ``r['start']`` and ``r['end']`` are automatically available as integers, so the subraction works as expected. Arbitrary functions that accept a single argument and return a value may also be specified. """ def __init__(self,f,skip_line_chars='#') : if not hasattr(self,'FIELD_NAMES') or not hasattr(self,'FIELD_TYPES') : raise Exception('Subclasses must define class members FIELD_NAMES and FIELD_TYPES') if isinstance(f,str) : f = open(f) self._dict_reader = DictReader(f,delimiter='\t',fieldnames=self.FIELD_NAMES) self.fieldnames = self.FIELD_NAMES self.curr_line = self._dict_reader.next() self.skip_line_chars = skip_line_chars # skip initial comment lines while self.curr_line[self.FIELD_NAMES[0]][0] in self.skip_line_chars : self.curr_line = self._dict_reader.next() if self.FIELD_NAMES[0] in self.curr_line.values() : self.curr_line = self._dict_reader.next() def __iter__(self) : return self def __getattr__(self,attr) : try: return self.__dict__[attr] except KeyError : return getattr(self._dict_reader,attr) def next(self) : """Emit the next record in the file as a dictionary with parsed values""" if self.curr_line is None : raise StopIteration() line = self.curr_line # check for comment while line[self.FIELD_NAMES[0]][0] in self.skip_line_chars : line = self.curr_line = self._dict_reader.next() for k,f in zip(self.FIELD_NAMES, self.FIELD_TYPES) : try : line[k] = f(line[k]) except Exception, e : #sys.stderr.write('Warning: field %s on line %d could not be properly formatted, exception %s\n'%(k,self._dict_reader.reader.line_num,str(e))) line[k] = line[k] try : self.curr_line = self._dict_reader.next() except StopIteration : self.curr_line = None return line
print "- Destination folder: %s"%(destfolder) print "- Parallel downloads: %s"%(threads) print "-----------------------------------------------------------------------" for i in range(2): print " " if not path.exists(destfolder): makedirs(destfolder) done = glob(path.join(destfolder,'*.jpg')) filename='photo_links' if path.exists(filename): op = open(filename, 'r') reader=DictReader(op) urls=[row['url_original'] for row in reader] op.seek(0) reader.next() names=[row['name'] for row in reader] op.close() print '- Checking which pictures are not already downloaded. -' print '- -' print '- It may take some time, or not, who knows. -' tengui=[] for el,name in enumerate(names): if name in [d.split('/')[-1].split('.')[0] for d in done]: tengui.append(el) names = [i for j, i in enumerate(names) if j not in tengui] urls = [i for j, i in enumerate(urls) if j not in tengui] else: print 'Need picture links. Go find photo_links on my github ;) https://github.com/dieguico/Project_Apollo_Archive'
class KnownGeneFile(SmartFileIter) : '''An iterable that parses UCSC's KnownGene gene annotation files. Field names are:: FIELD_NAMES = [ 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'proteinID', 'alignID', ] ''' FIELD_NAMES = [ 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart', 'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'proteinID', 'alignID', ] # function pointers for correct formatting of field names FIELD_TYPES = [ str, str, str, int, int, int, int, lambda x: [int(y) for y in x.split(',') if len(y) > 0], lambda x: [int(y) for y in x.split(',') if len(y) > 0], lambda x: [int(y) for y in x.split(',') if len(y) > 0], str, str, ] def __init__(self,kg_fn) : self.meta_data = [] self.file_info = {} f = open(kg_fn) self._dict_reader = DictReader(f,delimiter='\t',fieldnames=KnownGeneFile.FIELD_NAMES) def __iter__(self) : return self def next(self) : line = self._dict_reader.next() for k,f in zip(self.FIELD_NAMES,self.FIELD_TYPES) : line[k] = f(line[k]) return line
def solr_retransform(fname, start_time, feed_file_size): """Create Solr-compatible versions of a datafile""" numopps = 0 print_progress("Creating Solr transformed file for: " + fname) out_filename = fname + ".transformed" data_file = open(fname, "r") try: csv_reader = DictReader(data_file, dialect="our-dialect") csv_reader.next() except: print data_file.read() print_progress("error processing %s" % str(fname)) return shortname = footprint_lib.guess_shortname(fname) if not shortname: shortname = fname fnames = csv_reader.fieldnames[:] fnames.append("c:eventrangestart:dateTime") fnames.append("c:eventrangeend:dateTime") fnames.append("c:eventduration:integer") fnames.append("c:aggregatefield:string") fnames.append("c:dateopportunityidgroup:string") fnames.append("c:randomsalt:float") fnamesdict = dict([(x, x) for x in fnames]) data_file = open(fname, "r") # TODO: Switch to TSV - Faster and simpler csv_reader = DictReader(data_file, dialect="our-dialect") csv_writer = DictWriter(open(out_filename, "w"), dialect="excel-tab", fieldnames=fnames) for field_name in fnamesdict.keys(): fnamesdict[field_name] = fnamesdict[field_name].lower() if fnamesdict[field_name].startswith("c:"): fnamesdict[field_name] = fnamesdict[field_name].split(":")[1] csv_writer.writerow(fnamesdict) now = parser.parse(commands.getoutput("date")) today = now.date() expired_by_end_date = num_bad_links = 0 for rows in csv_reader: if rows["title"] and rows["title"].lower().find("anytown museum") >= 0: # bogus event continue if not "c:OpportunityID:string" in rows: continue # Split the date range into separate fields # event_date_range can be either start_date or start_date/end_date split_date_range = [] if rows["event_date_range"]: split_date_range = rows["event_date_range"].split("/") if split_date_range: rows["c:eventrangestart:dateTime"] = split_date_range[0] if len(split_date_range) > 1: rows["c:eventrangeend:dateTime"] = split_date_range[1] else: if rows["c:openended:boolean"] == "Yes": rows["c:eventrangeend:dateTime"] = rows["c:expires:dateTime"] else: rows["c:eventrangeend:dateTime"] = rows["c:eventrangestart:dateTime"] # in case we somehow got here without already doing this rows["title"] = footprint_lib.cleanse_snippet(rows["title"]) rows["description"] = footprint_lib.cleanse_snippet(rows["description"]) rows["c:detailURL:URL"] = rows["c:detailURL:URL"].replace("&", "&") if not rows["c:detailURL:URL"].lower().startswith("http"): rows["c:detailURL:URL"] = "http://" + rows["c:detailURL:URL"] link = str(rows["c:detailURL:URL"]) if link in BAD_LINKS or check_links.is_bad_link(link, RECHECK_BAD_LINKS): num_bad_links += 1 footprint_lib.feed_report(rows["c:OpportunityID:string"], "badlinks", shortname, link) dlink = "'" + str(link) + "'" if dlink not in BAD_LINKS: BAD_LINKS[dlink] = 0 print_progress("bad link: " + dlink) BAD_LINKS[dlink] += 1 continue rows["c:org_missionStatement:string"] = footprint_lib.cleanse_snippet(rows["c:org_missionStatement:string"]) rows["c:org_description:string"] = footprint_lib.cleanse_snippet(rows["c:org_description:string"]) rows["c:aggregatefield:string"] = footprint_lib.cleanse_snippet( " ".join( [ rows["title"], rows["description"], rows["c:provider_proper_name:string"], rows.get("c:skills:string", rows.get("c:skill:string", "")), rows.get("c:categoryTags:string", rows.get("c:categoryTag:string", "")), rows["c:org_name:string"], rows["c:eventName:string"], ] ) ) ids = rows.get("c:OpportunityID:string", rows.get("c:opportunityID:string", "OpportunityID")) ds = str(rows.get("c:eventrangestart:dateTime", "2001")) if ds.find("T") > 0: ds = ds.split("T")[0] rows["c:dateopportunityidgroup:string"] = "".join([ds, ids]) for key in rows.keys(): if key.find(":dateTime") != -1: if rows[key].find(":") > 0: rows[key] += "Z" elif key.find(":integer") != -1: if rows[key] == "": rows[key] = 0 else: # find the first numbers from the string, e.g. abc123.4 => 123 try: rows[key] = int(re.sub(r"^.*?([0-9]+).*$", r"\1", rows[key])) except: print_progress("error parsing rows[key]=%s -- rejecting record." % str(rows[key])) continue try: start_date = parser.parse(rows["c:eventrangestart:dateTime"], ignoretz=True) except: start_date = "2001-01-01T00:00:00" try: end_date = parser.parse(rows["c:eventrangeend:dateTime"], ignoretz=True) except: end_date = "2020-12-31T23:59:59" try: # check for expired opportunities delta_days = get_delta_days(relativedelta.relativedelta(end_date, today)) if delta_days < -2 and delta_days > -3000: # more than 3000? it's the 1971 thing # else it expired at least two days ago footprint_lib.feed_report(rows["c:OpportunityID:string"], "expired", shortname, link) expired_by_end_date += 1 continue duration_rdelta = relativedelta.relativedelta(end_date, start_date) duration_delta_days = get_delta_days(duration_rdelta) # Check whether start/end dates are the wrong way around. if duration_delta_days < 0: # removing this code for now-- too scary wrt. typos # e.g. what happens if 9/11/2009 - 9/7/2009 and it turns out # that the 7 was supposed to be a 17 i.e. simple typo-- by # swapping you've made it worse. Correct solution is to add # to spreadsheet checker, then reject start>end here. # even this is the wrong place to do this-- should apply to # both Base and SOLR. # print_progress('Date error: start > end. Swapping dates...') # duration_delta_days = -duration_delta_days # temp = rows["c:eventrangestart:dateTime"] # rows["c:eventrangestart:dateTime"] = rows["c:eventrangeend:dateTime"] # rows["c:eventrangeend:dateTime"] = temp print_progress("start date after end date: rejecting record.") continue # Fix for events that are ongoing or whose dates were unsucessfully # parsed. These events have start and end dates on 0000-01-01. # # These events get a large eventduration (used for ranking) so that # they are not erroneously boosted for having a short duration. current_rdelta = relativedelta.relativedelta(today, end_date) current_delta_days = get_delta_days(current_rdelta) rows["c:eventduration:integer"] = max(duration_delta_days, current_delta_days) except: pass # GBASE LEGACY: Fix to the +1000 to lat/long hack if not rows["c:latitude:float"] is None and float(rows["c:latitude:float"]) > 500: rows["c:latitude:float"] = float(rows["c:latitude:float"]) - 1000.0 if not rows["c:longitude:float"] is None and float(rows["c:longitude:float"]) > 500: rows["c:longitude:float"] = float(rows["c:longitude:float"]) - 1000.0 # The random salt is added to the result score during ranking to prevent # groups of near-identical results with identical scores from appearing # together in the same result pages without harming quality. rows["c:randomsalt:float"] = str(random.uniform(0.0, 1.0)) csv_writer.writerow(rows) numopps += 1 data_file.close() print_progress("bad links: %d" % num_bad_links) print_progress(" expired: %d" % expired_by_end_date) # NOTE: if you change this, you also need to update datahub/load_gbase.py # and frontend/views.py to avoid breaking the dashboard-- other status # messages don't matter. elapsed = datetime.now() - start_time xmlh.print_status( "done parsing: output " + str(footprint_lib.NUMORGS) + " organizations" + " and " + str(numopps) + " opportunities" + " (" + str(feed_file_size) + " bytes): " + str(int(elapsed.seconds / 60)) + " minutes.", shortname, ) proper_name = shortname if shortname in providers.ProviderNames: proper_name = providers.ProviderNames[shortname].get("name", shortname) # do the per-provider summary if shortname: processed = str(datetime.now()).split(".")[0] try: fh = open(FEEDSDIR + "/" + shortname + "-last.txt", "r") except: fh = None footprint_stats = None if fh: footprint_stats = fh.read() fh.close() fh = open(FEEDSDIR + "/" + shortname + "-history.txt", "a") if fh: fh.write("processed\t" + processed + "\n") fh.write("elapsed\t" + str(int(elapsed.seconds / 60)) + "\n") fh.write("bytes\t" + str(feed_file_size) + "\n") fh.write("numopps\t" + str(numopps) + "\n") fh.write("expired\t" + str(expired_by_end_date) + "\n") fh.write("badlinks\t" + str(num_bad_links) + "\n") if footprint_stats: fh.write(footprint_stats) fh.write("proper_name\t" + proper_name + "\n") fh.close() return out_filename
def file_choice(tables, verbose): """ Choose the right summary file component for the given Census table """ # code originally in readcsv.py by Peter Gao datareader = DictReader(open(dirname(argv[0]) + "/sf1_data_field_descriptors_2010.csv")) data = [] entry = None prevCol = None current_table = "" for line in datareader: new_table_number = line['TABLE NUMBER'] if new_table_number != current_table: # save the old one if entry != None: data.append(entry) entry = {} current_table = new_table_number entry['Matrix Number'] = line['TABLE NUMBER'] entry['File Name'] = line['SEGMENT'] next_line = datareader.next() entry['Universe'] = (next_line['FIELD NAME'][9:].lstrip()) entry['Name'] = line['FIELD NAME'][:line['FIELD NAME'].index('[')-1] entry['Cell Count'] = 0 entry['Field Names'] = [] # Increment the cell count iff there's actually data, rather than this being a descriptive row, # and save the column name if len(line['FIELD CODE']) > 0: entry['Cell Count'] += 1 entry['Field Names'].append(line['FIELD CODE']) # sanity check: ensure the columns are stored in order if entry['Cell Count'] == 1: assert int(re.sub('[A-Z]', '', line['FIELD CODE'][-4:])) == 1,\ 'Field names not stored in order for matrix %s: first column is %s' % (entry['Matrix Number'], line['FIELD CODE']) else: assert int(re.sub('[A-Z]', '', line['FIELD CODE'][-4:])) == int(re.sub('[A-Z]', '', prevCol[-4:])) + 1,\ 'Field names are not stored in order for matrix %s: column %s follows column %s' %\ (entry['Matrix Number'], line['FIELD CODE'], prevCol) prevCol = line['FIELD CODE'] files = [] for table in tables: file_name, column_offset = None, 5 for row in data: curr_file, curr_table, cell_count = row.get('File Name'), row.get('Matrix Number'), int(row.get('Cell Count')) if curr_file != file_name: file_name, column_offset = curr_file, 5 if curr_table == table: if verbose: print >> stderr, table, '-', row.get('Name'), 'in', row.get('Universe') files.append((table, file_name, column_offset, cell_count, row.get('Field Names'))) break column_offset += cell_count return files
def upload_waterpoints(filename, skip=0, limit=None): """Upload waterpoints from a CSV file.""" # Use sys.stdout.write so waterpoints can be printed nicely and succinctly import sys date_converter = lambda s: datetime.strptime(s, '%Y-%m-%d') bool_converter = lambda s: s == "true" status_map = { "non functional": "not functional", "functional needs repair": "needs repair" } status_converter = lambda s: status_map.get(s.lower(), s.lower()) convert = { 'gid': int, 'object_id': int, 'valid_from': date_converter, 'valid_to': date_converter, 'amount_tsh': float, 'breakdown_year': int, 'date_recorded': date_converter, 'gps_height': float, 'latitude': float, 'longitude': float, 'num_private': int, 'region_code': int, 'district_code': int, 'population': int, 'public_meeting': bool_converter, 'construction_year': int, 'status_group': status_converter } def print_flush(msg): sys.stdout.write(msg) sys.stdout.flush() facility_code = "wpf001" print_every = 1000 print_flush("Adding waterpoints. Please be patient.") with open(filename) as f: reader = DictReader(f) for i in range(skip): reader.next() for i, d in enumerate(reader): actual_index = i + skip + 2 do_print = actual_index % print_every == 0 try: d = dict((k, convert.get(k, str)(v)) for k, v in d.items() if v) coords = [d.pop('longitude'), d.pop('latitude')] d['location'] = {'type': 'Point', 'coordinates': coords} d['facility_code'] = facility_code if not check(add_document('waterpoints', d), 201, False): raise Exception() if do_print: print_flush(".") except Exception as e: print "Error adding waterpoint", e pprint(d) exit() if limit and i >= limit: break # Create a 2dsphere index on the location field for geospatial queries app.data.driver.db['resources'].ensure_index([('location', '2dsphere')]) print "Waterpoints uploaded!"
class CourtDataProcessor: def __init__(self, court_type, dob_start, dob_end): self.court_type = court_type self.dob_start = dob_start self.in_filepath = '{}_{}_{}.csv'.format(dob_start, dob_end, court_type) self.download_data(dob_start, dob_end) self.in_file = open(self.in_filepath) self.data_reader = DictReader(self.in_file) self.last_person = None def download_data(self, dob_start, dob_end): # PGHOST, PGDATABASE, PGUSER, PGPASSWORD if self.court_type == 'district': gender_field = 'Gender' name_field = 'Name' table = 'DistrictCriminalCase' else: gender_field = 'Sex' name_field = 'Defendant' table = 'CircuitCriminalCase' copy_cmd = '\\copy (SELECT id, "{}", "{}", "DOB", "Address" FROM "{}"'.format( gender_field, name_field, table) copy_cmd += ' WHERE "DOB" >= \'{}\' AND "DOB" <= \'{}\''.format( dob_start, dob_end) copy_cmd += ' ORDER BY "{}", "DOB", "{}") To \'{}\' With CSV HEADER;'.format( gender_field, name_field, self.in_filepath) psql_cmd = ['psql', '-c', copy_cmd] print self.in_filepath, subprocess.check_output(psql_cmd) def close(self): self.in_file.close() os.remove(self.in_filepath) def next_people(self, gender_group, dob_group, letter_group): people = [] while True: if self.last_person is not None: person = self.last_person self.last_person = None else: try: person = self.data_reader.next() except StopIteration: break gender = person['Gender'] if 'Gender' in person else person['Sex'] name = person['Name'] if 'Name' in person else person['Defendant'] dob = person['DOB'] if gender not in GENDERS: continue if name[0] not in LETTERS: continue if gender == gender_group and dob == dob_group and name.startswith( letter_group): people.append({ 'id': person['id'], 'name': name, 'address': person['Address'], 'courtType': self.court_type }) else: self.last_person = person break return people
def next(self): # For Python 2 row = DictReader.next(self) for (att, func) in self._casts.items(): row[att] = func(row[att]) return row
def buildRootFile(fname, outputdir): if outputdir.rstrip('/') == '.': outFname = os.path.basename(fname) + '.root' else: outFname = outputdir.rstrip('/') + '/' + os.path.basename(fname) +'.root' print 'building root file from', fname, '>> output:', outFname #loop on file for upload file = open(fname) reader = DictReader(open(fname), delimiter=' ', skipinitialspace = True) try: doc = parseDoc(reader.next()) #read the first line except Exception as e: print e return None #here, i search through the key/value pairs of doc trying to determine #the type of value and then creating a list of variable names, an array for #each variable stored in a list (necessary for TTree branching), and then #a TTree branch format descriptor varNames = list() arrayList = list() descriptor = list() for k, v in doc.items(): #print k, v name = formatname(k) if isinstance(v,str)==False: if isinstance(v, int): #int varNames.append(name) if name == 'Stamp' or name == 'Position' or name == 'GigaStamp' or name == 'Evt' or re.match('List',name) or re.match('Date',name): arrayList.append(np.arange(1, dtype=np.uint32)) descriptor.append(str(name) + '/i') else: arrayList.append(array.array('i',[0])) #we have to use arrays because of the way that Python deals with memory and the way that TTrees deal with memory descriptor.append(str(name) + '/I') else: #must be a float try: if math.isnan(float(v))==False: varNames.append(name) arrayList.append(array.array('f',[0])) descriptor.append(str(name) + '/F') except: pass else: #must be a string # we're skipping strings. #varNames.append(name) #arrayList.append(array.array('i',[0])) #descriptor.append(str(name) + '/C') pass file = TFile.Open(outFname, 'recreate') tree = TTree('ntp_tree','A Tree based on the Samba NTP files.') #print varNames #print arrayList #print descriptor for i in range(len(arrayList)): #set up the branches tree.Branch(varNames[i],arrayList[i],descriptor[i]) #re-read the file so that we start at the first line reader = DictReader(open(fname), delimiter=' ', skipinitialspace = True) try: for line in reader: #print 'next line' line = parseDoc(line) for k, v in line.items(): name = formatname(k) try: i = varNames.index(name) #its not guaranteed the the order of key/value pair is #maintained. So, we have to use the list.index function #to find the proper index for this particular key try: arrayList[i][0] = v #set the value to the proper array (arrayList[i] returns an array and arrayList[i][0] is the zero'th element of the array) except OverflowError: print i print k, v raise OverflowError #print k, v #print i, arrayList[i][0] except ValueError: pass #this will throw if varNames doesn't have an index named 'name' In the code above, #strings are ignored. so when we come across a key that isn't in our list, #which is probably a string, we ignore it here. #print 'fill' tree.Fill() except Exception as e: print e file.cd() tree.Write() file.Close() return outFname