def analyze_csv(url, sample=1000): try: fileobj = urlopen(url) row_set = CSVRowSet('data', fileobj, window=sample) sample = list(row_set.sample) headers, sample = sample[0], sample[1:] # values = frequent_values(sample) types = type_guess(sample[500:], types=LIMITED_TYPES) mapping = {} for header, type_ in zip(headers, types): type_ = repr(type_).lower() name = slugify(header.value).lower() meta = { 'label': header.value, 'column': header.value, 'datatype': type_ } if type_ in ['decimal', 'integer', 'float']: meta['type'] = 'measure' meta['datatype'] = 'float' elif type_.startswith('date'): meta['type'] = 'date' meta['datatype'] = 'date' else: meta['type'] = 'attribute' mapping[name] = meta return {'columns': [h.value for h in headers], 'mapping': mapping} except Exception as e: log.exception(e) return {'error': unicode(e)}
def lines(self): fh = urlopen(self.source.url) row_set = CSVRowSet('data', fh, window=3) headers = list(row_set.sample)[0] headers = [c.value for c in headers] row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(1)) for row in row_set: row_dict = dict([(c.column, c.value) for c in row]) # Rename id to row_id row_dict['row_id'] = row_dict.pop('id') # Set time as empty string to use the default value row_dict['time'] = '' # Transform COFOG field into six fields with code and label as # the same value cofog = row_dict.pop('cofog', None) if cofog: row_dict['cofog1code'] = self.cofog_code(cofog, level=1) row_dict['cofog1label'] = self.cofog_code(cofog, level=1) row_dict['cofog2code'] = self.cofog_code(cofog, level=2) row_dict['cofog2label'] = self.cofog_code(cofog, level=2) row_dict['cofog3code'] = self.cofog_code(cofog, level=3) row_dict['cofog3label'] = self.cofog_code(cofog, level=3) # Transform gfsm expense field into three fields gfsmexpense = row_dict.pop('gfsmexpense', None) if gfsmexpense: row_dict['gfsmexpense1'] = self.gfsm_code(gfsmexpense, level=1) row_dict['gfsmexpense2'] = self.gfsm_code(gfsmexpense, level=2) row_dict['gfsmexpense3'] = self.gfsm_code(gfsmexpense, level=3) # Transform gfsm revenue field into three fields gfsmrevenue = row_dict.pop('gfsmrevenue', None) if gfsmrevenue: row_dict['gfsmrevenue1'] = self.gfsm_code(gfsmrevenue, level=1) row_dict['gfsmrevenue2'] = self.gfsm_code(gfsmrevenue, level=2) row_dict['gfsmrevenue3'] = self.gfsm_code(gfsmrevenue, level=3) yield row_dict
def lines(self): fh = urlopen(self.source.url) row_set = CSVRowSet('data', fh, window=3) headers = list(row_set.sample)[0] headers = [c.value for c in headers] row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(1)) for row in row_set: yield dict([(c.column, c.value) for c in row])