Пример #1
0
def analyze_csv(url, sample=1000):
    try:
        fileobj = urlopen(url)
        row_set = CSVRowSet('data', fileobj, window=sample)
        sample = list(row_set.sample)
        headers, sample = sample[0], sample[1:]
        # values = frequent_values(sample)
        types = type_guess(sample[500:], types=LIMITED_TYPES)
        mapping = {}
        for header, type_ in zip(headers, types):
            type_ = repr(type_).lower()
            name = slugify(header.value).lower()
            meta = {
                'label': header.value,
                'column': header.value,
                'datatype': type_
            }
            if type_ in ['decimal', 'integer', 'float']:
                meta['type'] = 'measure'
                meta['datatype'] = 'float'
            elif type_.startswith('date'):
                meta['type'] = 'date'
                meta['datatype'] = 'date'
            else:
                meta['type'] = 'attribute'
            mapping[name] = meta
        return {'columns': [h.value for h in headers], 'mapping': mapping}
    except Exception as e:
        log.exception(e)
        return {'error': unicode(e)}
Пример #2
0
    def lines(self):
        fh = urlopen(self.source.url)
        row_set = CSVRowSet('data', fh, window=3)
        headers = list(row_set.sample)[0]
        headers = [c.value for c in headers]
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(1))

        for row in row_set:
            row_dict = dict([(c.column, c.value) for c in row])
            # Rename id to row_id
            row_dict['row_id'] = row_dict.pop('id')
            # Set time as empty string to use the default value
            row_dict['time'] = ''

            # Transform COFOG field into six fields with code and label as
            # the same value
            cofog = row_dict.pop('cofog', None)
            if cofog:
                row_dict['cofog1code'] = self.cofog_code(cofog, level=1)
                row_dict['cofog1label'] = self.cofog_code(cofog, level=1)
                row_dict['cofog2code'] = self.cofog_code(cofog, level=2)
                row_dict['cofog2label'] = self.cofog_code(cofog, level=2)
                row_dict['cofog3code'] = self.cofog_code(cofog, level=3)
                row_dict['cofog3label'] = self.cofog_code(cofog, level=3)

            # Transform gfsm expense field into three fields
            gfsmexpense = row_dict.pop('gfsmexpense', None)
            if gfsmexpense:
                row_dict['gfsmexpense1'] = self.gfsm_code(gfsmexpense, level=1)
                row_dict['gfsmexpense2'] = self.gfsm_code(gfsmexpense, level=2)
                row_dict['gfsmexpense3'] = self.gfsm_code(gfsmexpense, level=3)

            # Transform gfsm revenue field into three fields
            gfsmrevenue = row_dict.pop('gfsmrevenue', None)
            if gfsmrevenue:
                row_dict['gfsmrevenue1'] = self.gfsm_code(gfsmrevenue, level=1)
                row_dict['gfsmrevenue2'] = self.gfsm_code(gfsmrevenue, level=2)
                row_dict['gfsmrevenue3'] = self.gfsm_code(gfsmrevenue, level=3)
            yield row_dict
Пример #3
0
 def lines(self):
     fh = urlopen(self.source.url)
     row_set = CSVRowSet('data', fh, window=3)
     headers = list(row_set.sample)[0]
     headers = [c.value for c in headers]
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(1))
     for row in row_set:
         yield dict([(c.column, c.value) for c in row])
Пример #4
0
    def lines(self):
        fh = urlopen(self.source.url)
        row_set = CSVRowSet('data', fh, window=3)
        headers = list(row_set.sample)[0]
        headers = [c.value for c in headers]
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(1))

        for row in row_set:
            row_dict = dict([(c.column, c.value) for c in row])
            # Rename id to row_id
            row_dict['row_id'] = row_dict.pop('id')
            # Set time as empty string to use the default value
            row_dict['time'] = ''

            # Transform COFOG field into six fields with code and label as
            # the same value
            cofog = row_dict.pop('cofog', None)
            if cofog:
                row_dict['cofog1code'] = self.cofog_code(cofog, level=1)
                row_dict['cofog1label'] = self.cofog_code(cofog, level=1)
                row_dict['cofog2code'] = self.cofog_code(cofog, level=2)
                row_dict['cofog2label'] = self.cofog_code(cofog, level=2)
                row_dict['cofog3code'] = self.cofog_code(cofog, level=3)
                row_dict['cofog3label'] = self.cofog_code(cofog, level=3)

            # Transform gfsm expense field into three fields
            gfsmexpense = row_dict.pop('gfsmexpense', None)
            if gfsmexpense:
                row_dict['gfsmexpense1'] = self.gfsm_code(gfsmexpense, level=1)
                row_dict['gfsmexpense2'] = self.gfsm_code(gfsmexpense, level=2)
                row_dict['gfsmexpense3'] = self.gfsm_code(gfsmexpense, level=3)

            # Transform gfsm revenue field into three fields
            gfsmrevenue = row_dict.pop('gfsmrevenue', None)
            if gfsmrevenue:
                row_dict['gfsmrevenue1'] = self.gfsm_code(gfsmrevenue, level=1)
                row_dict['gfsmrevenue2'] = self.gfsm_code(gfsmrevenue, level=2)
                row_dict['gfsmrevenue3'] = self.gfsm_code(gfsmrevenue, level=3)
            yield row_dict