def read_metadata(cls, source): """ Reads metadata from a scraper for a specified source. """ if not source.scraper_name or not source.scraperwiki_url: return scraper_name = source.scraper_name api_key = source.scraper_api_key.strip() \ if source.scraper_api_key else '' scraperwiki_url = source.scraperwiki_url.rstrip('/') data_list = cls.get_metadata_of_scraper( scraperwiki_url, scraper_name, api_key ) if isinstance(data_list, dict): raise Exception('Obtained error', repr(data_list)) errors = [] total = 0 bounded = cls.BOUNDED for list_elem in data_list: stripped = lambda i: list_elem[bounded[i]].strip() stripped_or_none = lambda i: stripped(i)\ if bounded[i] in list_elem else None try: total += 1 dataset = Dataset() dataset.source = source dataset.url = stripped(0) download = stripped(1) if download is None: raise Exception( 'Dataset %s does not define a valid download URL' % dataset.url ) if download.startswith('http:'): dataset.download = download else: dataset.download = '{}{}:{}'.format( SCRAPER_PROTOCOL, scraper_name, get_table_name_from_scraper(stripped(1)) ) dataset.name = stripped(2) dataset.curator = stripped(3) dataset.license = stripped(4) dataset.description = stripped_or_none(5) dataset.tags = stripped_or_none(6) dataset.bounding_box = stripped_or_none(7) dataset.other_meta = json.dumps(cls._get_unbounded(list_elem)) dataset.save() except Exception as e: logger.exception('Invalid dataset') errors.append(repr(e)) return {'total': total, 'errors': len(errors), 'report': errors}
def read_csv(source, csv_stream): """ Reads metadata from a CSV for a specified source name. """ if not isinstance(source, Source): source = Source.objects.get(name=source) from csvkit import CSVKitReader rows = list(CSVKitReader(csv_stream, delimiter='\t')) fields = dict(enumerate(rows[0])) errors = [] for row in rows[1:]: try: data = {fields[idx]: value for idx, value in enumerate(row)} tags = data.pop('tags', None) dataset = Dataset(**data) dataset.source = source dataset.save() if tags: dataset.tags.add(*parse_tags(tags)) except Exception, e: logger.exception('Cannot import a dataset from CSV') errors.append(repr(e))
def read_metadata(cls, source): """ Reads metadata from a scraper for a specified source. """ if not source.scraper_name or not source.scraperwiki_url: return scraper_name = source.scraper_name api_key = source.scraper_api_key.strip() \ if source.scraper_api_key else '' scraperwiki_url = source.scraperwiki_url.rstrip('/') data_list = cls.get_metadata_of_scraper(scraperwiki_url, scraper_name, api_key) if isinstance(data_list, dict): raise Exception('Obtained error', repr(data_list)) errors = [] total = 0 bounded = cls.BOUNDED for list_elem in data_list: stripped = lambda i: list_elem[bounded[i]].strip() stripped_or_none = lambda i: stripped(i)\ if bounded[i] in list_elem else None try: total += 1 dataset = Dataset() dataset.source = source dataset.url = stripped(0) download = stripped(1) if download is None: raise Exception( 'Dataset %s does not define a valid download URL' % dataset.url) if download.startswith('http:'): dataset.download = download else: dataset.download = '{}{}:{}'.format( SCRAPER_PROTOCOL, scraper_name, get_table_name_from_scraper(stripped(1))) dataset.name = stripped(2) dataset.curator = stripped(3) dataset.license = stripped(4) dataset.description = stripped_or_none(5) dataset.tags = stripped_or_none(6) dataset.bounding_box = stripped_or_none(7) dataset.other_meta = json.dumps(cls._get_unbounded(list_elem)) dataset.save() except Exception as e: logger.exception('Invalid dataset') errors.append(repr(e)) return {'total': total, 'errors': len(errors), 'report': errors}