def from_url(url): # allow the year to go +1 incase we're on new years eve # and dealing with timezones max_year = datetime.date.today().year + 1 def parse_segments(segments): for segment in segments: try: val = int(segment) # years if val >= 1980 or val <= max_year: yield val continue # months if val >= 1 or val <= 31: yield val continue except Exception as e: continue except GeneratorExit: break segments = url.split('/') year, month, day = None, None, None parser = parse_segments(segments) try: # try and extract in year order year = parser.next() month = parser.next() day = parser.next() except Exception as e: pass if year: if year < 1980 or year > max_year: year = None if month: if month < 1 or month > 12: month = None if day: if day < 1 or day > 31: day = None if not day: day = 1 if year and month and day: try: return datetime.datetime(year, month, day) except: return None return None
def _save_raw_data(file_pk, *args, **kwargs): """Chunk up the CSV and save data into the DB raw.""" import_file = ImportFile.objects.get(pk=file_pk) if import_file.raw_save_done: return {'status': 'warning', 'message': 'raw data already saved'} if import_file.source_type == "Green Button Raw": return _save_raw_green_button_data(file_pk, *args, **kwargs) parser = reader.MCMParser(import_file.local_file) cache_first_rows(import_file, parser) rows = parser.next() import_file.num_rows = 0 prog_key = get_prog_key('save_raw_data', file_pk) tasks = [] for chunk in batch(rows, 100): import_file.num_rows += len(chunk) tasks.append(_save_raw_data_chunk.subtask((chunk, file_pk, prog_key))) tasks = add_cache_increment_parameter(tasks) import_file.num_columns = parser.num_columns() import_file.save() if tasks: chord(tasks, interval=15)(finish_raw_save.subtask([file_pk])) else: finish_raw_save.task(file_pk) return {'status': 'success'}
def cache_first_rows(import_file, parser): """Cache headers, and rows 2-6 for validation/viewing. :param import_file: ImportFile inst. :param parser: unicode-csv.Reader instance. Unfortunately, this is duplicated logic from data_importer, but since data_importer makes many faulty assumptions we need to do it differently. """ parser.seek_to_beginning() rows = parser.next() validation_rows = [] for i in range(5): row = rows.next() if row: validation_rows.append(row) import_file.cached_second_to_fifth_row = "\n".join( [ ROW_DELIMITER.join(map(lambda x: str(x), r.values())) for r in validation_rows ] ) first_row = rows.next().keys() if first_row: first_row = ROW_DELIMITER.join(first_row) import_file.cached_first_row = first_row or '' import_file.save() # Reset our file pointer for mapping. parser.seek_to_beginning()
def cache_first_rows(import_record, parser): """Cache headers, and rows 2-6 for validation/viewing. :param import_record: ImportRecord inst. :param parser: unicode-csv.Reader instance. Unfortunately, this is duplicated logic from data_importer, but since data_importer makes many faulty assumptions we need to do it differently. """ parser.csvfile.seek(0) rows = parser.next() first_row = rows.next().values() if first_row: first_row = ROW_DELIMITER.join(first_row) import_record.cached_first_row = first_row or '' validation_rows = [] for i in range(5): row = rows.next() if row: validation_rows.append(row) import_record.cached_second_to_fifth_row = "\n".join( [ROW_DELIMITER.join(r.values()) for r in validation_rows] ) import_record.save() # Reset our file pointer for mapping. parser.csvfile.seek(0)
def _save_raw_data(file_pk, *args, **kwargs): """Chunk up the CSV or XLSX file and save the raw data into the DB BuildingSnapshot table.""" result = {'status': 'success', 'progress': 100} prog_key = get_prog_key('save_raw_data', file_pk) try: import_file = ImportFile.objects.get(pk=file_pk) if import_file.raw_save_done: result['status'] = 'warning' result['message'] = 'Raw data already saved' set_cache(prog_key, result['status'], result) return result if import_file.source_type == "Green Button Raw": return _save_raw_green_button_data(file_pk, *args, **kwargs) parser = reader.MCMParser(import_file.local_file) cache_first_rows(import_file, parser) rows = parser.next() import_file.num_rows = 0 import_file.num_columns = parser.num_columns() # Why are we setting the num_rows to the number of chunks? tasks = [] for chunk in batch(rows, 100): import_file.num_rows += len(chunk) tasks.append(_save_raw_data_chunk.s(chunk, file_pk, prog_key)) import_file.save() # need to rework how the progress keys are implemented here tasks = add_cache_increment_parameter(tasks) if tasks: chord(tasks, interval=15)(finish_raw_save.s(file_pk)) else: finish_raw_save.s(file_pk) except StopIteration: result['status'] = 'error' result['message'] = 'StopIteration Exception' result['stacktrace'] = traceback.format_exc() except Error as e: result['status'] = 'error' result['message'] = 'File Content Error: ' + e.message result['stacktrace'] = traceback.format_exc() except KeyError as e: result['status'] = 'error' result['message'] = 'Invalid Column Name: "' + e.message + '"' result['stacktrace'] = traceback.format_exc() except Exception as e: result['status'] = 'error' result['message'] = 'Unhandled Error: ' + str(e.message) result['stacktrace'] = traceback.format_exc() set_cache(prog_key, result['status'], result) return result
def _save_raw_data(file_pk, *args, **kwargs): """Chunk up the CSV and save data into the DB raw.""" result = {'status': 'success', 'progress': 100} prog_key = get_prog_key('save_raw_data', file_pk) try: import_file = ImportFile.objects.get(pk=file_pk) if import_file.raw_save_done: result['status'] = 'warning' result['message'] = 'Raw data already saved' cache.set(prog_key, result) return result if import_file.source_type == "Green Button Raw": return _save_raw_green_button_data(file_pk, *args, **kwargs) parser = reader.MCMParser(import_file.local_file) cache_first_rows(import_file, parser) rows = parser.next() import_file.num_rows = 0 tasks = [] for chunk in batch(rows, 100): import_file.num_rows += len(chunk) tasks.append( _save_raw_data_chunk.subtask((chunk, file_pk, prog_key))) tasks = add_cache_increment_parameter(tasks) import_file.num_columns = parser.num_columns() import_file.save() if tasks: chord(tasks, interval=15)(finish_raw_save.subtask([file_pk])) else: finish_raw_save.task(file_pk) except StopIteration: result['status'] = 'error' result['message'] = 'StopIteration Exception' result['stacktrace'] = traceback.format_exc() except Error as e: result['status'] = 'error' result['message'] = 'File Content Error: ' + e.message result['stacktrace'] = traceback.format_exc() except KeyError as e: result['status'] = 'error' result['message'] = 'Invalid Column Name: "' + e.message + '"' result['stacktrace'] = traceback.format_exc() except Exception as e: result['status'] = 'error' result['message'] = 'Unhandled Error: ' + e.message result['stacktrace'] = traceback.format_exc() cache.set(prog_key, result) return result
def _save_raw_data(file_pk, *args, **kwargs): """Chunk up the CSV and save data into the DB raw.""" result = {'status': 'success', 'progress': 100} prog_key = get_prog_key('save_raw_data', file_pk) try: import_file = ImportFile.objects.get(pk=file_pk) if import_file.raw_save_done: result['status'] = 'warning' result['message'] = 'Raw data already saved' cache.set(prog_key, result) return result if import_file.source_type == "Green Button Raw": return _save_raw_green_button_data(file_pk, *args, **kwargs) parser = reader.MCMParser(import_file.local_file) cache_first_rows(import_file, parser) rows = parser.next() import_file.num_rows = 0 tasks = [] for chunk in batch(rows, 100): import_file.num_rows += len(chunk) tasks.append(_save_raw_data_chunk.subtask((chunk, file_pk, prog_key))) tasks = add_cache_increment_parameter(tasks) import_file.num_columns = parser.num_columns() import_file.save() if tasks: chord(tasks, interval=15)(finish_raw_save.subtask([file_pk])) else: finish_raw_save.task(file_pk) except StopIteration: result['status'] = 'error' result['message'] = 'StopIteration Exception' result['stacktrace'] = traceback.format_exc() except Error as e: result['status'] = 'error' result['message'] = 'File Content Error: ' + e.message result['stacktrace'] = traceback.format_exc() except KeyError as e: result['status'] = 'error' result['message'] = 'Invalid Column Name: "' + e.message + '"' result['stacktrace'] = traceback.format_exc() except Exception as e: result['status'] = 'error' result['message'] = 'Unhandled Error: ' + e.message result['stacktrace'] = traceback.format_exc() cache.set(prog_key, result) return result
def cache_first_rows(import_file, parser): """Cache headers, and rows 2-6 for validation/viewing. :param import_file: ImportFile inst. :param parser: unicode-csv.Reader instance. Unfortunately, this is duplicated logic from data_importer, but since data_importer makes many faulty assumptions we need to do it differently. """ parser.seek_to_beginning() rows = parser.next() validation_rows = [] for i in range(5): try: row = rows.next() if row: validation_rows.append(row) except StopIteration: """Less than 5 rows in file""" break # This is a fix for issue #24 to use original field order when importing # This is ultimately not the correct place for this fix. The correct fix # is to update the mcm code to a newer version where the readers in mcm/reader.py # have a headers() function defined and then just do # first_row = parser.headers() # But until we can patch the mcm code this should fix the issue. local_reader = parser.reader if isinstance(local_reader, reader.ExcelParser): first_row = local_reader.sheet.row_values(local_reader.header_row) elif isinstance(local_reader, reader.CSVParser): first_row = local_reader.csvreader.fieldnames first_row = [local_reader._clean_super(x) for x in first_row] else: # default to the original behavior if a new type of parser for lack of anything better first_row = rows.next().keys() tmp = [] for r in validation_rows: tmp.append(ROW_DELIMITER.join([str(r[x]) for x in first_row])) import_file.cached_second_to_fifth_row = "\n".join(tmp) if first_row: first_row = ROW_DELIMITER.join(first_row) import_file.cached_first_row = first_row or '' import_file.save() # Reset our file pointer for mapping. parser.seek_to_beginning()
def cache_first_rows(import_file, parser): """Cache headers, and rows 2-6 for validation/viewing. :param import_file: ImportFile inst. :param parser: unicode-csv.Reader instance. Unfortunately, this is duplicated logic from data_importer, but since data_importer makes many faulty assumptions we need to do it differently. """ parser.seek_to_beginning() rows = parser.next() validation_rows = [] for i in range(5): try: row = rows.next() if row: validation_rows.append(row) except StopIteration: """Less than 5 rows in file""" break #This is a fix for issue #24 to use original field order when importing #This is ultimately not the correct place for this fix. The correct fix #is to update the mcm code to a newer version where the readers in mcm/reader.py #have a headers() function defined and then just do #first_row = parser.headers() #But until we can patch the mcm code this should fix the issue. local_reader = parser.reader if isinstance(local_reader, reader.ExcelParser): first_row = local_reader.sheet.row_values(local_reader.header_row) elif isinstance(local_reader, reader.CSVParser): first_row = local_reader.csvreader.fieldnames first_row = [local_reader._clean_super(x) for x in first_row] else: #default to the original behavior if a new type of parser for lack of anything better first_row = rows.next().keys() tmp = [] for r in validation_rows: tmp.append(ROW_DELIMITER.join([str(r[x]) for x in first_row])) import_file.cached_second_to_fifth_row = "\n".join(tmp) if first_row: first_row = ROW_DELIMITER.join(first_row) import_file.cached_first_row = first_row or '' import_file.save() # Reset our file pointer for mapping. parser.seek_to_beginning()
def cache_first_rows(import_file, parser): """Cache headers, and rows 2-6 for validation/viewing. :param import_file: ImportFile inst. :param parser: unicode-csv.Reader instance. Unfortunately, this is duplicated logic from data_importer, but since data_importer makes many faulty assumptions we need to do it differently. """ parser.seek_to_beginning() rows = parser.next() validation_rows = [] for i in range(5): try: row = rows.next() if row: validation_rows.append(row) except StopIteration: """Less than 5 rows in file""" break # return the first row of the headers which are cleaned first_row = parser.headers() tmp = [] for r in validation_rows: tmp.append(ROW_DELIMITER.join([str(r[x]) for x in first_row])) import_file.cached_second_to_fifth_row = "\n".join(tmp) if first_row: first_row = ROW_DELIMITER.join(first_row) import_file.cached_first_row = first_row or '' import_file.save() # Reset our file pointer for mapping. parser.seek_to_beginning()