def handle(self, *args, **options): data = os.path.join(settings.BASE_DIR, "data") files = list(all_files(data, "*.csv")) for filepath in files: log("Opening file {}\n".format(filepath), "cyan") log(" Loading data ...\n") copy = CopyMapping( Incident, filepath, dict( date="IncidentDate", year="Year", raw_location="Location", address="Edited Street Address", city_and_state="City and State", full_address="Edited Full Address", latitude="Latitude", longitude="Longitude", accuracy_score="Accuracy Score", accuracy_type="Accuracy Type", number="Number", street="Street", city="City", state="State", county="County", zipcode="Zip", ), ) copy.save() log(" Data loaded!\n", "green")
def handle(self, *args, **kwargs): ccpath = kwargs['crimeCatPath'] tppath = kwargs['targetPlacePath'] # ccpath = '.../ctype2cc_170627.csv' print('loadAncil: crimeCat from %s...' % (ccpath)) c1 = CopyMapping( CrimeCat, ccpath, # And a dict mapping the model fields to CSV headers # CSV header = CType, CC dict(ctypeDesc='CType', crimeCat='CC')) c1.save() print('loadAncil: NCrimeCat=%d' % (CrimeCat.objects.all().count())) # tppath = '.../targetPlace.csv' print('loadAncil: targetPlace from %s ...' % (tppath)) c2 = CopyMapping( TargetPlace, tppath, # And a dict mapping the model fields to CSV headers # CSV header = placeType, ylat, xlng, name, desc dict(placeType='placeType', ylat='ylat', xlng='xlng', name='name', desc='desc')) c2.save() print('loadAncil: NTargetPlace=%d' % (TargetPlace.objects.all().count()))
def test_limited_save(self): c = CopyMapping(LimitedMockObject, self.name_path, dict(name='NAME', dt='DATE')) c.save() self.assertEqual(LimitedMockObject.objects.count(), 3) self.assertEqual( LimitedMockObject.objects.get(name='BEN').dt, date(2012, 1, 1))
def insertData(apps, schema_editor): lang = Lang(code='es', name='Español') lang.save() curdir = os.path.dirname(os.path.realpath(__file__)) path = os.path.normpath(os.path.join(curdir, os.pardir, os.pardir, 'data', 'mercado_continuo.csv')) c = CopyMapping( Symbol, path, dict(name='nombre', ticker='ticker'), static_mapping= {'market': 'Mercado Continuo', 'type': 'stock'} ) c.save() Symbol(ticker='IBEX35', name='IBEX 35', market='Mercado Continuo', type=Symbol.INDEX).save() Symbol(ticker='IBEXTR', name='IBEX Total Return', market='Mercado Continuo', type=Symbol.INDEX).save() invertia_quote_tpl = 'https://www.invertia.com/es/mercados/bolsa/empresas/historico?p_p_id=cotizacioneshistoricas_WAR_ivfrontmarketsportlet&p_p_lifecycle=2&p_p_state=normal&p_p_mode=view&p_p_resource_id=exportExcel&p_p_cacheability=cacheLevelPage&p_p_col_id=column-1&p_p_col_pos=1&p_p_col_count=2&_cotizacioneshistoricas_WAR_ivfrontmarketsportlet_startDate={{startDate}}&_cotizacioneshistoricas_WAR_ivfrontmarketsportlet_endDate={{endDate}}&_cotizacioneshistoricas_WAR_ivfrontmarketsportlet_idtel={invertia_key}' invertia_dividend_tpl = 'https://www.invertia.com/es/mercados/bolsa/empresas/dividendos/-/empresa/{empresa}/{invertia_key}' curdir = os.path.dirname(os.path.realpath(__file__)) path = os.path.normpath(os.path.join(curdir, os.pardir, os.pardir, 'data', 'invertia_claves.csv')) invertia_keys = csv.reader(open(path)) next(invertia_keys, None) # skip the headers quote_sources = [] for row in invertia_keys: quote_url = invertia_quote_tpl.format(invertia_key=row[2]) dividend_url = invertia_dividend_tpl.format(empresa=row[1], invertia_key=row[2]) try: symbol_id = Symbol.objects.get(ticker=row[0]).id SymbolSource(name='invertia', symbol_id=symbol_id, key=quote_url, type=SymbolSource.QUOTE).save() SymbolSource(name='invertia', symbol_id=symbol_id, key=dividend_url, type=SymbolSource.DIVIDEND).save() except Symbol.DoesNotExist: print('Symbol with ticker', row[0], "(%s)" % row[1], 'not found. Cotinuing...') continue mcs = Symbol.objects.filter(market='Mercado Continuo').exclude(ticker__in=('IBEX35', 'IBEXTR')) for symbol in mcs: SymbolSource(name='quantmod', symbol_id=symbol.id, key='%s.mc' % symbol.ticker, type=SymbolSource.SPLIT).save()
def test_field_value_mapping_and_types(self): c = CopyMapping( BasicMockObject, self.mapping_path, dict(name='NAME', number='NUMBER', dt='DATE'), field_value_mapping={ 'name': { 'ben': 'Master Ben', 'joe': 'Padawan Joe', 'jane': 'Jedi Jane' }, 'number': { 'seven': 7, 'three': 3, 'five': 5 } }, field_copy_types={'number': 'text'} ) c.save() self.assertEqual(BasicMockObject.objects.count(), 3) self.assertEqual( list(BasicMockObject.objects.order_by('name').values_list('name', 'number')), [('Jedi Jane', 5), ('Master Ben', 7), ('Padawan Joe', 3)] ) self.assertEqual( BasicMockObject.objects.get(name='Master Ben').dt, date(2012, 1, 1) )
def test_foreignkey_mapping(self): c = CopyMapping( BasicMockObject, self.name_path, dict(name='NAME', number='NUMBER', dt='DATE') ) c.save() self.assertEqual(BasicMockObject.objects.count(), 3) c = CopyMapping( BasicMockObject, self.fk_mapping_path, dict(name='NAME', number='NUMBER', dt='DATE', parent='PARENT'), field_value_mapping={ 'parent': { 'ben': BasicMockObject.objects.get(name='ben').pk, 'joe': BasicMockObject.objects.get(name='joe').pk, 'jane': BasicMockObject.objects.get(name='jane').pk }, }, field_copy_types={'parent': 'text'} ) c.save() self.assertEqual(BasicMockObject.objects.count(), 6) self.assertEqual( list(BasicMockObject.objects.order_by('name'). values_list('name', 'parent__name')), [('ben', None), ('ben junior', 'ben'), ('jane', None), ('jane junior', 'jane'), ('joe', None), ('joe junior', 'joe')] )
def test_static_mapping_ignore_non_mapped_headers(self): c = CopyMapping( BasicMockObject, self.name_path, dict(name='NAME', number='NUMBER', dt='DATE'), ) c.save() self.assertEqual(BasicMockObject.objects.count(), 3) c = CopyMapping(BasicMockObject, self.fk_mapping_path, dict(name='NAME', number='NUMBER', dt='DATE'), static_mapping={ 'parent': BasicMockObject.objects.get(name='ben').pk }, ignore_non_mapped_headers=True) c.save() self.assertEqual(BasicMockObject.objects.count(), 6) self.assertEqual( list( BasicMockObject.objects.order_by('name').values_list( 'name', 'parent__name')), [('ben', None), ('ben junior', 'ben'), ('jane', None), ('jane junior', 'ben'), ('joe', None), ('joe junior', 'ben')])
def handle(self, *args, **options): data = os.path.join(settings.BASE_DIR, 'data') files = list(all_files(data, '*.csv')) for filepath in files: log('Opening file {}\n'.format(filepath), 'cyan') log(' Loading data ...\n') copy = CopyMapping( Incident, filepath, dict( date='IncidentDate', year='Year', raw_location='Location', address='Edited Street Address', city_and_state='City and State', full_address='Edited Full Address', latitude='Latitude', longitude='Longitude', accuracy_score='Accuracy Score', accuracy_type='Accuracy Type', number='Number', street='Street', city='City', state='State', county='County', zipcode='Zip', )) copy.save() log(' Data loaded!\n', 'green')
def test_foreignkey_mapping(self): c = CopyMapping(BasicMockObject, self.name_path, dict(name='NAME', number='NUMBER', dt='DATE')) c.save() self.assertEqual(BasicMockObject.objects.count(), 3) c = CopyMapping(BasicMockObject, self.fk_mapping_path, dict(name='NAME', number='NUMBER', dt='DATE', parent='PARENT'), field_value_mapping={ 'parent': { 'ben': BasicMockObject.objects.get(name='ben').pk, 'joe': BasicMockObject.objects.get(name='joe').pk, 'jane': BasicMockObject.objects.get(name='jane').pk }, }, field_copy_types={'parent': 'text'}) c.save() self.assertEqual(BasicMockObject.objects.count(), 6) self.assertEqual( list( BasicMockObject.objects.order_by('name').values_list( 'name', 'parent__name')), [('ben', None), ('ben junior', 'ben'), ('jane', None), ('jane junior', 'jane'), ('joe', None), ('joe junior', 'joe')])
def test_field_value_mapping_and_types(self): c = CopyMapping(BasicMockObject, self.mapping_path, dict(name='NAME', number='NUMBER', dt='DATE'), field_value_mapping={ 'name': { 'ben': 'Master Ben', 'joe': 'Padawan Joe', 'jane': 'Jedi Jane' }, 'number': { 'seven': 7, 'three': 3, 'five': 5 } }, field_copy_types={'number': 'text'}) c.save() self.assertEqual(BasicMockObject.objects.count(), 3) self.assertEqual( list( BasicMockObject.objects.order_by('name').values_list( 'name', 'number')), [('Jedi Jane', 5), ('Master Ben', 7), ('Padawan Joe', 3)]) self.assertEqual( BasicMockObject.objects.get(name='Master Ben').dt, date(2012, 1, 1))
def test_bad_static_values(self): with self.assertRaises(ValueError): c = CopyMapping(ExtendedMockObject, self.name_path, dict(name='NAME', number='NUMBER', dt='DATE'), encoding='UTF-8', static_mapping=dict(static_bad=1)) c.save()
def test_encoding_save(self): c = CopyMapping(MockObject, self.null_path, dict(name='NAME', number='NUMBER', dt='DATE'), encoding='UTF-8') c.save() self.assertEqual(MockObject.objects.count(), 5) self.assertEqual(MockObject.objects.get(name='BADBOY').number, None) self.assertEqual( MockObject.objects.get(name='BEN').dt, date(2012, 1, 1))
def test_bad_static_values(self): with self.assertRaises(ValueError): c = CopyMapping( ExtendedMockObject, self.name_path, dict(name='NAME', number='NUMBER', dt='DATE'), encoding='UTF-8', static_mapping={'static_bad':1,} ) c.save()
def handle(self, *args, **kwargs): c = CopyMapping( # Give it the model Forum, # The path to your CSV args[1], # And a dict mapping the model fields to CSV headers dict(article_title='article_title')) # Then save it. c.save()
def test_save_foreign_key(self): c = CopyMapping( MockObject, self.foreign_path, dict(name='NAME', number='NUMBER', dt='DATE', parent='PARENT')) c.save() self.assertEqual(MockObject.objects.count(), 3) self.assertEqual(MockObject.objects.get(name='BEN').parent_id, 4) self.assertEqual( MockObject.objects.get(name='BEN').dt, date(2012, 1, 1))
def test_silent_save(self): c = CopyMapping( MockObject, self.name_path, dict(name='NAME', number='NUMBER', dt='DATE'), ) c.save(silent=True) self.assertEqual(MockObject.objects.count(), 3) self.assertEqual(MockObject.objects.get(name='BEN').number, 1) self.assertEqual( MockObject.objects.get(name='BEN').dt, date(2012, 1, 1))
def test_static_values(self): c = CopyMapping(ExtendedMockObject, self.name_path, dict(name='NAME', number='NUMBER', dt='DATE'), static_mapping=dict(static_val=1, static_string='test')) c.save() self.assertEqual( ExtendedMockObject.objects.filter(static_val=1).count(), 3) self.assertEqual( ExtendedMockObject.objects.filter(static_string='test').count(), 3)
def handle(self, *args, **kwargs): c = CopyMapping( # Give it the model Person, # The path to your CSV '/path/to/my/data.csv', # And a dict mapping the model fields to CSV headers dict(name='NAME', number='NUMBER', dt='DATE') ) # Then save it. c.save()
def test_pipe_save(self): c = CopyMapping( MockObject, self.pipe_path, dict(name='NAME', number='NUMBER', dt='DATE'), delimiter="|", ) c.save() self.assertEqual(MockObject.objects.count(), 3) self.assertEqual(MockObject.objects.get(name='BEN').number, 1) self.assertEqual( MockObject.objects.get(name='BEN').dt, date(2012, 1, 1))
def test_limited_save(self): c = CopyMapping( LimitedMockObject, self.name_path, dict(name='NAME', dt='DATE') ) c.save() self.assertEqual(LimitedMockObject.objects.count(), 3) self.assertEqual( LimitedMockObject.objects.get(name='BEN').dt, date(2012, 1, 1) )
def handle(self, *args, **options): SalaryRecord.objects.all().delete() clean_file_path = os.path.join(settings.DATA_DIR, 'berkeley_faculty.csv') fields = [ field.name for field in SalaryRecord._meta.get_fields() if field.name != 'id' ] mapping = {field: field for field in fields} c = CopyMapping(SalaryRecord, clean_file_path, mapping) c.save()
def test_silent_save(self): c = CopyMapping( MockObject, self.name_path, dict(name='NAME', number='NUMBER', dt='DATE'), ) c.save(silent=True) self.assertEqual(MockObject.objects.count(), 3) self.assertEqual(MockObject.objects.get(name='BEN').number, 1) self.assertEqual( MockObject.objects.get(name='BEN').dt, date(2012, 1, 1) )
def test_field_override_save(self): c = CopyMapping( MockObject, self.null_path, dict(name='NAME', number='NUMBER', dt='DATE'), ) c.save() self.assertEqual(MockObject.objects.count(), 5) self.assertEqual(MockObject.objects.get(name='BADBOY').number, None) self.assertEqual( MockObject.objects.get(name='BEN').dt, date(2012, 1, 1) )
def test_pipe_save(self): c = CopyMapping( MockObject, self.pipe_path, dict(name='NAME', number='NUMBER', dt='DATE'), delimiter="|", ) c.save() self.assertEqual(MockObject.objects.count(), 3) self.assertEqual(MockObject.objects.get(name='BEN').number, 1) self.assertEqual( MockObject.objects.get(name='BEN').dt, date(2012, 1, 1) )
def test_save_foreign_key(self): c = CopyMapping( MockObject, self.foreign_path, dict(name='NAME', number='NUMBER', dt='DATE', parent='PARENT') ) c.save() self.assertEqual(MockObject.objects.count(), 3) self.assertEqual(MockObject.objects.get(name='BEN').parent_id, 4) self.assertEqual( MockObject.objects.get(name='BEN').dt, date(2012, 1, 1) )
def test_static_values(self): c = CopyMapping( ExtendedMockObject, self.name_path, dict(name='NAME', number='NUMBER', dt='DATE'), static_mapping={'static_val':1,'static_string':'test'} ) c.save() self.assertEqual( ExtendedMockObject.objects.filter(static_val = 1).count(), 3 ) self.assertEqual( ExtendedMockObject.objects.filter(static_string = 'test').count(), 3 )
def test_overload_save(self): c = CopyMapping( OverloadMockObject, self.name_path, dict(name='NAME', number='NUMBER', dt='DATE'), overloaded_mapping=dict(lower_name='NAME') ) c.save() self.assertEqual(OverloadMockObject.objects.count(), 3) self.assertEqual(OverloadMockObject.objects.get(name='BEN').number, 1) self.assertEqual(OverloadMockObject.objects.get(lower_name='ben').number, 1) self.assertEqual( OverloadMockObject.objects.get(name='BEN').dt, date(2012, 1, 1) ) omo = OverloadMockObject.objects.first() self.assertEqual(omo.name.lower(), omo.lower_name)
def load_postgresql(self): """ Load the file into a PostgreSQL database using COPY """ # Drop all the records from the target model's real table self.cursor.execute('TRUNCATE TABLE "%s" CASCADE' % (self.model._meta.db_table)) c = CopyMapping( self.model, self.csv, dict((f.name, f.db_column) for f in self.model._meta.fields), using=self.database ) c.save(silent=True) # Print out the results if self.verbosity > 2: csv_count = self.get_row_count() model_count = self.model.objects.count() self.finish_load_message(model_count, csv_count)
def load_postgresql(self, model, csv_path): """ Takes a model and a csv_path and loads it into postgresql """ # Drop all the records from the target model's real table self.cursor.execute('TRUNCATE TABLE "%s" CASCADE' % (model._meta.db_table)) c = CopyMapping( model, csv_path, dict((f.name, f.db_column) for f in model._meta.fields), ) c.save(silent=True) # Print out the results csv_count = self.get_row_count(csv_path) model_count = model.objects.count() self.finish_load_message(model_count, csv_count)
def load_postgresql(self, model, csv_path): """ Takes a model and a csv_path and loads it into postgresql """ # Drop all the records from the target model's real table self.cursor.execute('TRUNCATE TABLE "%s" CASCADE' % ( model._meta.db_table )) c = CopyMapping( model, csv_path, dict((f.name, f.db_column) for f in model._meta.fields), ) c.save(silent=True) # Print out the results csv_count = self.get_row_count(csv_path) model_count = model.objects.count() self.finish_load_message(model_count, csv_count)
def test_overload_save(self): c = CopyMapping( OverloadMockObject, self.name_path, dict(name='NAME', lower_name='NAME', upper_name='NAME', number='NUMBER', dt='DATE'), ) c.save() self.assertEqual(OverloadMockObject.objects.count(), 3) self.assertEqual(OverloadMockObject.objects.get(name='ben').number, 1) self.assertEqual( OverloadMockObject.objects.get(lower_name='ben').number, 1) self.assertEqual( OverloadMockObject.objects.get(upper_name='BEN').number, 1) self.assertEqual( OverloadMockObject.objects.get(name='ben').dt, date(2012, 1, 1)) omo = OverloadMockObject.objects.first() self.assertEqual(omo.name.lower(), omo.lower_name)
def load_postgresql(self): """ Load the file into a PostgreSQL database using COPY """ # Drop all the records from the target model's real table self.cursor.execute('TRUNCATE TABLE "%s" CASCADE' % (self.model._meta.db_table)) c = CopyMapping( self.model, self.csv, dict((f.name, f.db_column) for f in self.model._meta.fields), using=self.database, ) c.save(silent=True) # Print out the results if self.verbosity > 2: csv_count = self.get_row_count() model_count = self.model.objects.count() self.finish_load_message(model_count, csv_count)
def import_csv(self, request): if 'csv' not in request.FILES: return JsonResponse( status=status.HTTP_400_BAD_REQUEST, data={ 'success': False, 'errors': ["Missing 'csv' parameter"] } ) csv = request.FILES['csv'] file_path = "/tmp/{0}.csv".format(str(uuid.uuid4())) with open(file_path, 'wb+') as destination: for chunk in csv.chunks(): destination.write(chunk) try: copy_mapping = CopyMapping( models.Concept, file_path, ConceptViewSet.CSV_COLUMN_MAPPING ) copy_mapping.save() except (ValueError, DatabaseError): expected_columns = ', '.join(ConceptViewSet.CSV_COLUMN_MAPPING.keys()) return JsonResponse( status=status.HTTP_400_BAD_REQUEST, data={ 'success': False, 'errors': ['CSV import error. Expected columns are {0}'.format(expected_columns)] } ) finally: os.remove(file_path) return JsonResponse({ 'success': True, })
def import_csv(self, request): if 'csv' not in request.FILES: return JsonResponse(status=status.HTTP_400_BAD_REQUEST, data={ 'success': False, 'errors': ["Missing 'csv' parameter"] }) csv = request.FILES['csv'] file_path = "/tmp/{0}.csv".format(str(uuid.uuid4())) with open(file_path, 'wb+') as destination: for chunk in csv.chunks(): destination.write(chunk) try: copy_mapping = CopyMapping(models.Concept, file_path, ConceptViewSet.CSV_COLUMN_MAPPING) copy_mapping.save() except (ValueError, DatabaseError): expected_columns = ', '.join( ConceptViewSet.CSV_COLUMN_MAPPING.keys()) return JsonResponse( status=status.HTTP_400_BAD_REQUEST, data={ 'success': False, 'errors': [ 'CSV import error. Expected columns are {0}'.format( expected_columns) ] }) finally: os.remove(file_path) return JsonResponse({ 'success': True, })
def insertRecords(requet): # save the data into model of Django c = CopyMapping( # Give it the model Product, # The path to your CSV 'file.csv', # And a dict mapping the model fields to CSV headers dict(category='category', name='name', image='image', price='price') ) # Then save it. c.save() # save the record into postgresql database conn = psycopg2.connect("dbname='testdb' user='******' host='localhost' password='******'") cur = conn.cursor() filesource = 'file.csv' reader = csv.reader(open(filesource, 'rb')) i=0 for row in reader: if(i!=0): cur.execute("INSERT INTO test_table (category, image, price, name) VALUES(%s, %s, %s, %s)",(row[0],row[1],row[2],row[3])) i=1 conn.commit() conn.close() # used for accesing data from postgresql database conn = psycopg2.connect("dbname='testdb' user='******' host='localhost' password='******'") cur = conn.cursor() cur.execute(""" select * from test_table """) results = [] rows = cur.fetchall() for row in rows: results.append(row) return render(requet,'insert.html',{})
def handle(self, *args, **options): if options['ticker']: symbols = [Symbol.objects.get(ticker=options['ticker'])] else: symbols = Symbol.objects.all().order_by('ticker') curdir = os.path.dirname(os.path.realpath(__file__)) for symbol in symbols: path = os.path.normpath(os.path.join(curdir, os.pardir, os.pardir, os.pardir, 'data', 'ohlcv', '%s.csv' % symbol.ticker)) print('Importing %s' % path) try: c = CopyMapping( # Give it the model SymbolQuote, # The path to your CSV path, # And a dict mapping the model fields to CSV headers dict(date='date', open='open', close='close', high='high', low='low', volume='volume'), static_mapping = {'symbol_id': symbol.id} ) c.save() self.stdout.write(self.style.SUCCESS('Successfully imported OHLCV rows for %s' % symbol.name)) except ValueError: self.stdout.write(self.style.ERROR('Could not find CSV file for %s in %s' % (symbol.name, path)))
def test_static_mapping_ignore_non_mapped_headers(self): c = CopyMapping( BasicMockObject, self.name_path, dict(name='NAME', number='NUMBER', dt='DATE'), ) c.save() self.assertEqual(BasicMockObject.objects.count(), 3) c = CopyMapping( BasicMockObject, self.fk_mapping_path, dict(name='NAME', number='NUMBER', dt='DATE'), static_mapping={'parent': BasicMockObject.objects.get(name='ben').pk}, ignore_non_mapped_headers=True ) c.save() self.assertEqual(BasicMockObject.objects.count(), 6) self.assertEqual( list(BasicMockObject.objects.order_by('name'). values_list('name', 'parent__name')), [('ben', None), ('ben junior', 'ben'), ('jane', None), ('jane junior', 'ben'), ('joe', None), ('joe junior', 'ben')] )
def handle(self, *args, **options): """ For each of the filing types we support, loops through the downloaded CSVs of that type, using the CopyMapping library to dump the records into the raw database, preserving all fields from the original data. """ self.header("Load raw FEC filings") name_to_model = { "F3X": RawF3XFiling, "F3P": RawF3PFiling, "F24": RawF24Filing, "sa": RawContribution, "se": RawIndependentExpenditure, } if options['flush']: self.log('Flushing the DB') for model in name_to_model.values(): model.objects.all().delete() # Characters to strip from filenames to get filing number ignore = ascii_letters + punctuation loaded = 0 for name, model in name_to_model.items(): data_dir = os.path.join(self.DATA_DIR, name) # If, for some reason, we have no filings of this type if not os.path.exists(data_dir): continue # We're making a mapping of CSV to model fields for CopyMapping # In this case, they're identical. mapping = { field: field for field in model._meta.get_all_field_names() } for file_name in os.listdir(data_dir): # Hidden files will mess with us if not file_name.startswith('.'): # Remove our list of ignore characters from the file name # to give us just the filing number. filing_no = int(file_name.translate(None, ignore)) if model.objects.filter(filing_no=filing_no).exists(): self.log( '{} CSV for filing {} is already in the DB'.format( name, filing_no)) else: self.log('Loading {} CSV for filing {}'.format( name, filing_no)) try: c = CopyMapping(model, os.path.join(data_dir, file_name), mapping) c.save(silent=True) loaded += 1 except: self.failure( 'Error on {} CSV for filing {}'.format( name, filing_no)) self.success("Loaded {} CSVs".format(loaded))
def handle(self, *args, **options): d = datetime.datetime start_time = d.now() self.stdout.write('\n') initial_message = 'About to loop through all files saved in %s directory and update Result Stage table for %s. \n' % ( save_date_string, election_date_string) self.stdout.write(initial_message) for file in os.listdir(file_directory): full_file_path = file_directory + '/' + file delete_all_resultstage_objects(self) start_message = 'Started loading file at ' + str( unicode(start_time)) + ' for ' + election_date_string c = CopyMapping( # Give it the model ResultStage, # The path to your CSV full_file_path, # And a dict mapping the model fields to CSV headers dict(id='id', unique_id='unique_id', raceid='raceid', racetype='racetype', racetypeid='racetypeid', ballotorder='ballotorder', candidateid='candidateid', description='description', electiondate='electiondate', fipscode='fipscode', first='first', incumbent='incumbent', initialization_data='initialization_data', is_ballot_measure='is_ballot_measure', last='last', lastupdated='lastupdated', level='level', national='national', officeid='officeid', officename='officename', party='party', polid='polid', polnum='polnum', precinctsreporting='precinctsreporting', precinctsreportingpct='precinctsreportingpct', precinctstotal='precinctstotal', reportingunitid='reportingunitid', reportingunitname='reportingunitname', runoff='runoff', seatname='seatname', seatnum='seatnum', statename='statename', statepostal='statepostal', test='test', uncontested='uncontested', votecount='votecount', votepct='votepct', winner='winner')) loading_message = 'Loading %s.' % (full_file_path) self.stdout.write(loading_message) # initial_message = '%s.' % () self.stdout.write('\n') c.save() self.stdout.write('Saved') end_time = d.now() import_length = str(unicode(end_time - start_time)) end_message = 'Finished copying snapshot at ' + str( unicode(end_time)) self.stdout.write('\n') self.stdout.write(end_message) self.stdout.write('\n') self.stdout.write('Loading this snapshot took ' + import_length) self.stdout.write('Sleeping for %s') % (sleep_time) time.sleep(sleep_time) self.stdout.write('--------------------------------------')
def import_report(self, instance, report_id): report_base = os.path.join(instance.report_dir, report_id) logging.info("[%s] Processing %s", instance.id, report_base) # first load the metadata with open(report_base + '.json', 'r') as handle: meta = json.load(handle) try: validate(meta) except Exception as e: logging.exception(e) return False # Next we'll extract files. extracted_files = [] data_map = {} with tarfile.open(report_base + '.tar.gz', 'r') as tar: for member in tar: guessed = self.fix_name(member) if guessed == 'unknown': continue # fancy safe name. tmpname = uuid.uuid4().hex + '.tsv' extracted_to = os.path.join(TMPDIR, tmpname) logging.info("[%s] Extracting %s to %s, guessed:%s", instance.id, member.name, extracted_to, guessed) # Record where the 'params' file is or the 'metrics' file. data_map[guessed] = extracted_to # Change the archive member's name in order to ensure that it # is extracted to somewhere with a safe name. member.name = tmpname # Extract into CWD. tar.extract(member, TMPDIR) # Track where we put it for cleanup later. extracted_files.append(extracted_to) if 'jobs' in data_map: c = CopyMapping( Job, data_map['jobs'], dict(external_job_id='id', tool_id='tool_id', tool_version='tool_version', state='state', create_time='create_time'), quote_character="\b", static_mapping={ 'instance_id': instance.id, }, delimiter='\t' ) c.save() if 'params' in data_map: c = CopyMapping( JobParam, data_map['params'], dict(external_job_id='job_id', name='name', value='value'), quote_character="\b", static_mapping={ 'instance_id': instance.id, }, delimiter='\t' ) c.save() if 'metric_num' in data_map: c = CopyMapping( MetricNumeric, data_map['metric_num'], dict(external_job_id='job_id', plugin='plugin', name='name', value='value'), quote_character="\b", static_mapping={ 'instance_id': instance.id, }, delimiter='\t' ) c.save() if 'datasets' in data_map: c = CopyMapping( Dataset, data_map['datasets'], dict( external_job_id='job_id', external_dataset_id='dataset_id', extension='extension', file_size='file_size', param_name='param_name', file_type='type', ), quote_character="\b", static_mapping={ 'instance_id': instance.id, }, delimiter='\t' ) c.save() for f in extracted_files: try: logging.info("[%s] Cleaning up %s", instance.id, f) os.unlink(f) except Exception as e: logging.exception(e) if 'users' in meta: if 'active' in meta['users']: instance.users_recent = meta['users']['active'] if 'total' in meta['users']: instance.users_total = meta['users']['total'] if 'jobs' in meta: if 'ok' in meta['jobs']: instance.jobs_run = meta['jobs']['ok'] instance.last_import = report_id instance.save() return True
def import_report(self, instance, report_id): report_base = os.path.join(instance.report_dir, report_id) logging.info("[%s] Processing %s", instance.id, report_base) # first load the metadata with open(report_base + '.json', 'r') as handle: meta = json.load(handle) try: validate(meta) except Exception as e: logging.exception(e) return False # Next we'll extract files. extracted_files = [] data_map = {} with tarfile.open(report_base + '.tar.gz', 'r') as tar: for member in tar: guessed = self.fix_name(member) if guessed == 'unknown': continue # fancy safe name. tmpname = uuid.uuid4().hex + '.tsv' extracted_to = os.path.join(tempfile.gettempdir(), 'grt', tmpname) logging.info("[%s] Extracting %s to %s, guessed:%s", instance.id, member.name, extracted_to, guessed) # Record where the 'params' file is or the 'metrics' file. data_map[guessed] = extracted_to # Change the archive member's name in order to ensure that it # is extracted to somewhere with a safe name. member.name = tmpname # Extract into CWD. tar.extract(member, TMPDIR) # Track where we put it for cleanup later. extracted_files.append(extracted_to) if 'jobs' in data_map: c = CopyMapping(Job, data_map['jobs'], dict(external_job_id='id', tool_id='tool_id', tool_version='tool_version', state='state', create_time='create_time'), quote_character="\b", static_mapping={ 'instance_id': instance.id, }, delimiter='\t') c.save() if 'params' in data_map: c = CopyMapping(JobParam, data_map['params'], dict(external_job_id='job_id', name='name', value='value'), quote_character="\b", static_mapping={ 'instance_id': instance.id, }, delimiter='\t') c.save() if 'metric_num' in data_map: c = CopyMapping(MetricNumeric, data_map['metric_num'], dict(external_job_id='job_id', plugin='plugin', name='name', value='value'), quote_character="\b", static_mapping={ 'instance_id': instance.id, }, delimiter='\t') c.save() if 'datasets' in data_map: c = CopyMapping(Dataset, data_map['datasets'], dict( external_job_id='job_id', external_dataset_id='dataset_id', extension='extension', file_size='file_size', param_name='param_name', file_type='type', ), quote_character="\b", static_mapping={ 'instance_id': instance.id, }, delimiter='\t') c.save() for f in extracted_files: try: logging.info("[%s] Cleaning up %s", instance.id, f) os.unlink(f) except Exception as e: logging.exception(e) if 'users' in meta: if 'active' in meta['users']: instance.users_recent = meta['users']['active'] if 'total' in meta['users']: instance.users_total = meta['users']['total'] if 'jobs' in meta: if 'ok' in meta['jobs']: instance.jobs_run = meta['jobs']['ok'] instance.last_import = report_id instance.save() return True
def handle(self, *args, **kwargs): # Check if data source is from a directory or a url and assign input_file to selected option if kwargs['typesource'] == 'web': # Write content from url into csv file url = str(kwargs['inputfile']) logger.info('URL: {0}'.format(url)) req = requests.get(url) url_content = req.content csv_path = str( Path(kwargs['directory'] + '/' + kwargs['inputfile'])) csv_file = open(csv_path, 'wb') csv_file.write(url_content) csv_file.close() input_file = csv_path elif kwargs['typesource'] == 'directory': input_file = Path(kwargs['inputfile']) logger.info('INPUT FILE: {0}'.format(input_file)) else: logger.info( 'WARNING (lwf_csv_import.py) non-valid value entered for "typesource": {0}' .format(kwargs['typesource'])) return # Get the parent class, assumes parent class is in module within lwf/models directory parent_name = kwargs['parentclass'].rsplit('.', 1)[-1] package = importlib.import_module("lwf.models." + parent_name) parent_class = getattr(package, parent_name) # Assign variables used to write csv_temporary csv_temporary = Path(kwargs['directory'] + '/' + kwargs['model'] + '_temporary.csv') input_fields = parent_class.input_fields database_fields = [field.name for field in parent_class._meta.fields] date_format = parent_class.date_format model_class = None written_timestamps = [] rows_before = 24 rows_after = 0 rows_buffer = [] nead_header = [] # Write data in input_file into csv_temporary with additional computed fields try: with open(csv_temporary, 'w', newline='') as sink, open(input_file, 'r') as source: sink.write(','.join(database_fields) + '\n') records_written = 0 # Skip number of header lines designated in parent class header line count for i in range(parent_class.header_line_count): first_lines = source.readline() nead_header.append(first_lines) next(source, None) while True: line = source.readline() if not line: break line_array = [ v for v in line.strip().split(parent_class.delimiter) if len(v) > 0 ] # Skip header lines that start with designated parent class header symbol # For example: the '#' character if line.startswith(parent_class.header_symbol): nead_header.append(line) continue if len(line_array) != len(input_fields): error_msg = "Line has {0} values, header {1} columns ".format( len(line_array), len(input_fields)) logger.error(error_msg) raise ValueError(error_msg) row = { input_fields[i]: line_array[i] for i in range(len(line_array)) } # Process row and add new computed fields # Check which kind of cleaner should be applied if kwargs['parentclass'] == 'LWFMeteo': line_clean = get_lwf_meteo_line_clean(row, date_format) elif kwargs['parentclass'] == 'LWFStation': line_clean = get_lwf_station_line_clean( row, date_format) else: logger.info( 'WARNING (lwf_csv_import.py) {0} parentclass does not exist' .format(kwargs['parentclass'])) return # Get the model class_name = kwargs['model'].rsplit('.', 1)[-1] package = importlib.import_module("lwf.models") model_class = getattr(package, class_name) # Make timestamp_iso value a UTC timezone aware datetime object dt_obj = line_clean['timestamp_iso'] aware_dt = make_aware(dt_obj) # Check if record with identical timestamp already exists in table, otherwise write record to # temporary csv file after checking for record with duplicate timestamp try: model_class.objects.get(timestamp_iso=aware_dt) except model_class.DoesNotExist: if line_clean[ 'timestamp_iso'] not in written_timestamps: # keep timestamps length small written_timestamps = written_timestamps[ (-1) * min(len(written_timestamps), 1000):] written_timestamps += [line_clean['timestamp_iso']] # slide the row buffer window rows_buffer = rows_buffer[ (-1) * min(len(rows_buffer), rows_before + rows_after):] + [line_clean] # check values before and after if len(rows_buffer) > rows_after: sink.write(','.join([ "{0}".format(v) for v in rows_buffer[-(1 + rows_after)].values() ]) + '\n') records_written += 1 # Write nead header configuration file if applicable if len(nead_header) > parent_class.header_line_count: header_symbol = parent_class.header_symbol write_nead_config('lwf', nead_header, kwargs['model'], kwargs['parentclass'], header_symbol) except FileNotFoundError as e: logger.info( 'WARNING (lwf_csv_import.py) file not found {0}, exception {1}' .format(input_file, e)) return if model_class is None: logger.info( 'WARNING (lwf_csv_import.py) no data found for {0}'.format( kwargs['model'])) return # Assign copy_dictionary from database_fields copy_dictionary = { database_fields[i]: database_fields[i] for i in range(0, len(database_fields)) } # Import processed and cleaned data into Postgres database c = CopyMapping( # Assign model model_class, # Temporary CSV with input data and computed fields csv_temporary, # Dictionary mapping the model fields to CSV fields copy_dictionary, ) # Then save it. c.save() # Log import message logger.info( '{0} successfully imported, {1} new record(s) written in {2}'. format((kwargs['inputfile']), records_written, (kwargs['model']))) # Delete csv_temporary os.remove(csv_temporary)
def handle(self, *args, **options): """ For each of the filing types we support, loops through the downloaded CSVs of that type, using the CopyMapping library to dump the records into the raw database, preserving all fields from the original data. """ self.header("Load raw FEC filings") name_to_model = { "F3X": RawF3XFiling, "F3P": RawF3PFiling, "F24": RawF24Filing, "sa": RawContribution, "se": RawIndependentExpenditure, } if options['flush']: self.log('Flushing the DB') for model in name_to_model.values(): model.objects.all().delete() # Characters to strip from filenames to get filing number ignore = ascii_letters + punctuation loaded = 0 for name, model in name_to_model.items(): data_dir = os.path.join(self.DATA_DIR, name) # If, for some reason, we have no filings of this type if not os.path.exists(data_dir): continue # We're making a mapping of CSV to model fields for CopyMapping # In this case, they're identical. mapping = { field: field for field in model._meta.get_all_field_names() } for file_name in os.listdir(data_dir): # Hidden files will mess with us if not file_name.startswith('.'): # Remove our list of ignore characters from the file name # to give us just the filing number. filing_no = int(file_name.translate(None, ignore)) if model.objects.filter(filing_no=filing_no).exists(): self.log( '{} CSV for filing {} is already in the DB'.format( name, filing_no ) ) else: self.log( 'Loading {} CSV for filing {}'.format( name, filing_no ) ) try: c = CopyMapping( model, os.path.join(data_dir, file_name), mapping ) c.save(silent=True) loaded += 1 except: self.failure('Error on {} CSV for filing {}'.format(name, filing_no)) self.success("Loaded {} CSVs".format(loaded))
def handle(self, *args, **kwargs): # Assign kwargs from url to variables inputfile = kwargs['inputfile'] typesource = kwargs['typesource'] directory = kwargs['directory'] app = kwargs['app'] model = kwargs['model'] # Validate app if not apps.is_installed(app): logger.error(f' ERROR app {app} not found') return # Validate model try: model_class = get_model_cl(app, model) except AttributeError as e: logger.error(f' ERROR model {model} not found, exception {e}') return # Check if data source is from a directory or a url and assign input_file to selected option if typesource == 'web': # Assign url to inputfile url = str(inputfile) # Check if sizelimit kwarg passed size_limit = kwargs['sizelimit'] if size_limit: # Get size of file to download in megabytes info_url = requests.head(url) size_bytes_url = info_url.headers['Content-Length'] size_mb_url = int(size_bytes_url) / (1024 * 1024) # If file to download is over limit then log error and stop processing if size_mb_url > int(size_limit): logger.error( f' ERROR {url} is larger than maximum size allowed: {size_limit} MB' ) return # logger.info(f' Started importing input URL: {url}') req = requests.get(url) url_content = req.content input_file = Path(f'{directory}/{model}_downloaded.csv') csv_file = open(input_file, 'wb') csv_file.write(url_content) csv_file.close() elif typesource == 'directory': input_file = Path(inputfile) logger.info(f' Started importing input file: {input_file}') else: logger.error( f' ERROR non-valid value entered for "typesource": {typesource}' ) return # Get parent class name parent_class_name = model_class.__base__.__name__ # Get line cleaner function try: line_cleaner = self.get_line_cleaner(parent_class_name) except Exception as e: logger.error(e) return # Assign other variables used to write csv_temporary csv_temporary = Path(f'{directory}/{model}_temporary.csv') input_fields = model_class.input_fields database_fields = [ field.name for field in model_class._meta.fields if field.name != 'id' ] date_format = model_class.date_format written_timestamps = [] rows_before = 24 rows_after = 0 rows_buffer = [] nead_header = [] # Write data in input_file into csv_temporary with additional computed fields try: with open(csv_temporary, 'w', newline='') as sink, open(input_file, 'r') as source: sink.write(','.join(database_fields) + '\n') records_written = 0 # Skip number of header lines designated in parent class header line count for i in range(model_class.header_line_count): first_lines = source.readline() nead_header.append(first_lines) next(source, None) while True: line = source.readline() if not line: break line_array = [ v for v in line.strip().split(model_class.delimiter) if len(v) > 0 ] # Skip header lines that start with designated parent class header symbol # For example: the '#' character if line.startswith(model_class.header_symbol): nead_header.append(line) continue if len(line_array) != len(input_fields): error_msg = f' ERROR: line has {len(line_array)} values, header has {len(input_fields)} columns' logger.error(error_msg) raise ValueError(error_msg) row = { input_fields[i]: line_array[i] for i in range(len(line_array)) } # Process row and add new computed fields line_clean = line_cleaner(row, date_format) # Make timestamp_iso value a UTC timezone aware datetime object dt_obj = line_clean['timestamp_iso'] aware_dt = make_aware(dt_obj) # Check if record with identical timestamp already exists in table, otherwise write record to # temporary csv file after checking for record with duplicate timestamp try: model_class.objects.get(timestamp_iso=aware_dt) except model_class.DoesNotExist: if line_clean[ 'timestamp_iso'] not in written_timestamps: # keep timestamps length small written_timestamps = written_timestamps[ (-1) * min(len(written_timestamps), 1000):] written_timestamps += [line_clean['timestamp_iso']] # slide the row buffer window rows_buffer = rows_buffer[ (-1) * min(len(rows_buffer), rows_before + rows_after):] + [line_clean] # check values before and after if len(rows_buffer) > rows_after: sink.write(','.join([ "{0}".format(v) for v in rows_buffer[-(1 + rows_after)].values() ]) + '\n') records_written += 1 # Write nead header configuration file if applicable if nead_header: header_symbol = model_class.header_symbol write_nead_config(app, nead_header, model, parent_class_name, header_symbol) except FileNotFoundError as e: logger.error(f' ERROR file not found {input_file}, exception {e}') return # Assign copy_dictionary from database_fields copy_dictionary = { database_fields[i]: database_fields[i] for i in range(0, len(database_fields)) } # Import processed and cleaned data into Postgres database c = CopyMapping( # Assign model model_class, # Temporary CSV with input data and computed fields csv_temporary, # Dictionary mapping the model fields to CSV fields copy_dictionary, ) # Then save it. c.save() # Log import message logger.info( f' Finished import: {records_written} new records written in {model}' ) # Delete csv_temporary os.remove(csv_temporary) # If file downloaded from web delete it if os.path.isfile(f'{directory}/{model}_downloaded.csv'): os.remove(f'{directory}/{model}_downloaded.csv')