forked from nicolewhite/neo4j-fleets
/
clean.py
52 lines (41 loc) · 1.65 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from csvthings import import_csv_to_list
from csvthings import export_list_to_csv
# Import CSVs to list.
airlines = import_csv_to_list('airlines_raw.csv', headers = True, astuple = True)
aircraft = import_csv_to_list('aircraft_raw.csv', headers = True, astuple = True)
# Remove duplicate rows.
def remove_duplicates(data):
data = set(data)
data = list(data)
return(data)
airlines = remove_duplicates(airlines)
aircraft = remove_duplicates(aircraft)
# Remove Regionnair observations. Not unique by name.
def remove_regionnair(data):
newdata = []
for i in range(len(data)):
if('Regionnair' not in data[i]):
newdata.append(data[i])
return(newdata)
airlines = remove_regionnair(airlines)
aircraft = remove_regionnair(aircraft)
# Convert to list of lists from list of tuples for validate_status function.
def convert_to_list(data):
newlist = [list(e) for e in data]
return(newlist)
airlines = convert_to_list(airlines)
aircraft = convert_to_list(aircraft)
# Fix bad statuses in aircraft.csv.
def validate_status(data):
valid_status = ["Active", "Scrapped", "Written off", "Stored", "On order"]
for i in range(len(data)):
if(data[i][8] not in valid_status):
data[i][8] = "Unknown"
return(data)
aircraft = validate_status(aircraft)
# Export cleaned data to CSV.
export_list_to_csv('airlines_clean.csv', airlines, headers = ["airline", "country", "status"])
export_list_to_csv('aircraft_clean.csv', aircraft, headers = ["msn", "model", "series", "airline", "ff_day", "ff_month", "ff_year", "registration", "status"])
# Remove raw datasets.
# os.remove('airlines_raw.csv')
# os.remove('aircraft_raw.csv')