import unicodedata newname = unicodedata.normalize('NFKD', weirdname).encode('ASCII', 'ignore').decode() return newname for y in range(len(olydata)): name = olydata[y]['Athlete'] weirdname = extract_usable_name(name) finalname = no_weird_characters(weirdname) olydata[y]['usable_name'] = finalname #namecaller={} #for x in range(len(olydata)): # call=olydata[x]['usable_name'] # namecaller[call]=olydata[x] from gender import detect_gender for x in range(len(olydata)): result = detect_gender(olydata[x]['usable_name']) ratio = result['ratio'] gender = result['gender'] olydata[x]['ratio'] = ratio olydata[x]['gender'] = gender olympicpath = join(datadirect, 'olympic_athletes_classified.json') writejson = open(olympicpath, 'w') json.dump(olydata, writejson, indent=2) print("Writing to olympic_athletes_classified.json file.")
from csv import DictReader, DictWriter CLASSIFIED_DATA_HEADERS = ['firstname','lastname','born_year','died_year','categories', 'years', 'gender', 'usable_name', 'ratio'] def extract_usable_name(namestr): if '.' not in namestr: return namestr return '' # Set up the new data file w = open(CLASSIFIED_DATA_FILENAME, 'w') dw = DictWriter(w, fieldnames=CLASSIFIED_DATA_HEADERS) dw.writeheader() # Open the non-gender classified data file with open(WRANGLED_DATA_FILENAME) as r: datarows = list(DictReader(r)) # read each row ct = 0 for row in datarows: usablename = extract_usable_name(row['firstname']) ct += 1 print("Row:", ct, "extracting --", usablename, "-- from:", row['firstname']) gender_result = detect_gender(usablename) # now add usable_name and gender data to each row row['usable_name'] = usablename row['gender'] = gender_result['gender'] row['ratio'] = gender_result['ratio'] dw.writerow(row)
datarows = list(DictReader(f)) wf = open(CLASSIFIED_DATA_FILENAME, 'w') cwf = DictWriter(wf, fieldnames=CLASSIFIED_HEADERS) cwf.writeheader() def extract_usable_name(name_string): name_list = name_string.split(', ') almost_first_name = name_list[-1] first_name_list = almost_first_name.split(' ') first_name = first_name_list[-1] return first_name_list[0] linecount = 0 for row in datarows: linecount += 1 the_name = row['cited_name'] usable_name = extract_usable_name(the_name) print(linecount, " -- Extracted", usable_name, "from", the_name) genderdict = detect_gender(usable_name) # now write the row... row['gender'] = genderdict['gender'] row['ratio'] = genderdict['ratio'] row['usable_name'] = usable_name cwf.writerow(row)
DATA_DIR = 'tempdata' WRANGLED_DATA_FILENAME = join(DATA_DIR, 'wrangled_data.csv') WRANGLED_HEADERS = ['Last Name', 'Firstish Name', 'Position Title', 'Department' , 'Employee Annual Salary', 'Gender', 'Ratio', 'Usable Name'] CLASSIFIED_DATA_FILENAME = join(DATA_DIR, 'classified_data.csv') wrangledFile = open(WRANGLED_DATA_FILENAME, 'r') datarows = list(DictReader(wrangledFile)) def extract_usable_name(firstishName): return firstishName.split(' ')[0] dicList = [] for employee in datarows: usable_name = extract_usable_name(employee['Firstish Name']) result = detect_gender(usable_name) employee['Gender'] = result['gender'] employee['Ratio'] = result['ratio'] employee['Usable Name'] = usable_name employee['Employee Annual Salary'] = float(employee['Employee Annual Salary']) dicList.append(employee) wrangledFile.close() wfile = open(CLASSIFIED_DATA_FILENAME, 'w') # turn it into a DictWriter object, and tell it what the fieldnames are wcsv = DictWriter(wfile, fieldnames=WRANGLED_HEADERS) # write the headers row wcsv.writeheader()
from gender import detect_gender DATA_DIR = 'tempdata' WRANGLED_DIR = 'tempdata/wrangled' WRANGLED_DATA_PATH = join(WRANGLED_DIR, 'wranglednames.csv') CLASSIFIED_DATA_FILENAME = join(DATA_DIR, 'classified_data.csv') def extractable_usable_name(name): return name.split(' ')[0] classified_headers = [ 'year', 'name', 'description', 'usable_name', 'gender', 'ratio' ] w = open(CLASSIFIED_DATA_FILENAME, 'w') dw = DictWriter(w, fieldnames=classified_headers) dw.writeheader() with open(WRANGLED_DATA_PATH) as r: datarows = list(DictReader(r)) ct = 0 for row in datarows: usable_name = extractable_usable_name(row['name']) ct += 1 print("Row:", ct, "extracting --", usable_name, "--from", row['name']) gender_result = detect_gender(usable_name) row['usable_name'] = usable_name row['gender'] = gender_result['gender'] row['ratio'] = gender_result['ratio'] dw.writerow(row)
result+=name[n] return result.strip(" ") #Actually goes through CSV file and reads it wrangled_file = open(DATA_PATH, 'r', encoding="latin1") my_reader = csv.reader(wrangled_file, skipinitialspace=True) next(my_reader, None) #Skips headers for line in my_reader: year = line[0] category = line[1] name = line[2] country = line[3] field = line[4] motivation = line[5] usable_name = extract_usable_name(name) gender = detect_gender(usable_name)['gender'] ratio = detect_gender(usable_name)['ratio'] namesdict = {'year': year, 'category': category, 'name': name, 'country': country, 'field': field, 'motivation': motivation, 'gender': gender, 'ratio': ratio,'usable_name': usable_name} all_names.append(namesdict) #Creates new file HEADERS = ['year', 'category', 'name' , 'country' , 'field', 'motivation', 'gender', 'ratio', 'usable_name'] wfile = open(CLASSIFIED_DATA_PATH, 'w') wcsv = csv.DictWriter(wfile, fieldnames=HEADERS) # write the headers row wcsv.writeheader() for entry in all_names:
from os.path import exists, join from gender import extract_usable_name, detect_gender import csv DATA_DIR = 'tempdata' SJ_DATA_FILENAME = join(DATA_DIR, 'wrangled_data.csv') C_FILENAME = join(DATA_DIR, 'classified_data.csv') C_HEADERS = [ 'name', 'gender', 'ratio', 'females', 'males', 'total', 'compensation' ] sj_data = list(csv.DictReader(open(SJ_DATA_FILENAME))) classified_data = [] for person in sj_data: use_name = extract_usable_name(person['Employee Name']) result = detect_gender(use_name) result['compensation'] = float(person['Total Pay & Benefits']) classified_data.append(result) wfile = open(C_FILENAME, 'w') wcsv = csv.DictWriter(wfile, fieldnames=C_HEADERS) wcsv.writeheader() for row in classified_data: wcsv.writerow(row) wfile.close()
from os.path import exists, join from gender import extract_usable_name, detect_gender import csv DATA_DIR = 'tempdata' SJ_DATA_FILENAME = join(DATA_DIR, 'wrangled_data.csv') C_FILENAME = join(DATA_DIR, 'classified_data.csv') C_HEADERS = ['name', 'gender' , 'ratio' , 'females', 'males', 'total', 'compensation'] sj_data = list(csv.DictReader(open(SJ_DATA_FILENAME))) classified_data = [] for person in sj_data: use_name = extract_usable_name(person['Employee Name']) result = detect_gender(use_name) result['compensation'] = float(person['Total Pay & Benefits']) classified_data.append(result) wfile = open(C_FILENAME, 'w') wcsv = csv.DictWriter(wfile, fieldnames=C_HEADERS) wcsv.writeheader() for row in classified_data: wcsv.writerow(row) wfile.close()
filename = 'wrangled_data.csv' foldername = os.path.join(directory, subdirectory) pathname = os.path.join(foldername, filename) wrangledHeaders = ['title', 'first_name', 'last_name', 'party', 'age', 'gender', 'ratio', 'usable_name'] openFile = open(pathname, 'r') print('Reading data from', pathname) lineCount = 0 legislatorList = [] for line in openFile: if lineCount > 0: legislatorInfo = line.strip().split(',') firstname = extract_usable_name(legislatorInfo[1]) genderInfo = detect_gender(firstname) infoDict = {} infoDict['title'] = legislatorInfo[0] infoDict['first_name'] = legislatorInfo[1] infoDict['last_name'] = legislatorInfo[2] infoDict['party'] = legislatorInfo[3] infoDict['age'] = legislatorInfo[4] infoDict['gender'] = genderInfo['gender'] infoDict['ratio'] = genderInfo['ratio'] infoDict['usable_name'] = firstname legislatorList.append(infoDict) lineCount += 1 newFileName = 'classified_data.csv' newFilePath = os.path.join(foldername, newFileName) csvFile = open(newFilePath, 'w')
import unicodedata newname=unicodedata.normalize('NFKD', weirdname).encode('ASCII', 'ignore').decode() return newname for y in range(len(olydata)): name=olydata[y]['Athlete'] weirdname=extract_usable_name(name) finalname=no_weird_characters(weirdname) olydata[y]['usable_name']=finalname #namecaller={} #for x in range(len(olydata)): # call=olydata[x]['usable_name'] # namecaller[call]=olydata[x] from gender import detect_gender for x in range(len(olydata)): result=detect_gender(olydata[x]['usable_name']) ratio=result['ratio'] gender=result['gender'] olydata[x]['ratio']=ratio olydata[x]['gender']=gender olympicpath=join(datadirect, 'olympic_athletes_classified.json') writejson=open(olympicpath, 'w') json.dump(olydata, writejson, indent=2) print("Writing to olympic_athletes_classified.json file.")
thefile = join(folder, 'presidents.txt') with open(thefile, 'r') as infile: pres_dict = {} with open('data/classified.csv', 'w') as outfile: writer = csv.writer(outfile) writer.writerow(['name', 'term', 'university', 'gender']) #read through each line for line in infile: temp = line.strip().split(',') pres, term, univ = line.strip().split(',') univ = univ.strip() first_name = pres.split(' ')[0] res = detect_gender(first_name) temp.extend([res['gender']]) writer.writerow(temp) # # This part is just to check if we have got all the names # # in the file. # # count = 0 # # for school in pres_dict: # # print(school, 'has had', len(pres_dict[school]), 'presidents') # # print('\n') # # count += 1 # # print('Looked at', count, 'schools') # schools_analyzed = {}
full_filename = join(DATA_DIR, fname) joe = open(full_filename, 'r') First_rows = list(DictReader(joe)) joe.close() classified_headers = list(First_rows[0]) + ['gender', 'ratio', 'usable_name'] classified_filename = join(CLASSIFIED_DIR, fname) print("About to classify", len(First_rows), 'rows into the file:', classified_filename) outfile = open(classified_filename, 'w') output_csv = DictWriter(outfile, fieldnames=classified_headers) output_csv.writeheader() xc = 0 for row in First_rows: xc += 1 first_name = row['First'] print("On row", xc, first_name) if "N/A" in first_name: pass else: usablename = extract_usable_name(first_name) get = detect_gender(usablename) row['gender'] = get['gender'] row['ratio'] = get['ratio'] row['usable_name'] = usablename # output_csv.writerow(row) outfile.close() #Dan: I used California Colleges as a template. Thank you for sharing it.
classified_headers = list( salary_rows[0].keys()) + ['gender', 'ratio', 'usable_name'] classified_filename = join(DATA_DIR, fname) print("About to classify", len(salary_rows), 'rows into the file:', classified_filename) outfile = open(classified_filename, 'w') output_csv = DictWriter(outfile, fieldnames=classified_headers) output_csv.writeheader() xc = 0 for row in salary_rows: xc += 1 first_name = row['First Name'] print("On row", xc, first_name) # skip rows in which row['Employee Name'] is "Not provided" if "Not provided" in first_name: pass else: usablename = extract_usable_name(first_name) xresult = detect_gender(usablename) row['gender'] = xresult['gender'] row['ratio'] = xresult['ratio'] row['usable_name'] = usablename row['Base Salary'] = (row['Base Salary'][1:]) # write to the csv file output_csv.writerow(row) outfile.close()
'usable_name', 'detected_gender', 'ratio' ] def extract_usable_name(namestr): nameparts = namestr.split(' ') for n in nameparts: if '[' not in n: if '.' not in n: return n return "" f = open(classifyfilename, 'w', newline='') fwrite = csv.DictWriter(f, fieldnames=classifyheaders) fwrite.writeheader() with open(wranglefilename) as r: datarows = list(csv.DictReader(r)) ct = 0 for row in datarows: usablename = extract_usable_name(row['firstname']) ct += 1 print("Row:", ct, "extracting --", usablename, "-- from:", row['firstname']) genderresults = detect_gender(usablename) row['usable_name'] = usablename row['detected_gender'] = genderresults['gender'] row['ratio'] = genderresults['ratio'] fwrite.writerow(row)
return result.strip(" ") #Actually goes through CSV file and reads it wrangled_file = open(DATA_PATH, 'r', encoding="latin1") my_reader = csv.reader(wrangled_file, skipinitialspace=True) next(my_reader, None) #Skips headers for line in my_reader: year = line[0] category = line[1] name = line[2] country = line[3] field = line[4] motivation = line[5] usable_name = extract_usable_name(name) gender = detect_gender(usable_name)['gender'] ratio = detect_gender(usable_name)['ratio'] namesdict = { 'year': year, 'category': category, 'name': name, 'country': country, 'field': field, 'motivation': motivation, 'gender': gender, 'ratio': ratio, 'usable_name': usable_name } all_names.append(namesdict) #Creates new file
"detected_gender", "ratio", ] def extract_usable_name(namestr): nameparts = namestr.split(" ") for n in nameparts: if "[" not in n: if "." not in n: return n return "" f = open(classifyfilename, "w", newline="") fwrite = csv.DictWriter(f, fieldnames=classifyheaders) fwrite.writeheader() with open(wranglefilename) as r: datarows = list(csv.DictReader(r)) ct = 0 for row in datarows: usablename = extract_usable_name(row["firstname"]) ct += 1 print("Row:", ct, "extracting --", usablename, "-- from:", row["firstname"]) genderresults = detect_gender(usablename) row["usable_name"] = usablename row["detected_gender"] = genderresults["gender"] row["ratio"] = genderresults["ratio"] fwrite.writerow(row)