def genderize(args): print(args) #File initialization dir_path = os.path.dirname(os.path.realpath(__file__)) logging.basicConfig( filename=dir_path + os.sep + "log.txt", level=logging.DEBUG, format='%(asctime)s %(levelname)s %(name)s %(message)s') logger = logging.getLogger(__name__) ofilename, ofile_extension = os.path.splitext(args.output) ofile = ofilename + "_" + time.strftime("%Y%m%d-%H%M%S") + ".csv" ifile = args.input if os.path.isabs(ifile): print("\n--- Input file: " + ifile) else: print("\n--- Input file: " + dir_path + os.sep + ifile) if os.path.isabs(ofile): print("--- Output file: " + ofile) else: print("--- Output file: " + dir_path + os.sep + ofile + "\n") #File integrity checking if not os.path.exists(ifile): print("--- Input file does not exist. Exiting.\n") sys.exit() if not os.path.exists(os.path.dirname(ofile)): print("--- Error! Invalid output file path. Exiting.\n") sys.exit() #Some set up stuff ##csv.field_size_limit(sys.maxsize) #Initialize API key if not args.key == "NO_API": print("--- API key: " + args.key + "\n") genderize = Genderize(user_agent='GenderizeDocs/0.0', api_key=args.key) key_present = True else: print("--- No API key provided.\n") key_present = False #Open ifile with open(ifile, 'r', encoding="utf8") as csvfile: readCSV = csv.reader(csvfile, delimiter=',', skipinitialspace=True) headers = next(readCSV) # take the first row # [Chen]: use re.search to find the specific column name, match string "name" specific_header_index = 0 for header in headers: if re.search("name", header, re.I) != None: break specific_header_index += 1 # [Chen]: store rows without headers as a list rest = list(readCSV) # [Chen]: extract the first_name column first_name = [row[specific_header_index].strip() for row in rest] o_first_name = list() for l in first_name: for b in l: o_first_name.append(b) if args.auto == True: uniq_first_name = list(set(o_first_name)) chunks = list(jpyh.splitlist(uniq_first_name, 10)) print("--- Read CSV with " + str(len(first_name)) + " first_name. " + str(len(uniq_first_name)) + " unique.") else: chunks = list(jpyh.splitlist(first_name, 10)) print("--- Read CSV with " + str(len(first_name)) + " first_name") print("--- Processed into " + str(len(chunks)) + " chunks") if jpyh.query_yes_no( "\n---! Ready to send to Genderdize. Proceed?") == False: print("Exiting...\n") sys.exit() if os.path.isfile(ofile): if jpyh.query_yes_no( "---! Output file exists, overwrite?") == False: print("Exiting...\n") sys.exit() print("\n") if args.auto == True: ofile = ofile + ".tmp" response_time = [] gender_responses = list() with open(ofile, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f) # [Chen]: generate different headers by'-ORV' argument if args.override == True: headers.extend(["female", "male"]) else: headers.extend(["gender", "probability", "count"]) writer.writerow(headers) chunks_len = len(chunks) stopped = False for index, chunk in enumerate(chunks): if stopped: break success = False while not success: try: start = time.time() if key_present: dataset = genderize.get(chunk) else: dataset = Genderize().get(chunk) gender_responses.append(dataset) success = True except GenderizeException as e: #print("\n" + str(e)) logger.error(e) #Error handling if "response not in JSON format" in str( e) and args.catch == True: if jpyh.query_yes_no( "\n---!! 502 detected, try again?" ) == True: success = False continue elif "Invalid API key" in str( e) and args.catch == True: print( "\n---!! Error, invalid API key! Check log file for details.\n" ) else: print( "\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key. Check log file for details.\n" ) stopped = True break response_time.append(time.time() - start) print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + \ str( round( (sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s") # [Chen]: Combine original data and the response: if go with '-OVR', convert gender to 0/1; if go without '-OVR', pick up gender, probability, count if args.override == True: for data in dataset: append_response = [] if data.get('gender'): # null check female = 1 if data.get( 'gender') == 'female' else 0 append_response = [female, female ^ 1 ] # xor for 0 <=> 1 else: append_response = [0, 0] writer.writerow([*rest[0], *append_response]) rest.pop(0) else: for data in dataset: writer.writerow( [*rest[0], *list(data.values())[1:]]) rest.pop(0) break if args.auto == True: print("\nCompleting identical first_name...\n") #AUTOCOMPLETE first_name #Create master dict gender_dict = dict() print(gender_dict) for response in gender_responses: for d in response: gender_dict[d.get("name")] = [ d.get("gender"), d.get("probability"), d.get("count") ] filename, file_extension = os.path.splitext(ofile) with open(filename, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f) writer.writerow( list(["first_name", "gender", "probability", "count"])) for name in o_first_name: data = gender_dict.get(name) writer.writerow([name, data[0], data[1], data[2]]) print("Done!\n")
def genderize(args): print(args) #File initialization dir_path = os.path.dirname(os.path.realpath(__file__)) logging.basicConfig( filename=dir_path + os.sep + "log.txt", level=logging.DEBUG, format='%(asctime)s %(levelname)s %(name)s %(message)s') logger = logging.getLogger(__name__) ofilename, ofile_extension = os.path.splitext(args.output) ofile = ofilename + "_" + time.strftime("%Y%m%d-%H%M%S") + ".csv" ifile = args.input if os.path.isabs(ifile): print("\n--- Input file: " + ifile) else: print("\n--- Input file: " + dir_path + os.sep + ifile) if os.path.isabs(ofile): print("--- Output file: " + ofile) else: print("--- Output file: " + dir_path + os.sep + ofile + "\n") #File integrity checking if not os.path.exists(ifile): print("--- Input file does not exist. Exiting.\n") sys.exit() if not os.path.exists(os.path.dirname(ofile)): print("--- Error! Invalid output file path. Exiting.\n") sys.exit() #Some set up stuff csv.field_size_limit(sys.maxsize) #Initialize API key if not args.key == "NO_API": print("--- API key: " + args.key + "\n") genderize = Genderize(user_agent='GenderizeDocs/0.0', api_key='169f6a8e933dcec15a57235d7fde49d6') key_present = True else: print("--- No API key provided.\n") key_present = False #Open ifile with open(ifile, 'r', encoding="utf8") as csvfile: readCSV = csv.reader(csvfile, delimiter=',', quotechar='"', skipinitialspace=True) line_count = 0 first_name = [] users_id = [] input_fields = [] rows = [] input_fields = next(readCSV) for row in readCSV: #Read CSV into first_name list rows.append(row) line_count += 1 if len(input_fields) > 1: for row in rows: first_name.append(row[1]) users_id.append(row[0]) else: for row in rows: first_name.append(row[0]) if args.noheader == False and first_name[0] in input_fields: first_name.pop(0) #Remove header o_first_name = list() for l in first_name: for b in l: o_first_name.append(b) if args.auto == True: # uniq_first_name = list(set(o_first_name)) chunks = list(jpyh.splitlist(first_name, 10)) print("--- Read CSV with " + str(len(first_name)) + " first_name. " + str(len(first_name)) + " unique.") else: chunks = list(jpyh.splitlist(first_name, 10)) print("--- Read CSV with " + str(len(first_name)) + " first_name") print("--- Processed into " + str(len(chunks)) + " chunks") if jpyh.query_yes_no( "\n---! Ready to send to Genderdize. Proceed?") == False: print("Exiting...\n") sys.exit() if os.path.isfile(ofile): if jpyh.query_yes_no( "---! Output file exists, overwrite?") == False: print("Exiting...\n") sys.exit() print("\n") if args.auto == True: ofile = ofile + ".tmp" response_time = [] gender_responses = list() output_fields = [*input_fields, "gender", "probability", "count"] if args.override == False: with open(ofile, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f) writer.writerow(output_fields) chunks_len = len(chunks) stopped = False for index, chunk in enumerate(chunks): if stopped: break success = False while not success: try: start = time.time() if key_present: dataset = genderize.get(chunk) else: dataset = Genderize( user_agent='GenderizeDocs/0.0', api_key='169f6a8e933dcec15a57235d7fde49d6' ).get(chunk) gender_responses.append(dataset) success = True except GenderizeException as e: print("\n" + str(e)) logger.error(e) #Error handling if "response not in JSON format" in str( e) and args.catch == True: if jpyh.query_yes_no( "\n---!! 502 detected, try again?" ) == True: success = False continue elif "Invalid API key" in str( e) and args.catch == True: print( "\n---!! Error, invalid API key! Check log file for details.\n" ) else: print( "\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key. Check log file for details.\n" ) stopped = True break response_time.append(time.time() - start) print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + \ str( round( (sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s") break j = 0 response_arr = list() for data in gender_responses: for el in data: response_arr.append(el) for row in rows: row.append(response_arr[j]['gender']) row.append(response_arr[j]['probability']) row.append(response_arr[j]['count']) j += 1 for row in rows: writer.writerow(row) if args.auto == True: print("\nCompleting identical first_name...\n") #AUTOCOMPLETE first_namey #Create master dict gender_dict = dict() for response in gender_responses: for d in response: gender_dict[d.get("name")] = [ d.get("count"), d.get("gender"), d.get("probability") ] filename, file_extension = os.path.splitext(ofile) with open(filename, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f, delimiter='-') writer.writerow( list(["name", "count", "gender", "probability"])) for name in o_first_name: data = gender_dict.get(name) writer.writerow([name, data[1], data[1], data[2]]) if args.override == True: output_fields = [*input_fields] with open(ofile, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f) writer.writerow(output_fields) chunks_len = len(chunks) stopped = False for index, chunk in enumerate(chunks): if stopped: break success = False while not success: try: start = time.time() if key_present: dataset = genderize.get(chunk) else: dataset = Genderize( user_agent='GenderizeDocs/0.0', api_key='169f6a8e933dcec15a57235d7fde49d6' ).get(chunk) gender_responses.append(dataset) success = True except GenderizeException as e: print("\n" + str(e)) logger.error(e) #Error handling if "response not in JSON format" in str( e) and args.catch == True: if jpyh.query_yes_no( "\n---!! 502 detected, try again?" ) == True: success = False continue elif "Invalid API key" in str( e) and args.catch == True: print( "\n---!! Error, invalid API key! Check log file for details.\n" ) else: print( "\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key. Check log file for details.\n" ) stopped = True break response_time.append(time.time() - start) print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + \ str( round( (sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s") break i = 0 response_arr = list() for data in gender_responses: for el in data: response_arr.append(el) index = len(rows) for row in rows: # print(rows[-1][-2]) if response_arr[i]['gender'] == 'male': rows[i][-1] = "1" rows[i][-2] = "0" else: rows[i][-1] = "0" rows[i][-2] = "1" i += 1 for row in rows: writer.writerow(row) print("Done!\n")
def genderize(args): print(args) #File initialization dir_path = os.path.dirname(os.path.realpath(__file__)) logging.basicConfig(filename=dir_path + os.sep + "log.txt", level=logging.DEBUG, format='%(asctime)s %(levelname)s %(name)s %(message)s') logger=logging.getLogger(__name__) ofilename, ofile_extension = os.path.splitext(args.output) #ofile = ofilename + "_" + time.strftime("%Y%m%d-%H%M%S") + ".csv" ofile = ofilename + ofile_extension ifile = args.input if os.path.isabs(ifile): print("\n--- Input file: " + ifile) else: print("\n--- Input file: " + dir_path + os.sep + ifile) if os.path.isabs(ofile): print("--- Output file: " + ofile) else: print("--- Output file: " + dir_path + os.sep + ofile + "\n") #File integrity checking if not os.path.exists(ifile): print("--- Input file does not exist. Exiting.\n") sys.exit() #if not os.path.exists(os.path.dirname(ofile)): if not os.path.exists(ofile): print("--- Error! Invalid output file path. Exiting.\n") sys.exit() #Some set up stuff ##csv.field_size_limit(sys.maxsize) #Initialize API key if not args.key == "NO_API": print("--- API key: " + args.key + "\n") genderize = Genderize( user_agent='GenderizeDocs/0.0', api_key=args.key) key_present = True else: print("--- No API key provided.\n") key_present = False #Open ifile with open(ifile, 'r', encoding="utf8") as csvfile: readCSV = csv.reader(csvfile, delimiter=',', skipinitialspace=True) first_name = [] inputData = [] for row in readCSV: #Read CSV into first_name list if len(row) > 1: row[1] = row[1].strip() first_name.append(row[1]) else: row[0] = row[0].strip() first_name.append(row[0]) inputData.append(row) headers = inputData[0] inputData = inputData[1:] data_iterator = iter(inputData) if args.noheader == False: first_name.pop(0) #Remove header o_first_name = list() for l in first_name: for b in l: o_first_name.append(b) if args.auto == True: uniq_first_name = list(set(first_name)) chunks = list(jpyh.splitlist(uniq_first_name, 10)); print("--- Read CSV with " + str(len(first_name)) + " first_name. " + str(len(uniq_first_name)) + " unique.") else: chunks = list(jpyh.splitlist(first_name, 10)); print("--- Read CSV with " + str(len(first_name)) + " first_name") print("--- Processed into " + str(len(chunks)) + " chunks") if jpyh.query_yes_no("\n---! Ready to send to Genderdize. Proceed?") == False: print("Exiting...\n") sys.exit() if os.path.isfile(ofile): if jpyh.query_yes_no("---! Output file exists, overwrite?") == False: print("Exiting...\n") sys.exit() print("\n") if args.auto == True: ofile = ofile + ".tmp" if "gender" not in headers: headers.append("gender") gender_index = headers.index("gender") if "probability" not in headers: headers.append("probability") prob_index = headers.index("probability") if "count" not in headers: headers.append("count") count_index = headers.index("count") response_time = []; gender_responses = list() with open(ofile, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f) #writer.writerow(list(["first_name", "gender", "probability", "count"])) chunks_len = len(chunks) stopped = False #print(inputData) writer.writerow(headers) for index, chunk in enumerate(chunks): if stopped: break success = False while not success: try: start = time.time() if key_present: dataset = genderize.get(chunk) else: dataset = Genderize().get(chunk) gender_responses.append(dataset) success = True except GenderizeException as e: #print("\n" + str(e)) logger.error(e) #Error handling if "response not in JSON format" in str(e) and args.catch == True: if jpyh.query_yes_no("\n---!! 502 detected, try again?") == True: success = False continue elif "Invalid API key" in str(e) and args.catch == True: print("\n---!! Error, invalid API key! Check log file for details.\n") else: print("\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key. Check log file for details.\n") stopped = True break response_time.append(time.time() - start) print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + \ str( round( (sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s") #print(dataset) for data in dataset: next_row = next(data_iterator) next_row_index = inputData.index(next_row) entry_row = inputData[next_row_index] entry = [data["gender"], data["probability"], data["count"]] entry_row += entry writer.writerow(entry_row) break if args.auto == True: print("\nCompleting identical first_name...\n") #AUTOCOMPLETE first_name #Create master dict gender_dict = dict() for response in gender_responses: for d in response: gender_dict[d.get("name")] = [d.get("gender"), d.get("probability"), d.get("count")] #names seen seen_names = [] with open(ofilename + "_auto" + ofile_extension, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f) writer.writerow(headers) for row in inputData: if len(row) > 4: name = row[1] if name not in seen_names: writer.writerow(row) seen_names.append(name) else: name = row[0] if name not in seen_names: writer.writerow(row) seen_names.append(name) if args.override == True: print("\nExercising override \n") with open(ofilename + "_override" + ofile_extension, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f) #add headers headers += ["female", "male"] writer.writerow(headers) for row in inputData: gender = row[gender_index] if gender == "male": row += [0, 1] writer.writerow(row) else: row += [1, 0] writer.writerow(row) master_dict = dict() for response in gender_responses: for line in response: master_dict[line.get("name")] = [line.get("gender"), line.get("probability"), line.get("count")] userResponse = True while userResponse == True: if jpyh.query_yes_no("Would you like to search the output file for the gender of a name") == False: userResponse = False sys.exit() sys.stdout.write("Enter a name you would like to search. \n") user_input = input() if user_input == '': sys.stdout.write("You MUST enter a name \n") elif user_input not in master_dict: sys.stdout.write("The name you entered is not in the output file \n") if jpyh.query_yes_no("Would you like to genderize {}?".format(user_input)) == False: userResponse = False else: if key_present: api_response = genderize.get([user_input]) else: api_response = Genderize().get([user_input]) for line in api_response: name, gender, prob, count = line.get("name"), line.get("gender"), line.get("probability"), line.get("count") print("Name: {}, Gender: {}, Probability: {}, Count: {}".format(name, gender, prob, count)) else: response = master_dict[user_input] sys.stdout.write("Name: {}, Gender: {}, Probability: {}, Count: {}".format(user_input, response[0], response[1], response[2])) if jpyh.query_yes_no("\nWould you like to search for another name?") == False: userResponse = False print("Done!\n")
def genderize(args): print(args) #File initialization dir_path = os.path.dirname(os.path.realpath(__file__)) logging.basicConfig(filename=dir_path + os.sep + "log.txt", level=logging.DEBUG, format='%(asctime)s %(levelname)s %(name)s %(message)s') logger=logging.getLogger(__name__) ofilename, ofile_extension = os.path.splitext(args.output) ofile = ofilename + "_" + time.strftime("%Y%m%d-%H%M%S") + ".csv" ifile = args.input if os.path.isabs(ifile): print("\n--- Input file: " + ifile) else: print("\n--- Input file: " + dir_path + os.sep + ifile) if os.path.isabs(ofile): print("--- Output file: " + ofile) else: print("--- Output file: " + dir_path + os.sep + ofile + "\n") #File integrity checking if not os.path.exists(ifile): print("--- Input file does not exist. Exiting.\n") sys.exit() if not os.path.exists(os.path.dirname(ofile)): print("--- Error! Invalid output file path. Exiting.\n") sys.exit() #Some set up stuff ##csv.field_size_limit(sys.maxsize) #Initialize API key if not args.key == "NO_API": print("--- API key: " + args.key + "\n") genderize = Genderize( user_agent='GenderizeDocs/0.0', api_key=args.key) key_present = True else: print("--- No API key provided.\n") key_present = False # Modify this section to take into account what the user wants to use through the command line #Open ifile with open(ifile, 'r', encoding="utf8") as csvfile: readCSV = csv.reader(csvfile, delimiter=',', skipinitialspace=True) first_name = [] raw = [] original_headers = []; is_override = False column_number = -1 # we are attempting to override the column that the names are stored in # Easier to check a boolean than to constantly check if args.override is equal to 'NO_OVERRIDE' if args.override != 'NO_OVERRIDE': is_override = True for row in readCSV: #Read CSV into first_name list # if we are overriding the search column if is_override: # ugly nested mess but it works. # if we have not found the list position of the desired override column if column_number == -1: # get the first row from the reader (assumed to be the first row) first_name.append(row) # also save this to the raw list for later use raw.append(row) # iterate through each item in the row we just saved and keep track of the for loop index for index, column in enumerate(first_name[0]): # if our column name is equal to the override name, we found the index number we need to proceed. Break from the loop if column == args.override: column_number = index break # error detection if the user override is not found in the header of the input csv. if index == len(first_name[0])-1: print("User Override '" + args.override + "' not found in input CSV file, Exiting...") sys.exit() # Our column number should be found by now, so continue to import the specific data that we want. else: # IMPORTANT: we need to remove all leading and trailing whitespaces to ensure that the genderizer responds with correct information stripped = row[column_number].strip() # append our stripped string onto the first_name list first_name.append(stripped) # save the entire row to the raw list raw.append(row) # if no override, continue like normal else: first_name.append(row) # if we have a header, we need to remove it so it is not included in the submission if args.noheader == False: if is_override: # Before we pop the first list item in first_name, save it to be our original headers so we can write them later original_headers = first_name[0] # We also need to pop the for item in the raw list or we will end up with extra data raw.pop(0) first_name.pop(0) #Remove header o_first_name = list() # We dont need to strip the first name list if we are overriding because it has already been taken care of if is_override: o_first_name = first_name # Removes the [''] on each list item so we just end up with names when iterating through the list else: for l in first_name: for b in l: o_first_name.append(b) # moved uniq_first_name outside of the if statement for later use. uniq_first_name = [] if args.auto == True: uniq_first_name = list(dict.fromkeys(o_first_name)) chunks = list(jpyh.splitlist(uniq_first_name, 10)); print("--- Read CSV with " + str(len(first_name)) + " first_name. " + str(len(uniq_first_name)) + " unique.") else: # splitting the name list into chunks of 10 due to api restrictions chunks = list(jpyh.splitlist(first_name, 10)); print("--- Read CSV with " + str(len(first_name)) + " first_name") print("--- Processed into " + str(len(chunks)) + " chunks") if jpyh.query_yes_no("\n---! Ready to send to Genderdize. Proceed?") == False: print("Exiting...\n") sys.exit() if os.path.isfile(ofile): if jpyh.query_yes_no("---! Output file exists, overwrite?") == False: print("Exiting...\n") sys.exit() print("\n") if args.auto == True: ofile = ofile + ".tmp" response_time = []; gender_responses = list() with open(ofile, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f) ## TODO Add new system to write all rows of the original file. Done # If we are overriding, we need to write different headers into the output csv file. We call the write_headers function to keep the code clean if is_override: write_headers(writer, original_headers) # else, continue as expected else: writer.writerow(list(["first_name","gender", "probability", "count"])) chunks_len = len(chunks) stopped = False for index, chunk in enumerate(chunks): if stopped: break success = False while not success: try: start = time.time() if key_present: dataset = genderize.get(chunk) else: dataset = Genderize().get(chunk) gender_responses.append(dataset) success = True except GenderizeException as e: #print("\n" + str(e)) logger.error(e) #Error handling if "response not in JSON format" in str(e) and args.catch == True: if jpyh.query_yes_no("\n---!! 502 detected, try again?") == True: success = False continue elif "Invalid API key" in str(e) and args.catch == True: print("\n---!! Error, invalid API key! Check log file for details.\n") else: print("\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key. Check log file for details.\n") stopped = True break response_time.append(time.time() - start) print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + \ str( round( (sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s") gender_dict = dict() # Moved this function out of the autocomplete function to allow us to use it for the non-autocomplete writing as well for response in gender_responses: for d in response: gender_dict[d.get("name")] = [d.get("gender"), d.get("probability"), d.get("count")] # we need to iterate over all of our "cleaned" first names for index, name in enumerate(o_first_name): data = gender_dict.get(name) # If we are overriding, we need to print our raw data plus our genderize information. if is_override: data_list = [data[0], data[1], data[2]] writer.writerow(raw[index] + data_list) # If we are not overriding, we print the standard information else: writer.writerow([name, data[0], data[1], data[2]]) # if we have the autocomplete enabled, we need to allow overriding in this mode as well. if args.auto == True: print("\nCompleting identical first_name...\n") filename, file_extension = os.path.splitext(ofile) with open(filename, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f) # Before we enter the for loop, we need to print the correct headers into the output csv file. # If we are overriding, we need to print out saved original headers as well as the new headers. We call our write_headers function to keep the code clean if is_override: write_headers(writer, original_headers) # we need to remove duplicate items in our raw file for proper file writing. raw_cleaned = remove_dupes(raw, column_number) # If we are not overriding, we can print the standard headers. else: writer.writerow(list(["first_name","gender", "probability", "count"])) # We need to iterate over our uniq_first_name list inorder to write the correct names for index, name in enumerate(uniq_first_name): # If we are overriding, we need to combine the data recieved from the genderize api and combine it with our clean raw list inorder to write the correct information if is_override: data = gender_dict.get(name) data_list = [data[0], data[1], data[2]] writer.writerow(raw_cleaned[index] + data_list) # If we are not overriding, we can perform everything as expected. else: data = gender_dict.get(name) writer.writerow([name, data[0], data[1], data[2]]) print("Done!\n")
def genderize(): if len(sys.argv) != 3: print( "Plrease specify input and output files: python genderize.py [input file path] [output file path]" ) sys.exit() dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) filename, file_extension = os.path.splitext(sys.argv[2]) ofile = filename + "_" + time.strftime("%Y%m%d-%H%M%S") + ".csv" print("\n--- Input file: " + dir_path + os.sep + sys.argv[1]) print("--- Output file: " + dir_path + os.sep + ofile + "\n") csv.field_size_limit(sys.maxsize) #Initialize API key genderize = Genderize(user_agent='GenderizeDocs/0.0', api_key=API_KEY) with open(sys.argv[1], encoding="utf8") as csvfile: readCSV = csv.reader(csvfile, delimiter=',') names = [] for row in readCSV: names.append(row) names.pop(0) print("--- Read CSV with " + str(len(names)) + " names") chunks = list(jpyh.splitlist(names, 10)) print("--- Processed into " + str(len(chunks)) + " chunks") if jpyh.query_yes_no( "\n---! Ready to send to Genderdize. Proceed?") == False: print("Exiting...\n") sys.exit() if os.path.isfile(ofile): if jpyh.query_yes_no( "---! Output file exists, overwrite?") == False: print("Exiting...\n") sys.exit() print("\n") response_time = [] with open(ofile, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f) writer.writerow(list(["names", "gender", "probability", "count"])) chunks_len = len(chunks) for index, chunk in enumerate(chunks): start = time.time() #while True: try: dataset = genderize.get(chunk) #break except GenderizeException as e: print(e) ''' if "response not in JSON format" in e: if jpyh.query_yes_no("\n---! 502 detected, try again?") == True: print("Exiting...\n") pass else: break ''' print( "\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key\n" ) sys.exit() response_time.append(time.time() - start) print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + \ str( round( (sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s") for data in dataset: writer.writerow(data.values())
def genderize(args): print(args) # File initialization dir_path = os.path.dirname(os.path.realpath(__file__)) logging.basicConfig( filename=dir_path + os.sep + "log.txt", level=logging.DEBUG, format='%(asctime)s %(levelname)s %(name)s %(message)s') logger = logging.getLogger(__name__) ofilename, ofile_extension = os.path.splitext(args.output) ofile = ofilename + "_" + time.strftime("%Y%m%d-%H%M%S") + ".csv" ifile = args.input if os.path.isabs(ifile): print("\n--- Input file: " + ifile) else: print("\n--- Input file: " + dir_path + os.sep + ifile) if os.path.isabs(ofile): print("--- Output file: " + ofile) else: print("--- Output file: " + dir_path + os.sep + ofile + "\n") # File integrity checking if not os.path.exists(ifile): print("--- Input file does not exist. Exiting.\n") sys.exit() if not os.path.exists(os.path.dirname(ofile)): print("--- Error! Invalid output file path. Exiting.\n") sys.exit() # Some set up stuff # csv.field_size_limit(sys.maxsize) # Initialize API key if not args.key == "NO_API": print("--- API key: " + args.key + "\n") genderize = Genderize(user_agent='GenderizeDocs/0.0', api_key=args.key) key_present = True else: print("--- No API key provided.\n") key_present = False # Open ifile with open(ifile, 'r', encoding="utf8") as csvfile: readCSV = csv.reader(csvfile, delimiter=',', skipinitialspace=True) i = next(readCSV) # set i to the headers as an array first_name = [] allData = [] if args.auto == False: # add columns based off auto i.append('male') i.append('female') if args.auto == True: i.append('gender') i.append('probability') i.append('count') for row in readCSV: # Read CSV into first_name list allData.append(row) first_name.append(row[1]) # specify the name column index here if args.noheader == False: first_name.pop(0) # Remove header o_first_name = list() # Deleted this part of the code because it substrings the name column # for l in first_name: # for b in l: # o_first_name.append(b) # print(o_first_name) if args.auto == True: chunks = list(jpyh.splitlist(first_name, 10)) print("--- Read CSV with " + str(len(first_name)) + " first_name. " + str(len(first_name)) + " unique.") else: chunks = list(jpyh.splitlist(first_name, 10)) print("--- Read CSV with " + str(len(first_name)) + " first_name") print("--- Processed into " + str(len(chunks)) + " chunks") if jpyh.query_yes_no( "\n---! Ready to send to Genderdize. Proceed?") == False: print("Exiting...\n") sys.exit() if os.path.isfile(ofile): if jpyh.query_yes_no( "---! Output file exists, overwrite?") == False: print("Exiting...\n") sys.exit() print("\n") if args.auto == True: ofile = ofile + ".tmp" response_time = [] gender_responses = list() with open(ofile, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f) writer.writerow(i) # write row header based off input file header chunks_len = len(chunks) stopped = False for index, chunk in enumerate(chunks): if stopped: break success = False while not success: try: start = time.time() if key_present: dataset = genderize.get(chunk) else: dataset = Genderize().get(chunk) gender_responses.append(dataset) success = True except GenderizeException as e: #print("\n" + str(e)) logger.error(e) # zzError handling if "response not in JSON format" in str( e) and args.catch == True: if jpyh.query_yes_no( "\n---!! 502 detected, try again?" ) == True: success = False continue elif "Invalid API key" in str( e) and args.catch == True: print( "\n---!! Error, invalid API key! Check log file for details.\n" ) else: print( "\n---!! GenderizeException - You probably exceeded the request limit, please add or purchase a API key. Check log file for details.\n" ) stopped = True break response_time.append(time.time() - start) print("Processed chunk " + str(index + 1) + " of " + str(chunks_len) + " -- Time remaining (est.): " + str( round((sum(response_time) / len(response_time) * (chunks_len - index - 1)), 3)) + "s") singleDataDict1 = dict() newSingleDataDict = dict() singleDataDictGen = dict() singleDataDictData = dict() newDataListWithGen = list() allDataList = list() addDataDict = {} newDataList = list() newArry = list() # get each row and make a new dictionatry for l in allData: for n in range(0, len(l)): singleDataDict1[n] = l[n] dictionary_copy = singleDataDict1.copy() allDataList.append(dictionary_copy) # Combined the input file data with the gender api data for m in allDataList: for n in dataset: if n['name'] == m[1]: # specify name column here singleDataDictGen = n singleDataDictData = m singleDataDictData.update(singleDataDictGen) # delete the name key because of duplication del singleDataDictData['name'] newDataListWithGen.append(singleDataDictData) for newData in newDataListWithGen: # add gender binary at the end of the data dictionary if newData['gender'] == 'female' and newData[ 'probability'] > 0.5: newData.update({'male': 0, 'female': 1}) elif newData['gender'] == 'male' and newData[ 'probability'] > 0.5: newData.update({'male': 1, 'female': 0}) # since this is without argument auto delete gender, probability, and count del newData['gender'] del newData['probability'] del newData['count'] # write the data column writer.writerow(newData.values()) break if args.auto == True: print("\nCompleting identical first_name...\n") # AUTOCOMPLETE first_name # Create master dict gender_dict = dict() singleDataDict = dict() singleDataDict1 = dict() newSingleDataDict = dict() singleDataDictGen = dict() singleDataDictData = dict() newDataListWithGen = list() allDataList = list() addDataDict = {} newDataList = list() newArry = list() # get each row and make a new dictionatry for l in allData: for n in range(0, len(l)): singleDataDict1[n] = l[n] dictionary_copy = singleDataDict1.copy() allDataList.append(dictionary_copy) # Combined the input file data rows with the gender api data rows for m in allDataList: for n in dataset: if n['name'] == m[1]: # specify name column here singleDataDictGen = n singleDataDictData = m singleDataDictData.update(singleDataDictGen) del singleDataDictData['name'] newDataListWithGen.append(singleDataDictData) filename, file_extension = os.path.splitext(ofile) with open(filename, 'w', newline='', encoding="utf8") as f: writer = csv.writer(f) writer.writerow(i) # write the header column for newData in newDataListWithGen: # write the data column writer.writerow(newData.values()) print("Done!\n")