def get_genderize(names, outfile): genderize = Genderize(user_agent="Kevin_Bonham", api_key=genderize_key) for i in range(0, len(fixed_names), 10): query = fixed_names[i : i + 10] genders = genderize.get(query) names_dict = {} for gender in genders: n = gender["name"] g = gender["gender"] if g != None: p = gender["probability"] c = gender["count"] else: p = None c = None n = gender["name"] g = gender["gender"] names_dict[n] = {"gender": g, "probability": p, "count": c} with open(outfile, "w+") as f: f.write(json.dumps(names_dict, indent=4))
def prob(self, name, binary=False): # guess method to check names dictionary if (self.config['DEFAULT']['genderize'] == 'no'): v = Genderize().get([name]) elif (self.config['DEFAULT']['genderize'] == 'yes'): fichero = open(self.config['DEFAULT']['genderizefile'], "r+") apikey = fichero.readline().rstrip() v = Genderize( user_agent='GenderizeDocs/0.0', api_key=apikey).get([name]) prob = v[0]['probability'] return prob
def gender(json_dict, first_name): """ Get the gender of the patient :param dict json_dict: full response of the vision api in dict :param str first_name: first_name of the patients :return gender in case of printed gender """ male_guess_list = ['M', 'Male', 'm'] female_guess_list = ['F', 'Female', 'f'] gender_printed = get_gender_if_printed(json_dict) if gender_printed: if gender_printed == 'm' or gender_printed == 'male': return 'M' elif gender_printed == 'f' or gender_printed == 'female': return 'F' title_gender = get_gender_using_title(json_dict, first_name) if title_gender: return title_gender try: li = Genderize().get([first_name])[0] if li['gender'] == 'male': if check_for_gender(json_dict, male_guess_list): return "M" elif li['gender'] == 'female': if check_for_gender(json_dict, female_guess_list): return "F" return None except Exception as e: print("Genderize not responding {}".format(e)) return None
def test_more_than_10_names(): """ Retrieve 20 names, which requires multiple HTTP requests now that the API has a 10-name limit. """ names = [ "Emma", "Olivia", "Ava", "Isabella", "Sophia", "Mia", "Charlotte", "Amelia", "Evelyn", "Abigail", "Liam", "Noah", "William", "James", "Logan", "Benjamin", "Mason", "Elijah", "Oliver", "Jacob", ] response = Genderize().get(names) assert len(names) == len(response) for name, namedata in zip(names, response): assert name == namedata['name'],\ 'Expected names to be returned in same order'
def genderize(genderize_api_key): """ Function to set up Genderize """ genderize_obj = Genderize(user_agent='GenderizeCommunities/0.0', \ api_key=genderize_api_key) return genderize_obj
def __get_gender(names): """ TODO: documentation """ try: genders = Genderize().get(names, country_id="IT") male = 0 male_count = 0 female = 0 female_count = 0 for gender in genders: if gender['gender'] != None: if gender['gender'] == "female": female_count += 1 female += gender['probability'] else: male_count += 1 male += gender['probability'] if male > 0: male = male / male_count if female > 0: female = female / female_count if male > 0 or female > 0: if female > male: return " Donna " else: return " Uomo " else: return " PERSONA " except: print(names) return " PERSONARICHIESTAFALLITA "
def get_author_genders(): app.logger.info(request.json) authors = tuple(request.json) # Fetch authors from DB query = "select authors.name, authors.gender, authors.gender_source from authors where authors.name IN %s" conn = getConnection() cur = conn.cursor() cur.execute(query, (authors, )) rows = cur.fetchall() genders = {row[0]: (row[1], row[2]) for row in rows} # Find missing authors toFetch = list(set([x for x in authors if x not in genders])) fetched = [] repsonse = Genderize(api_key="a619730661a7ce6b4f8e8e6b047046a2").get( [x.split(" ")[0] for x in toFetch]) for author, resp in zip(toFetch, repsonse): fetched.append((author, resp["gender"], "genderize")) # Add fetched to response for author, gender, source in fetched: genders[author] = (gender, source) # Add fetched to DB from psycopg2.extras import execute_values execute_values( cur, "INSERT INTO authors (name, gender, gender_source) VALUES %s", fetched) conn.close() return genders
def test_integration_single(): """ Retrieve a single name. """ expected = 'male' actual = Genderize().get1('Peter')['gender'] assert expected == actual,\ "Expected {0}, got {1}".format(expected, actual)
def test_with_headers(): """ Calls the API server with an invalid API key. Should result in an exception. """ headers = None result = Genderize().get('Peter', retheader=True) headers = result['headers'] assert headers, "Expected response headers to be returned"
def main(): inLines = open(INFILE, mode='r', encoding='ISO-8859-1') # create array of unknown gender characters unknown = [] for line in inLines: fields = line.rsplit('\t') if fields[4] == '?': name = fields[4].split(' ') unknown.append(fields[1]) # creates Genderize and gets genders genderize = Genderize( user_agent='GenderizeDocs/0.0', api_key=config.api_key, timeout=60) results = [] for name in unknown: results.append(gender_genderizer(genderize, name)) # sets pointer to beginning of file inLines.seek(0) outLines = open(OUTFILE, mode='w', encoding='ISO-8859-1') # counts changed = 0 unchanged = 0 i = 0 for line in inLines: fields = line.rsplit('\t') name = fields[1] if fields[4] == '?': prob = results[i]['probability'] if prob > 0.6: gender = results[i]['gender'][0:1] print('CHANGED:\t%s -> %s, %.2f' % (name, gender, prob)) fields[4] = gender changed += 1 else: print('UNCHANGED:\t%s' % (name)) unchanged += 1 i += 1 # print updated character metadata newline = '\t'.join(fields) outLines.write(newline) print('----------------------------------------') print('TOTAL UNKNOWN:\t\t%d' % (len(unknown))) print('TOTAL CHANGED:\t\t%d' % (changed)) print('TOTAL UNCHANGED:\t%d' % (unchanged)) outLines.close() inLines.close()
def guess(self, name, binary=False): # guess method to check names dictionary if (self.config['DEFAULT']['genderize'] == 'no'): v = Genderize().get([name]) elif (self.config['DEFAULT']['genderize'] == 'yes'): fichero = open(self.config['DEFAULT']['genderizefile'], "r+") apikey = fichero.readline().rstrip() v = Genderize( user_agent='GenderizeDocs/0.0', api_key=apikey).get([name]) g = v[0]['gender'] if ((g == 'female') and binary): guess = 0 elif ((g == 'male') and binary): guess = 1 elif (not(binary)): guess = g return guess
def main(): # Creates year folders if they don't exist; clears them if they do for year in range(1975, 2016): clear_dir('%s/%d' % (DESTFOL, year)) genderize = Genderize(user_agent='GenderizeDocs/0.0', api_key=config.api_key, timeout=60) femCount = 0 maleCount = 0 unkCount = 0 for year in range(1975, 2016): print('Gendering movies in %d...' % (year)) movies = read_folder_dict('%s/%d' % (SRCFOL, year), year) for movie in movies: characters = {} title = movie['title'] lines = movie['text'].split('\n') outFile = open('%s/%d/%s' % (DESTFOL, year, title), mode='w', encoding='ISO-8859-1') for i in range(2, len(lines)): fields = lines[i].split('\t') if len(fields) < 2: continue name = fields.pop(0) gender = getGender(name, characters, genderize) outFile.write('%s\t%s\t%s\n' % (name, gender, ' '.join(fields))) for gender in characters.values(): if gender == '?': unkCount += 1 elif gender == 'f': femCount += 1 elif gender == 'm': maleCount += 1 outFile.close() print('Finished %s...' % (title)) print('----------------------------------------') print('NUMBER OF CHARACTERS') print('\tMale:\t\t%d' % (maleCount)) print('\tFemale:\t\t%d' % (femCount)) print('\tUnknown:\t%d' % (unkCount))
def get_gender_using_genderize_api(name): """ :param a string, ideally the first name :returns a dictionary of name, gender, % accuracy Description:Given a name as a string, this function returns a dic with the possible gender NOTE: This uses the genderize api """ if name: return Genderize().get(name)
def test_invalid_api_key(): """ Calls the API server with an invalid API key. Should result in an exception. """ caught = False try: Genderize(api_key='invalid_api_key').get1('Peter') except GenderizeException: caught = True assert caught, "Expected a GenderizeException to be thrown"
async def gender(self, ctx, *, name: str): list = [f"{name}"] em = discord.Embed(title="Gender") gender = Genderize().get(list) result = gender[0] em.add_field(name="Name", value=f'{result["name"]}', inline=True) em.add_field(name='Gender', value=f'{result["gender"]}', inline=False) em.add_field(name='Probability', value=f'{result["probability"]}', inline=True) await ctx.send(embed=em)
def test_integration(): """ Integration test from the readme. Calls the real Genderize.io API server. """ expected_genders = { 'James': 'male', 'Eva': 'female', 'Thunderhorse': None, } actual_genders = dict((elem['name'], elem['gender']) for elem in Genderize().get(expected_genders.keys())) assert expected_genders == actual_genders,\ "Expected {0}, got {1}".format(expected_genders, actual_genders)
def gender(): gende = [] prenom = "" #On defini le prenom via une RegEx qui prend le premier mot du CV defPrenom = re.findall('\A[a-zA-Z{Ë, Ï, Ö, Œ, ï, ö, é,œ,â, ë, ç, ô, -}]+ ', txt) #On supprime l'espace for suppEsp in defPrenom: prenom = suppEsp.strip() #on defini le sexe a partir du prenom sexe = Genderize().get1(prenom) gende.append(sexe['gender']) return gende
def test_with_headers(): """ Retrieve a single name with response headers. """ response = Genderize().get(['Peter'], retheader=True) assert response['data'][0][ 'name'] == 'Peter', "Expected name data to be returned" assert response['headers'], "Expected response headers to be returned" for header in [ 'X-Rate-Limit-Limit', 'X-Rate-Limit-Remaining', 'X-Rate-Reset' ]: assert header in response['headers'],\ "Expected {0} header to be returned".format(header)
def guess_list(self, path='files/names/partial.csv', binary=False): # guess list method slist = [] with open(path) as csvfile: sexreader = csv.reader(csvfile, delimiter=',', quotechar='|') next(sexreader, None) i = 0 # string = "" listnames = list() for row in sexreader: name = row[0].title() name = name.replace('\"','') listnames.append(name) # print("len listnames:"+str(len(listnames))) new = [] for i in range(0, len(listnames), 10): # We must split the list in different lists with size 10 new.append(listnames[i : i+10]) for i in new: if (self.config['DEFAULT']['genderize'] == 'no'): jsonlist = Genderize().get(i) elif (self.config['DEFAULT']['genderize'] == 'yes'): fichero = open("files/apikeys/genderizepass.txt", "r+") apikey = fichero.readline().rstrip() jsonlist = Genderize(user_agent='GenderizeDocs/0.0', api_key=apikey).get(i) for item in jsonlist: if ((item['gender'] == None) & binary): slist.append(2) elif ((item['gender'] == None) & (not binary)): slist.append("unknown") elif ((item['gender'] == "male") & binary): slist.append(1) elif ((item['gender'] == "male") & (not binary) ): slist.append("male") elif ((item['gender'] == "female") & binary): slist.append(0) elif ((item['gender'] == "female") & (not binary) ): slist.append("female") return slist
def __init__(self, data, key=None, gender_file=None): """ Main constructor of the class where the original dataframe is provided. :param data: original dataframe :param key: genderize key (optional) :param gender_file: file with gender info, used as cache :type data: pandas.DataFrame :type key: string :type gender_file: string (as filepath) """ from genderize import Genderize self.data = data self.gender = {} # init the name-gender dictionary self.key = key self.gender_file = gender_file # Init the genderize connection self.connection = Genderize() if self.key: self.connection = Genderize(api_key=self.key) if self.gender_file: # This file is used as cache for the gender info # This helps to avoid calling once and again to the API fd = open(gender_file, "r") lines = fd.readlines() fd.close() # TODO: fix hardcoded code when reading columns and using # separators for line in lines: gender_data = line.split("\t") self.gender[gender_data[1]] = { "gender_analyzed_name": gender_data[1], "gender": gender_data[2] }
def get_gender_from_given_name(ungendered_imdbs): names = {} for imdb in ungendered_imdbs: if imdb[0] not in names.keys(): names[imdb[0]] = {} names[imdb[0]]['imdbs'] = [] names[imdb[0]]['imdbs'].append(imdb[1]) #print(names) #trying to get gender from name file try: with open('names_to_gender.csv', 'r', encoding="utf-8") as file: for line in file: info = line.split(',') if info[0] in names: #print(info[0]+"("+info[1]+") is given name of "+str(names[info[0]]['imdbs'])) for imdb in names[info[0]]['imdbs']: #print(imdb) crew[imdb]['gender'] = info[1] crew[imdb]['gender_probability'] = info[2] crew[imdb]['source'] = 'Genderize.io' del names[info[0]] except: print('names_to_gender file not found. No problem. Proceeding.') names_list = list(names.keys()) start = 0 stop = 9 namesfile = open("names_to_gender.csv", 'a', encoding="utf-8") while (start < len(names_list)): #print("Getting gender of names "+str(names_list[start:stop+1])) gendered_names = Genderize().get(names_list[start:stop + 1]) for name in gendered_names: #print(name['name']) namesfile.write( str(name['name']) + ',' + str(name['gender']) + ',' + str(name['probability']) + ',' + str(name['count']) + '\n') for imdb in names[name['name']]['imdbs']: #print(imdb) crew[imdb]['gender'] = name['gender'] crew[imdb]['gender_probability'] = name['probability'] crew[imdb]['source'] = 'Genderize.io' start = start + 10 stop = min(stop + 10, len(names) + 1) time.sleep(5) namesfile.close()
def find_gender(): f = open('kiev_users.txt', 'r') users = pickle.load(f) for user in users: gender = None user.full_name = translit(user.full_name, 'ru', reversed=True) result = Genderize().get(user.full_name.split()) for item in result: # print user.full_name, item['gender'] # print item['gender'] # if 'probability' in item: # print item['probability'] if item['gender'] == 'female' or item['gender'] == 'male': gender = item['gender'] print user.full_name, gender
class GenderizeIO(object): __name__ = 'genderizeio' def __init__(self, api_key=None): self._api = Genderize(api_key=api_key) def gender(self, artist): first_name = artist.split(' ')[0] response = self._api.get([first_name]) if response[0]['gender'] is not None \ and response[0]['probability'] > 0.75 \ and response[0]['count'] > 50: return response[0]['gender'] else: return None
def __init__(self, data, key=None, gender_file=None): """ Main constructor of the class where the original dataframe is provided. :param data: original dataframe :param key: genderize key (optional) :param gender_file: file with gender info, used as cache :type data: pandas.DataFrame :type key: string :type gender_file: string (as filepath) """ from genderize import Genderize self.data = data self.gender = {} # init the name-gender dictionary self.key = key self.gender_file = gender_file # Init the genderize connection self.connection = Genderize() if self.key: self.connection = Genderize(api_key=self.key) if self.gender_file: # This file is used as cache for the gender info # This helps to avoid calling once and again to the API fd = open(gender_file, "r") lines = fd.readlines() fd.close() # TODO: fix hardcoded code when reading columns and using # separators for line in lines: gender_data = line.split("\t") self.gender[gender_data[1]] = {"gender_analyzed_name": gender_data[1], "gender": gender_data[2]}
def dumpintoexcel(fullname, salutation, designation, namelist, sociallinks, countries, imagelinks, last_updated): try: from genderize import Genderize x = Genderize().get([namelist.first]) for y in x: if y['gender'] != None or y['gender'] != '': Gender = y['gender'] else: Gender = 'Unknown' except: Gender = 'Unknown' links = ','.join([str(elem) for elem in sociallinks]) images = ','.join([str(elem) for elem in imagelinks]) if namelist: first_name = namelist.first middle_name = namelist.middle last_name = namelist.last else: first_name = "" middle_name = "" last_name = "" infotext = [{ 'Country': countries[0], 'Full Name': fullname, 'First Name': first_name, 'Middle Name': middle_name, 'Last Name': last_name, 'Gender': Gender, 'Title': salutation, 'Designation': designation, 'Contact': links, 'Images': images, 'Last Updated': last_updated, }] keys = [ 'Country', 'Full Name', 'First Name', 'Middle Name', 'Last Name', 'Gender', 'Title', 'Designation', 'Contact', 'Images', 'Last Updated' ] with open('ScrappedData/PoliticalLeaders.csv', 'a') as dept: writer = csv.DictWriter(dept, fieldnames=keys) writer.writerows(infotext)
def genderize_function(name): #lowercase name = name.lower() regex = re.compile('[^a-z ]') #removal of all non-letters name = regex.sub('', name) #names = name.split() names = name.split() print names try: for x in names: gen = Genderize(api_key=GENDERIZE_API_KEY).get([x]) print gen if gen[0]['gender'] != None: return (gen[0]['gender'],gen[0]['count'],gen[0]['probability']) return ('unknown','unknown','unknown') except genderize.GenderizeException: pass
def get_author_gender(): name = request.args.get('name') conn = getConnection() cur = conn.cursor() query = "select authors.gender, authors.gender_source from authors where authors.name = %s" cur.execute(query, (name, )) rows = cur.fetchall() if not rows: gender = Genderize(api_key="a619730661a7ce6b4f8e8e6b047046a2").get( [name.split(" ")[0]])[0]["gender"] source = "genderize" insert = "insert into authors (name, gender, gender_source) values (%s, %s, %s)" cur.execute(insert, (name, gender, source)) else: gender = rows[0][0] source = rows[0][1] print(query, rows) conn.close() return {"name": name, "gender": gender, "source": source}
def get_gender(df, column): name_list = [] genderdict_list = [] gender_list = [] for name in df[column]: name_list.append(name) name_list = [name_list[i:i + 10] for i in range(0, len(name_list), 10)] for element in name_list: genderdict_list.append(Genderize().get(element)) for gender in genderdict_list: for key in gender: gender_list.append(key['gender']) return gender_list
def get_gender(unique_name): """ Request genderize api to get the gender for each name in a list. If maximum number of requests is reached, return the last data stored. """ try: get_gender = Genderize().get(unique_name) fp = os.path.join('extern_data', 'gender.json') f = open(fp, "w") json.dump(get_gender, f) f.close() return get_gender except GenderizeException: print("Request limit") fp = os.path.join('extern_data', 'gender.json') if os.path.exists(fp): with open(fp) as data_file: data_loaded = json.load(data_file) return data_loaded
def call_genderizeio(auth_df): unk_df = auth_df[(auth_df['clean_gender'] == 'unknown') & (auth_df['forename'] != 'NAN') & (auth_df['forename'].str.len() > 2)] print('There are ' + str(len(unk_df)) + ' names we dont know about before calling genderize...') unk_df = pd.DataFrame(unk_df['forename'].drop_duplicates()) r = requests.get("https://api.genderize.io?name=test") print(r.headers['X-Rate-Limit-Remaining'] + ' genderize calls remaining...!') if int(r.headers['X-Rate-Limit-Remaining']) >= len(unk_df): genderize_return = (Genderize().get(unk_df['forename'].to_list())) unk_df['genderize_return'] = '' counter = 0 for index, row in unk_df.iterrows(): unk_df.at[index, 'genderize_return'] = genderize_return[counter]['gender'] counter = counter + 1 #call_genderizeio() unk_df.to_csv('unknown_names.csv') unk_df = unk_df[(unk_df['genderize_return'] == 'male') | (unk_df['genderize_return'] == 'female')] auth_df = pd.merge(auth_df, unk_df, how='left', left_on='forename', right_on='forename') auth_df['clean_gender'] = np.where( auth_df['genderize_return'] == 'male', auth_df['genderize_return'], auth_df['clean_gender']) auth_df['clean_gender'] = np.where( auth_df['genderize_return'] == 'female', auth_df['genderize_return'], auth_df['clean_gender']) unk_df = auth_df[(auth_df['clean_gender'] == 'unknown') & (auth_df['forename'] != 'NAN') & (auth_df['forename'].str.len() > 2)] print('After calling the genderize API, there are still ' + str(len(unk_df)) + ' names which we dont know about!...') return auth_df
def trouver_sexe_prof(prenom): sexe = Genderize().get([prenom]) for sex in sexe: gender = list(sex.values())[1] #print(gender) if gender is not None: probability = list(sex.values())[2] #print(probability) if gender == None: return "impossible à déterminer" else: if gender == "female": try: if probability >= 0.75: return "femme" except IndexError: return "impossible à déterminer" elif gender == "male": try: if probability >= 0.75: return "homme" except IndexError: return "impossible à déterminer"
def get_genderize(api_key): """ Returns genderize object, useful when using this script as a module """ genderize = Genderize(user_agent='GenderizeDocs/0.0', api_key=api_key) return genderize
class Gender(Enrich): """ This class creates three new columns with the gender of the name provided """ def __init__(self, data, key=None, gender_file=None): """ Main constructor of the class where the original dataframe is provided. :param data: original dataframe :param key: genderize key (optional) :param gender_file: file with gender info, used as cache :type data: pandas.DataFrame :type key: string :type gender_file: string (as filepath) """ from genderize import Genderize self.data = data self.gender = {} # init the name-gender dictionary self.key = key self.gender_file = gender_file # Init the genderize connection self.connection = Genderize() if self.key: self.connection = Genderize(api_key=self.key) if self.gender_file: # This file is used as cache for the gender info # This helps to avoid calling once and again to the API fd = open(gender_file, "r") lines = fd.readlines() fd.close() # TODO: fix hardcoded code when reading columns and using # separators for line in lines: gender_data = line.split("\t") self.gender[gender_data[1]] = {"gender_analyzed_name": gender_data[1], "gender": gender_data[2]} def enrich(self, column): """ This method calculates thanks to the genderize.io API the gender of a given name. This method initially assumes that for the given string, only the first word is the one containing the name eg: Daniel Izquierdo <*****@*****.**>, Daniel would be the name. If the same class instance is used in later gender searches, this stores in memory a list of names and associated gender and probability. This is intended to have faster identifications of the gender and less number of API accesses. :param column: column where the name is found :type column: string :return: original dataframe with four new columns: * gender: male, female or unknown * gender_probability: value between 0 and 1 * gender_count: number of names found in the Genderized DB * gender_analyzed_name: name that was sent to the API for analysis :rtype: pandas.DataFrame """ if column not in self.data.columns: return self.data splits = self.data[column].str.split(" ") splits = splits.str[0] self.data["gender_analyzed_name"] = splits.fillna("noname") self.data["gender_probability"] = 0 self.data["gender"] = "Unknown" self.data["gender_count"] = 0 names = list(self.data["gender_analyzed_name"].unique()) for name in names: if name in self.gender.keys(): gender_result = self.gender[name] else: try: # TODO: some errors found due to encode utf-8 issues. # Adding a try-except in the meantime. gender_result = self.connection.get([name])[0] except Exception: continue # Store info in the list of users self.gender[name] = gender_result # Update current dataset if gender_result["gender"] is None: gender_result["gender"] = "NotKnown" self.data.loc[self.data["gender_analyzed_name"] == name, 'gender'] =\ gender_result["gender"] if "probability" in gender_result.keys(): self.data.loc[self.data["gender_analyzed_name"] == name, 'gender_probability'] = gender_result["probability"] self.data.loc[self.data["gender_analyzed_name"] == name, 'gender_count'] = gender_result["count"] self.data.fillna("noname") return self.data