def find_out_genders( twitterUserData ): # figure out genders of users based on user's first name detector = GenderDetector('us') for screenName in twitterUserData.keys(): print("Getting gender for: {0} ".format(screenName)) nam = twitterUserData[screenName]['name'] toCheck = None if nam != '.': if " " in nam: toCheck = nam.split(" ")[0] if toCheck is not None: twitterUserData[screenName]['gender'] = detector.guess(toCheck) else: print(nam) twitterUserData[screenName]['gender'] = detector.guess(nam) print("Creating backup file:") with open('backup2.json', 'w') as f: json.dump(twitterUserData, f) print("Results") for result in twitterUserData.keys(): print(result) print("\tid: {0}".format(result)) print("\tscreenName: {0}".format( twitterUserData[result]['screen_name'])) print("\tgender: {0}".format(twitterUserData[result]['gender'])) return twitterUserData
def determineGenders(twitterUserData): detector = GenderDetector('us') for screenName in twitterUserData.keys(): print("Getting gender for: {0} ".format(screenName)) nam = twitterUserData[screenName]['name'] #print(nam) toCheck = None if nam != '.': if " " in nam: toCheck = nam.split(" ")[0] if toCheck is not None: twitterUserData[screenName]['gender'] = detector.guess(toCheck) else: print(nam) twitterUserData[screenName]['gender'] = detector.guess(nam) print("Creating backup file:") with open('backup2.json','w') as f: json.dump(twitterUserData,f) #print(twitterUserData) print("Results") for result in twitterUserData.keys(): print(result) print("\tid: {0}".format(result)) print("\tscreenName: {0}".format(twitterUserData[result]['screen_name'])) print("\tgender: {0}".format(twitterUserData[result]['gender'])) return twitterUserData
def gender_identify(name): name = re.findall(pattern, name, re.M) detector = GenderDetector('uk') try: temp_name = name[0] if ' ' in temp_name: temp_name = temp_name.split(" ") return detector.guess(temp_name[0]) else: return detector.guess(temp_name) except: return None
def user(request, user_id): connect() user = dict(models.Recommendation.get(user_gid=user_id)) name = user['username'].replace('[', '').split(" ") detector = GenderDetector('us') gender = "" try: gender = detector.guess(name[0]) except Exception as e: gender = "unknown" user['gender'] = gender for j, book in enumerate(user['books_details_recommended']): for i, shelve in enumerate(book['list_shelves']): for best_shelve in user['most_common_shelves']: if (best_shelve.shelve == shelve.shelve): last_shelve = user['books_details_recommended'][j][ 'list_shelves'][i] new_shelve = models.shelve(count=last_shelve.count, votes=last_shelve.votes, gid=last_shelve.gid, best=True, shelve=last_shelve.shelve) user['books_details_recommended'][j]['list_shelves'][ i] = new_shelve return render(request, 'recom/user.html', {'user': user})
def predict_gender(name_phrase): detector = GenderDetector(country='us', unknown_value='') names = _strip_punctuation(name_phrase).split(' ') genders = [g for g in [detector.guess(name) for name in names] if g] if genders and all(gender == 'male' for gender in genders): return 'male' if any(gender == 'female' for gender in genders): return 'female' return ''
def index(request): connect() cursor = connection.cursor() rows = cursor.execute("SELECT id, user_gid, precision, recall, username, gender FROM prs.recommendation where common_shelves_retrieved=true") users=sorted(rows, key=lambda k: k['precision'], reverse=True) detector = GenderDetector('us') for i, user in enumerate(users): name=user['username'].replace('[', '').split(" ") gender="" try: gender=detector.guess(name[0]) except Exception as e: gender="unknown" new_user=dict(models.Recommendation( user_gid=user['user_gid'], gender=gender,precision=user['precision'], recall=user['recall'], username=user['username'])) users[i]=new_user return render(request, 'recom/index.html', {'users': users})
def user(request, user_id): connect() user=dict(models.Recommendation.get(user_gid=user_id)) name=user['username'].replace('[', '').split(" ") detector = GenderDetector('us') gender="" try: gender=detector.guess(name[0]) except Exception as e: gender="unknown" user['gender']=gender for j, book in enumerate(user['books_details_recommended']): for i, shelve in enumerate(book['list_shelves']): for best_shelve in user['most_common_shelves']: if(best_shelve.shelve==shelve.shelve): last_shelve=user['books_details_recommended'][j]['list_shelves'][i] new_shelve=models.shelve(count=last_shelve.count,votes=last_shelve.votes, gid=last_shelve.gid, best=True,shelve=last_shelve.shelve) user['books_details_recommended'][j]['list_shelves'][i]=new_shelve return render(request, 'recom/user.html', {'user': user})
def index(request): connect() cursor = connection.cursor() rows = cursor.execute( "SELECT id, user_gid, precision, recall, username, gender FROM prs.recommendation where common_shelves_retrieved=true" ) users = sorted(rows, key=lambda k: k['precision'], reverse=True) detector = GenderDetector('us') for i, user in enumerate(users): name = user['username'].replace('[', '').split(" ") gender = "" try: gender = detector.guess(name[0]) except Exception as e: gender = "unknown" new_user = dict( models.Recommendation(user_gid=user['user_gid'], gender=gender, precision=user['precision'], recall=user['recall'], username=user['username'])) users[i] = new_user return render(request, 'recom/index.html', {'users': users})
def get_nominees(year): '''Nominees is a dictionary with the hard coded award names as keys, and each entry a list of strings. Do NOT change the name of this function or what it returns.''' if get_tweets(year) == False: return {} award_tweet_dict = get_award_tweet_dict(year) nominees = {award:[] for award in OFFICIAL_AWARDS} nominee_names = {award:[] for award in OFFICIAL_AWARDS} stoplist = ['best','-','award','for','or','made', 'in', 'a', 'by', 'performance', 'an','golden','globes','role','the'] clean_award_names = {award:[a for a in award.split() if not a in stoplist] for award in OFFICIAL_AWARDS} cfd = {} person_award_identifiers = ["director","actor","actress","demille"] name_pattern = re.compile(r'[A-Z][a-z]+\s[A-Z][a-z]+') for award in OFFICIAL_AWARDS: if any(identifier in award for identifier in person_award_identifiers): for tweet in award_tweet_dict[award]: names = re.findall(name_pattern, tweet) for name in names: award_not_in_name = True for word in clean_award_names[award]+stoplist: award_not_in_name = award_not_in_name and not word in name.lower().split() if award_not_in_name: nominee_names[award].append(name) detector = GenderDetector('us') for award in OFFICIAL_AWARDS: if 'actor' in award: cfd[award] = nltk.FreqDist(nominee_names[award]) most_common = cfd[award].most_common(50) for name in most_common: gender = detector.guess(name[0].split()[0]) if gender == 'male': nominees[award].append(name[0]) elif 'actress' in award: cfd[award] = nltk.FreqDist(nominee_names[award]) most_common = cfd[award].most_common(50) for name in most_common: gender = detector.guess(name[0].split()[0]) if gender == 'female': nominees[award].append(name[0]) elif any(identifier in award for identifier in ['director','demille']): cfd[award] = nltk.FreqDist(nominee_names[award]) most_common = cfd[award].most_common(50) for name in most_common: gender = detector.guess(name[0].split()[0]) if gender != 'unknown': nominees[award].append(name[0]) else: winner_stoplist = ["musical","comedy","motion", "picture","golden","globe","movie","television","best","or","tv","original","series","animated","feature","film","song","drama","-","rt","to","goes","foreign",'the'] bigrams = [] for tweet in award_tweet_dict[award]: if tweet[:2] == "RT": continue tweet_bigrams = nltk.bigrams(tweet.split()) trimmed = [b for b in tweet_bigrams if b[0].lower() not in winner_stoplist and b[1].lower() not in winner_stoplist and b[0][0] == b[0][0].upper()] bigrams += [b for b in trimmed if b[0][0] != "@" and b[1][0] != "@" and b[0][0] != "#" and b[1][0] != "#"] cfd[award] = nltk.FreqDist([' '.join(b) for b in bigrams]) nominees[award] = [n[0] for n in cfd[award].most_common(5)] # print "\n" # nominees = {award: [a[0] for a in cfd[award].most_common(5)] for award in OFFICIAL_AWARDS} return nominees
class TwitterTransformer(): def __init__(self): self.gender_detector = GenderDetector() self.googlemaps_api = GoogleMaps( api_key=app_settings.SERVICES_CREDENTIALS['google_api_key']) def process(self, ds, **kwargs): raw_records = self.__fetch_tweets() print "{} new tweets have been analyzed".format(len(raw_records)) conn_db_lake = db_handler.get_connection('data_lake') cur_db_lake = conn_db_lake.cursor() for record in raw_records: tweet = record[2] clean_tweet = self.__tweet_cleaner(tweet['text']) print clean_tweet polarity, sentiment = self.__get_sentiment(clean_tweet) coordinates = self.__get_go_points(tweet['user']['location']) gender = self.__guess_gender(tweet['user']['name'].split()[0]) tweet_tokens = self.__tokenizer(clean_tweet) processed_tweet = { "author": tweet["user"]["screen_name"], "tweet_geo": tweet['geo'], "tweet_lang": tweet['lang'], "tweet_place": tweet['place'], "user_description": tweet['user']['description'], "user_followers_count": tweet['user']['followers_count'], "user_friends_count": tweet['user']['friends_count'], "user_lang": tweet['user']['lang'], "user_name": tweet['user']['name'], "user_location_name": tweet['user']['location'], "user_location_coordinate": { "lat": coordinates[0], "lon": coordinates[1] } if coordinates else None, "user_status_count": tweet['user']['statuses_count'], "tweet_created_at": str(parser.parse(tweet['created_at'])), "user_created_at": str(parser.parse(tweet['user']['created_at'])), "tweet_tokens": tweet_tokens, 'bigrams': ["_".join(x) for x in bigrams(tweet_tokens)], 'trigrams': ["_".join(x) for x in trigrams(tweet_tokens)], "polarity": polarity, "sentiment": sentiment, "gender": gender, } try: update_query = """ UPDATE records SET is_analyzed=TRUE WHERE id={}; """.format(record[0]) query = """INSERT INTO tweets (data, created_at) VALUES ('{}', '{}')""".format( json.dumps(processed_tweet).replace("'", "''"), record[3]) cur_db_lake.execute(query) cur_db_lake.execute(update_query) conn_db_lake.commit() except Exception as ex: conn_db_lake.rollback() raise ex def __fetch_tweets(self): try: conn_db_lake = db_handler.get_connection('data_lake') cur_db_lake = conn_db_lake.cursor() query = """ SELECT * FROM records WHERE type='tweet' AND is_analyzed = false """ cur_db_lake.execute(query) return cur_db_lake.fetchall() except Exception as ex: conn_db_lake.rollback() raise ex def __guess_gender(self, name): gender = None try: gender = self.gender_detector.guess(name) return gender except Exception as e: print('error in gender detector') def __get_go_points(self, address): if not address: return None coordinate = None try: res = self.googlemaps_api.search( address.strip(string.punctuation + ' ')).first() if res: coordinate = [res.lat, res.lng] except Exception as ex: print("Err in geo location convertor") return coordinate def __tweet_cleaner(self, tweet): # Convert to lower case tweet = tweet.lower() # Convert www.* or https?://* to empty string tweet = re.sub('((www\.[\s]+)|(https?://[^\s]+))', '', tweet) # Convert @username to empty string tweet = re.sub('@[^\s]+', '', tweet) # Remove additional white spaces tweet = re.sub('[\s]+', ' ', tweet) # Replace #word with word tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # trim tweet = tweet.strip('\'"') return tweet def __get_sentiment(self, tweet): res = TextBlob(tweet) polarity = res.sentiment.polarity if polarity < 0: sentiment = 'negative' elif polarity == 0: sentiment = 'neutral' else: sentiment = 'positive' return (polarity, sentiment) def __tokenizer(self, tweet): tokens = [] for word in tweet.split(): if len(word) > 3 and word not in stopwords.words( 'english') and wordnet.synsets(word): tokens.append(word) return list(set(tokens))
class BylineGender(): def __init__(self): self.detector = GenderDetector('us') self.load_name_org_online() def byline_gender(self,byline): gender_result = {"female":0, "male":0,"unknown":0} for name in self.get_first_names(byline): if(name is None): gender_result["unknown"] += 1 else: gender_result[str(self.detector.guess(name))] += 1 return gender_result def single_name_gender(self,name): if(name is None or len(name.strip()) == 0): return "unknown" n = self.get_first_names(name.strip()) if len(n) > 0 and n[0] is not None: return self.detector.guess(n[0]) return "unknown" def single_name_gender_ascii(self,name): name = unicodedata.normalize('NFKD', name).encode('ascii','ignore') if(name is None or len(name.strip()) == 0): return "unknown" n = self.get_first_names(name.strip()) if len(n) > 0 and n[0] is not None: return self.detector.guess(n[0]) return "unknown" # needs error handling def load_name_org_online(self): self.online_names = {} url = "https://docs.google.com/spreadsheets/d/1TTX5ymLPjefIrHep2QmHZNb76VunFwfq0x6FNAXjUZk/export?format=csv" response = requests.get(url) csv_string = response.content f = StringIO.StringIO(csv_string) eval_base = csv.reader(f, delimiter=',') eval_base.next() try: for row in eval_base: org = row[0].decode("iso-8859-1").replace(u"+",u" ") name = row[1].decode("iso-8859-1").replace(u"+",u" ") gender = row[4] if(not org in self.online_names.keys()): self.online_names[org]= {} if gender in ['male','female','unknown','ignore']: self.online_names[org][name] = {} self.online_names[org][name]['gender'] = gender self.online_names[org][name]['count'] = row[3] #print "{0},{1},{2}".format(org,name,gender) except: print "download unsuccessful", sys.exc_info()[0] def org_name_gender(self,org,name): # org, name, manual_gender, single_name_gender #name = self.to_ascii(name) #org = self.to_ascii(org) org = org.replace(u",",u" ") f = codecs.open("byline_gender.log","a", "utf8") f.write(','.join([org,name])+ "\n") manual_gender=None #exclude = set(string.punctuation) try: inferred_gender = self.single_name_gender(name) except KeyError: asciiname = unicodedata.normalize('NFKD', name).encode('ascii','ignore') inferred_gender = self.single_name_gender(asciiname) if org in self.online_names.keys(): if name in self.online_names[org].keys(): manual_gender = self.online_names[org][name]['gender'] f.write(','.join([u'MANUAL',org,name,manual_gender,u""])+ "\n") else: f.write(','.join([u'SEMI',org,name,inferred_gender,u"",str(self.online_names[org].keys())])+ "\n") else: f.write(','.join([u"INFERENCE",org,name,u"",inferred_gender])+ "\n") if manual_gender in ['male','female','unknown','ignore']: f.close() return manual_gender f.close() return inferred_gender def online_org_name_gender(self,org,name): #name = self.to_ascii(name) #org = self.to_ascii(org) try: #print u"{0},{1}".format(org,name) if org in self.online_names.keys(): if name in self.online_names[org].keys(): return self.online_names[org][name]['gender'] return "" except UnicodeError: import pdb;pdb.set_trace() #org names is a dictionary in the format #org_names[org][name] = number of articles def export_org_names(self,org_names,f): o_names = self.online_names.copy() for org in org_names.keys(): for name in org_names[org].keys(): if org in o_names.keys() and name in o_names[org].keys(): o_names[org].pop(name, None) f.write(','.join([org.replace(u",",u"+").replace(u" ",u"+"),name.replace(u",",u"+").replace(u" ",u"+"),self.single_name_gender(name),str(org_names[org][name]),self.online_org_name_gender(org,name)])+ "\n") for org in o_names.keys(): for name in o_names[org].keys(): f.write(','.join([org.replace(u",",u"+").replace(u" ",u"+"),name.replace(u",",u"+").replace(u" ",u"+"),self.single_name_gender(name),str(self.online_names[org][name]['count']),self.online_org_name_gender(org,name)])+ "\n") def strip_extras(self, byline): byline = re.sub(r'general sir ','',byline) byline = re.sub(r'american way: ','',byline) byline = re.sub(r'president','',byline) byline = re.sub(r'sir','',byline) byline = re.sub(r'gov(\.)?','',byline) byline = re.sub(r'rep(\.)?','',byline) byline = re.sub(r'prof','',byline) byline = re.sub(r'professor','',byline) byline = re.sub(r'.*?rt rev(d)?','',byline) byline = re.sub(r'\n.*','',byline) #telegraph cleaning #byline = re.sub(r'london-based.*','',byline) #byline = re.sub(r'london researcher.*','',byline) #byline = re.sub(r' of the.*','',byline) #byline = re.sub(r'telegraph tv critic','',byline) #byline = re.sub(r'broadcaster','',byline) #byline = re.sub(r'interview: ','',byline) #byline = re.sub(r'commentary: ','',byline) #byline = re.sub(r'telegraph travel writer','',byline) #byline = re.sub(r'on gigolo','',byline) byline = re.sub(r'more stories by ','',byline) byline = re.sub(r'view author.*','',byline) byline = re.sub(r'founder of.*','',byline) byline = re.sub(r' is (a)?.*','',byline) byline = re.sub(r' covers.*','',byline) byline = re.sub(r' in .*','',byline) byline = re.sub(r' info.*','',byline) byline = re.sub(r' writes .*','',byline) byline = re.sub(r'graphic(s)? by(:)?','',byline) byline = re.sub(r'compiled ','',byline) byline = re.sub(r'exclusive ','',byline) byline = re.sub(r'special dispatch' ,'',byline) byline = re.sub(r'as told to ','',byline) byline = re.sub(r' for .*','',byline) byline = re.sub(r' .*','',byline) byline = re.sub(r'interview(ed)?(s)? ','',byline) byline = re.sub(r' at.*','',byline) byline = re.sub(r'^\| ','',byline) #cleaning Telegraph "by" byline = re.sub(r'^(by|By|BY) ','',byline) byline = re.sub(r'.*? by ','',byline) #remove multiple spaces in the middle of a name byline = re.sub(r'\s\s',' ',byline) byline = re.sub(r'\s\s',' ',byline) byline = re.sub(r'^dr ','',byline) byline = byline.strip().encode("utf-8") return byline # TODO: deal with commas def get_full_names(self, byline): if byline is None: return [] byline = byline.strip().lower() if byline is None or re.search('[0-9]',byline) is not None: return [] spaces = byline.count(' ') commas = byline.count(',') conjunctions = byline.count(' and ') semicolons = byline.count(';') bylines_result = [] if(semicolons > 0): for name in byline.split(";"): if(name.count(";") > 0 or name.count(",") > 0 or name.count(" and ") > 0): bylines_result = bylines_result + self.get_full_names(name) else: bylines_result.append(self.strip_extras(name.strip())) elif(conjunctions >0): for name in byline.split(' and '): if(name.count(";") > 0 or name.count(",") > 0 or name.count(" and ") > 0): bylines_result = bylines_result + self.get_full_names(name) else: bylines_result.append(self.strip_extras(name.strip())) elif(commas == 0 and conjunctions == 0 and semicolons == 0): bylines_result.append(self.strip_extras(byline)) elif(spaces >=2 and commas >=1): for name in byline.split(','): if(name.count(";") > 0 or name.count(",") > 0 or name.count(" and ") > 0): bylines_result = bylines_result + self.get_full_names(name) else: bylines_result.append(self.strip_extras(name.strip())) for junk in ['','based']: if junk in bylines_result: bylines_result.remove(junk) return bylines_result def get_first_names(self, byline): if byline is None or re.search('[0-9]',byline) is not None: return [] byline = byline.strip() spaces = byline.count(' ') commas = byline.count(',') conjunctions = byline.count(' and ') bylines_result = [] if(commas == 0 and conjunctions == 0): bylines_result.append(self.get_first_name_from_fullname(byline)) if(conjunctions >0): for name in byline.split(' and '): bylines_result.append(self.get_first_name_from_fullname(name.strip())) if(spaces < 3 and commas == 1): bylines_result.append(self.get_first_name_from_reversename(byline)) return bylines_result # assumes there's a single comma def get_first_name_from_reversename(self, byline): split_byline = [x.strip() for x in byline.split(',')] # set offset to 0 since the surname has already been stripped return self.get_first_name_from_fullname(split_byline[1], 0) def get_first_name_from_fullname(self, byline, offset=None): if(offset == None): offset = -1 tokens = nltk.word_tokenize(byline) first_name = "" for i in range(0, (len(tokens)+offset)): if(tokens[i].count(".") > 0 or len(tokens[i]) == 1): continue return tokens[i] return None def test_first_names(self): test_strings = [ ["J. Nathan Matias", ["Nathan"]], ["J. Matias", [None]], ["J Matias", [None]], ["J N Matias", [None]], ["J. N. Matias", [None]], ["JN Matias", ["JN"]], ["Matias, J. Nathan",["Nathan"]], ["Mishkin, Pamela", ["Pamela"]], ["Pamela Mishkin", ["Pamela"]], ["Nathan Matias and Pamela Mishkin", ["Nathan", "Pamela"]], ["J. Nathan Matias and Pamela Mishkin", ["Nathan", "Pamela"]] ] for test_string in test_strings: names = self.get_first_names(test_string[0]) print 'Got: [%s]' % ', '.join(map(str, names)) print 'Expected: [%s]' % ', '.join(map(str, test_string[1])) print '--------'
print "neutral" if (polarity > 0): print "positive" print "user location:",tweet['user']['location'] print "screen name:",tweet['user']['screen_name'] print "user real name:",tweet['user']['name'] #guess gender user_gender = [] get_name = tweet['user']['name'] try: index = get_name.find(' ') name_for_gender_check = get_name[0:index] #debug --- print "found space in name",index #debug ---print "grabbed first name",name_for_gender_check user_gender = detector.guess(name_for_gender_check) except: user_gender = 'unknown' print "user gender is:",user_gender print "user language:",tweet['user']['lang'] print "user time zone:",tweet['user']['time_zone'] print "user utc_offset:",tweet['user']['utc_offset'] #find location coordinates try user location if it's available if it doesnt work then check by utc ofset print "user description:",tweet['user']['description'] print "user create date:",tweet['user']['created_at'] hashtags = [] for hashtag in tweet['entities']['hashtags']: hashtags.append(hashtag['text'])
class UpworkDataFormatter: def __init__(self): # Settings self.all_data_file_name = './csv_files/altgender4_2017_12_12_upwork_analysis_unitedstates_allskills.csv' # Filename for all data self.data_log_file_name = './log_files/alt_gender4_log_upwork_data_analysis_2017_12_12_unitedstates_allskills.txt' # Write a log self.log = open(self.data_log_file_name, 'a') self.log.write("We have started analyzing data!" + "\n") self.log.flush() # Connect to the database self.conn = psycopg2.connect("dbname=eureka01") self.cur = self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json) # Get detailed_info from workers in our database self.cur.execute( "SELECT detailed_info FROM upwork_unitedstates_allskills_2017_12_12;" ) # Initialize arrays for Causal Analysis self.user_count = 1 # Initialize gender detectors self.d = gender.Detector() self.gc = GenderComputer('./nameLists') self.us_detector = GenderDetector('us') self.ar_detector = GenderDetector('ar') self.uk_detector = GenderDetector('uk') self.uy_detector = GenderDetector('uy') self.gender_guesser = gender_guesser.Detector() def save_all_to_csv(self): with open(self.all_data_file_name, 'w') as csvfile: fieldnames = [ 'user_count', 'worker_id', 'first_name', 'profile_desc', 'gender_guesser', 'gender_detector', 'sex_machine', 'gender_computer', 'gender_pronoun' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for user in self.cur: try: user_count = self.user_count worker_id = user[0]["ciphertext"] first_name = user[0]["dev_first_name"].encode( 'utf-8').strip() print "Successfully done first_name" profile_desc = user[0]['dev_blurb'].encode('utf-8').strip() gender_guesser = self.gender_by_guesser( first_name) # Output of gender guesser gender_detector = self.gender_by_detector( first_name) # Output of gender detector sex_machine = self.gender_by_sex_machine( first_name) # Output of sex machine gender_computer = self.gender_by_computer( first_name) # Output of gender computer gender_pronoun = self.gender_by_pronoun( user) # Output of pronoun in reviews writer.writerow({ 'user_count': self.user_count, 'worker_id': worker_id, 'first_name': first_name, 'profile_desc': profile_desc, 'gender_guesser': gender_guesser, 'gender_detector': gender_detector, 'sex_machine': sex_machine, 'gender_computer': gender_computer, 'gender_pronoun': gender_pronoun }) except KeyboardInterrupt: print "We got interrupted" break except Exception as error: print "Ran into some error at user {0}".format( self.user_count) print error writer.writerow({ 'user_count': self.user_count, 'worker_id': "error", 'first_name': "error", 'profile_desc': "error", 'gender_guesser': "error", 'gender_detector': "error", 'sex_machine': "error", 'gender_computer': "error", 'gender_pronoun': "error" }) print "Finished writing data for {0}".format(self.user_count) self.user_count += 1 def gender_by_detector(self, name): try: us_gender = self.us_detector.guess( name) #Check against US database return us_gender except Exception as error: print "Something wrong at gender_by_detector: {0}".format(error) def gender_by_computer(self, name): try: unicode_name = unicode(name, "utf-8") gender = self.gc.resolveGender(unicode_name, 'USA') return gender except Exception as error: print "Something wrong at gender_by_computer: {0}".format(error) def gender_by_sex_machine(self, name): try: unicode_name = unicode(name, "utf-8") gender = self.d.get_gender(unicode_name) return gender except Exception as error: print "Something wrong at gender_by_sex_machine: {0}".format(error) def gender_by_guesser(self, name): try: unicode_name = unicode(name, "utf-8") gender = self.gender_guesser.get_gender(unicode_name) return gender except Exception as error: print "Something wrong at gender_by_guesser: {0}".format(error) def gender_by_pronoun(self, user): is_female = False is_male = False try: all_assignments = user[0]["assignments"] all_fp_assignments = all_assignments["fp"]["job"] all_hr_assignments = all_assignments["hr"]["job"] if (type(all_fp_assignments) == list): for job in all_fp_assignments: try: try: feedback = job["feedback"]["comment"] print feedback except: feedback = job["feedback"][0]["comment"] print feedback is_female = re.search(" her |Her |She | she | her/.", feedback) is_male = re.search(" his |His |He | he | him | him/.", feedback) break except: continue elif (type(all_fp_assignments == dict)): try: feedback = all_fp_assignments["feedback"]["comment"] print feedback except: feedback = job["feedback"][0]["comment"] print feedback is_female = re.search(" her |Her |She | she | her/.", feedback) is_male = re.search(" his |His |He | he | him | him/.", feedback) if not is_female and not is_male: if (type(all_hr_assignments) == list): for job in all_hr_assignments: try: try: feedback = job["feedback"]["comment"] print feedback except: job["feedback"][0]["comment"] print feedback is_female = re.search( " her |Her |She | she | her/.", feedback) is_male = re.search( " his |His |He | he | him | him/.", feedback) break except: continue elif (type(all_hr_assignments) == dict): try: feedback = all_hr_assignments["feedback"]["comment"] print feedback except: feedback = all_hr_assignments["feedback"][0]["comment"] print feedback is_female = re.search(" her |Her |She | she | her/.", feedback) is_male = re.search(" his |His |He | he | him | him/.", feedback) except Exception as error: print "Could not find assignments for gender_by_pronoun: {0}".format( error) return "unknown" if is_female and not is_male: return "female" elif is_male and not is_female: return "male" elif is_male and is_female: return "ambiguous" else: return "unknown"
def run_pipeline_unlabelled_data(): vec_mf = CountVectorizer(decode_error='replace') vec_ii = CountVectorizer(decode_error='replace') trans_mf = TfidfTransformer(use_idf=True) trans_ii = TfidfTransformer(use_idf=True) # Load Datasets train_data_mf = joblib.load(os.path.join(PROJECT_PATH, 'resources/datasets/gender/text/male_vs_female/docs_raw')) y_train_mf = joblib.load(os.path.join(PROJECT_PATH, 'resources/datasets/gender/text/male_vs_female/labels')) train_data_ii = joblib.load(os.path.join(PROJECT_PATH, 'resources/datasets/gender/text/individual_vs_institution/docs_raw')) y_train_ii = joblib.load(os.path.join(PROJECT_PATH, 'resources/datasets/gender/text/individual_vs_institution/labels')) test_data = joblib.load(os.path.join(PROJECT_PATH, 'resources/datasets/gender/text/docs_unlabelled_raw')) test_names = joblib.load(os.path.join(PROJECT_PATH, 'resources/datasets/gender/text/names_unlabelled_raw')) clf_mf = LinearSVC() clf_ii = LinearSVC() X_mf = trans_mf.fit_transform(vec_mf.fit_transform(train_data_mf)) X_ii = trans_ii.fit_transform(vec_ii.fit_transform(train_data_ii)) print 'SHAPE X_mf TRAIN DATA:', X_mf.shape print 'SHAPE X_ii TRAIN DATA:', X_ii.shape clf_mf.fit(X_mf, y_train_mf) clf_ii.fit(X_ii, y_train_ii) # Individual vs. Institution X_test_ii = trans_ii.transform(vec_ii.transform(test_data)) prediction = clf_ii.predict(X_test_ii) print 'X_test_ii SHAPE:', X_test_ii.shape print 'PREDICTION DISTRIBUTION: %.2f / %.2f' % ((np.count_nonzero(prediction) / prediction.shape[0]), ((prediction.shape[0] - np.count_nonzero(prediction)) / prediction.shape[0])) print 'INDIVIDUALS:', np.where(prediction == 1) test_data_mf = [] test_names_mf = [] for idx in np.where(prediction == 1)[0]: #print 'IDX=%d; DATA=%r; NAME=%r' % (idx, test_data[idx], test_names[idx]) test_data_mf.append(test_data[idx]) test_names_mf.append(test_names[idx]) print 'LEN TEST DATA:', len(test_data_mf) print 'LEN NAMES:', len(test_names_mf) # Gender classification X_test_mf = trans_mf.transform(vec_mf.transform(test_data_mf)) print 'SHAPE X_test_mf:', X_test_mf.shape detector = GenderDetector('uk') names_pred = [] for idx, n in enumerate(test_names_mf): try: cands = extract_name(n) for c in cands: gender = detector.guess(c) if (gender != 'unknown'): names_pred.append(0 if gender == 'female' else 1) break except: #names_pred.append(1) # Use SVM here names_pred.append(clf_mf.predict(X_test_mf[idx])[0]) pass else: #names_pred.append(1) # Use SVM here names_pred.append(clf_mf.predict(X_test_mf[idx])[0]) idx_female = np.where(np.array(names_pred) == 0)[0] idx_male = np.where(np.array(names_pred) == 1)[0] print 'FEMALE PREDICTION...' for i in idx_female: print test_data_mf[i] print 'MALE PREDICTION...' for i in idx_male: print test_data_mf[i]
""" Created on Fri Nov 11 08:37:50 2016 @author: varshith """ print(''' us - USA ar - Argentina uy - Uruguay uk - United Kingdom ''') cclist = ['us', 'ar', 'uy', 'uk'] while True: country_code = input('Enter country code :') if country_code in cclist: break else: print('Entered country code not valid') while True: name = input('Enter first name :') if name.find(' ') != -1: print('Entered name not valid, Enter first name only') else: break from gender_detector import GenderDetector detector = GenderDetector(country_code) print(detector.guess(name))
#to correct the format we receive the words and take as a chain in list word_final =list(itertools.chain(*word_final)) #print word_final rec=open("Names_processed.txt", "w") for item in word_final: rec.write("%s\n" % item) rec.close() for item in word_final: c= segment(item) d= max(c, key=len) #print d detector = GenderDetector('us') # It can also be uk. e= detector.guess(d) # => 'male' print d,e """ rec=open("Names_processed.txt", "w") for item in word_final: rec.write("%s\n" % item) rec.close() with open("Names_processed.txt", 'r') as word: for i in word:
from gender_detector import GenderDetector detector = GenderDetector('us') # It can also be ar, uk, uy. count = 0 ### guesses gender based on firstname.good for mostly english names with open('/Users/ankitkumar/Downloads/firstname.csv') as f: for line in f: try: if detector.guess(line) == 'unknown': print count count = count + 1 except (KeyError, NameError): print "skip" print count
def test_guessing(self): detector = GenderDetector('us') self.assertEqual(detector.guess('Marcos'), 'male')
# infer the gender of each author according to the 'gs_name.txt' from datetime import datetime from gender_detector import GenderDetector detector = GenderDetector('us') file_name = open("gs_name.txt", "r") lines_name = file_name.read().split("\n") all_name = [] all_gender = [] for x in lines_name: if (len(x) > 0): all_name.append(x) yy = x.split(" ") # print yy[0], author_gender = 'unknown' try: author_gender = detector.guess(yy[0]) except: pass print author_gender all_gender.append(author_gender)
import os import sys import json from gender_detector import GenderDetector from time import strftime, localtime, time # record running time start = time() print('Starting Time: %s' % strftime("%a, %b %d, %Y at %H:%M:%S", localtime())) detector = GenderDetector('us') with open("9ulovesu.json", 'rb') as file: data = json.load(file) counter = 0 unknown = [] with open("9ulovesu.data", 'w') as file: for name in data['nodes']: counter += 1 try: print(counter, name['name'].split()[0].encode('ascii'), detector.guess(name['name'].split()[0]), name['id']) file.write('%s, %s, %s\n' % (str(name['id']), name['name'].split()[0].encode('ascii'), detector.guess(name['name'].split()[0]))) except: print(counter, name['name'].split()[0], 'unknown') file.write('%s, %s, unknown\n' % (str(name['id']), name['name'].split()[0].encode('ascii-8'))) print('\nEnd Time: %s' % strftime("%a, %b %d, %Y at %H:%M:%S", localtime())) print('Execution Time: %.2f seconds' % (time()-start))
def are_opposite(first, second): detector = GenderDetector('us') for first, second in [(first, second), (second, first)]: if detector.guess(first) == 'male' and detector.guess(second) == 'female': return True return False