def is_male(s): try: s = s.lower().split(' ')[1] except IndexError: print("Failed to get first name. %s" % s) d1 = gender_detector.GenderDetector('us') try: g = d1.guess(s) except KeyError: print("Failed to get gender of name %s" % s) #print('name is %s' % s) #print('first detector guessed %s' % g) if g == 'male': #print('returning male') return 1 elif g == 'female': #print('returning female') return 0 # if first (faster) detector can't guess it # try second detector #print('going to detector 2') d2 = gender.Detector() g = d2.get_gender(s) if g == 'male': #print('male') return 1 #print('female') return 0
def __init__(self): # Settings self.all_data_file_name = './csv_files/altgender4_2017_12_12_upwork_analysis_unitedstates_allskills.csv' # Filename for all data self.data_log_file_name = './log_files/alt_gender4_log_upwork_data_analysis_2017_12_12_unitedstates_allskills.txt' # Write a log self.log = open(self.data_log_file_name, 'a') self.log.write("We have started analyzing data!" + "\n") self.log.flush() # Connect to the database self.conn = psycopg2.connect("dbname=eureka01") self.cur = self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor) psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json) # Get detailed_info from workers in our database self.cur.execute( "SELECT detailed_info FROM upwork_unitedstates_allskills_2017_12_12;" ) # Initialize arrays for Causal Analysis self.user_count = 1 # Initialize gender detectors self.d = gender.Detector() self.gc = GenderComputer('./nameLists') self.us_detector = GenderDetector('us') self.ar_detector = GenderDetector('ar') self.uk_detector = GenderDetector('uk') self.uy_detector = GenderDetector('uy') self.gender_guesser = gender_guesser.Detector()
def predict_sex(name): sex_predictor = gender.Detector(unknown_value=u"unknown",case_sensitive=False) first_name= name.str.split(' ').str.get(0) sex= first_name.apply(sex_predictor.get_gender) sex_dict={'female': -2, 'mostly_female': -1,'unknown':0,'mostly_male':1, 'male': 2} sex_code = sex.map(sex_dict).astype(int) return sex_code
def gender(dataframe): gender_list = [] import sexmachine.detector as gender d = gender.Detector() #read in host names host_name = dataframe["HostName"] #for loop to loop in every host name and judge the gender for hostname in host_name: if "&" in hostname or "And" in hostname or "/" in hostname: gender = "couple" else: first_name = hostname.split(" ")[0] gender = d.get_gender(first_name).encode('utf8') gender_list.append(gender) return gender_list
def createMovieGraph(): movieMap, actorMap, directorMap = parseMovieFile(datasetFilename, fetchRaceAndGender=True) graph, graphDict = createGraphForMovieInfo(movieMap, actorMap, directorMap) # Fill in unknown genders with SexMachine genderDetector = gender.Detector(unknown_value=None) for nodeId in graph.nodes(): node = graph.node[nodeId] nodeType = node["type"] if nodeType != "MOVIE" and not node["gender"] and node["name"] != "": firstName = graph.node[nodeId]["name"].split()[0] predictedGender = genderDetector.get_gender(firstName) # Fill in only genders we're certain about if predictedGender == "male" or predictedGender == "female": node["gender"] = predictedGender.capitalize() # Save to files nx.write_gpickle(graph, graphFilename) saveDictToFile(graphDict, graphDictFilename, firstRow=["Name", "NodeID"])
import pandas as pd import re, sqlite3 from os import path from sexmachine import detector from getgender import getGenders from ethnicolr import pred_census_ln from urllib.request import urlopen from bs4 import BeautifulSoup nndb_base_link = "https://search.nndb.com/search/nndb.cgi?query=" gender_detector = detector.Detector() gender_table = { "male": "Male", "mostly_male": "Male", "female": "Female", "mostly_female": "Female" } race_table = { "api": "Asian", "black": "Black", "hispanic": "Hispanic", "white": "White" } gender_num_table = {"Male": "num_males", "Female": "num_females"} race_num_table = {"White": "num_whites", "Black": "num_blacks", "Asian": "num_asians", "Hispanic": "num_hispanics"} def guessActorInfo(actor_name): # If NNDB bio page not found then use SexMachine or ethnicolr module to guess gender/race print("\t\tNNDB bio page not found...Using Python modules to guess") first_name = actor_name.split(" ")[0]
favourites_list = pickle.load(open('favourites_count.pkl', 'rb')) statuses_list = pickle.load(open('statuses_count.pkl', 'rb')) location_dict = pickle.load(open('location_dict_scraper.pkl', 'rb')) # In[6]: #['created_at','statuses_count','followers_count','favourites_count','sex_code','lang_code'] #1.scraping username section #gmql0nx0.l94mrbxd.p1ri9a11.lzcic4wl.bp9cbjyn.j83agx80 elems = driver.find_elements_by_class_name( "gmql0nx0.l94mrbxd.p1ri9a11.lzcic4wl.bp9cbjyn.j83agx80") username = elems[0].text username = pd.Series(username) #predicting sex sex_predictor = gender.Detector(unknown_value=u"unknown", case_sensitive=False) first_name = username.str.split(' ').str.get(0) sex = first_name.apply(sex_predictor.get_gender) sex_dict = { 'female': -2, 'mostly_female': -1, 'unknown': 0, 'mostly_male': 1, 'male': 2 } sex_code = sex.map(sex_dict).astype(int) print sex_code[0] #2.scraping bio section #d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j keod5gw0 nxhoafnm aigsh9s9 d3f4x2em fe6kdd0r mau55g9w c8b282yb mdeji52x a5q79mjw g1cxx5fr knj5qynh m9osqain oqcyycmt elems = driver.find_elements_by_class_name(
def __init__(self): self.d = gender.Detector()
import authors import sexmachine.detector as gender import getauthors import os import matplotlib.pyplot as plt import numpy as np import scipy.stats as stats #Create file that matches author names to gender d = gender.Detector(unknown_value='unknown', case_sensitive=False) fileNameAuthors = 'C:/Users/jstwa/Desktop/Names/allAuthors.txt' authorDict = authors.authorsToNum(fileNameAuthors) count = 1 gendersList = [] exceptionsList = [] for author, nums in authorDict.iteritems(): firstName = author.split(' ')[0] currGender = d.get_gender(firstName) try: gendersList.append(author + ", " + currGender) except UnicodeDecodeError: exceptionsList.append(firstName + ", " + str(nums[0]) + ", " + currGender) if count % 50 == 0: getauthors.writeList(gendersList) getauthors.writeList(exceptionsList, "exceptions.txt") count += 1 #After creating gender file, creates a list of unknown gender authors os.chdir("C:/Users/jstwa/Desktop") with open("genders.txt", 'r') as f:
import os import math import pandas as pd import sexmachine.detector as genderdetector import matplotlib.pyplot as plt plt.rcdefaults() import numpy as np import matplotlib.pyplot as plt data_file_path = '../data/reviews.csv' detector = genderdetector.Detector() print 'test gender of Sam: ' + detector.get_gender('Sam') df = pd.read_csv(data_file_path) print df.head(1) gender_count = {} andy = [] for name in df['author']: if isinstance(name, str) == False: continue # we had some nan values, this solves that issue first_name = name.split(' ')[0] gender = detector.get_gender(first_name) gender_count[gender] = gender_count.get(gender, 0) + 1 if gender == 'andy': andy.append(first_name) print gender_count # print list(set(andy))[0:30] idx = np.arange(len(gender_count.keys()))
import numpy as np import sqlalchemy as sa from sqlalchemy_utils import database_exists, create_database import requests from bs4 import BeautifulSoup import seaborn as sns sns.set_context('talk') sns.set_style('darkgrid') pd.set_option('display.max_columns', 500) local_weave_pair = sa.create_engine("postgres://%s@localhost/%s" % ('jiongz', 'weave_pair')) import sexmachine.detector as gender identify_gender = gender.Detector() from sklearn.externals import joblib def matches(user_name, user_degree, user_start_yr, looking_for): new_user_feature = get_new_user_info(user_name, user_degree, user_start_yr, looking_for) user_features = read_user_features()[:500] user_features = user_features[user_features['start_yr'].notnull()] meeting_features = meeting_feature_for_newuser(new_user_feature, user_features) match_result = predic_5star(meeting_features).sort('star_prob', ascending=False)
import sqlite3 import sexmachine.detector as gender import nltk import string db = sqlite3.connect('../commentsData.db') d = gender.Detector(case_sensitive=False) c = db.cursor() #c.execute("create table if not exists maleComments(userID int,username text,comment text)") #c.execute("insert into maleComments select A.userID, A.username, A.commentBody from comments A join commenterGender B where B.gender="male" and A.userID = B.userID and A.username = B.username") c.execute("select userID, username, comment from femaleComments") words_count = 0 comments_count = 0 for result in c.fetchall(): comment = result[2] tokens = nltk.word_tokenize(result[2].lower()) token_count = 0 for t in tokens: if t not in string.punctuation: token_count += 1 words_count += token_count comments_count += 1 if (comments_count % 100000 == 0): print comments_count, " records processed" print "Avg word_count per Female_Comment :" + str(words_count / comments_count)
def __init__(self): self.detector = gender.Detector(case_sensitive=False)
def setUp(self): self.case = d.Detector() self.incase = d.Detector(case_sensitive=False)
def gender_guess(name): #Takes first name and attempts to guess the gender. if name == "": return("unknown") gen = gender.Detector() return gen.get_gender(name)
def predict(): ''' ['username','location','statuses_count','followers_count','friends_count','favourites_count','sex_code','lang_code','created_at'] {'fr': 3, 'en': 1, 'nl': 6, 'de': 0, 'tr': 7, 'it': 5, 'gl': 4, 'es': 2} ''' #request.form.values() data = request.form int_features = list(data.values()) #lang lang_dict = { 'fr': 3, 'en': 1, 'nl': 6, 'de': 0, 'tr': 7, 'it': 5, 'gl': 4, 'es': 2, 'hi': 8, 'other': 9 } ''' #location users = pd.read_csv(r'/home/vamsi82674/Desktop/fake profile detection fb/app/data/processed_data.csv') location_list = list(enumerate(np.unique(users['location']))) location_dict = { name : i for i, name in location_list } location_dict['other']=1679 ''' #created_at created_date = datetime.datetime.strptime( datetime.datetime.strptime(int_features[7], '%Y-%m-%d').strftime('%m %d %Y'), '%m %d %Y') today = datetime.datetime.strptime( datetime.datetime.now().strftime('%m %d %Y'), '%m %d %Y') days_count = today - created_date days_count = days_count.days #for local host df = pd.DataFrame( { 'bio': int_features[0], 'statuses_count': int_features[1], 'followers_count': int_features[5], 'friends_count': int_features[2], 'favourites_count': int_features[8], 'created_at': int_features[7], 'location': location_dict[int_features[6]], 'username': int_features[9], 'lang': lang_dict[int_features[3]] }, index=[0]) ''' #for heroku #[u'1', u'4', u'2', u'sai', u'other', u'3', u'en', u'2021-04-04', u'bio\r\n', u''] df=pd.DataFrame({'bio':, 'statuses_count':, 'followers_count':, 'friends_count':, 'favourites_count':, 'created_at':, 'location':location_dict[], 'username':, 'lang':lang_dict[]}, index=[0]) ''' #created_at created_date = datetime.datetime.strptime( datetime.datetime.strptime(df.loc[0, 'created_at'], '%Y-%m-%d').strftime('%m %d %Y'), '%m %d %Y') today = datetime.datetime.strptime( datetime.datetime.now().strftime('%m %d %Y'), '%m %d %Y') days_count = today - created_date days_count = days_count.days df.loc[0, 'created_at'] = days_count #predicting sex sex_predictor = gender.Detector(unknown_value=u"unknown", case_sensitive=False) first_name = df['username'].str.split(' ').str.get(0) sex = first_name.apply(sex_predictor.get_gender) sex_dict = { 'female': -2, 'mostly_female': -1, 'unknown': 0, 'mostly_male': 1, 'male': 2 } sex_code = sex.map(sex_dict).astype(int) #['created_at','location','statuses_count','followers_count','favourites_count','friends_count','sex_code','lang_code'] params = pd.Series([ df['created_at'], df['location'], df['statuses_count'], df['followers_count'], df['favourites_count'], df['friends_count'], sex_code, df['lang'] ]) #Random forest prediction rfr_prediction = random_forest.predict(params) #support vector machine prediction svm_prediction = support_vector.predict(params) #Naive Bayes prediction nvb_prediction = naive_bayes.predict(params) #Decision Tree Prediction dtc_prediction = decision_tree.predict(params) #neural network prediction ds2 = ClassificationDataSet(8, 1, nb_classes=2) lst = [ df['created_at'], df['location'], df['statuses_count'], df['followers_count'], df['favourites_count'], df['friends_count'], sex_code, df['lang'].astype(int) ] ds2.addSample(lst, 1) ds2._convertToOneOfMany() fnn_prediction = neural_network.testOnClassData(dataset=ds2) fnn_prediction[0] = 1 #percent = ( dtc_prediction[0] + nvb_prediction[0] + rfr_prediction[0] + svm_prediction[0] + fnn_prediction[0] ) percent = (dtc_prediction[0] + rfr_prediction[0] + fnn_prediction[0]) percent = round(percent * 33) #return render_template('result.html',username = int_features[9],dtc_prediction = dtc_prediction[0] , nvb_prediction = nvb_prediction[0] ,rfr_prediction = rfr_prediction[0],svm_prediction = svm_prediction[0],fnn_prediction = fnn_prediction[0],percentage=percent,features=int_features) return render_template('result.html', username=int_features[9], dtc_prediction=dtc_prediction[0], rfr_prediction=rfr_prediction[0], fnn_prediction=fnn_prediction[0], percentage=percent, features=int_features)
def predict(): ''' ['statuses_count','followers_count','friends_count','favourites_count','listed_count','sex_code','lang_code'] {'fr': 3, 'en': 1, 'nl': 6, 'de': 0, 'tr': 7, 'it': 5, 'gl': 4, 'es': 2} ''' int_features = request.form.values() lang_dict = { 'fr': 3, 'en': 1, 'nl': 6, 'de': 0, 'tr': 7, 'it': 5, 'gl': 4, 'es': 2 } df = pd.DataFrame( { 'bio': int_features[0], 'statuses_count': int_features[1], 'followers_count': int_features[5], 'friends_count': int_features[2], 'favourites_count': int_features[6], 'listed_count': int_features[8], 'username': int_features[7], 'lang': lang_dict[int_features[3]] }, index=[0]) sex_predictor = gender.Detector(unknown_value=u"unknown", case_sensitive=False) first_name = df['username'].str.split(' ').str.get(0) sex = first_name.apply(sex_predictor.get_gender) sex_dict = { 'female': -2, 'mostly_female': -1, 'unknown': 0, 'mostly_male': 1, 'male': 2 } sex_code = sex.map(sex_dict).astype(int) print type(df['lang']) params = [ df['statuses_count'], df['followers_count'], df['friends_count'], df['favourites_count'], df['listed_count'], sex_code, df['lang'].astype(int) ] #Random forest prediction rfr_prediction = random_forest.predict([ df['statuses_count'], df['followers_count'], df['friends_count'], df['favourites_count'], df['listed_count'], sex_code, df['lang'] ]) #support vector machine prediction svm_prediction = support_vector.predict([ df['statuses_count'], df['followers_count'], df['friends_count'], df['favourites_count'], df['listed_count'], sex_code, df['lang'] ]) #naive_bayes prediction nvb_prediction = naive_bayes.predict([ df['statuses_count'], df['followers_count'], df['friends_count'], df['favourites_count'], df['listed_count'], sex_code, df['lang'] ]) if rfr_prediction[0] == 0 and svm_prediction[0] == 0 and fnn_prediction[ 0] == 0: percent = 100 elif (rfr_prediction[0] == 0 and svm_prediction[0] == 0 and fnn_prediction[0] == 1) or ( rfr_prediction[0] == 0 and svm_prediction[0] == 1 and fnn_prediction[0] == 0) or (rfr_prediction[0] == 1 and svm_prediction[0] == 0 and fnn_prediction[0] == 0): percent = 67 elif (rfr_prediction[0] == 1 and svm_prediction[0] == 1 and fnn_prediction[0] == 0) or ( rfr_prediction[0] == 0 and svm_prediction[0] == 1 and fnn_prediction[0] == 1) or (rfr_prediction[0] == 1 and svm_prediction[0] == 0 and fnn_prediction[0] == 1): percent = 33 else: percent = 0 return render_template('result.html', rfr_prediction=rfr_prediction[0], svm_prediction=svm_prediction[0], fnn_prediction=fnn_prediction[0], percentage=percent, features=params)
def scrape_prediction(): #request.form.values() data = request.form int_features = list(data.values()) chrome_options = webdriver.ChromeOptions() prefs = {"profile.default_content_setting_values.notifications": 2} chrome_options.add_experimental_option("prefs", prefs) driver = webdriver.Chrome('C:/Users/vamsi/chromedriver.exe', chrome_options=chrome_options) #for heroku #driver = webdriver.Chrome(executable_path=os.environ.get("CHROME_DRIVER_PATH"), chrome_options=chrome_options) #open the webpage driver.get("http://www.facebook.com") #target username username = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='email']"))) password = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='pass']"))) #enter username and password username.clear() username.send_keys("9490461737") password.clear() password.send_keys("Facebook@62892") time.sleep(15) #target the login button and click it button = WebDriverWait(driver, 2).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, "button[type='submit']"))).click() time.sleep(15) #We are logged in! url = int_features[0] driver.get(url) time.sleep(15) html = driver.page_source #['created_at','statuses_count','followers_count','favourites_count','sex_code','lang_code'] #1.scraping username section #gmql0nx0.l94mrbxd.p1ri9a11.lzcic4wl.bp9cbjyn.j83agx80 elems = driver.find_elements_by_class_name( "gmql0nx0.l94mrbxd.p1ri9a11.lzcic4wl.bp9cbjyn.j83agx80") try: username = elems[0].text except KeyError: username = '******' username = pd.Series(username) #predicting sex sex_predictor = gender.Detector(unknown_value=u"unknown", case_sensitive=False) first_name = username.str.split(' ').str.get(0) sex = first_name.apply(sex_predictor.get_gender) sex_dict = { 'female': -2, 'mostly_female': -1, 'unknown': 0, 'mostly_male': 1, 'male': 2 } sex_code = sex.map(sex_dict).astype(int) print username print sex_code[0] #2.scraping bio section #d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j keod5gw0 nxhoafnm aigsh9s9 d3f4x2em fe6kdd0r mau55g9w c8b282yb mdeji52x a5q79mjw g1cxx5fr knj5qynh m9osqain oqcyycmt elems = driver.find_elements_by_class_name( "d2edcug0.hpfvmrgz.qv66sw1b.c1et5uql.lr9zc1uh.a8c37x1j.keod5gw0.nxhoafnm.aigsh9s9.d3f4x2em.fe6kdd0r.mau55g9w.c8b282yb.mdeji52x.a5q79mjw.g1cxx5fr.knj5qynh.m9osqain.oqcyycmt" ) try: bio = elems[0].text except KeyError: bio = '' print bio #3.scraping friends count,statuses_count,followers_count,favourites_count #d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh e9vueds3 j5wam9gi knj5qynh m9osqain #d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j keod5gw0 nxhoafnm aigsh9s9 d3f4x2em fe6kdd0r mau55g9w c8b282yb iv3no6db jq4qci2q a3bd9o3v lrazzd5p m9osqain elems = driver.find_elements_by_class_name( "d2edcug0.hpfvmrgz.qv66sw1b.c1et5uql.lr9zc1uh.a8c37x1j.keod5gw0.nxhoafnm.aigsh9s9.d3f4x2em.fe6kdd0r.mau55g9w.c8b282yb.iv3no6db.jq4qci2q.a3bd9o3v.lrazzd5p.m9osqain" ) friend_count = elems[2].text friend_count = random.choice(friends_list) print friend_count #statuses_count statuses_count = random.choice(statuses_list) print statuses_count #followers_count followers_count = random.choice(followers_list) print followers_count #favourites_count favourites_count = random.choice(favourites_list) print favourites_count #4.scraping location #oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 nc684nl6 p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl oo9gr5id gpro0wi8 lrazzd5p elems = driver.find_elements_by_class_name( "oajrlxb2.g5ia77u1.qu0x051f.esr5mh6w.e9989ue4.r7d6kgcz.rq0escxv.nhd2j8a9.nc684nl6.p7hjln8o.kvgmc6g5.cxmmr5t8.oygrvhab.hcukyx3x.jb3vyjys.rz4wbd8a.qt6c0cv9.a8nywdso.i1ao9s8h.esuyzwwr.f1sip0of.lzcic4wl.oo9gr5id.gpro0wi8.lrazzd5p" ) location = 'other' if location in location_dict: location = location_dict[location] else: location_dict[location] = len(location_dict) + 1 location = location_dict[location] pickle.dump(location_dict, open('location_dict_scraper.pkl', 'wb'), protocol=2) print location #5.scraping created_at #d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j keod5gw0 nxhoafnm aigsh9s9 d3f4x2em fe6kdd0r mau55g9w c8b282yb iv3no6db jq4qci2q a3bd9o3v knj5qynh oo9gr5id hzawbc8m elems = driver.find_elements_by_class_name( "d2edcug0.hpfvmrgz.qv66sw1b.c1et5uql.lr9zc1uh.a8c37x1j.keod5gw0.nxhoafnm.aigsh9s9.d3f4x2em.fe6kdd0r.mau55g9w.c8b282yb.iv3no6db.jq4qci2q.a3bd9o3v.knj5qynh.oo9gr5id.hzawbc8m" ) created_at = '07 December 1997' created_date = datetime.datetime.strptime( datetime.datetime.strptime(created_at, '%d %B %Y').strftime('%m %d %Y'), '%m %d %Y') today = datetime.datetime.strptime( datetime.datetime.now().strftime('%m %d %Y'), '%m %d %Y') days_count = today - created_date days_count = days_count.days print days_count #6.language #lang lang_dict = { 'fr': 3, 'en': 1, 'nl': 6, 'de': 0, 'tr': 7, 'it': 5, 'gl': 4, 'es': 2, 'hi': 8, 'other': 9 } #['created_at','location','statuses_count','followers_count','favourites_count','friends_count','sex_code','lang_code'] df = pd.DataFrame( { 'bio': bio, 'statuses_count': statuses_count, 'followers_count': followers_count, 'friends_count': friend_count, 'favourites_count': favourites_count, 'created_at': days_count, 'location': location, 'sex_code': sex_code, 'lang': lang_dict['hi'] }, index=[0]) params = pd.Series([ df['created_at'], df['location'], df['statuses_count'], df['followers_count'], df['favourites_count'], df['friends_count'], sex_code, df['lang'] ]) print params #Random forest prediction rfr_prediction = random_forest.predict(params) #support vector machine prediction svm_prediction = support_vector.predict(params) #Naive Bayes prediction nvb_prediction = naive_bayes.predict(params) #Decision Tree Prediction dtc_prediction = decision_tree.predict(params) #neural network prediction ds2 = ClassificationDataSet(8, 1, nb_classes=2) lst = [ df['created_at'], df['location'], df['statuses_count'], df['followers_count'], df['favourites_count'], df['friends_count'], sex_code, df['lang'].astype(int) ] ds2.addSample(lst, 1) ds2._convertToOneOfMany() fnn_prediction = neural_network.testOnClassData(dataset=ds2) percent = (dtc_prediction[0] + nvb_prediction[0] + rfr_prediction[0] + svm_prediction[0] + fnn_prediction[0]) percent = round(percent * 20) return render_template('result.html', username=username[0], dtc_prediction=dtc_prediction[0], nvb_prediction=nvb_prediction[0], rfr_prediction=rfr_prediction[0], svm_prediction=svm_prediction[0], fnn_prediction=fnn_prediction[0], percentage=percent, features=int_features)
#READ CSV columns = defaultdict(list) with open('NERoutput.csv') as f: reader = csv.DictReader(f) for row in reader: for (k, v) in row.items(): #iteritems columns[k].append(v) ids = columns.get('id') #contents = columns.get('content') ner = columns.get('NER') print "csv file read" data = {id: row for id, row in zip(ids, ner)} output = () d = gender.Detector() for idn in data: ent = data.get(idn) #CLEAN ENT #if tag is PER, get first name and store by id, gender in dict if isinstance(ent, str): if ent[1:6] == 'I-PER': #len(ent) > 3 and ent = ent.replace("u'", '') ent = ent.replace("'", '') rec = ent[7:-2].split(';') for entity in rec: name = ''.join(ch for ch in entity if ch.isalnum()) output += (idn, name, d.get_gender(name), 'XOXO') print "writing to file" #WRTITE gender TO FILE
# demographics: name, code, gender, affiliation, seniority, pub_rate, first_author_por, avg_coauthor from unidecode import unidecode import sexmachine.detector as gender d = gender.Detector(case_sensitive=False, unknown_value="") author_names = {} current_year = 2019 with open("../csv/authors_unique.csv") as f_authors: f_authors.readline() for line in f_authors: line = line.strip() parts = line.split(",") author_code = parts[0] author_name = parts[1] author_name = author_name.strip() author_names[int(author_code)] = author_name with open("../csv/basic_demographics.csv") as f, open("../csv/demographics.csv","w") as f_out: # write header f_out.write("author_name,author_id,gender,affiliation,seniority,nb_publi,pub_rate,first_author_por,avg_coauthor\n") f.readline() for line in f: line = line.strip() parts = line.split(",") author_id = parts[0] first_publication_year = parts[1] number_of_publications = parts[2].strip() number_of_first_author_publications = parts[3]
#!/usr/bin/python import pandas as pd import sqlite3 import sexmachine.detector as gender import matplotlib.pyplot as plt d = gender.Detector(case_sensitive=False, unknown_value='neutral') # let's start with Python # read the data out of the database file: #connec = sqlite3.connect('dat.db') connec = sqlite3.connect( 'dat_anon.db') #last names removed for public database df = pd.read_sql('select owner_name, language from github', connec) splitnames = [x.split(' ') for x in df['owner_name']] #redundant if using dat_anon df['gender'] = [d.get_gender(x[0]) for x in splitnames] labels = ['male', 'mostly_male', 'neutral', 'mostly_female', 'female'] languages = [ 'JavaScript', 'Python', 'Ruby', 'PHP', 'Java', 'Objective-C', 'C', 'C++', 'Shell', 'C#', 'Go', 'Perl', 'Clojure', 'Scala', 'Haskell', 'Erlang', 'R' ] plotdata = pd.DataFrame(index=languages, columns=labels) for language in languages: langdf = df[df['language'] == language] vc = langdf['gender'].value_counts() norm = sum(vc) for aggcount in vc.iteritems(): gclass, count = aggcount plotdata[gclass][language] = "{:.1f}".format(100 * float(count) / norm)
def setUp(self): self.case = d.Detector()