示例#1
0
def main():
    ''' Here I'm looping through the verious credential files and 
	retrieve results for each credentail '''
    cred = ['amanuel']

    connections_dict = dict()
    total_profiles_list = []
    for name in cred:
        credential_filename = "credentials_" + name + ".json"
        application = authenticate(credential_filename)

        # Search the Results
        profile_results = search(application, name, "Mathematician")

        # Append the results to the total_profiles
        total_profiles_list.append(profile_results)

        # Get the connection
        # ds_connections = retrieve_connections(application, name)

        # Add to the dictionary
        # connections_dict[name] = ds_connections

    # Save all the profiels retrieved
    total_out_file = "./data/total_profiles_math" + month + day + year + ".pkl"
    utils.savepickle(total_profiles_list, total_out_file)
示例#2
0
def main():
	''' Here I'm looping through the verious credential files and 
	retrieve results for each credentail '''
	cred = [ 'amanuel']

	connections_dict = dict()
	total_profiles_list =[]
	for name in cred:
		credential_filename = "credentials_"+name+".json"
		application= authenticate(credential_filename)
		
		# Search the Results
		profile_results = search(application, name, "Mathematician")
		
		# Append the results to the total_profiles
		total_profiles_list.append(profile_results)
		
		# Get the connection
		# ds_connections = retrieve_connections(application, name)
		
		# Add to the dictionary
		# connections_dict[name] = ds_connections

	# Save all the profiels retrieved
	total_out_file = "./data/total_profiles_math"+month+day+year+".pkl"
	utils.savepickle(total_profiles_list, total_out_file)
示例#3
0
def build_skills_ds(db, collection):
	''' Builds a matrix with all the skills per profile'''
	print "Building skills Df"

	# The method below fails when it
	# comes to name the columns if it's nicer
	# skills_list= collection.distinct('skills')
	# columns = []
	# for i in range(len(skills_list)):
	# 	print skills_list[i]
	# 	columns.append(skills_list[i])
	# pdb.set_trace()
	
	skill_set = set()
	skill_full_list = []
	index= []
	# Build columns and index to create the dataframe
	cursor  = collection.find({}, {"_id":0, "skills":1, "id":1})
	for results in cursor:
		# pdb.set_trace()
		if "skills" in results:
			for skill in (results['skills']):
				skill_set.add(skill)
				skill_full_list.append(skill)
		index.append(results['id'])
	print len(skill_set)


	df_skills = pd.DataFrame(index=index, columns=skill_set).fillna(0)
	# print df_skills

	cursor_profile = collection.find({})
		# Here I will set the value of
	for profile in cursor_profile:
		if 'skills' in profile:
			user_id = profile['id']
			skill_list = profile['skills']
			# Parse the skill list:
			for skill in skill_list:
				df_skills.ix[user_id, skill] = 1
	# Save the matrix in a pickle
	date_string = utils.get_date_string()
	out_file_matrix = './results/skills_matrix_'+date_string+'.pkl'
	utils.savepickle(df_skills, out_file_matrix)
	# pdb.set_trace()
	return df_skills, skill_full_list
示例#4
0
def retrieve_connections(applicaiton, name):
	data_scienctist_connections = []
	outfile = "./data/"+name+"_connections_list.pkl"
	connections = applicaiton.get_connections(selectors=['id', 'first-name', 'last-name', 'headline', 'summary', \
						'location', 'distance', 'num-connections', 'skills',\
						'public-profile-url', 'date-of-birth', 'courses', 'specialties',\
						 'educations', 'positions'])
	print connections.keys()
	# Save the file first and in case do the processing later
	utils.savepickle(connections, outfile)

	connections = utils.readpickle(outfile)
	for connection in connections['values']:
		found = False
		# Here I have the single value in the connection
		# Now I want just to return the connection if it has "data scient"

		if 'headline' in connection:
			if 'data scientist' in connection['headline'].lower():
					found = True
					print connection['firstName'] , connection['lastName']
					print "Data Scientist in Headline"
		
		if 'positions' in connection:
			# print connection['positions']
			positions_num = connection['positions']['_total']
			for i in range(int (positions_num)):
				position = connection['positions']['values'][i]
				if 'data scientist' in position['title'].lower():
					found = True
					print connection['firstName'] , connection['lastName']
					print "Data Scientist in a position"

		if 'summary' in connection:
			if 'data scientist' in connection['summary'].lower():
				found = True
				print connection['firstName'] , connection['lastName']
				print "Data Scientist in Summary"

		if found:
			data_scienctist_connections.append(connection)
	# Save the data scientist connections
	outfile_conn = "./data/"+name+"_data_science_connections"+month+day+year+".pkl"
	utils.savepickle(data_scienctist_connections, outfile_conn)
	return data_scienctist_connections
示例#5
0
def build_skills_ds(db, collection):
    ''' Builds a matrix with all the skills per profile'''
    print "Building skills Df"

    # The method below fails when it
    # comes to name the columns if it's nicer
    # skills_list= collection.distinct('skills')
    # columns = []
    # for i in range(len(skills_list)):
    # 	print skills_list[i]
    # 	columns.append(skills_list[i])
    # pdb.set_trace()

    skill_set = set()
    skill_full_list = []
    index = []
    # Build columns and index to create the dataframe
    cursor = collection.find({}, {"_id": 0, "skills": 1, "id": 1})
    for results in cursor:
        # pdb.set_trace()
        if "skills" in results:
            for skill in (results['skills']):
                skill_set.add(skill)
                skill_full_list.append(skill)
        index.append(results['id'])
    print len(skill_set)

    df_skills = pd.DataFrame(index=index, columns=skill_set).fillna(0)
    # print df_skills

    cursor_profile = collection.find({})
    # Here I will set the value of
    for profile in cursor_profile:
        if 'skills' in profile:
            user_id = profile['id']
            skill_list = profile['skills']
            # Parse the skill list:
            for skill in skill_list:
                df_skills.ix[user_id, skill] = 1
    # Save the matrix in a pickle
    date_string = utils.get_date_string()
    out_file_matrix = './results/skills_matrix_' + date_string + '.pkl'
    utils.savepickle(df_skills, out_file_matrix)
    # pdb.set_trace()
    return df_skills, skill_full_list
示例#6
0
def retrieve_connections(applicaiton, name):
    data_scienctist_connections = []
    outfile = "./data/" + name + "_connections_list.pkl"
    connections = applicaiton.get_connections(selectors=['id', 'first-name', 'last-name', 'headline', 'summary', \
         'location', 'distance', 'num-connections', 'skills',\
         'public-profile-url', 'date-of-birth', 'courses', 'specialties',\
          'educations', 'positions'])
    print connections.keys()
    # Save the file first and in case do the processing later
    utils.savepickle(connections, outfile)

    connections = utils.readpickle(outfile)
    for connection in connections['values']:
        found = False
        # Here I have the single value in the connection
        # Now I want just to return the connection if it has "data scient"

        if 'headline' in connection:
            if 'data scientist' in connection['headline'].lower():
                found = True
                print connection['firstName'], connection['lastName']
                print "Data Scientist in Headline"

        if 'positions' in connection:
            # print connection['positions']
            positions_num = connection['positions']['_total']
            for i in range(int(positions_num)):
                position = connection['positions']['values'][i]
                if 'data scientist' in position['title'].lower():
                    found = True
                    print connection['firstName'], connection['lastName']
                    print "Data Scientist in a position"

        if 'summary' in connection:
            if 'data scientist' in connection['summary'].lower():
                found = True
                print connection['firstName'], connection['lastName']
                print "Data Scientist in Summary"

        if found:
            data_scienctist_connections.append(connection)
    # Save the data scientist connections
    outfile_conn = "./data/" + name + "_data_science_connections" + month + day + year + ".pkl"
    utils.savepickle(data_scienctist_connections, outfile_conn)
    return data_scienctist_connections
示例#7
0
def search(application, name, keywords):
	''' Retrieves the profiles cotnaing keywords using the credetial of a user '''
	log = True
	search_results = application.search_profile(selectors=[{'people': ['first-name', 'last-name', 'id']}], params={'keywords': keywords, 'start':0, 'count':25})
	
	# Saves the results in pickle file
	if log:
		outfile = "./data/search_results_"+name+"_"+month+day+year+".pkl"
		utils.savepickle(search_results, outfile)
	total_people_count = int(search_results['people']['_total'])
	pagination =  int(search_results['people']['_count'])
	start = int(search_results['people']['_start'])
	print "Found %d results" %total_people_count, pagination
	
	# Computes the number of loops to be executed 
	if total_people_count % pagination==0:
		calls = total_people_count/pagination
	else:
		calls = total_people_count/pagination+1
	print calls
	full_results=[]
	for i in range(calls):
			# Slowdown the requests
			# Just in case
			sleep(0.5)
			profile_list=[]
			count = pagination
			results = application.search_profile(selectors=[{'people': ['first-name', 'last-name', 'id']}], params={'keywords': keywords , 'start':start, 'count':count})
			profile_list = parse_search_results(results)
			profile_details_list=[]
			for profile in profile_list:
				profile_id = profile['id']
				if profile_id != 'private':
					print profile_id 
					# TO DO: add all the possible fields
					profile_details = application.get_profile(member_id = profile_id, \
						selectors=['id', 'first-name', 'last-name', 'headline', 'summary', \
						'location', 'distance', 'num-connections', 'skills',\
						'public-profile-url', 'date-of-birth', 'courses', 'specialties',\
						 'educations', 'positions'])
					print profile_details
					profile_details_list.append(profile_details)
					full_results.append(profile_details)
				outfile = "./data/"+name+"_profiles_"+str(start)+"_"+month+day+year+".pkl"
				utils.savepickle(profile_details_list, outfile)
			
			# Increase the start point 
			start+=pagination
	
	# Save the full_list of results
	outfile = "./data/"+name+"_full_profile_list"+month+day+year+".pkl"
	utils.savepickle(full_results, outfile)
	return full_results
示例#8
0
def enhance_profiles(profile_file, search_label, data_scientist_label):
    ''' Add the infromation missing in the profile retrieved form the API
	and stores the new profiles in a new list'''

    enhanced_profiles = []
    profile_list = utils.readpickle(profile_file)
    for profile in profile_list:
        processed = False
        user_id = profile['id']
        pub_profile_file = './data/full_profiles/' + user_id + "_profile.json"
        # Check if file exists
        # Add the additional labels information
        profile['search_label'] = search_label
        profile['label'] = data_scientist_label

        try:
            with open(pub_profile_file):
                pub_profile = json.load(open(pub_profile_file))

                if 'educations' not in profile:
                    print "Missing education"
                    # Open the json file and look for education
                    if 'education' in pub_profile:
                        print "Found education in pub profile file"
                        # Add education
                        profile['educations'] = pub_profile['education']
                        profile['added_education'] = True
                    else:
                        print "Education not found in public profile"

                if 'skills' not in profile:
                    print "Missing skills"
                    # Open the json file and look for education
                    if 'skills' in pub_profile:
                        print "Found skills in pub profile file"
                        # Add education
                        profile['skills'] = pub_profile['skills']
                        profile['added_skills'] = True
                    else:
                        print "Skills not found in public profile"

                if 'specialties' not in profile:
                    print "Missing specialties"
                    # Open the json file and look for education
                    if 'specialties' in pub_profile:
                        print "Found specialties in pub profile file"
                        # Add education
                        profile['specialties'] = pub_profile['specialties']
                        profile['added_specialties'] = True
                    else:
                        print "Specialties not found in public profile"

                print profile
        except:
            print("Public profile file not found")

        # Add the profile to the new profilesle
        enhanced_profiles.append(profile)
    # Save the pickle with new profile
    out_file_enh_profiles = './data/enhanced_profiles/math_enchanced_total_unique_profiles_' + day + month + year + '.pkl'
    utils.savepickle(enhanced_profiles, out_file_enh_profiles)
示例#9
0
# Combines several lists of profiles into one
# Lists just keeping the uniqe user_id

cred = ['mine', 'motoki', 'henry']
total_unique_profiles = []
unique_users = set()
total_profiles = 0

for name in cred:
    profile_filename = "./data/" + name + "_full_profile_list1192013.pkl"
    profile_list = utils.readpickle(profile_filename)
    for profile in profile_list:
        total_profiles += 1
        user_id = profile['id']
        firstName = profile['firstName']
        lastName = profile['lastName']
        user = (firstName, lastName)
        # print user_id
        if user not in unique_users:
            # Add to unqie profiles
            unique_users.add(user)
            total_unique_profiles.append(profile)
        else:
            print "user  exists"
            print user

# Save the pickle
out_tot_profiles = 'data/total_unique_profile_math_list.pkl'
utils.savepickle(total_unique_profiles, out_tot_profiles)
print "Total Profiles: %d, Unique profiles %d, %d" % (
    total_profiles, len(unique_users), len(total_unique_profiles))
def enhance_profiles(profile_file, search_label, data_scientist_label):
	''' Add the infromation missing in the profile retrieved form the API
	and stores the new profiles in a new list'''

	enhanced_profiles=[]
	profile_list = utils.readpickle(profile_file)
	for profile in profile_list:
		processed = False
		user_id = profile['id']
		pub_profile_file = './data/full_profiles/'+user_id+"_profile.json"
		# Check if file exists
		# Add the additional labels information
		profile['search_label'] = search_label
		profile['label'] = data_scientist_label

		try:
			with open(pub_profile_file):
				pub_profile = json.load(open(pub_profile_file))

				if  'educations' not in profile:
					print "Missing education"
					# Open the json file and look for education
					if 'education' in pub_profile:
						print "Found education in pub profile file"
						# Add education
						profile['educations'] = pub_profile['education']
						profile['added_education'] = True
					else:
						print "Education not found in public profile"
				
				if  'skills' not in profile:
					print "Missing skills"
					# Open the json file and look for education
					if 'skills' in pub_profile:
						print "Found skills in pub profile file"
						# Add education
						profile['skills'] = pub_profile['skills']
						profile['added_skills'] = True
					else:
						print "Skills not found in public profile"

				if  'specialties' not in profile:
					print "Missing specialties"
					# Open the json file and look for education
					if 'specialties' in pub_profile:
						print "Found specialties in pub profile file"
						# Add education
						profile['specialties'] = pub_profile['specialties']
						profile['added_specialties'] = True
					else:
						print "Specialties not found in public profile"
				
				print profile
		except:
			print("Public profile file not found")


		# Add the profile to the new profilesle
		enhanced_profiles.append(profile)
	# Save the pickle with new profile
	out_file_enh_profiles = './data/enhanced_profiles/math_enchanced_total_unique_profiles_'+day+month+year+'.pkl'
	utils.savepickle(enhanced_profiles, out_file_enh_profiles)
示例#11
0
def search(application, name, keywords):
    ''' Retrieves the profiles cotnaing keywords using the credetial of a user '''
    log = True
    search_results = application.search_profile(selectors=[{
        'people': ['first-name', 'last-name', 'id']
    }],
                                                params={
                                                    'keywords': keywords,
                                                    'start': 0,
                                                    'count': 25
                                                })

    # Saves the results in pickle file
    if log:
        outfile = "./data/search_results_" + name + "_" + month + day + year + ".pkl"
        utils.savepickle(search_results, outfile)
    total_people_count = int(search_results['people']['_total'])
    pagination = int(search_results['people']['_count'])
    start = int(search_results['people']['_start'])
    print "Found %d results" % total_people_count, pagination

    # Computes the number of loops to be executed
    if total_people_count % pagination == 0:
        calls = total_people_count / pagination
    else:
        calls = total_people_count / pagination + 1
    print calls
    full_results = []
    for i in range(calls):
        # Slowdown the requests
        # Just in case
        sleep(0.5)
        profile_list = []
        count = pagination
        results = application.search_profile(selectors=[{
            'people': ['first-name', 'last-name', 'id']
        }],
                                             params={
                                                 'keywords': keywords,
                                                 'start': start,
                                                 'count': count
                                             })
        profile_list = parse_search_results(results)
        profile_details_list = []
        for profile in profile_list:
            profile_id = profile['id']
            if profile_id != 'private':
                print profile_id
                # TO DO: add all the possible fields
                profile_details = application.get_profile(member_id = profile_id, \
                 selectors=['id', 'first-name', 'last-name', 'headline', 'summary', \
                 'location', 'distance', 'num-connections', 'skills',\
                 'public-profile-url', 'date-of-birth', 'courses', 'specialties',\
                  'educations', 'positions'])
                print profile_details
                profile_details_list.append(profile_details)
                full_results.append(profile_details)
            outfile = "./data/" + name + "_profiles_" + str(
                start) + "_" + month + day + year + ".pkl"
            utils.savepickle(profile_details_list, outfile)

        # Increase the start point
        start += pagination

    # Save the full_list of results
    outfile = "./data/" + name + "_full_profile_list" + month + day + year + ".pkl"
    utils.savepickle(full_results, outfile)
    return full_results
示例#12
0
# Combines several lists of profiles into one 
# Lists just keeping the uniqe user_id


cred = ['mine',  'motoki', 'henry']
total_unique_profiles = [] 
unique_users = set()
total_profiles = 0;

for name in cred:
	profile_filename = "./data/"+name+"_full_profile_list1192013.pkl"
	profile_list = utils.readpickle(profile_filename)
	for profile in profile_list:
		total_profiles+=1
		user_id = profile['id']
		firstName = profile['firstName']
		lastName = profile['lastName']
		user = (firstName, lastName)
		# print user_id
		if user not in unique_users:
			# Add to unqie profiles
			unique_users.add(user)
			total_unique_profiles.append(profile)
		else:
			print "user  exists" 
			print user
			
# Save the pickle
out_tot_profiles = 'data/total_unique_profile_math_list.pkl'
utils.savepickle(total_unique_profiles, out_tot_profiles)
print "Total Profiles: %d, Unique profiles %d, %d" %(total_profiles, len(unique_users) , len(total_unique_profiles))