示例#1
0
def computeSkillIndexes(profile, skills):
    """Compute skill index values"""
    skillindex = dict()
    for item in profile['skills']:
        skillFlag = False
        for category, topics in skills.items():
            if item in topics: # If the skill is already present in one of our lists
                skillindex[category] = skillindex.get(category, 0) + 100.0/len(profile['skills'])
                skillFlag = True
        if not skillFlag:
            # if not, get related skills from the relevant linkedin page
            category1,category2,category3,category4 = categorizer.categorize(item)
            catset = set([category1, category2, category3,category4])
            for cat in catset:
                skills[cat].append(item)
                skillindex[cat] = skillindex.get(cat, 0) + 100.0/len(profile['skills'])
                #writeback(cat, item)
            #relatedskills = scraper.extractRelatedSkills(item)
            #for relskill in relatedskills:
            #    for category, topics in skills.items():
            #        if relskill in topics:
            #            skillFlag = False
            #    else:
            #        # if none of the related skills are present in any of the lists,
            #        # go the extra mile and make a wild guess on which category it might belong to
            #       if not skillFlag:
            #            category1,category2,category3,category4 = categorizer.categorize(relskill)
            #            catset = set([category1, category2, category3,category4])
            #            print relskill, ':', catset
            #            for cat in catset:
            #                skills[cat].append(relskill)
            #                #writeback(cat, relskill)
    return skillindex
示例#2
0
def doc_categorize(fsort):
    cat_prob = [0,0,0,0]
    for term in fsort:
        print term[0]
    for kw in fsort:
        term_corr = [kw[1]*term for term in categorizer.categorize(kw[0])]
        cat_prob = map(operator.add, cat_prob, term_corr)
        print 'cumulative probability :', cat_prob
    return cat_prob
示例#3
0
def run_server(kw_list,log):
	global source_probs
	cat_list = {}
	cat_list['general'] = wiki_list = [("Wikipedia", "site:wikipedia.org"), ("Citizendium", "site:citizendium.org"), ("Britannica", "site:britannica.com")]
	cat_list['technology'] = [("gizmodo","gizmodo"), ("theverge","theverge"), ("engadget","engadget")]

	source_probs['general'] = phase_one.phase1_update(source_probs['general'], cat_list['general'], [item[0] for item in kw_list], 1, True, log)

	pickle.dump( source_probs, open( "source_probabilities.sp", "wb" ) )

	for kw in kw_list:
		kw_category = string.lower(cat_mod.categorize(kw[0]))
		print kw, kw_category
		get_uri(kw,cat_list['general'], source_probs['general'], 2)
		if kw_category in cat_list.keys():
			get_uri(kw,cat_list[kw_category], source_probs['technology'], 2)
	print_uri()