Exemplo n.º 1
0
def search_candidates_from_e(sentence, key, can_num=10):
    """
    Given a sentence and key, conceptulize it and find candidates for the key
    :param sentence: a complete sentence with key filled into the originial gap
    :param key: entity to be searched in Probase
    :param can_num: maximum number of candidates to be generated
    :return: a list containing the candidate, frequency pairs for each concept ([ ['candidate_name', frequency] ... ], concept_probability)
            a dict {'candidate_name':frequency...}
    """
    sentence = sentence.replace("**blank**", key)
    probabilities_of_concepts = conceptualizer.conceptualize(sentence,
                                                             key,
                                                             0,
                                                             debug,
                                                             eval=True)
    if probabilities_of_concepts is None:
        print("probabilities_of_concepts is none!")
        return None
    cnt = 0
    candidates = []
    syn_key = normalize_instance(key, mode=1)
    for concept, prob in probabilities_of_concepts:
        # add original candidates if its normalized form is not in syn_key
        tmp = [
            x for x in search_e_from_c(c, concept, can_num)
            if normalize_instance(x[0]) not in syn_key
        ]
        cnt += len(tmp)
        candidates.append((tmp, prob))
        if cnt > can_num:
            candidates = candidate_prob(candidates)
            return candidates
    candidates = candidate_prob(candidates)
    return candidates
def search_candidates_from_e(sentence, key, can_num=10):
    """
	Given a sentence and key, conceptulize it and find candidates for the key
	:param sentence: a complete sentence with key filled into the originial gap
    :param key: entity to be searched in Probase
    :param can_num: maximum number of candidates to be generated 
    :return: a list containing the candidate, frequency pairs for each concept ([ ['candidate_name', frequency] ... ], concept_probability)
    		a dict {'candidate_name':frequency...}
	"""
    sentence = sentence.replace('**blank**', key)
    print("Probase sentence: ", sentence)
    probabilities_of_concepts = conceptualizer.conceptualize(sentence,
                                                             key,
                                                             debug,
                                                             eval=True)
    print("Probability of concepts done!")
    if probabilities_of_concepts is None:
        return None
    cnt = 0
    candidates = []
    syn_key = normalize_instance(key, mode=1)
    for concept, prob in probabilities_of_concepts:
        # add original candidates if its normalized form is not in syn_key
        tmp = [
            x for x in search_e_from_c(c, concept, can_num)
            if normalize_instance(x[0]) not in syn_key
        ]
        cnt += len(tmp)
        candidates.append((tmp, prob))
        if cnt > can_num:
            candidates = candidate_prob(candidates)
            return candidates
    candidates = candidate_prob(candidates)
    return candidates


# debug = True
# search_candidates_from_e("apple and iPad are useful products", "apple")
# search_candidates_from_e("He likes to eat apple", "apple")

# search_candidates_from_e("Earth's core is primarily composed of magma of the following materials", "magma")
# search_candidates_from_e("the ba4ic unit of life is cell",'cell')
#print(search_candidates_from_e("human have been on the earth for the shortest amount of time",'human',100)) # "Insects","Fish","Reptiles"

#The following shows 100 candidates
# candidates = search_candidates_from_e("the most basic unit of living things is Cells",'Cells',100) # "Bones","Tissues","Organs"
# cd = candidate_prob(candidates)
# print(sorted(cd.items(), key=lambda d: -d[1]))
Exemplo n.º 3
0
	def hit_rate(self, topk, distractors, k=10):
		"""
		Calculate hit rate in topk for an item in dataset
		"""
		total = len(distractors)
		valid = 0
		# different forms of distractors to be tested
		for i in range(len(distractors)):
			d = normalize_instance(distractors[i])
			for k in [distractors[i], d, d.capitalize(), ' '.join([x.capitalize() for x in d.split()]), ''.join([x.capitalize() for x in d.split()])]:
				if k in topk:
					valid += 1
					break
		return float(valid)/total, topk
Exemplo n.º 4
0
def calculate_features(item, mode=1, can_num=100, source="Probase"):
    """
	Given an item, generate can_num candidates and test hit rate. 
	param: can_num: number of candidates to be generated
	param: mode: 1 means single-processing, 2 means multi-processing have to return an additional len(distractors)
	return: features: probabilty from Probase + embedding similarities for each candidate
	"""

    # select a candidate generation source
    print("calculating features...")
    if source == 'Probase':
        candidates = search_candidates_from_e(item['sentence'], item['answer'],
                                              can_num)
    elif source == "WordNet":
        candidates = wordnet_predict(item['sentence'], item['answer'], can_num)
    else:
        # candidates = word2vec_predict(item['sentence'],can_num)
        pass
    # print(candidates)
    # if answer is not in Probase
    if candidates is None:
        return np.array([]), np.array([])

    cnt = 1
    rankings = {}  # from candidate to its ranking
    features = [
    ]  # concept probability + embedding features for each candidate
    Y = []  # label, whether is a distractor or not
    res = []
    dic = {}
    visit = [0] * len(item['distractors'])

    # different forms of distractors to be tested
    distractors = []
    for i in range(len(item['distractors'])):
        d = normalize_instance(item['distractors'][i])
        for k in [
                item['distractors'][i], d,
                d.capitalize(), ' '.join([x.capitalize() for x in d.split()]),
                ''.join([x.capitalize() for x in d.split()])
        ]:
            distractors.append(k)
            dic[k] = i

    item['answer'] = normalize_instance(item['answer'])
    item['sentence'] = " ".join(
        [normalize_instance(x) for x in item['sentence'].split()])
    scores = []
    LMProb = []
    pairs = []

    for c, v in sorted(candidates.items(), key=lambda d: -d[1]):
        # print("feature for ,", c)
        y = 0
        rankings[c] = cnt
        if c in distractors:
            if visit[dic[c]] == 1:
                cnt += 1
                continue
            res.append(rankings[c])
            visit[dic[c]] = 1
            y = 1
        cnt += 1
        try:
            features.append(
                cal_26_feature_vec([item['sentence'], item['answer'], c]))
            Y.append(y)
            scores.append([v])
            pairs.append([item['sentence'], c])
        except:
            print("error")
            pass

    for i in range(len(item['distractors'])):
        if visit[i] == 0:
            try:
                features.append(
                    cal_26_feature_vec([
                        item['sentence'], item['answer'],
                        item['distractors'][i]
                    ]))
                Y.append(1)
                scores.append([0])

            except:
                print('error')
                pass

    features = np.array(features, dtype=np.float32)
    scores = np.array(scores, dtype=np.float64)
    scores_normed = scores / scores.max(axis=0)
    features = np.hstack((features, scores_normed))
    print(features.shape)
    if mode == 1:
        return features, Y, res
    else:
        return features, Y, res, len(item['distractors'])