def get_matches(evaluation_references, sample_N):

    # Take a random sample of the evaluation references to find matches for
    match_data = evaluation_references.sample(n=sample_N * 2,
                                              random_state=0).reset_index()
    match_data['Title'] = match_data['title']

    match_data_positive = match_data.iloc[0:sample_N]
    match_data_negative = match_data.iloc[sample_N:]

    evaluation_references_without_negative = evaluation_references.loc[
        ~evaluation_references['uber_id'].isin(match_data_negative['uber_id'])]

    fuzzy_matcher = FuzzyMatcher(evaluation_references_without_negative, -1)

    match_data_pos_neg = pd.concat([match_data_positive, match_data_negative])
    eval_references = []
    for i, ref in match_data_pos_neg.iterrows():
        match_ref = fuzzy_matcher.match(ref)
        eval_references.append(match_ref)
    eval_references = pd.DataFrame(eval_references)
    eval_references["Title Length"] = [
        len(title) for title in eval_references["Extracted title"]
    ]
    eval_references["Match Type"] = ["Positive"] * sample_N + ["Negative"
                                                               ] * sample_N

    return eval_references
Пример #2
0
	def test_close_match(self):
		real_publications = pd.DataFrame({
			'title': ['Malaria is caused by mosquitoes'],
			'uber_id': [1]
		})
		threshold = settings.FUZZYMATCH_SIMILARITY_THRESHOLD
		fuzzy_matcher = FuzzyMatcher(real_publications, threshold)
		reference = {
			'Document id': 10,
            'Reference id': 11,
            'Title': 'Malaria'			
		}
		matched_publication = fuzzy_matcher.match(reference)
		self.assertEqual(matched_publication, None)