def get_matches(evaluation_references, sample_N):

    # Take a random sample of the evaluation references to find matches for
    match_data = evaluation_references.sample(n=sample_N * 2,
                                              random_state=0).reset_index()
    match_data['Title'] = match_data['title']

    match_data_positive = match_data.iloc[0:sample_N]
    match_data_negative = match_data.iloc[sample_N:]

    evaluation_references_without_negative = evaluation_references.loc[
        ~evaluation_references['uber_id'].isin(match_data_negative['uber_id'])]

    fuzzy_matcher = FuzzyMatcher(evaluation_references_without_negative, -1)

    match_data_pos_neg = pd.concat([match_data_positive, match_data_negative])
    eval_references = []
    for i, ref in match_data_pos_neg.iterrows():
        match_ref = fuzzy_matcher.match(ref)
        eval_references.append(match_ref)
    eval_references = pd.DataFrame(eval_references)
    eval_references["Title Length"] = [
        len(title) for title in eval_references["Extracted title"]
    ]
    eval_references["Match Type"] = ["Positive"] * sample_N + ["Negative"
                                                               ] * sample_N

    return eval_references
예제 #2
0
	def test_close_match(self):
		real_publications = pd.DataFrame({
			'title': ['Malaria is caused by mosquitoes'],
			'uber_id': [1]
		})
		threshold = settings.FUZZYMATCH_SIMILARITY_THRESHOLD
		fuzzy_matcher = FuzzyMatcher(real_publications, threshold)
		reference = {
			'Document id': 10,
            'Reference id': 11,
            'Title': 'Malaria'			
		}
		matched_publication = fuzzy_matcher.match(reference)
		self.assertEqual(matched_publication, None) 
예제 #3
0
	def test_no_string_titles(self):
		real_publications = pd.DataFrame({
			'title': [1,2]
		})
		threshold = 75
		with self.assertRaises(AttributeError):
			FuzzyMatcher(real_publications, threshold)
예제 #4
0
	def test_empty_title(self):
		real_publications = pd.DataFrame({
			'title': []
		})
		threshold = 75
		with self.assertRaises(ValueError):
			FuzzyMatcher(real_publications, threshold)
예제 #5
0
def evaluate_match_references(evaluation_references, match_threshold,
                              length_threshold, sample_N):

    # Take a random sample of the evaluation references to find matches for
    match_data = evaluation_references.sample(n=sample_N * 2,
                                              random_state=0).reset_index()
    match_data['Title'] = match_data['title']

    match_data_positive = match_data.iloc[0:sample_N]
    match_data_negative = match_data.iloc[sample_N:]

    evaluation_references_without_negative = evaluation_references.loc[
        ~evaluation_references['uber_id'].isin(match_data_negative['uber_id'])]

    fuzzy_matcher = FuzzyMatcher(evaluation_references_without_negative,
                                 match_threshold, length_threshold)

    predictions = predict_match_data(
        match_data=match_data_positive.to_dict('records') +
        match_data_negative.to_dict('records'),
        matcher=fuzzy_matcher)
    actual = match_data_positive['Reference id'].to_list() + [None] * sample_N

    metrics = evaluate_metric(actual, predictions)

    return metrics
예제 #6
0
	def init_fuzzy_matcher(self):
		real_publications = pd.DataFrame({
			'title': ['Malaria', 'Zika'],
			'uber_id': [1, 2]
		})
		threshold = settings.FUZZYMATCH_SIMILARITY_THRESHOLD
		fuzzy_matcher = FuzzyMatcher(real_publications, threshold)
		return fuzzy_matcher
예제 #7
0
	def test_init_variables(self):
		real_publications = pd.DataFrame({
			'title': ['Malaria', 'Zika']
		})
		threshold = 0
		fuzzy_matcher = FuzzyMatcher(real_publications, threshold)
		assert_frame_equal(
			fuzzy_matcher.publications, real_publications
		)
		self.assertEqual(
			fuzzy_matcher.similarity_threshold, threshold
		)
		self.assertTrue(
			fuzzy_matcher.tfidf_matrix.size != 0
		)
예제 #8
0
	def test_no_title(self):
		real_publications = pd.DataFrame({})
		threshold = 75
		with self.assertRaises(KeyError):
			FuzzyMatcher(real_publications, threshold)