def cosine_test(train_text, test_text, num1, num2):
    if not train_text or not test_text:
        return 0.0
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([train_text.lower(), test_text.lower()])
    result = (tfidf_matrix * tfidf_matrix.T).A[0, 1]
    return balance_result(num1, num2, False, result)
def abbr_test(train_examples, test_examples, num1, num2):
    # if testExamples is a string, perform metadata abbr test (label thing). Else do the normal one
    train_example_set = set(train_examples)

    count_matches = 0

    if isinstance(test_examples, str):
        patterns = get_abbr_patterns(test_examples)
        for pattern in patterns:
            if pattern.match(train_examples):
                count_matches += 1
                break
        return count_matches
    else:
        test_example_set = set(test_examples)

        if len(test_example_set) > 50 or len(train_example_set) > 50:
            return 0.0

        for test_example in test_example_set:
            if not test_example.isupper():
                continue
            patterns = get_abbr_patterns(test_example)
            found = False
            for pattern in patterns:
                for train_example in train_example_set:
                    if pattern.match(train_example):
                        count_matches += 1
                        found = True
                        break
                    if found:
                        break
        result = count_matches * 2.0 / (len(train_example_set) + len(test_example_set))
        return balance_result(num1, num2, False, result)
def abbr_test(train_examples, test_examples, num1, num2):
    # if testExamples is a string, perform metadata abbr test (label thing). Else do the normal one
    train_example_set = set(train_examples)

    count_matches = 0

    if isinstance(test_examples, str):
        patterns = get_abbr_patterns(test_examples)
        for pattern in patterns:
            if pattern.match(train_examples):
                count_matches += 1
                break
        return count_matches
    else:
        test_example_set = set(test_examples)

        if len(test_example_set) > 50 or len(train_example_set) > 50:
            return 0.0

        for test_example in test_example_set:
            if not test_example.isupper():
                continue
            patterns = get_abbr_patterns(test_example)
            found = False
            for pattern in patterns:
                for train_example in train_example_set:
                    if pattern.match(train_example):
                        count_matches += 1
                        found = True
                        break
                    if found:
                        break
        result = count_matches * 2.0 / (len(train_example_set) +
                                        len(test_example_set))
        return balance_result(num1, num2, False, result)
def cosine_test(train_text, test_text, num1, num2):
    if not train_text or not test_text:
        return 0.0
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(
        [train_text.lower(), test_text.lower()])
    result = (tfidf_matrix * tfidf_matrix.T).A[0, 1]
    return balance_result(num1, num2, False, result)
def mann_whitney_u_test(train_examples, test_examples, num1, num2):
    try:
        if len(train_examples) > 1 and len(test_examples) > 1:
            result = mannwhitneyu(train_examples, test_examples)[1]
            return balance_result(num1, num2, True, result)
        return 0
    except ValueError as e:
        logging.warn("IGNORE EXCEPTION: %s", str(e))
        return 0
示例#6
0
def mann_whitney_u_test(train_examples, test_examples, num1, num2):
    try:
        if len(train_examples) > 1 and len(test_examples) > 1:
            result = mannwhitneyu(train_examples, test_examples)[1]
            return balance_result(num1, num2, True, result)
        return 0
    except ValueError as e:
        logging.warn("IGNORE EXCEPTION: %s", str(e))
        return 0
示例#7
0
def coverage_test(train_examples, test_examples, num1, num2):
    if len(train_examples) > 1 and len(test_examples) > 1:
        max1 = percentile(train_examples, 75)
        min1 = percentile(train_examples, 25)
        max2 = percentile(test_examples, 75)
        min2 = percentile(test_examples, 25)
        max3 = max(max1, max2)
        min3 = min(min1, min2)
        if min2 > max1 or min1 > max2:
            return 0
        elif max3 == min3:
            return 0
        else:
            min4 = min(max1, max2)
            max4 = max(min1, min2)
            result = (min4 - max4) * 1.0 / (max3 - min3)
            return balance_result(num1, num2, True, result)
    return 0
def coverage_test(train_examples, test_examples, num1, num2):
    if len(train_examples) > 1 and len(test_examples) > 1:
        max1 = percentile(train_examples, 100)
        min1 = percentile(train_examples, 0)
        max2 = percentile(test_examples, 100)
        min2 = percentile(test_examples, 0)
        max3 = max(max1, max2)
        min3 = min(min1, min2)
        # print "max1", max1
        # print "min1", min1
        # print "max2", max2
        # print "min2", min2
        if min2 > max1 or min1 > max2:
            return 0
        elif max3 == min3:
            return 0
        else:
            min4 = min(max1, max2)
            max4 = max(min1, min2)
            result = (min4 - max4) * 1.0 / (max3 - min3)
            return balance_result(num1, num2, True, result)
    return 0
def welch_test(train_examples, test_examples, num1, num2):
    if len(train_examples) > 1 and len(test_examples) > 1:
        print(train_examples, test_examples)
        result = ttest_ind(train_examples, test_examples, False)[1]
        return balance_result(num1, num2, True, result)
    return 0
def kolmogorov_smirnov_test(train_examples, test_examples, num1, num2):
    if len(train_examples) > 1 and len(test_examples) > 1:
        result = ks_2samp(train_examples, test_examples)[1]
        return balance_result(num1, num2, True, result)
    return 0
def jaccard_test(train_examples, test_examples, num1, num2):
    result = jaccard_similarity(train_examples, test_examples)
    return balance_result(num1, num2, False, result)
示例#12
0
def welch_test(train_examples, test_examples, num1, num2):
    if len(train_examples) > 1 and len(test_examples) > 1:
        print(train_examples, test_examples)
        result = ttest_ind(train_examples, test_examples, False)[1]
        return balance_result(num1, num2, True, result)
    return 0
示例#13
0
def kolmogorov_smirnov_test(train_examples, test_examples, num1, num2):
    if len(train_examples) > 1 and len(test_examples) > 1:
        result = ks_2samp(train_examples, test_examples)[1]
        return balance_result(num1, num2, True, result)
    return 0
示例#14
0
def mann_whitney_u_test(train_examples, test_examples, num1, num2):
    if len(train_examples) > 1 and len(test_examples) > 1:
        result = mannwhitneyu(train_examples, test_examples)[1]
        return balance_result(num1, num2, True, result)
    return 0
def jaccard_test(train_examples, test_examples, num1, num2):
    result = jaccard_similarity(train_examples, test_examples)
    return balance_result(num1, num2, False, result)