예제 #1
0
    def testComplex(self):
        # Regression check. If anything wildly varies, this test will break.  A
        # white-box tests of this is available:
        # tests.test_translationese.Utilities.test_pmi.
        sentence = "Just because you're a Bad Guy doesn't mean you're a " \
                   "bad guy."
        analysis = Analysis(sentence)

        expected_pmi = {
                ('guy', '.'): 2.1439800628174073,
                ("n't", 'mean'): 2.8371272433773522,
                ('does', "n't"): 2.8371272433773522,
                ('guy', 'does'): 2.1439800628174073,
                ('a', 'bad'): 2.1439800628174073,
                ('mean', 'you'): 2.1439800628174073,
                ('bad', 'guy'): 2.1439800628174073,
                ('just', 'because'): 2.8371272433773522,
                ("'re", 'a'): 2.1439800628174073,
                ('because', 'you'): 2.1439800628174073,
                ('you', "'re"): 2.1439800628174073
                }

        self.assertDictEqual(expected_pmi, analysis.pmi())
        average_expected_pmi = sum(expected_pmi.values()) / len(expected_pmi)
        self.assertAlmostEqual(2.33302, average_expected_pmi, 5)
        result = average_pmi.quantify(analysis)
        self.assertAlmostEqual(average_expected_pmi, result["average_pmi"], 5)
예제 #2
0
    def testComplex(self):
        # Regression check. If anything wildly varies, this test will break.  A
        # white-box tests of this is available:
        # tests.test_translationese.Utilities.test_pmi.
        sentence = "Just because you're a Bad Guy doesn't mean you're a " \
                   "bad guy."
        analysis = Analysis(sentence)

        expected_pmi = {
            ('guy', '.'): 2.1439800628174073,
            ("n't", 'mean'): 2.8371272433773522,
            ('does', "n't"): 2.8371272433773522,
            ('guy', 'does'): 2.1439800628174073,
            ('a', 'bad'): 2.1439800628174073,
            ('mean', 'you'): 2.1439800628174073,
            ('bad', 'guy'): 2.1439800628174073,
            ('just', 'because'): 2.8371272433773522,
            ("'re", 'a'): 2.1439800628174073,
            ('because', 'you'): 2.1439800628174073,
            ('you', "'re"): 2.1439800628174073
        }

        self.assertDictEqual(expected_pmi, analysis.pmi())
        average_expected_pmi = sum(expected_pmi.values()) / len(expected_pmi)
        self.assertAlmostEqual(2.33302, average_expected_pmi, 5)
        result = average_pmi.quantify(analysis)
        self.assertAlmostEqual(average_expected_pmi, result["average_pmi"], 5)
예제 #3
0
    def testMeanMultipleNaming(self):
        a = Analysis("John Joseph Doe easily defeated Bob Robertson.")
        result = mean_multiple_naming.quantify(a)
        self.assertAlmostEqual(2.5, result["mean_multiple_naming"])

        a = Analysis("Dr James Brown, specialist in physics, visited in " \
                     "Haifa University.")
        result = mean_multiple_naming.quantify(a)
        self.assertAlmostEqual(2.5, result["mean_multiple_naming"])
예제 #4
0
 def testContractions(self):
     a = Analysis("I'm certain he said I am later than I am, "
                  "let's go.")
     result = contractions.quantify(a)
     self.assertAlmostEqual(0.5, result["i'm"])
     self.assertAlmostEqual(1.0, result["let's"])
     self.assertEqual(0.0, result["didn't"])
    def testPositionalTokenFrequency(self):
        text = """
        Humpty dumpty sat on a wall.
        Humpty dumpty had a great fall.
        Strange occurrence, quite frankly.
        What an egg.
        """

        analysis = Analysis(text)

        expected = {
            "first humpty": 2,
            "second dumpty": 2,
            "antepenultimate on": 1,
            "penultimate a": 1,
            "last wall": 1,
            "antepenultimate a": 1,
            "penultimate great": 1,
            "last fall": 1,
            "first strange": 1,
            "second occurrence": 1,
            "antepenultimate ,": 1,
            "penultimate quite": 1,
            "last frankly": 1,
        }

        result = quantify(analysis)

        self.assertSparseDictEqual(expected, result)
 def testVariant0(self):
     a = Analysis(self.sentence)
     expected = {
         'the': 2 / 11.0,
         'and': 1 / 11.0,
     }
     result = most_frequent_words.quantify_variant(a, 0)
     self.assertSparseDictEqual(expected, result)
예제 #7
0
    def testMeanWordRank(self):
        a = Analysis("Have you not heard? The velociraptor is the word.")

        expected_0 = sum([18, 30, 20, 372, 1, 6000, 8, 1, 250]) / 9.0
        expected_1 = sum([18, 30, 20, 372, 1, 8, 1, 250]) / 8.0

        result_0 = mean_word_rank.quantify_variant(a, 0)["mean_word_rank"]
        result_1 = mean_word_rank.quantify_variant(a, 1)["mean_word_rank"]

        self.assertAlmostEqual(result_0, expected_0)
        self.assertAlmostEqual(result_1, expected_1)
    def testContextualFunctionWords(self):
        analysis = Analysis("He better be back, lest a thousand " \
                                 "similarly angry birds show up.")

        expected = {
            'he better be': 1 / 14.0,
            'VBP a thousand': 1 / 14.0,
            'better be back': 1 / 14.0,
            'a thousand similarly': 1 / 14.0,
            'thousand similarly JJ': 1 / 14.0,
        }

        result = contextual_function_words.quantify(analysis)

        self.assertSparseDictEqual(expected, result)
예제 #9
0
    def testUnigrams(self):
        analysis = Analysis("Hello, a world!")
        result = quantify_variant(analysis, 0)
        expected = {
                "a": 1 / 15.0,
                "h": 1 / 15.0,
                "e": 1 / 15.0,
                "l": 3 / 15.0,
                "o": 2 / 15.0,
                "w": 1 / 15.0,
                "r": 1 / 15.0,
                "d": 1 / 15.0,
                }

        self.assertSparseDictEqual(expected, result)
예제 #10
0
    def testRepetitions(self):
        text = """
        This is a very, very impressive thing. It is more impressive than
        anything I have ever seen. Very good. You have done a fine job.
        """

        # Repeated:
        # very * 3
        # impressive * 2
        # is * 2, but doesn't count.
        # have * 2, but doesn't count.
        # Total: 5

        expected_repetitions = 5
        num_tokens = 30
        expected_result = 3 * float(expected_repetitions) / num_tokens

        a = Analysis(text)
        result = repetitions.quantify(a)
        self.assertAlmostEqual(expected_result, result["repetitions"])
예제 #11
0
    def testBigrams(self):
        analysis = Analysis("Hello, a world!")
        result = quantify_variant(analysis, 1)
        expected = {
                "<h": 1 / 15.0,
                "he": 1 / 15.0,
                "el": 1 / 15.0,
                "ll": 1 / 15.0,
                "lo": 1 / 15.0,
                "o>": 1 / 15.0,
                "<a": 1 / 15.0,
                "a>": 1 / 15.0,
                "<w": 1 / 15.0,
                "wo": 1 / 15.0,
                "or": 1 / 15.0,
                "rl": 1 / 15.0,
                "ld": 1 / 15.0,
                "d>": 1 / 15.0,
                }

        self.assertSparseDictEqual(expected, result)
예제 #12
0
 def testSingleNaming(self):
     a = Analysis("Jim, George and Bob are my friends. John Doe is not.")
     result = single_naming.quantify(a)
     self.assertAlmostEqual(3.0 / 14.0, result["single_naming"])
 def setUp(self):
     self.sentence = "I came.  I saw. I conquered. I didn't elaborate."
     self.analysis = Analysis(self.sentence)
예제 #14
0
 def setUp(self):
     self.analysis = Analysis("We like dogs, and we dislike cats.")
예제 #15
0
 def testSimple(self):
     sentence = "a b a b b"
     # ("b","b") has negative PMI, the other two are positive
     result = threshold_pmi.quantify(Analysis(sentence))
     self.assertEquals(2 / 2000.0, result["threshold_pmi"])
예제 #16
0
 def testWithHyphen(self):
     sentence = "I'm a self-made man."
     self.analysis = Analysis(sentence)
     # Apostrophe in "I'm" and hyphen in "self-made" don't count.
     # However, "self-made" counts as one word.
     self.expected = {"mean_word_length": 14.0 / 4.0}
예제 #17
0
 def testWithContraction(self):
     sentence = "A little bit overcautious. Don't you think so?"
     self.analysis = Analysis(sentence)
     # Apostrophe in "Don't" doesn't count.
     self.expected = {"mean_word_length": 36.0 / 8.0}
예제 #18
0
def getFeats():
    # get cohesive markers
    df = pd.read_csv('../ckxx_arff3/ckxx_cohesive_markers:1.csv', index_col=0)
    # df = df.iloc[:20]
    print(df.head())

    # get num of tokens for each text
    mynlp = StanfordCoreNLP('http://localhost', port=9000, lang='en')

    totalTokens = {}

    for fn in sorted(glob.glob('../ckxx_T/*') + glob.glob('../ckxx_O_XIN/*')):
        if fn.endswith('analysis'): continue
        # if fn == '../ckxx_O_XIN/XIN_0020': break
        myAna1 = Analysis(filename=fn)
        myAna1.stanfordnlp = mynlp
        myAna1.loadcache()

        # print(os.path.basename(myAna1.filename))
        # print(len(myAna1.case_tokens()))
        totalTokens[os.path.basename(myAna1.filename)] = len(
            myAna1.case_tokens())

    df2 = pd.DataFrame.from_dict(data=totalTokens, orient='index')
    df2.columns = ['totalTokens']
    df2.sort_index(axis=0, inplace=True)
    print(df2.head())

    # concat two df's
    df = pd.concat([df, df2], axis=1)

    # get raw counts
    mycols = list(df.columns)
    mycols.remove('totalTokens')
    print(mycols[-5:])
    df[mycols] = df[mycols].multiply(df['totalTokens'], axis="index")
    print(df.head())

    ##############
    # get entropy
    # remove 'totalTokens'
    df_new = df.iloc[:, :-1]  # work on a copy, exclude last col = totalTokens
    # df_new.drop(['totalTokens'], axis=1, inplace=True)
    df_new['numCohMkrs'] = df_new.sum(axis=1)
    print(df_new.head())

    df_new.iloc[:, :-1] = df_new.iloc[:, :-1].div(df_new['numCohMkrs'], axis=0)
    print(df_new.head(10))

    df_new['entropy'] = \
        - df_new.iloc[:, :-1].multiply(
            np.log2(df_new.iloc[:, :-1])
        ).sum(axis=1)
    print(df_new.head(10))  # first row 3.217797

    # sanity check first row entropy:
    ps = list(df_new.iloc[0, :-2])
    en = -sum([x * math.log2(x) for x in ps if x != 0])
    print(en)  # 3.217796811598595
    # CORRECT!

    # get TTR: type / token
    # number of non-zeros in each row: type
    df_new['TTR1'] = 6 * df.astype(bool).sum(axis=1) / df['totalTokens']
    df_new['TTR2'] = 6 * np.log(df.astype(bool).sum(axis=1)) / np.log(
        df['totalTokens'])

    df_new['class'] = 'T'
    df_new.loc[df.index.str.contains('XIN'), 'class'] = 'O'

    print(df_new.head(10))

    # save
    df_new.iloc[:, -5:-1].to_csv('ckxx_cohesive_markers_entropy_TTR.csv')
    df_new.iloc[:, -5:].to_csv('ckxx_cohesive_markers_entropy_TTR_weka.csv',
                               index=0)
예제 #19
0
 def testMultiContractions(self):
     a = Analysis("What's the difference between what is shown "
                  "now and what has been shown before?")
     result = contractions.quantify(a)
     self.assertAlmostEqual(0.5, result["what's"])
예제 #20
0
 def testExplicitNaming(self):
     a = Analysis("She and he are better than John, Marie and Jim.")
     result = explicit_naming.quantify(a)
     self.assertAlmostEqual(3.0 * (2.0 / 3.0), result["explicit_naming"])
예제 #21
0
 def testSyllableRatio(self):
     a = Analysis("A test for vowels and such")
     # 'vowels' counts as having two syllables
     expected = 7 / 6.0
     result = syllable_ratio.quantify(a)["syllable_ratio"]
     self.assertAlmostEqual(expected, result)
예제 #22
0
 def testWouldNot(self):
     a = Analysis("what's wouldn't would not would not")
     result = contractions.quantify(a)
     self.assertAlmostEqual(0.5, result["wouldn't"])