def testComplex(self): # Regression check. If anything wildly varies, this test will break. A # white-box tests of this is available: # tests.test_translationese.Utilities.test_pmi. sentence = "Just because you're a Bad Guy doesn't mean you're a " \ "bad guy." analysis = Analysis(sentence) expected_pmi = { ('guy', '.'): 2.1439800628174073, ("n't", 'mean'): 2.8371272433773522, ('does', "n't"): 2.8371272433773522, ('guy', 'does'): 2.1439800628174073, ('a', 'bad'): 2.1439800628174073, ('mean', 'you'): 2.1439800628174073, ('bad', 'guy'): 2.1439800628174073, ('just', 'because'): 2.8371272433773522, ("'re", 'a'): 2.1439800628174073, ('because', 'you'): 2.1439800628174073, ('you', "'re"): 2.1439800628174073 } self.assertDictEqual(expected_pmi, analysis.pmi()) average_expected_pmi = sum(expected_pmi.values()) / len(expected_pmi) self.assertAlmostEqual(2.33302, average_expected_pmi, 5) result = average_pmi.quantify(analysis) self.assertAlmostEqual(average_expected_pmi, result["average_pmi"], 5)
def testMeanMultipleNaming(self): a = Analysis("John Joseph Doe easily defeated Bob Robertson.") result = mean_multiple_naming.quantify(a) self.assertAlmostEqual(2.5, result["mean_multiple_naming"]) a = Analysis("Dr James Brown, specialist in physics, visited in " \ "Haifa University.") result = mean_multiple_naming.quantify(a) self.assertAlmostEqual(2.5, result["mean_multiple_naming"])
def testContractions(self): a = Analysis("I'm certain he said I am later than I am, " "let's go.") result = contractions.quantify(a) self.assertAlmostEqual(0.5, result["i'm"]) self.assertAlmostEqual(1.0, result["let's"]) self.assertEqual(0.0, result["didn't"])
def testPositionalTokenFrequency(self): text = """ Humpty dumpty sat on a wall. Humpty dumpty had a great fall. Strange occurrence, quite frankly. What an egg. """ analysis = Analysis(text) expected = { "first humpty": 2, "second dumpty": 2, "antepenultimate on": 1, "penultimate a": 1, "last wall": 1, "antepenultimate a": 1, "penultimate great": 1, "last fall": 1, "first strange": 1, "second occurrence": 1, "antepenultimate ,": 1, "penultimate quite": 1, "last frankly": 1, } result = quantify(analysis) self.assertSparseDictEqual(expected, result)
def testVariant0(self): a = Analysis(self.sentence) expected = { 'the': 2 / 11.0, 'and': 1 / 11.0, } result = most_frequent_words.quantify_variant(a, 0) self.assertSparseDictEqual(expected, result)
def testMeanWordRank(self): a = Analysis("Have you not heard? The velociraptor is the word.") expected_0 = sum([18, 30, 20, 372, 1, 6000, 8, 1, 250]) / 9.0 expected_1 = sum([18, 30, 20, 372, 1, 8, 1, 250]) / 8.0 result_0 = mean_word_rank.quantify_variant(a, 0)["mean_word_rank"] result_1 = mean_word_rank.quantify_variant(a, 1)["mean_word_rank"] self.assertAlmostEqual(result_0, expected_0) self.assertAlmostEqual(result_1, expected_1)
def testContextualFunctionWords(self): analysis = Analysis("He better be back, lest a thousand " \ "similarly angry birds show up.") expected = { 'he better be': 1 / 14.0, 'VBP a thousand': 1 / 14.0, 'better be back': 1 / 14.0, 'a thousand similarly': 1 / 14.0, 'thousand similarly JJ': 1 / 14.0, } result = contextual_function_words.quantify(analysis) self.assertSparseDictEqual(expected, result)
def testUnigrams(self): analysis = Analysis("Hello, a world!") result = quantify_variant(analysis, 0) expected = { "a": 1 / 15.0, "h": 1 / 15.0, "e": 1 / 15.0, "l": 3 / 15.0, "o": 2 / 15.0, "w": 1 / 15.0, "r": 1 / 15.0, "d": 1 / 15.0, } self.assertSparseDictEqual(expected, result)
def testRepetitions(self): text = """ This is a very, very impressive thing. It is more impressive than anything I have ever seen. Very good. You have done a fine job. """ # Repeated: # very * 3 # impressive * 2 # is * 2, but doesn't count. # have * 2, but doesn't count. # Total: 5 expected_repetitions = 5 num_tokens = 30 expected_result = 3 * float(expected_repetitions) / num_tokens a = Analysis(text) result = repetitions.quantify(a) self.assertAlmostEqual(expected_result, result["repetitions"])
def testBigrams(self): analysis = Analysis("Hello, a world!") result = quantify_variant(analysis, 1) expected = { "<h": 1 / 15.0, "he": 1 / 15.0, "el": 1 / 15.0, "ll": 1 / 15.0, "lo": 1 / 15.0, "o>": 1 / 15.0, "<a": 1 / 15.0, "a>": 1 / 15.0, "<w": 1 / 15.0, "wo": 1 / 15.0, "or": 1 / 15.0, "rl": 1 / 15.0, "ld": 1 / 15.0, "d>": 1 / 15.0, } self.assertSparseDictEqual(expected, result)
def testSingleNaming(self): a = Analysis("Jim, George and Bob are my friends. John Doe is not.") result = single_naming.quantify(a) self.assertAlmostEqual(3.0 / 14.0, result["single_naming"])
def setUp(self): self.sentence = "I came. I saw. I conquered. I didn't elaborate." self.analysis = Analysis(self.sentence)
def setUp(self): self.analysis = Analysis("We like dogs, and we dislike cats.")
def testSimple(self): sentence = "a b a b b" # ("b","b") has negative PMI, the other two are positive result = threshold_pmi.quantify(Analysis(sentence)) self.assertEquals(2 / 2000.0, result["threshold_pmi"])
def testWithHyphen(self): sentence = "I'm a self-made man." self.analysis = Analysis(sentence) # Apostrophe in "I'm" and hyphen in "self-made" don't count. # However, "self-made" counts as one word. self.expected = {"mean_word_length": 14.0 / 4.0}
def testWithContraction(self): sentence = "A little bit overcautious. Don't you think so?" self.analysis = Analysis(sentence) # Apostrophe in "Don't" doesn't count. self.expected = {"mean_word_length": 36.0 / 8.0}
def getFeats(): # get cohesive markers df = pd.read_csv('../ckxx_arff3/ckxx_cohesive_markers:1.csv', index_col=0) # df = df.iloc[:20] print(df.head()) # get num of tokens for each text mynlp = StanfordCoreNLP('http://localhost', port=9000, lang='en') totalTokens = {} for fn in sorted(glob.glob('../ckxx_T/*') + glob.glob('../ckxx_O_XIN/*')): if fn.endswith('analysis'): continue # if fn == '../ckxx_O_XIN/XIN_0020': break myAna1 = Analysis(filename=fn) myAna1.stanfordnlp = mynlp myAna1.loadcache() # print(os.path.basename(myAna1.filename)) # print(len(myAna1.case_tokens())) totalTokens[os.path.basename(myAna1.filename)] = len( myAna1.case_tokens()) df2 = pd.DataFrame.from_dict(data=totalTokens, orient='index') df2.columns = ['totalTokens'] df2.sort_index(axis=0, inplace=True) print(df2.head()) # concat two df's df = pd.concat([df, df2], axis=1) # get raw counts mycols = list(df.columns) mycols.remove('totalTokens') print(mycols[-5:]) df[mycols] = df[mycols].multiply(df['totalTokens'], axis="index") print(df.head()) ############## # get entropy # remove 'totalTokens' df_new = df.iloc[:, :-1] # work on a copy, exclude last col = totalTokens # df_new.drop(['totalTokens'], axis=1, inplace=True) df_new['numCohMkrs'] = df_new.sum(axis=1) print(df_new.head()) df_new.iloc[:, :-1] = df_new.iloc[:, :-1].div(df_new['numCohMkrs'], axis=0) print(df_new.head(10)) df_new['entropy'] = \ - df_new.iloc[:, :-1].multiply( np.log2(df_new.iloc[:, :-1]) ).sum(axis=1) print(df_new.head(10)) # first row 3.217797 # sanity check first row entropy: ps = list(df_new.iloc[0, :-2]) en = -sum([x * math.log2(x) for x in ps if x != 0]) print(en) # 3.217796811598595 # CORRECT! # get TTR: type / token # number of non-zeros in each row: type df_new['TTR1'] = 6 * df.astype(bool).sum(axis=1) / df['totalTokens'] df_new['TTR2'] = 6 * np.log(df.astype(bool).sum(axis=1)) / np.log( df['totalTokens']) df_new['class'] = 'T' df_new.loc[df.index.str.contains('XIN'), 'class'] = 'O' print(df_new.head(10)) # save df_new.iloc[:, -5:-1].to_csv('ckxx_cohesive_markers_entropy_TTR.csv') df_new.iloc[:, -5:].to_csv('ckxx_cohesive_markers_entropy_TTR_weka.csv', index=0)
def testMultiContractions(self): a = Analysis("What's the difference between what is shown " "now and what has been shown before?") result = contractions.quantify(a) self.assertAlmostEqual(0.5, result["what's"])
def testExplicitNaming(self): a = Analysis("She and he are better than John, Marie and Jim.") result = explicit_naming.quantify(a) self.assertAlmostEqual(3.0 * (2.0 / 3.0), result["explicit_naming"])
def testSyllableRatio(self): a = Analysis("A test for vowels and such") # 'vowels' counts as having two syllables expected = 7 / 6.0 result = syllable_ratio.quantify(a)["syllable_ratio"] self.assertAlmostEqual(expected, result)
def testWouldNot(self): a = Analysis("what's wouldn't would not would not") result = contractions.quantify(a) self.assertAlmostEqual(0.5, result["wouldn't"])