def test_doc_freq_and_token2id_for_several_docs_with_one_word(self): # two docs texts = [['human'], ['human']] d = HashDictionary(texts, myhash=zlib.adler32) expected = {31002: 2} self.assertEqual(d.dfs, expected) # only one token (human) should exist expected = {'human': 31002} self.assertEqual(d.token2id['human'], expected['human']) self.assertEqual(d.token2id.keys(), expected.keys()) # three docs texts = [['human'], ['human'], ['human']] d = HashDictionary(texts, myhash=zlib.adler32) expected = {31002: 3} self.assertEqual(d.dfs, expected) # only one token (human) should exist expected = {'human': 31002} self.assertEqual(d.token2id['human'], expected['human']) self.assertEqual(d.token2id.keys(), expected.keys()) # four docs texts = [['human'], ['human'], ['human'], ['human']] d = HashDictionary(texts, myhash=zlib.adler32) expected = {31002: 4} self.assertEqual(d.dfs, expected) # only one token (human) should exist expected = {'human': 31002} self.assertEqual(d.token2id['human'], expected['human']) self.assertEqual(d.token2id.keys(), expected.keys())
def test_saveAsText(self): """ `HashDictionary` can be saved as textfile. """ tmpf = get_tmpfile('dict_test.txt') # use some utf8 strings, to test encoding serialization d = HashDictionary(['žloťoučký koníček'.split(), 'Малйж обльйквюэ ат эжт'.split()]) d.save_as_text(tmpf) self.assertTrue(os.path.exists(tmpf))
def testDocFreqAndToken2IdForSeveralDocsWithOneWord(self): # two docs texts = [['human'], ['human']] d = HashDictionary(texts, myhash=zlib.adler32) expected = {31002: 2} self.assertEqual(d.dfs, expected) # only one token (human) should exist expected = {'human': 31002} self.assertEqual(d.token2id['human'], expected['human']) self.assertEqual(d.token2id.keys(), expected.keys()) # three docs texts = [['human'], ['human'], ['human']] d = HashDictionary(texts, myhash=zlib.adler32) expected = {31002: 3} self.assertEqual(d.dfs, expected) # only one token (human) should exist expected = {'human': 31002} self.assertEqual(d.token2id['human'], expected['human']) self.assertEqual(d.token2id.keys(), expected.keys()) # four docs texts = [['human'], ['human'], ['human'], ['human']] d = HashDictionary(texts, myhash=zlib.adler32) expected = {31002: 4} self.assertEqual(d.dfs, expected) # only one token (human) should exist expected = {'human': 31002} self.assertEqual(d.token2id['human'], expected['human']) self.assertEqual(d.token2id.keys(), expected.keys())
def test_saveAsText(self): """ `HashDictionary` can be saved as textfile. """ tmpf = get_tmpfile('dict_test.txt') # use some utf8 strings, to test encoding serialization d = HashDictionary( ['žloťoučký koníček'.split(), 'Малйж обльйквюэ ат эжт'.split()]) d.save_as_text(tmpf) self.assertTrue(os.path.exists(tmpf))
def testRange(self): # all words map to the same id d = HashDictionary(self.texts, id_range=1, debug=True) dfs = {0: 9} id2token = { 0: set([ 'minors', 'graph', 'system', 'trees', 'eps', 'computer', 'survey', 'user', 'human', 'time', 'interface', 'response' ]) } token2id = { 'minors': 0, 'graph': 0, 'system': 0, 'trees': 0, 'eps': 0, 'computer': 0, 'survey': 0, 'user': 0, 'human': 0, 'time': 0, 'interface': 0, 'response': 0 } self.assertEqual(d.dfs, dfs) self.assertEqual(d.id2token, id2token) self.assertEqual(d.token2id, token2id) # 2 ids: 0/1 for even/odd number of bytes in the word d = HashDictionary(self.texts, id_range=2, myhash=lambda key: len(key)) dfs = {0: 7, 1: 7} id2token = { 0: set([ 'minors', 'system', 'computer', 'survey', 'user', 'time', 'response' ]), 1: set(['interface', 'graph', 'trees', 'eps', 'human']) } token2id = { 'minors': 0, 'graph': 1, 'system': 0, 'trees': 1, 'eps': 1, 'computer': 0, 'survey': 0, 'user': 0, 'human': 1, 'time': 0, 'interface': 1, 'response': 0 } self.assertEqual(d.dfs, dfs) self.assertEqual(d.id2token, id2token) self.assertEqual(d.token2id, token2id)
def test_saveAsTextBz2(self): """ `HashDictionary` can be saved & loaded as compressed pickle. """ tmpf = get_tmpfile('dict_test.txt.bz2') # use some utf8 strings, to test encoding serialization d = HashDictionary(['žloťoučký koníček'.split(), 'Малйж обльйквюэ ат эжт'.split()]) d.save(tmpf) self.assertTrue(os.path.exists(tmpf)) d2 = d.load(tmpf) self.assertEqual(len(d), len(d2))
def test_saveAsTextBz2(self): """ `HashDictionary` can be saved & loaded as compressed pickle. """ tmpf = get_tmpfile('dict_test.txt.bz2') # use some utf8 strings, to test encoding serialization d = HashDictionary( ['žloťoučký koníček'.split(), 'Малйж обльйквюэ ат эжт'.split()]) d.save(tmpf) self.assertTrue(os.path.exists(tmpf)) d2 = d.load(tmpf) self.assertEqual(len(d), len(d2))
def test_doc_freq_for_one_doc_with_several_word(self): # two words texts = [['human', 'cat']] d = HashDictionary(texts, myhash=zlib.adler32) expected = {9273: 1, 31002: 1} self.assertEqual(d.dfs, expected) # three words texts = [['human', 'cat', 'minors']] d = HashDictionary(texts, myhash=zlib.adler32) expected = {9273: 1, 15001: 1, 31002: 1} self.assertEqual(d.dfs, expected)
def testDocFreqForOneDocWithSeveralWord(self): # two words texts = [['human', 'cat']] d = HashDictionary(texts, myhash=zlib.adler32) expected = {9273: 1, 31002: 1} self.assertEqual(d.dfs, expected) # three words texts = [['human', 'cat', 'minors']] d = HashDictionary(texts, myhash=zlib.adler32) expected = {9273: 1, 15001: 1, 31002: 1} self.assertEqual(d.dfs, expected)
def testDebugMode(self): # two words texts = [['human', 'cat']] d = HashDictionary(texts, debug=True, myhash=zlib.adler32) expected = {9273: {'cat'}, 31002: {'human'}} self.assertEqual(d.id2token, expected) # now the same thing, with debug off texts = [['human', 'cat']] d = HashDictionary(texts, debug=False, myhash=zlib.adler32) expected = {} self.assertEqual(d.id2token, expected)
def testBuild(self): d = HashDictionary(self.texts, myhash=zlib.adler32) expected = { 5232: 2, 5798: 3, 10608: 2, 12466: 2, 12736: 3, 15001: 2, 18451: 3, 23844: 3, 28591: 2, 29104: 2, 31002: 2, 31049: 2 } self.assertEqual(d.dfs, expected) expected = { 'minors': 15001, 'graph': 18451, 'system': 5798, 'trees': 23844, 'eps': 31049, 'computer': 10608, 'survey': 28591, 'user': 12736, 'human': 31002, 'time': 29104, 'interface': 12466, 'response': 5232 } for ex in expected: self.assertEqual(d.token2id[ex], expected[ex])
def setUp(self): self.texts = [ ["human", "interface", "computer"], ["eps", "user", "interface", "system"], ["system", "human", "system", "eps"], ["user", "response", "time"], ["trees"], ["graph", "trees"], ] self.dictionary = HashDictionary(self.texts) # Following is the mapping: # {'computer': 10608, # 'eps': 31049, # 'graph': 18451, # 'human': 31002, # 'interface': 12466, # 'response': 5232, # 'system': 5798, # 'time': 29104, # 'trees': 23844, # 'user': 12736} self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] # Suppose the segmented topics from s_one_pre are: self.segmented_topics = [ [(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)], ]
def testFilter(self): d = HashDictionary(self.texts, myhash=zlib.adler32) d.filter_extremes() expected = {} self.assertEqual(d.dfs, expected) d = HashDictionary(self.texts, myhash=zlib.adler32) d.filter_extremes(no_below=0, no_above=0.3) expected = {29104: 2, 31049: 2, 28591: 2, 5232: 2, 10608: 2, 12466: 2, 15001: 2, 31002: 2} self.assertEqual(d.dfs, expected) d = HashDictionary(self.texts, myhash=zlib.adler32) d.filter_extremes(no_below=3, no_above=1.0, keep_n=4) expected = {5798: 3, 12736: 3, 18451: 3, 23844: 3} self.assertEqual(d.dfs, expected)
def setUp(self): self.texts = [['human', 'interface', 'computer'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees']] self.dictionary = HashDictionary(self.texts) # Following is the mapping: # {'computer': 10608, # 'eps': 31049, # 'graph': 18451, # 'human': 31002, # 'interface': 12466, # 'response': 5232, # 'system': 5798, # 'time': 29104, # 'trees': 23844, # 'user': 12736} self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] # Suppose the segmented topics from s_one_pre are: self.segmented_topics = [[(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)]]
class TestProbabilityEstimation(unittest.TestCase): def setUp(self): self.texts = [ ["human", "interface", "computer"], ["eps", "user", "interface", "system"], ["system", "human", "system", "eps"], ["user", "response", "time"], ["trees"], ["graph", "trees"], ] self.dictionary = HashDictionary(self.texts) # Following is the mapping: # {'computer': 10608, # 'eps': 31049, # 'graph': 18451, # 'human': 31002, # 'interface': 12466, # 'response': 5232, # 'system': 5798, # 'time': 29104, # 'trees': 23844, # 'user': 12736} self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] # Suppose the segmented topics from s_one_pre are: self.segmented_topics = [ [(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)], ] def testPBooleanDocument(self): """Test p_boolean_document()""" # Unique topic ids are 5798, 10608, 12736 and 18451 obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) expected = {18451: set([5]), 12736: set([1, 3]), 5798: set([1, 2]), 10608: set([0])} self.assertTrue(obtained == expected) def testPBooleanSlidingWindow(self): """Test p_boolean_sliding_window()""" # Test with window size as 2. window_id is zero indexed. obtained, _ = probability_estimation.p_boolean_sliding_window( self.texts, self.segmented_topics, self.dictionary, 2 ) expected = {10608: set([1]), 12736: set([8, 2, 3]), 18451: set([11]), 5798: set([4, 5, 6, 7])} self.assertTrue(obtained == expected)
class TestProbabilityEstimation(unittest.TestCase): def setUp(self): self.texts = [['human', 'interface', 'computer'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees']] self.dictionary = HashDictionary(self.texts) # Following is the mapping: # {'computer': 10608, # 'eps': 31049, # 'graph': 18451, # 'human': 31002, # 'interface': 12466, # 'response': 5232, # 'system': 5798, # 'time': 29104, # 'trees': 23844, # 'user': 12736} self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] # Suppose the segmented topics from s_one_pre are: self.segmented_topics = [[(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)]] def testPBooleanDocument(self): """Test p_boolean_document()""" # Unique topic ids are 5798, 10608, 12736 and 18451 obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) expected = {18451: set([5]), 12736: set([1, 3]), 5798: set([1, 2]), 10608: set([0])} self.assertTrue(obtained == expected) def testPBooleanSlidingWindow(self): """Test p_boolean_sliding_window()""" # Test with window size as 2. window_id is zero indexed. obtained, _ = probability_estimation.p_boolean_sliding_window(self.texts, self.segmented_topics, self.dictionary, 2) expected = {10608: set([1]), 12736: set([8, 2, 3]), 18451: set([11]), 5798: set([4, 5, 6, 7])} self.assertTrue(obtained == expected)
from gensim.topic_coherence import probability_estimation from gensim.corpora.hashdictionary import HashDictionary from gensim.models import word2vec texts = [ ['human', 'interface', 'computer'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'] ] dictionary = HashDictionary(texts) w2id = dictionary.token2id segmented_topics = [ [ (w2id['system'], w2id['graph']), (w2id['computer'], w2id['graph']), (w2id['computer'], w2id['system']) ], [ (w2id['computer'], w2id['graph']), (w2id['user'], w2id['graph']), (w2id['user'], w2id['computer'])] ] # create corpus corpus = [dictionary.doc2bow(text) for text in texts]
reload(sys) sys.setdefaultencoding('utf8') #Get the command line arguments inputFile = sys.argv[1] doc = [] texts = [] with codecs.open(inputFile, encoding='utf-8', mode='r', errors='ignore') as inptFile: for line in inptFile: line = line.split() for word in line: doc.append(word) texts.append(doc) doc = [] dictionary = HashDictionary(texts) w2id = dictionary.token2id corpus = [dictionary.doc2bow(doc) for doc in texts] pickle.dump(dictionary, open("wiki_dictionary.p", "wb")) pickle.dump(w2id, open("wiki_w2id.p", "wb")) pickle.dump(corpus, open("wiki_corpus.p", "wb")) ''' dic = pickle.load( open( "wiki_dictionary.p", "rb" ) ) w2id = pickle.load( open( "wiki_w2id.p", "rb" ) ) data = pickle.load( open( "wiki_corpus.p", "rb" ) ) print w2id print data '''
def setup_dictionary(self): self.dictionary = HashDictionary(self.texts)
def testDocFreqOneDoc(self): texts = [['human', 'interface', 'computer']] d = HashDictionary(texts, myhash=zlib.adler32) expected = {10608: 1, 12466: 1, 31002: 1} self.assertEqual(d.dfs, expected)
def testFilter(self): d = HashDictionary(self.texts, myhash=zlib.adler32) d.filter_extremes() expected = {} self.assertEqual(d.dfs, expected) d = HashDictionary(self.texts, myhash=zlib.adler32) d.filter_extremes(no_below=0, no_above=0.3) expected = { 29104: 2, 31049: 2, 28591: 2, 5232: 2, 10608: 2, 12466: 2, 15001: 2, 31002: 2 } self.assertEqual(d.dfs, expected) d = HashDictionary(self.texts, myhash=zlib.adler32) d.filter_extremes(no_below=3, no_above=1.0, keep_n=4) expected = {5798: 3, 12736: 3, 18451: 3, 23844: 3} self.assertEqual(d.dfs, expected)