class TestProbabilityEstimation(unittest.TestCase): def setUp(self): self.texts = [ ["human", "interface", "computer"], ["eps", "user", "interface", "system"], ["system", "human", "system", "eps"], ["user", "response", "time"], ["trees"], ["graph", "trees"], ] self.dictionary = HashDictionary(self.texts) # Following is the mapping: # {'computer': 10608, # 'eps': 31049, # 'graph': 18451, # 'human': 31002, # 'interface': 12466, # 'response': 5232, # 'system': 5798, # 'time': 29104, # 'trees': 23844, # 'user': 12736} self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] # Suppose the segmented topics from s_one_pre are: self.segmented_topics = [ [(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)], ] def testPBooleanDocument(self): """Test p_boolean_document()""" # Unique topic ids are 5798, 10608, 12736 and 18451 obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) expected = {18451: set([5]), 12736: set([1, 3]), 5798: set([1, 2]), 10608: set([0])} self.assertTrue(obtained == expected) def testPBooleanSlidingWindow(self): """Test p_boolean_sliding_window()""" # Test with window size as 2. window_id is zero indexed. obtained, _ = probability_estimation.p_boolean_sliding_window( self.texts, self.segmented_topics, self.dictionary, 2 ) expected = {10608: set([1]), 12736: set([8, 2, 3]), 18451: set([11]), 5798: set([4, 5, 6, 7])} self.assertTrue(obtained == expected)
class TestProbabilityEstimation(unittest.TestCase): def setUp(self): self.texts = [['human', 'interface', 'computer'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees']] self.dictionary = HashDictionary(self.texts) # Following is the mapping: # {'computer': 10608, # 'eps': 31049, # 'graph': 18451, # 'human': 31002, # 'interface': 12466, # 'response': 5232, # 'system': 5798, # 'time': 29104, # 'trees': 23844, # 'user': 12736} self.corpus = [self.dictionary.doc2bow(text) for text in self.texts] # Suppose the segmented topics from s_one_pre are: self.segmented_topics = [[(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)]] def testPBooleanDocument(self): """Test p_boolean_document()""" # Unique topic ids are 5798, 10608, 12736 and 18451 obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics) expected = {18451: set([5]), 12736: set([1, 3]), 5798: set([1, 2]), 10608: set([0])} self.assertTrue(obtained == expected) def testPBooleanSlidingWindow(self): """Test p_boolean_sliding_window()""" # Test with window size as 2. window_id is zero indexed. obtained, _ = probability_estimation.p_boolean_sliding_window(self.texts, self.segmented_topics, self.dictionary, 2) expected = {10608: set([1]), 12736: set([8, 2, 3]), 18451: set([11]), 5798: set([4, 5, 6, 7])} self.assertTrue(obtained == expected)
dictionary = HashDictionary(texts) w2id = dictionary.token2id segmented_topics = [ [ (w2id['system'], w2id['graph']), (w2id['computer'], w2id['graph']), (w2id['computer'], w2id['system']) ], [ (w2id['computer'], w2id['graph']), (w2id['user'], w2id['graph']), (w2id['user'], w2id['computer'])] ] # create corpus corpus = [dictionary.doc2bow(text) for text in texts] sentences = [ ['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'] ] model = word2vec.Word2Vec(sentences, size=100, min_count=1) accumulator = probability_estimation.p_word2vec(texts, segmented_topics, dictionary, 2, 1, model) a=1
reload(sys) sys.setdefaultencoding('utf8') #Get the command line arguments inputFile = sys.argv[1] doc = [] texts = [] with codecs.open(inputFile, encoding='utf-8', mode='r', errors='ignore') as inptFile: for line in inptFile: line = line.split() for word in line: doc.append(word) texts.append(doc) doc = [] dictionary = HashDictionary(texts) w2id = dictionary.token2id corpus = [dictionary.doc2bow(doc) for doc in texts] pickle.dump(dictionary, open("wiki_dictionary.p", "wb")) pickle.dump(w2id, open("wiki_w2id.p", "wb")) pickle.dump(corpus, open("wiki_corpus.p", "wb")) ''' dic = pickle.load( open( "wiki_dictionary.p", "rb" ) ) w2id = pickle.load( open( "wiki_w2id.p", "rb" ) ) data = pickle.load( open( "wiki_corpus.p", "rb" ) ) print w2id print data '''