class TestProbabilityEstimation(unittest.TestCase):
    def setUp(self):
        self.texts = [
            ["human", "interface", "computer"],
            ["eps", "user", "interface", "system"],
            ["system", "human", "system", "eps"],
            ["user", "response", "time"],
            ["trees"],
            ["graph", "trees"],
        ]
        self.dictionary = HashDictionary(self.texts)
        # Following is the mapping:
        # {'computer': 10608,
        #  'eps': 31049,
        #  'graph': 18451,
        #  'human': 31002,
        #  'interface': 12466,
        #  'response': 5232,
        #  'system': 5798,
        #  'time': 29104,
        #  'trees': 23844,
        #  'user': 12736}
        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
        # Suppose the segmented topics from s_one_pre are:
        self.segmented_topics = [
            [(5798, 18451), (10608, 18451), (10608, 5798)],
            [(10608, 18451), (12736, 18451), (12736, 10608)],
        ]

    def testPBooleanDocument(self):
        """Test p_boolean_document()"""
        # Unique topic ids are 5798, 10608, 12736 and 18451
        obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics)
        expected = {18451: set([5]), 12736: set([1, 3]), 5798: set([1, 2]), 10608: set([0])}
        self.assertTrue(obtained == expected)

    def testPBooleanSlidingWindow(self):
        """Test p_boolean_sliding_window()"""
        # Test with window size as 2. window_id is zero indexed.
        obtained, _ = probability_estimation.p_boolean_sliding_window(
            self.texts, self.segmented_topics, self.dictionary, 2
        )
        expected = {10608: set([1]), 12736: set([8, 2, 3]), 18451: set([11]), 5798: set([4, 5, 6, 7])}
        self.assertTrue(obtained == expected)
class TestProbabilityEstimation(unittest.TestCase):
    def setUp(self):
        self.texts = [['human', 'interface', 'computer'],
                      ['eps', 'user', 'interface', 'system'],
                      ['system', 'human', 'system', 'eps'],
                      ['user', 'response', 'time'],
                      ['trees'],
                      ['graph', 'trees']]
        self.dictionary = HashDictionary(self.texts)
        # Following is the mapping:
        # {'computer': 10608,
        #  'eps': 31049,
        #  'graph': 18451,
        #  'human': 31002,
        #  'interface': 12466,
        #  'response': 5232,
        #  'system': 5798,
        #  'time': 29104,
        #  'trees': 23844,
        #  'user': 12736}
        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
        # Suppose the segmented topics from s_one_pre are:
        self.segmented_topics = [[(5798, 18451), (10608, 18451), (10608, 5798)], [(10608, 18451), (12736, 18451), (12736, 10608)]]

    def testPBooleanDocument(self):
        """Test p_boolean_document()"""
        # Unique topic ids are 5798, 10608, 12736 and 18451
        obtained, _ = probability_estimation.p_boolean_document(self.corpus, self.segmented_topics)
        expected = {18451: set([5]), 12736: set([1, 3]), 5798: set([1, 2]), 10608: set([0])}
        self.assertTrue(obtained == expected)

    def testPBooleanSlidingWindow(self):
        """Test p_boolean_sliding_window()"""
        # Test with window size as 2. window_id is zero indexed.
        obtained, _ = probability_estimation.p_boolean_sliding_window(self.texts, self.segmented_topics, self.dictionary, 2)
        expected = {10608: set([1]), 12736: set([8, 2, 3]), 18451: set([11]), 5798: set([4, 5, 6, 7])}
        self.assertTrue(obtained == expected)
示例#3
0
dictionary = HashDictionary(texts)
w2id = dictionary.token2id

segmented_topics = [
   [
         (w2id['system'], w2id['graph']),
         (w2id['computer'], w2id['graph']),
         (w2id['computer'], w2id['system'])
     ],
    [
        (w2id['computer'], w2id['graph']),
        (w2id['user'], w2id['graph']),
        (w2id['user'], w2id['computer'])]
 ]
# create corpus
corpus = [dictionary.doc2bow(text) for text in texts]
sentences = [
     ['human', 'interface', 'computer'],
    ['survey', 'user', 'computer', 'system', 'response', 'time']
]
model = word2vec.Word2Vec(sentences, size=100, min_count=1)
accumulator = probability_estimation.p_word2vec(texts, segmented_topics, dictionary, 2, 1, model)


a=1





示例#4
0
reload(sys)
sys.setdefaultencoding('utf8')

#Get the command line arguments
inputFile = sys.argv[1]

doc = []
texts = []
with codecs.open(inputFile, encoding='utf-8', mode='r',
                 errors='ignore') as inptFile:
    for line in inptFile:
        line = line.split()
        for word in line:
            doc.append(word)
        texts.append(doc)
        doc = []

dictionary = HashDictionary(texts)
w2id = dictionary.token2id
corpus = [dictionary.doc2bow(doc) for doc in texts]
pickle.dump(dictionary, open("wiki_dictionary.p", "wb"))
pickle.dump(w2id, open("wiki_w2id.p", "wb"))
pickle.dump(corpus, open("wiki_corpus.p", "wb"))
'''
dic = pickle.load( open( "wiki_dictionary.p", "rb" ) )
w2id = pickle.load( open( "wiki_w2id.p", "rb" ) )
data = pickle.load( open( "wiki_corpus.p", "rb" ) )
print w2id
print data
'''