def setUp(self): self.phraseVector = { 'project': 1, 'cluster': 1, 'highdimensional': 1, 'streams': 1 } self.phraseTextAndDimensionMap = TwoWayMap() self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1) self.phraseTextToPhraseObjectMap = { 'project': Phrase('project', test_time, score=8), 'cluster': Phrase('cluster', test_time, score=8), 'abcd': Phrase( 'abcd', test_time - 3 * stream_settings['max_phrase_inactivity_time_in_seconds'], score=8) } self.vector = Vector({0: 1, 1: 1, 2: 1, 3: 1}) self.initial_max_dimensions = stream_settings['dimensions'] stream_settings['dimensions'] = 2
def test_pruningConditionRandom(self): phrase1 = Phrase( 'dsf', test_time - 3 * stream_settings['max_phrase_inactivity_time_in_seconds'], 1) phrase2 = Phrase('dsf', test_time, 1) self.assertTrue( UtilityMethods.pruningConditionRandom(phrase1, test_time, **stream_settings)) self.assertFalse( UtilityMethods.pruningConditionRandom(phrase2, test_time, **stream_settings))
class PhraseTests(unittest.TestCase): def setUp(self): self.phrase1 = Phrase('abc', test_time, score=8) self.phrase2 = Phrase('xyz', test_time, score=7) def test_updateScore(self): self.phrase1.updateScore(test_time+timedelta(seconds=120), 0, **stream_settings) self.assertEqual(2, self.phrase1.score) self.assertEqual(test_time+timedelta(seconds=120), self.phrase1.latestOccuranceTime) def test_sort(self): self.assertEqual([self.phrase2, self.phrase1], Phrase.sort([self.phrase1, self.phrase2])) self.assertEqual([self.phrase1, self.phrase2], Phrase.sort([self.phrase1, self.phrase2], reverse=True))
def test_pruneUnnecessaryPhrases(self): phraseTextToPhraseObjectMap = { 'dsf': Phrase( 'dsf', test_time - 3 * stream_settings['max_phrase_inactivity_time_in_seconds'], 1), 'abc': Phrase('abc', test_time, 1) } UtilityMethods.pruneUnnecessaryPhrases( phraseTextToPhraseObjectMap, test_time, UtilityMethods.pruningConditionRandom, **stream_settings) self.assertTrue('dsf' not in phraseTextToPhraseObjectMap) self.assertTrue('abc' in phraseTextToPhraseObjectMap)
def dimensionsEstimation(estimationObject, currentMessageTime): ''' This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate the number of phrases that need to added every iteration for different dimensions. The dimension at which the number of phrases added stablizes is the number of dimensions for the stream. Why do we need this? The aim is to get dimensions, that dont change too often at the same time are not very huge. This experiments gives us an approximate idea of the number of dimensions. Randomly picking a small value will result in dimensions that are not good and picking too big a value will result in inefficiency. ''' def updatePhraseScore(phraseObject): phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings) return phraseObject topDimensionsDuringCurrentIteration = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)] oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration if estimationObject.topDimensionsDuringPreviousIteration: dimensions_estimation = {} for boundary in estimationObject.boundaries: if boundary < len(estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len(set(newList[:boundary]).difference(oldList[:boundary])) print currentMessageTime, len(estimationObject.phraseTextToPhraseObjectMap) iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap), 'settings': estimationObject.stream_settings.convertToSerializableObject(), ParameterEstimation.dimensionsEstimationId:dimensions_estimation } FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile) estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime): ''' Observe the new dimensions that get added to current dimension if the dimensions are being updated at regular intervals. For example, number of dimensions being added after 10m, 20m,... 5 horus. As time increases the number of 'decayed' dimensions increase. The current dimensions has a lot of unwanted decayed dimensions. Using this information identify the time interval that is best suited to refresh dimensions. Tentative: We decide to pick the time interval at which the rate of decay is maximum. ''' def updatePhraseScore(phraseObject): phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings) return phraseObject dimensions = estimationObject.stream_settings['dimensions'] newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions] print currentMessageTime, len(newList) if len(newList) >= dimensions: idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap] dimensionsUpdateFrequency = {} for td, id in idsOfDimensionsListToCompare: oldList = estimationObject.dimensionListsMap[id] dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList)) print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)] iterationData = { 'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime), 'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap), 'settings': pprint.pformat(estimationObject.stream_settings), ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency } FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile) estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:] for key in estimationObject.dimensionListsMap.keys()[:]: if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
class PhraseTests(unittest.TestCase): def setUp(self): self.phrase1 = Phrase('abc', test_time, score=8) self.phrase2 = Phrase('xyz', test_time, score=7) def test_updateScore(self): self.phrase1.updateScore(test_time + timedelta(seconds=120), 0, **stream_settings) self.assertEqual(2, self.phrase1.score) self.assertEqual(test_time + timedelta(seconds=120), self.phrase1.latestOccuranceTime) def test_sort(self): self.assertEqual([self.phrase2, self.phrase1], Phrase.sort([self.phrase1, self.phrase2])) self.assertEqual([self.phrase1, self.phrase2], Phrase.sort([self.phrase1, self.phrase2], reverse=True))
def test_updateDimensions_when_phraseTextToIdMap_is_filled_to_max_dimensions( self): for phrase, score in zip(['added'], range(10, 11)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual({ 'project': 0, 'added': 1 }, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
def test_updateDimensions_when_phrases_with_lower_id_are_removed_from_phraseTextToIdMap( self): stream_settings['dimensions'] = 3 for phrase, score in zip(['new', 'phrases', 'are'], range(100, 103)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 2) self.phraseTextToPhraseObjectMap['cluster'].score = 100 UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual( range(3), sorted( self.phraseTextAndDimensionMap.getMap( TwoWayMap.MAP_FORWARD).values()))
def test_updateDimensions_when_phraseTextToIdMap_has_lesser_than_max_dimensions( self): stream_settings['dimensions'] = 4 for phrase, score in zip(['new', 'phrases', 'are', 'added'], range(7, 11)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual( set({ 'project': 0, 'phrases': 1, 'are': 2, 'added': 3 }), set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))) self.assertEqual(4, len(self.phraseTextAndDimensionMap))
def test_updateDimensions_when_dimensions_have_to_be_removed(self): stream_settings['dimensions'] = 4 self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdx', 2) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdxy', 3) for phrase, score in zip(['new_text'], range(7, 8)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score) self.phraseTextToPhraseObjectMap[ 'cluster'].latestOccuranceTime = test_time - 3 * stream_settings[ 'max_phrase_inactivity_time_in_seconds'] UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings) self.assertEqual( set({ 'project': 0, 'new_text': 1 }), set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)))
def test_sort(self): self.assertEqual([self.phrase2, self.phrase1], Phrase.sort([self.phrase1, self.phrase2])) self.assertEqual([self.phrase1, self.phrase2], Phrase.sort([self.phrase1, self.phrase2], reverse=True))
def setUp(self): self.phrase1 = Phrase('abc', test_time, score=8) self.phrase2 = Phrase('xyz', test_time, score=7)