def __init__(self, op, lea1, nTimes=2): Lea.__init__(self) self._op = op self._lea1 = lea1 self._nTimes = nTimes if nTimes <= 0: raise Lea.Error("times method requires a strictly positive integer")
def exact(): W = Lea.fastMax(W0 + U, 0) for k in range(1, 21): if k % 5 == 0: plt.plot(W.support(), W.pmf(), label="k={}".format(k)) W = Lea.fastMax(W + U, 0) return W.support(), W.pmf()
def __init__(self,lea1,nTimes=2): Lea.__init__(self) self._lea1 = lea1 self._lea1Tuple = lea1.map(lambda v: (v,)) self._nTimes = nTimes if nTimes <= 0: raise Lea.Error("cprodTimes method requires a strictly positive integer")
def build(*clauses,**kwargs): priorLea = kwargs.get('priorLea',None) # TODO: check no other args !! # PY3: def build(*clauses,priorLea=None): elseClauseResults = tuple(result for (cond,result) in clauses if cond is None) if len(elseClauseResults) > 1: raise Lea.Error("impossible to define more than one 'other' clause") if len(elseClauseResults) == 1: if priorLea is not None: raise Lea.Error("impossible to define together prior probabilities and 'other' clause") elseClauseResult = elseClauseResults[0] else: elseClauseResult = None normClauseLeas = tuple((Lea.coerce(cond),Lea.coerce(result)) for (cond,result) in clauses if cond is not None) condLeas = tuple(condLea for (condLea,resultLea) in normClauseLeas) # check that conditions are disjoint for (condLea1,condLea2) in genPairs(condLeas): if (condLea1&condLea2).isFeasible(): raise Lea.Error("clause conditions are not disjoint") # build the OR of all given conditions orCondsLea = Lea.reduce(or_,condLeas) isClauseSetComplete = orCondsLea.isTrue() if priorLea is not None: # prior distribution: determine elseClauseResult if isClauseSetComplete: # TODO check priorLea equivalent to self raise Lea.Error("forbidden to define prior probabilities for complete clause set") (pTrue,count) = orCondsLea._p(True) pFalse = count - pTrue priorAleaDict = dict(priorLea.getAlea().genVPs()) priorAleaCount = sum(priorAleaDict.values()) normAleaDict = dict(Mlea(*(resultLea for (condLea,resultLea) in normClauseLeas)).getAlea().genVPs()) normAleaCount = sum(normAleaDict.values()) valuesSet = frozenset(chain(priorAleaDict.keys(),normAleaDict.keys())) vps = [] for value in valuesSet: priorP = priorAleaDict.get(value,0) condP = normAleaDict.get(value,0) p = priorP*count*normAleaCount - condP*pTrue*priorAleaCount if not(0 <= p <= pFalse*normAleaCount*priorAleaCount): # Infeasible : probability represented by p goes outside range from 0 to 1 priorPFraction = ProbFraction(priorP,priorAleaCount) lowerPFraction = ProbFraction(condP*pTrue,count*normAleaCount) upperPFraction = ProbFraction(condP*pTrue+pFalse*normAleaCount,count*normAleaCount) raise Lea.Error("prior probability of '%s' is %s, outside the range [ %s , %s ]"%(value,priorPFraction,lowerPFraction,upperPFraction)) vps.append((value,p)) elseClauseResult = Lea.fromValFreqs(*vps) elif elseClauseResult is None: # check that clause set is complete if not isClauseSetComplete: # TODO? : assume a uniform prior distribution ? ... which values ? raise Lea.Error("incomplete clause set requires 'other' clause or prior probabilities") if elseClauseResult is not None: elseCondLea = ~orCondsLea normClauseLeas += ((elseCondLea,Lea.coerce(elseClauseResult)),) # note that orCondsLea is NOT extended with rCondsLea |= elseCondLea # so, in case of else clause (and only in this case), orCondsLea is NOT certainly true return Blea(*(Ilea(resultLea,condLea) for (condLea,resultLea) in normClauseLeas))
def __init__(self,*iLeas): Lea.__init__(self) self._iLeas = tuple(iLeas) # the following treatment is needed only if some clauses miss variables present # in other clauses (e.g. CPT with context-specific independence) # a rebalancing is needed if there are such missing variables and if these admit multiple # values (total probability weight > 1) aleaLeavesSet = frozenset(aleaLeaf for ilea in iLeas \ for aleaLeaf in ilea.getAleaLeavesSet() \ if aleaLeaf._count > 1 ) self._ctxClea = Clea(*aleaLeavesSet)
def __init__(self,nextStateLeaPerState): ''' initializes Chain instance's attributes; nextStateLeaPerState is a sequence of tuples (stateObj,nextStateLea) where stateObj is a state object (e.g. a string) and nextStateLea is a Lea instance giving probabilities of transition from stateObj to each state object ''' object.__init__(self) self._stateObjs = tuple(stateObj for (stateObj,nextStateLea) in nextStateLeaPerState) self._stateAleaDict = dict((stateObj,StateAlea(Lea.coerce(stateObj),self)) for stateObj in self._stateObjs) self._state = StateAlea(Lea.fromVals(*self._stateObjs),self) iterNextStateData = ((self._state==stateObj,nextStateLea) for (stateObj,nextStateLea) in nextStateLeaPerState) self._nextStateBlea = Blea.build(*iterNextStateData)
def assign_realistic_ttls(bot_configs: dict): """ Assigns a realisitic ttl to each bot from @param: bot_configs. Uses statistics and distribution to be able to calculate a realisitc ttl. :param bot_configs: List that contains all bots that should be assigned with realistic ttls. """ ids = sorted(bot_configs.keys()) for pos, bot in enumerate(ids): bot_type = bot_configs[bot]["Type"] if bot_type == "local": # Set fix TTL for local Bots bot_configs[bot]["TTL"] = 128 # Set TTL based on TTL distribution of IP address else: # Set varying TTl for external Bots bot_ttl_dist = self.statistics.get_ttl_distribution( bot_configs[bot]["IP"]) if len(bot_ttl_dist) > 0: source_ttl_prob_dict = Lea.fromValFreqsDict( bot_ttl_dist) bot_configs[bot]["TTL"] = source_ttl_prob_dict.random() else: most_used_ttl = self.statistics.process_db_query( "most_used(ttlValue)") if isinstance(most_used_ttl, list): bot_configs[bot]["TTL"] = choice( self.statistics.process_db_query( "most_used(ttlValue)")) else: bot_configs[bot][ "TTL"] = self.statistics.process_db_query( "most_used(ttlValue)")
def markov(corpus, n_seq=1, start=None, length=42): # Counting occurrences next_one = defaultdict(Counter) next_one[EOS_TOKEN][EOS_TOKEN] = 1 # Last state is absorbing for sentence in corpus: words = sentence.split() nb_words = len(words) next_one[BOS_TOKEN][words[0]] += 1 for i in range(nb_words - 1): next_one[words[i]][words[i + 1]] += 1 if nb_words: final_word = words[nb_words - 1] next_one[final_word][EOS_TOKEN] += 1 # Initializing states states = {} for state in next_one: states[state] = Lea.fromValFreqsDict(next_one[state]) # Outputting visited states for _ in range(n_seq): state = start if start is not None else BOS_TOKEN seq = [state] while len(seq) < length and state != EOS_TOKEN: state = states[state].random() seq.append(state) print(' '.join(filter(lambda x: x not in {BOS_TOKEN, EOS_TOKEN}, seq)))
def det_ext_and_local_ids(self, prob_rspnd_local: int=0): """ Map the given IDs to a locality (i.e. local or external} considering the given probabilities. :param prob_rspnd_local: the probabilty that a responder is local """ external_ids = set() local_ids = self.local_init_ids.copy() # set up probabilistic chooser rspnd_locality = Lea.fromValFreqsDict({"local": prob_rspnd_local*100, "external": (1-prob_rspnd_local)*100}) for id_ in self.external_init_ids: external_ids.add(id_) # determine responder localities for id_ in self.respnd_ids: if id_ in local_ids or id_ in external_ids: continue pos = rspnd_locality.random() if pos == "local": local_ids.add(id_) elif pos == "external": external_ids.add(id_) self.local_ids, self.external_ids = local_ids, external_ids return self.local_ids, self.external_ids
def nextState(self,fromState=None,n=1): ''' returns the StateAlea instance obtained after n transitions from initial state defined by the given fromState, instance of StateAlea if fromState is None, then the initial state is the uniform distribution of the declared states if n = 0, then this initial state is returned ''' if n < 0: raise Lea.Error("nextState method requires a positive value for argument 'n'") if fromState is None: fromState = self._state stateN = Lea.coerce(fromState).getAlea() while n > 0: n -= 1 stateN = self._nextStateBlea.given(self._state==stateN).getAlea() return StateAlea(stateN,self)
def markov(corpus, start, length): # Counting occurrences next_one = defaultdict(Counter) for sentence in corpus: words = sentence.split() nb_words = len(words) for i in range(nb_words - 1): next_one[words[i]][words[i + 1]] += 1 # Initializing states states = {} for word in next_one: states[word] = Lea.fromValFreqsDict(next_one[word]) # Outputting visited states word = start words = [word] for _ in range(length - 1): word = states[word].random() words.append(word) return (words)
def markov(corpus, start, length): # Counting occurrences next_one = defaultdict(Counter) for sentence in corpus: words = sentence.split() nb_words = len(words) for i in range(nb_words - 1): next_one[words[i]][words[i + 1]] += 1 # Initializing states states = {} for word in next_one: states[word] = Lea.fromValFreqsDict(next_one[word]) # Outputting visited states word = start words = [word] for _ in range(length - 1): word = states[word].random() words.append(word) return(words)
def __init__(self,f,cleaArgs): Lea.__init__(self) self._f = f self._cleaArgs = cleaArgs
def __init__(self,lea1,condLea): Lea.__init__(self) self._lea1 = lea1 self._condLea = condLea
def __init__(self,*args): Lea.__init__(self) self._leaArgs = tuple(Lea.coerce(arg) for arg in args)
from __future__ import division, print_function from lea import Lea # define cancer dist cancer = Lea.fromValFreqs(('yes', 1), ('no', 99)) print('\nCancer Distribution', 'P(C)', cancer.asPct(), sep='\n') # prob for mamm given cancer == yes mamm_g_cancer = Lea.fromValFreqs(('pos', 80), ('neg', 20)) print('\nProb for mammogram given cancer', 'P(M|C=yes)', mamm_g_cancer.asPct(), sep='\n') # prob for mamm given cancer == no mamm_g_no_cancer = Lea.fromValFreqs(('pos', 96), ('neg', 1000-96)) print('\nProb for mammogram given NO cancer', 'P(M|C=no)', mamm_g_no_cancer.asPct(), sep='\n')
def __init__(self,lea1,nbValues): if nbValues <= 0: raise Lea.Error("draw method requires a strictly positive integer") Lea.__init__(self) self._lea1 = lea1 self._nbValues = nbValues
def largest_n_out_of(pmf: Lea, n: int, out_of: int) -> Distribution: return pmf.map(lambda outcome: (outcome,))\ .times(out_of, lambda outcomes1, outcomes2: tuple(sorted(outcomes1 + outcomes2)[-n:]))
def assign_realistic_timestamps(messages: list, external_ids: set, local_ids: set, avg_delay_local: list, avg_delay_external: list, zero_reference: float): """ Assigns realistic timestamps to a set of messages :param messages: the set of messages to be updated :param external_ids: the set of bot ids, that are outside the network, i.e. external :param local_ids: the set of bot ids, that are inside the network, i.e. local :param avg_delay_local: the avg_delay distribution between the dispatch and the reception of a packet between local computers :param avg_delay_external: the avg_delay distribution between the dispatch and the reception of a packet between a local and an external computer :param zero_reference: the timestamp which is regarded as the beginning of the pcap_file and therefore handled like a timestamp that resembles 0 """ updated_msgs = [] # Dict, takes a tuple of 2 Bot_IDs as a key (requester, responder), returns the time of the last response, # the requester received necessary in order to make sure, that additional requests are sent only after the # response to the last one was received last_response = {} for m in messages: # init last_response[(m.src, m.dst)] = -1 # update all timestamps for req_msg in messages: if req_msg in updated_msgs: # message already updated continue # if req_msg.timestamp would be before the timestamp of the response to the last request, req_msg needs # to be sent later (else branch) if last_response[ (req_msg.src, req_msg.dst)] == -1 or last_response[ (req_msg.src, req_msg.dst)] < (zero_reference + req_msg.time - 0.05): # update req_msg timestamp with a variation of up to 50ms req_msg.time = zero_reference + req_msg.time + uniform( -0.05, 0.05) updated_msgs.append(req_msg) else: req_msg.time = last_response[ (req_msg.src, req_msg.dst)] + 0.06 + uniform( -0.05, 0.05) # update response if necessary if req_msg.refer_msg_id != -1: respns_msg = messages[req_msg.refer_msg_id] # check for local or external communication and update response timestamp with the respective # avg delay if req_msg.src in external_ids or req_msg.dst in external_ids and avg_delay_external: # external communication external_dist = Lea.fromSeq(avg_delay_external) respns_msg.time = req_msg.time + float( external_dist.random()) * 0.001 else: # local communication local_dist = Lea.fromSeq(avg_delay_local) respns_msg.time = req_msg.time + float( local_dist.random()) * 0.001 updated_msgs.append(respns_msg) last_response[(req_msg.src, req_msg.dst)] = respns_msg.time
from __future__ import division, print_function from lea import Lea # define cancer dist cancer = Lea.fromValFreqs(('yes', 1), ('no', 99)) print('\nCancer Distribution', 'P(C)', cancer.asPct(), sep='\n') # prob for mamm given cancer == yes mamm_g_cancer = Lea.fromValFreqs(('pos', 80), ('neg', 20)) print('\nProb for mammogram given cancer', 'P(M|C=yes)', mamm_g_cancer.asPct(), sep='\n') # prob for mamm given cancer == no mamm_g_no_cancer = Lea.fromValFreqs(('pos', 96), ('neg', 1000 - 96)) print('\nProb for mammogram given NO cancer', 'P(M|C=no)', mamm_g_no_cancer.asPct(), sep='\n') # conditional probability table mammograms = Lea.buildCPT((cancer == 'yes', mamm_g_cancer), (cancer == 'no', mamm_g_no_cancer)) print('\nMammograms', 'P(M)', mammograms.asPct(), sep='\n') # get joint probs for all events
def __init__(self, *args): Lea.__init__(self) self._leaArgs = tuple(Lea.coerce(arg) for arg in args) counts = tuple(leaArg.getAlea()._count for leaArg in self._leaArgs) lcm = calcLCM(counts) self._factors = tuple(lcm // count for count in counts)
from __future__ import division, print_function from lea import Lea # define coin coin = Lea.fromValFreqs(('H', 1), ('T', 1)) print('Coin Distribution', coin, sep='\n') # define six-sided die die6 = Lea.fromValFreqs(('1', 1), ('2', 1), ('3', 1), ('4', 1), ('5', 1), ('6', 1)) print('Six-sided Die Distribution', die6, sep='\n') # define four-side die die4 = Lea.fromValFreqs(('1', 1), ('2', 1), ('3', 1), ('4', 1)) print('Four-sided Die Distribution',
def __init__(self,*words): self.words = Lea.fromVals(*words)
from collections import Counter from matplotlib.pylab import plt from matplotlib2tikz import save as tikz_save from matplotlib import style style.use('ggplot') from lea import Lea W0 = 5 # Lea.fromVals(0, 1, 2) S = Lea.fromVals(1, 2, 3) X = Lea.fromVals(1, 2, 4) U = S - X def simulate(): count = Counter() N = 1000 W = max(W0 + U.random(), 0) for k in range(1, N + 1): W = max(W + U.random(), 0) count[W] += 1 if k % (N // 5) == 0: # make 5 plots x = [w for w in count] tot = sum(count.values()) y = [count[w] / tot for w in count] plt.plot(x, y, label="N={}".format(k)) return x, y def exact(): W = Lea.fastMax(W0 + U, 0)
def assign_ttls_from_caida(bot_configs): """ Assign realistic TTL values to bots with respect to their IP, based on the CAIDA dataset. If there exists an entry for a bot's IP, the TTL is chosen based on a distribution over all used TTLs by this IP. If there is no such entry, the TTL is chosen based on a distribution over all used TTLs and their respective frequency. :param bot_configs: the existing bot configurations """ def get_ip_ttl_distrib(): """ Parses the CSV file containing a mapping between IP and their used TTLs. :return: returns a dict with the IPs as keys and dicts for their TTL distribution as values """ ip_based_distrib = {} with open("resources/CaidaTTL_perIP.csv", "r") as file: # every line consists of: IP, TTL, Frequency next(file) # skip CSV header line for line in file: ip_addr, ttl, freq = line.split(",") if ip_addr not in ip_based_distrib: # the values for ip_based_distrib are dicts with key=TTL, value=Frequency ip_based_distrib[ip_addr] = {} ip_based_distrib[ip_addr][ttl] = int(freq) return ip_based_distrib def get_total_ttl_distrib(): """ Parses the CSV file containing an overview of all used TTLs and their respective frequency. :return: returns a dict with the TTLs as keys and their frequencies as keys """ total_ttl_distrib = {} with open("resources/CaidaTTL_total.csv", "r") as file: # every line consists of: TTL, Frequency, Fraction next(file) # skip CSV header line for line in file: ttl, freq, _ = line.split(",") total_ttl_distrib[ttl] = int(freq) return total_ttl_distrib # get the TTL distribution for every IP that is available in "resources/CaidaTTL_perIP.csv" ip_ttl_distrib = get_ip_ttl_distrib() # build a probability dict for the total TTL distribution total_ttl_prob_dict = Lea.fromValFreqsDict(get_total_ttl_distrib()) # loop over every bot id and assign a TTL to the respective bot for bot_id in sorted(bot_configs): bot_type = bot_configs[bot_id]["Type"] bot_ip = bot_configs[bot_id]["IP"] if bot_type == "local": bot_configs[bot_id]["TTL"] = 128 # if there exists detailed information about the TTL distribution of this IP elif bot_ip in ip_ttl_distrib: ip_ttl_freqs = ip_ttl_distrib[bot_ip] # build a probability dict from this IP's TTL distribution source_ttl_prob_dict = Lea.fromValFreqsDict(ip_ttl_freqs) bot_configs[bot_id]["TTL"] = source_ttl_prob_dict.random() # otherwise assign a random TTL based on the total TTL distribution else: bot_configs[bot_id]["TTL"] = total_ttl_prob_dict.random()
# PyBossa is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with PyBossa. If not, see <http://www.gnu.org/licenses/>. from lea import Lea import pandas as pd import ngram def lower(s): return s.lower() task_runs = Lea.fromValFreqs(("hola mundo", 55), ("HoLa mundos", 45), ("algo horroroso", 10)) observation = task_runs.random(30) a = [lower(w) for w in observation] df = pd.DataFrame({'info': a}) desc = df.describe() top_string = desc['info']['top'] print "The top transcribed word is: %s" % top_string G = ngram.NGram([ lower(w) for w in a])
def setTermsChoices(self,*termsChoices): self.termsChoices = Lea.fromValFreqs(*termsChoices)