def init_ground(self):
     """ Predicts on testing data and updates self.current_detection_rate """
     data_y, data_x = h.compose(self.train_y, self.train_x, self.pol_y, self.pol_x)
     data_y = h.strip(data_y) # strip None
     data_x = h.strip(data_x)
     m = svm.train(data_y, data_x, self.params)
     p_label, p_acc, p_val = svm.predict(self.test_y, self.test_x, m)
     self.p_label = p_label
     self.p_val = h.delist(p_val)
     self.current_detection_rate = p_acc[0]
Пример #2
0
 def init_ground(self):
     """ Predicts on testing data and updates self.current_detection_rate """
     data_y, data_x = h.compose(self.train_y, self.train_x, self.pol_y,
                                self.pol_x)
     data_y = h.strip(data_y)  # strip None
     data_x = h.strip(data_x)
     m = svm.train(data_y, data_x, self.params)
     p_label, p_acc, p_val = svm.predict(self.test_y, self.test_x, m)
     self.p_label = p_label
     self.p_val = h.delist(p_val)
     self.current_detection_rate = p_acc[0]
Пример #3
0
 def __init__(self, s, sett):
     self.sett = sett
     self.s = s
     self.words = h.strip(s).split()
     self.score = sett.calcScore(s)
     self.count = len(sett.canRegen)
     self.children = [None] * len(sett.canRegen)
     print "--Created node [", s, "]", self.score
Пример #4
0
 def __init__(self, s, sett):
     self.s = s
     self.sett = sett
     self.isbad = False
     try:  # fixes unicode characters trying to sneak through; see https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
         self.words = h.strip(s).split()
     except Exception as e:
         #print s, e
         self.isbad = True
     self.score = None  #sett.calcScore
Пример #5
0
def oldeval(s):
	global temp

	if temp:
		return None
	s = h.strip(s)
	words = s.split()
	i = random.choice([0,1,2,5]) #TODO add 4
	ret = words
	print "EVAL REPLACE:",i,ret[i]
	ret[i] = None
	temp = True
	return ret
Пример #6
0
	def parseValueFormat(self, value):
		value = helpers.strip(value)
		## Integer/floating portion of number = group 1
		## Metric prefix = group 2
		tokens = match(r'([0-9]+\.?[0-9]*)\s?([a-zA-Z]?)', value)
		try:
			componentValue = float(tokens.group(1))
			prefixValue = METRIC_PREFIX_VALUES[tokens.group(2)]
		except AttributeError as ae:
			alert = "Invalid value for the input '{0}'. Input should be of the form <number> <metric prefix> where the number is a float or integer. Quitting now."
			helpers.eclbPrint(alert.format(value))
			return None
		except KeyError as ke:
			alert = "Invalid metric prefix for the input '{0}'. Input should be of the form <number> <metric prefix> where the prefix is in the list {1}. Quitting now."
			helpers.eclbPrint(alert.format(value, [prefix for prefix in METRIC_PREFIX_VALUES]))
			return None

		return float(componentValue * prefixValue)
    def parseValueFormat(self, value):
        value = helpers.strip(value)
        ## Integer/floating portion of number = group 1
        ## Metric prefix = group 2
        tokens = match(r'([0-9]+\.?[0-9]*)\s?([a-zA-Z]?)', value)
        try:
            componentValue = float(tokens.group(1))
            prefixValue = METRIC_PREFIX_VALUES[tokens.group(2)]
        except AttributeError as ae:
            alert = "Invalid value for the input '{0}'. Input should be of the form <number> <metric prefix> where the number is a float or integer. Quitting now."
            helpers.eclbPrint(alert.format(value))
            return None
        except KeyError as ke:
            alert = "Invalid metric prefix for the input '{0}'. Input should be of the form <number> <metric prefix> where the prefix is in the list {1}. Quitting now."
            helpers.eclbPrint(
                alert.format(value,
                             [prefix for prefix in METRIC_PREFIX_VALUES]))
            return None

        return float(componentValue * prefixValue)
Пример #8
0
def cluster_remaining(au, working_set):
    """ This function is called if weighted_initial returns NO_CENTROIDS, meaning there are no more misabeled emails to use as centers.
    The remaining emails in the working set are then returned as one cluster.
    """

    print "No more cluster centroids, grouping all remaining emails into one cluster"

    first_state_rate = au.current_detection_rate

    size = len(h.strip(working_set[0] + working_set[2])) # get number of remaining emails
    init_email = None
    init_pos = None
    label = None
    data_y, data_x = h.compose_set(working_set)
    for i,l in enumerate(data_y): # loop to find first email that is not none 
        if l is not None:
            label = l
            init_pos = i
            init_email = data_x[i]
    center = (init_email, init_pos)

    cluster = Cluster(center, size, au, label, au.distance_opt, working_set=working_set)

    au.unlearn(cluster)
    au.init_ground()
    new_detection_rate = au.current_detection_rate

    au.learn(cluster) # relearn cluster in real training space so deltas of future cluster are not influenced
    second_state_rate = au.current_detection_rate
    
    net_rate_change = second_state_rate - first_state_rate
    au.current_detection_rate = first_state_rate

    assert(au.current_detection_rate == first_state_rate), str(au.current_detection_rate) + " " + str(first_state_rate)
    print "clustered remaining with a net rate change of ", second_state_rate, " - ", first_state_rate, " = ", net_rate_change
    
    return net_rate_change, cluster
Пример #9
0
def doit(topic,noun,w2v,pens,retries=0):
	#if not stanford.check():
	#	print "START THE SERVER"
	#	raw_input('Press Enter...')
	f = random.choice(formats)
	form = f[0]
	axis = f[1]
	canRegen = f[2]
	s = form(topic,noun,w2v)
	regenf = lambda lock: form(topic,noun,w2v,lock)
	scoref = lambda x: h.getSkipScores(axis[0],axis[1][0],axis[1][1],x,pens)
	if s is None or isBad(h.getV(s)):
		if retries > 20:
			return None
		print "RETRYING"
		return doit(topic,noun,w2v,pens,retries+1)
	else:
		#instead of just randomly genning one story, randomly gen one for each verb (species) to get started?
		best = priority.best(s,regenf,canRegen,scoref)[0]
		raw = h.strip(best).split()[:3]
		notraw = best.split()
		best = ". ".join([h.firstCharUp(h.makePlural(r)) for r in raw])+". "+" ".join(notraw[3:])
		print best,"\n"
		return best
Пример #10
0
def getIndex(story, i):
    return h.strip(story.split(' ')[i])
Пример #11
0
 def __init__(self, s, sett):
     self.sett = sett
     self.s = s
     self.words = h.strip(s).split()
     self.score = None  #sett.calcScore
Пример #12
0
        if c == p and i != j:
            interpos[i].append(j)

genss = []
finalgarbs = []
scoress = []
badis = []
import matplotlib.pyplot as plt
for i, g in enumerate(garbs):
    if not g:
        genss.append(None)
        continue
    grb = g[:]
    f = formats[i]
    axes = f[1]
    gs = [h.strip(' '.join(grb))]
    ss = h.strip(random.choice(newmicro.doit(formats, w2v, pens, f, False))[0])
    genss.append(ss)
    for j, w in enumerate(
            ss.split(' ')
    ):  #(f['words']): #use a generated story instead of f['words'] to avoid bias toward the axis story itself
        grb[j] = w
        gs.append(h.strip(' '.join(grb)))
    scs = h.getSkipScores(axes[0], axes[1], axes[2], gs, pens)
    finalgarbs.append(gs)
    scoress.append(scs)
    prev = -10000
    for s in scs:
        if s < prev:
            print i, f[3]['raw'], axes, "\n", [(gs[i], scs[i])
                                               for i in range(len(gs))], "\n"
Пример #13
0
def makeFormats(w2v,
                pens,
                bestaxes=True,
                w2vmax=30,
                w2vmin=10,
                backoff=False,
                verbgen=False):
    ret = []
    ex = 0
    seen = set()
    for fraw in formatssw.makeAllRawForms():
        if fraw['raw'] in seen:
            continue
        seen.add(fraw['raw'])
        s = allIndices(fraw['root'])
        if s != set([0, 1, 2, 3, 4, 5]) or not checkChars(fraw['plug']):
            #print "SKIP:", fraw['raw'], s
            ex += 1
            continue
        processPOS(
            fraw['root'], w2v
        )  #Preprocess each node by checking whether word_pos is in w2v and massage them if possible
        genf = lambda lock, fraw=fraw, w2v=w2v, w2vmax=w2vmax, w2vmin=w2vmin, verbgen=verbgen: gen(
            fraw, w2v, lock, w2vmax, w2vmin, verbgen)
        regen = range(6)
        del regen[fraw['root']['index']]
        goodstory = h.strip(" ".join(fraw['words']))
        ret.append([genf, [badstory, goodstory, goodstory, True], regen, fraw])
    if ex:
        print "Number of excluded (bad) formats:", ex, "(%d total, %f%%)" % (
            len(ret), (float(ex) / len(ret) * 100))

    poss = []
    for tup in ret:
        f = tup[3]
        poss.append(''.join(
            posListRec(f['root'], [None, None, None, None, None, None])))
    interpos = defaultdict(
        list
    )  #dictionary of format index to list of other indices that have same POS
    for i, c in enumerate(poss):
        for j, p in enumerate(poss):
            if c == p and i != j:
                interpos[i].append(j)

    if not bestaxes:
        for i, tup in enumerate(ret):
            sames = interpos[i]
            otheraxis = None
            axes = tup[1]
            if len(sames) < 1:
                otheraxis = axes[1]  #duplicate single good axis
            else:
                otheraxis = h.strip(ret[random.choice(sames)][3]['raw'])
            axes[2] = otheraxis
    else:
        #==========
        # calculated best axes for each cluster of 3+ stories (or read from file if stored there)
        # all 1- or 2-cluster formats will get 1 or 2 different axes, respectively, and be flagged (axes[3] == True) that they need the "10-20% cutoff" instead

        possets = []
        for k in interpos:
            found = False
            for s in possets:
                if k in s:
                    found = True
                    break
            if found:
                continue
            possets.append(set(interpos[k] + [k]))

        scoresfn = 'axesscores'
        axscores = {}
        with open(scoresfn, 'r') as f:
            for line in f:
                line = line.strip()
                parts = line.split('\t')
                axscores[parts[0]] = float(parts[1])

        for interis in possets:
            if len(interis) == 2:
                newaxes = [getstory(j, ret) for j in interis]
                for i in interis:
                    ret[i][1] = ret[i][1][:1] + newaxes + [
                        True
                    ]  #note: difference between l[:1] and 1[0] is that the former returns a list!
                continue
            #else: use non-exemplar best axes
            candidates = {}
            for ai1, ai2 in combinations(interis, 2):
                k = getstory(ai1, ret) + "; " + getstory(ai2, ret)
                v = 0
                if k in axscores:
                    v = axscores[k]
                else:
                    v = testaxes(ai1, ai2, interis, pens, ret)
                    axscores[k] = v  #for posterity
                candidates[k] = v
            best = sorted(candidates.keys(),
                          key=lambda k: candidates[k],
                          reverse=True)
            for i in interis:
                exemplar = getstory(i, ret)
                besti = 0
                while exemplar in best[
                        besti]:  #pick the best axes that don't include the format's exemplar (avoid plagiarism)
                    besti += 1
                newaxes = best[besti].split('; ')
                ret[i][1] = ret[i][1][:1] + newaxes

        with open(scoresfn, 'w') as fout:
            for k in axscores:
                fout.write(k + "\t" + str(axscores[k]) + "\n")
    #==========

    if backoff:
        bests = []
        partial = []
        for f in ret:
            s = h.strip(f[3]['raw'])
            if s not in f[1]:
                bests.append(f)
            elif s != f[1][2]:
                partial.append(f)
        if bests:
            return bests
        if partial:
            return partial
    return ret
Пример #14
0
def getstory(i, fmts):
    return h.strip(fmts[i][3]['raw'])
Пример #15
0
def getstory(i):
    return h.strip(formats[i][3]['raw'])
Пример #16
0
def cluster_au(au, gold=True):
    """Clusters the training space of an ActiveUnlearner and returns the list of clusters."""
    
    print "\n----------------------Beginning the Clustering Process-----------------------\n"
    cluster_list = [] # list of tuples (net_rate_change, cluster)
    train_y = copy.deepcopy(au.train_y)
    train_x = copy.deepcopy(au.train_x)
    pol_y = copy.deepcopy(au.pol_y)
    pol_x = copy.deepcopy(au.pol_x)

    training = [train_y, train_x, pol_y, pol_x] # create the working set

    original_training_size = len(h.strip(pol_y)) + len(h.strip(train_y))

    print "\nResetting mislabeled...\n"
    mislabeled = au.get_mislabeled(update=True) # gets an array of all false positives, false negatives
    au.mislabeled_chosen = [] # reset set of clustered mislabeled emails in this instance of au

    print "\n Clustering...\n"
    pre_cluster_rate = au.current_detection_rate
    training_size = len(h.strip(pol_y)) + len(h.strip(train_y))
    while training_size > 0: # loop until all emails in phantom training space have been assigned
        print "\n-----------------------------------------------------\n"
        print "\n" + str(training_size) + " emails out of " + str(original_training_size) + \
              " still unclustered.\n"

        # Choose an arbitrary email from the mislabeled emails and returns the training email closest to it.
        # Final call and source of current_seed is mislabeled_initial() function
        # current_seed = cluster_methods(au, "mislabeled", training, mislabeled) 
        current_seed = None 
        label = None
        while current_seed is None:
            label, init_pos, current_seed = au.select_initial(mislabeled, "weighted", training) 

        if str(current_seed) == 'NO_CENTROIDS':
            cluster_result = cluster_remaining(au, training)
        else:
            cluster_result = determine_cluster(current_seed, au, label, init_pos, working_set=training, gold=gold) # if true, relearn clusters after returning them
        if cluster_result is None:
            print "!!!How did this happen?????"
            sys.exit(cluster_result)

        net_rate_change, cluster = cluster_result
        # After getting the cluster and net_rate_change, you relearn the cluster in original dataset if impact=True

        post_cluster_rate = au.current_detection_rate

        # make sure the cluster was properly relearned
        # assert(post_cluster_rate == pre_cluster_rate), str(pre_cluster_rate) + " " + str(post_cluster_rate)
        # print "cluster relearned successfully: au detection rate back to ", post_cluster_rate

        cluster_list.append([net_rate_change, cluster])

        print "\nRemoving cluster from shuffled training set...\n"

        h.unlearn(training, cluster.cluster_set)
        training_size = len(h.strip(pol_y)) + len(h.strip(train_y))

    cluster_list.sort() # sorts by net_rate_change
    print "\nClustering process done and sorted.\n"
    return cluster_list
Пример #17
0
def getindex(s):
    for i, f in enumerate(formats):
        if h.strip(f[3]['raw']) == s:
            return i