def setUp(self): # ready,... self.out = StringIO() self.cleanUp() # ...set,... os.mkdir('var') os.mkdir('gremlin') self.porter = Porter(stdout=self.out) self.porter.gremlin = abspath('gremlin')
def weights_for_block(words, call_position, coefficient): """ Function calculates weight for each word in list of words (in one block). Coefficient is tmp argument which doesn't using in this function. Use two formulas depending on the flag call_position. If call_position = True, then the formula W=L-Position/(L*(L+1)/2.0) is used, where W - weight, L - length of list of words, Position - position of word in list. If call_position = False, then the formula W=N(word)/L is used, where W - weight, N(word) - frequency of word in list, L - length of list of words. :param words: A list of words like ['word1', 'word2', 'word3'] :param call_position: This flag is for selecting formula for calculating weights. Might be True or False :param coefficient: tmp variable. Not used in this function. Looks like 0.2 or 0.4 :return weights: Return dictionary with words and their weights and tmp arg coefficient. For example {'word1': 450, 'word2': 500, 'word3': 50}, 0.3 The sum of all weights is always equals 100% or 1. Particularly for this situation sum is equals 1000. """ if words: porter = Porter() words = map(porter.get, words) if call_position: # Irregular text weights = dict() for num, word in enumerate(words): w = (len(words) - num) / ((float(len(words)) * (len(words) + 1)) / 2.0) weights[word] = weights[word] + round( w, 1) if word in weights else round(w, 1) if weights: return weights, coefficient else: # Uniform text weights = dict() for word in words: w = words.count(word) / float(len(words)) weights[word] = w if weights: return weights, coefficient return {'': 1.0}, coefficient
os.path.join(droot, 'example.com')) os.symlink(os.path.join(sroot, 'bar', 'website_8810'), os.path.join(droot, 'example.net')) os.symlink(os.path.join(sroot, 'baz', 'website_8090'), os.path.join(droot, 'example.org')) # set up some domain aliases os.symlink(os.path.join(droot, 'example.com'), os.path.join(droot, 'alias.example.com')) os.symlink(os.path.join(droot, 'example.com'), os.path.join(droot, 'alias.example.net')) os.symlink(os.path.join(droot, 'example.org'), os.path.join(droot, 'alias.example.org')) # instantiate Porter with our fake domain root porter = Porter(droot) ## # ok, now run some tests! ## testosterone( """\ # if already canonical then we get NULL porter.canonicalize('example.com') == 'NULL' porter.canonicalize('example.net') == 'NULL' porter.canonicalize('example.org') == 'NULL' # test some real live aliases #print porter.canonicalize('alias.example.com') porter.canonicalize('alias.example.com') == 'example.com'