示例#1
0
    def setUp(self):
        # ready,...
        self.out = StringIO()
        self.cleanUp()

        # ...set,...
        os.mkdir('var')
        os.mkdir('gremlin')
        self.porter = Porter(stdout=self.out)
        self.porter.gremlin = abspath('gremlin')
示例#2
0
def weights_for_block(words, call_position, coefficient):
    """
        Function calculates weight for each word in list of words (in one block).
        Coefficient is tmp argument which doesn't using in this function.
        Use two formulas depending on the flag call_position.

        If call_position = True, then the formula W=L-Position/(L*(L+1)/2.0) is used,
            where W - weight, L - length of list of words, Position - position of word in list.

        If call_position = False, then the formula W=N(word)/L is used,
            where W - weight, N(word) - frequency of word in list, L - length of list of words.

    :param words: A list of words like ['word1', 'word2', 'word3']
    :param call_position: This flag is for selecting formula for calculating weights. Might be True or False
    :param coefficient: tmp variable. Not used in this function. Looks like 0.2 or 0.4
    :return weights: Return dictionary with words and their weights and tmp arg coefficient.
            For example {'word1': 450, 'word2': 500, 'word3': 50}, 0.3
            The sum of all weights is always equals 100%  or 1. Particularly for this situation sum is equals 1000.
    """
    if words:
        porter = Porter()
        words = map(porter.get, words)
        if call_position:
            # Irregular text
            weights = dict()
            for num, word in enumerate(words):
                w = (len(words) - num) / ((float(len(words)) *
                                           (len(words) + 1)) / 2.0)
                weights[word] = weights[word] + round(
                    w, 1) if word in weights else round(w, 1)
            if weights:
                return weights, coefficient
        else:
            # Uniform text
            weights = dict()
            for word in words:
                w = words.count(word) / float(len(words))
                weights[word] = w
            if weights:
                return weights, coefficient
    return {'': 1.0}, coefficient
示例#3
0
           os.path.join(droot, 'example.com'))
os.symlink(os.path.join(sroot, 'bar', 'website_8810'),
           os.path.join(droot, 'example.net'))
os.symlink(os.path.join(sroot, 'baz', 'website_8090'),
           os.path.join(droot, 'example.org'))

# set up some domain aliases
os.symlink(os.path.join(droot, 'example.com'),
           os.path.join(droot, 'alias.example.com'))
os.symlink(os.path.join(droot, 'example.com'),
           os.path.join(droot, 'alias.example.net'))
os.symlink(os.path.join(droot, 'example.org'),
           os.path.join(droot, 'alias.example.org'))

# instantiate Porter with our fake domain root
porter = Porter(droot)

##
# ok, now run some tests!
##
testosterone(
    """\

# if already canonical then we get NULL
porter.canonicalize('example.com') == 'NULL'
porter.canonicalize('example.net') == 'NULL'
porter.canonicalize('example.org') == 'NULL'

# test some real live aliases
#print porter.canonicalize('alias.example.com')
porter.canonicalize('alias.example.com') == 'example.com'