예제 #1
0
    def setUp(self):
        # ready,...
        self.out = StringIO()
        self.cleanUp()

        # ...set,...
        os.mkdir('var')
        os.mkdir('gremlin')
        self.porter = Porter(stdout=self.out)
        self.porter.gremlin = abspath('gremlin')
예제 #2
0
class TestPorter(unittest.TestCase):

    def setUp(self):
        self.porter = Porter(1)

    ## This function goes through all the porter states returning to pending once complete ##
    def test_work(self):
        simState = Mock()
        simState.env.timout(return_value=20)
        self.assertEqual(self.porter.state,Porter.pending)
        self.porter.work(simState)
        self.assertEqual(self.porter.state,Porter.pending)
예제 #3
0
def prepare(data, titles = False, porter = False):
    '''
        Составить матрицу признаков для данных сообщений.
        Аргументы:
            data - DataFrame с исходными сообщениями.
        Возвращает: DataFrame - матрицу признаков.
    '''
    cluster_nums = {cluster: i for i, cluster
                    in enumerate(data["cluster"].unique())}
    messages = [{"id": ind,
        "text": ((row["title"]+" ") if titles else "") + row["text"],
        "cluster": cluster_nums[row["cluster"]]}
        for ind, row in data.iterrows()]
    tokens = mystem_parse([m["text"] for m in messages])
    assert len(tokens) == len(messages)
    for mes, tok in zip(messages, tokens):
        mes["tokens"] = tok
        if porter:
            mes["lemmas"] = set(Porter.stem(norm) for norm, _ in tok)
    rows = []
    for i, (m1, m2) in enumerate(combinations(messages, 2)):
        print("Прогресс в сочетаниях: {}/{}".format(i+1,
            len(messages)*(len(messages)-1)//2), end='\r')
        row = {'id1': m1['id'], 'id2': m2['id'],
            'similar': int(m1["cluster"] == m2["cluster"])}
        row.update({grm: 0 for grm in MYSTEM_GRAMMEMS})
        row.update(texts_comparison(m1, m2))
        rows.append(row)
    print("")
    return ps.DataFrame(rows, columns=['id1','id2'] + MYSTEM_GRAMMEMS \
        + ["semantic_repeats", "similar"])
예제 #4
0
def weights_for_block(words, call_position, coefficient):
    """
        Function calculates weight for each word in list of words (in one block).
        Coefficient is tmp argument which doesn't using in this function.
        Use two formulas depending on the flag call_position.

        If call_position = True, then the formula W=L-Position/(L*(L+1)/2.0) is used,
            where W - weight, L - length of list of words, Position - position of word in list.

        If call_position = False, then the formula W=N(word)/L is used,
            where W - weight, N(word) - frequency of word in list, L - length of list of words.

    :param words: A list of words like ['word1', 'word2', 'word3']
    :param call_position: This flag is for selecting formula for calculating weights. Might be True or False
    :param coefficient: tmp variable. Not used in this function. Looks like 0.2 or 0.4
    :return weights: Return dictionary with words and their weights and tmp arg coefficient.
            For example {'word1': 450, 'word2': 500, 'word3': 50}, 0.3
            The sum of all weights is always equals 100%  or 1. Particularly for this situation sum is equals 1000.
    """
    if words:
        porter = Porter()
        words = map(porter.get, words)
        if call_position:
            # Irregular text
            weights = dict()
            for num, word in enumerate(words):
                w = (len(words) - num) / ((float(len(words)) *
                                           (len(words) + 1)) / 2.0)
                weights[word] = weights[word] + round(
                    w, 1) if word in weights else round(w, 1)
            if weights:
                return weights, coefficient
        else:
            # Uniform text
            weights = dict()
            for word in words:
                w = words.count(word) / float(len(words))
                weights[word] = w
            if weights:
                return weights, coefficient
    return {'': 1.0}, coefficient
예제 #5
0
def prepare(data, titles=False, porter=False):
    '''
        Составить матрицу признаков для данных сообщений.
        Аргументы:
            data - DataFrame с исходными сообщениями.
        Возвращает: DataFrame - матрицу признаков.
    '''
    cluster_nums = {
        cluster: i
        for i, cluster in enumerate(data["cluster"].unique())
    }
    messages = [{
        "id": ind,
        "text": ((row["title"] + " ") if titles else "") + row["text"],
        "cluster": cluster_nums[row["cluster"]]
    } for ind, row in data.iterrows()]
    tokens = mystem_parse([m["text"] for m in messages])
    assert len(tokens) == len(messages)
    for mes, tok in zip(messages, tokens):
        mes["tokens"] = tok
        if porter:
            mes["lemmas"] = set(Porter.stem(norm) for norm, _ in tok)
    rows = []
    for i, (m1, m2) in enumerate(combinations(messages, 2)):
        print("Прогресс в сочетаниях: {}/{}".format(
            i + 1,
            len(messages) * (len(messages) - 1) // 2),
              end='\r')
        row = {
            'id1': m1['id'],
            'id2': m2['id'],
            'similar': int(m1["cluster"] == m2["cluster"])
        }
        row.update({grm: 0 for grm in MYSTEM_GRAMMEMS})
        row.update(texts_comparison(m1, m2))
        rows.append(row)
    print("")
    return ps.DataFrame(rows, columns=['id1','id2'] + MYSTEM_GRAMMEMS \
        + ["semantic_repeats", "similar"])
예제 #6
0
 def setUp(self):
     self.porter = Porter(1)
예제 #7
0
           os.path.join(droot, 'example.com'))
os.symlink(os.path.join(sroot, 'bar', 'website_8810'),
           os.path.join(droot, 'example.net'))
os.symlink(os.path.join(sroot, 'baz', 'website_8090'),
           os.path.join(droot, 'example.org'))

# set up some domain aliases
os.symlink(os.path.join(droot, 'example.com'),
           os.path.join(droot, 'alias.example.com'))
os.symlink(os.path.join(droot, 'example.com'),
           os.path.join(droot, 'alias.example.net'))
os.symlink(os.path.join(droot, 'example.org'),
           os.path.join(droot, 'alias.example.org'))

# instantiate Porter with our fake domain root
porter = Porter(droot)

##
# ok, now run some tests!
##
testosterone(
    """\

# if already canonical then we get NULL
porter.canonicalize('example.com') == 'NULL'
porter.canonicalize('example.net') == 'NULL'
porter.canonicalize('example.org') == 'NULL'

# test some real live aliases
#print porter.canonicalize('alias.example.com')
porter.canonicalize('alias.example.com') == 'example.com'
예제 #8
0
class TestCRUD(unittest.TestCase):
    def setUp(self):
        # ready,...
        self.out = StringIO()
        self.cleanUp()

        # ...set,...
        os.mkdir('var')
        os.mkdir('gremlin')
        self.porter = Porter(stdout=self.out)
        self.porter.gremlin = abspath('gremlin')

        # ... go!

    def tearDown(self):
        self.cleanUp()

    def cleanUp(self):
        # clean up our filesystem
        for directory in ('var', 'gremlin'):
            if isdir(directory):
                test_dir = abspath(directory)
                for datafile in os.listdir(test_dir):
                    os.remove(join(test_dir, datafile))
                os.rmdir(test_dir)

    def testListWhenEmpty(self):
        self.porter.onecmd("ls")
        self.assertEqual(self.out.getvalue(), '')
        self.assertEqual(os.listdir(self.porter.var), ['rewrite.db'])
        # db gets created when we try to read in data, frag file not until we
        #  write

    def testBadInput(self):
        self.porter.onecmd("add test")
        self.assertEqual(self.out.getvalue(), "We need a domain name, a " +\
                                              "server name, and a port " +\
                                              "number.\n")
        self.assertEqual(os.listdir(self.porter.var), ['rewrite.db'])
        # didn't write, so still just one file

    def testAddOneItem(self):
        self.porter.onecmd("add zetaweb.com alpin 8010")
        self.porter.onecmd("ls")
        self.assertEqual(self.porter.domains, {'zetaweb.com': 'alpin:8010'})
        self.assertEqual(self.porter.aliases, {'alpin:8010': ['zetaweb.com']})
        self.assertEqual(self.out.getvalue(), 'zetaweb.com\n')
        self.assertEqual(os.listdir(self.porter.var),
                         ['rewrite.db', 'rewrite.db.old'])
        self.assertEqual(os.listdir(self.porter.var),
                         ['rewrite.db', 'rewrite.db.old'])
        # now we should have both files, plus a backup!
        # to be really thorough we should reload the backup and make sure it works

    def testExtraInputIsIgnored(self):
        self.porter.onecmd(
            "add example.com server port Frank Sinatra sings the blues")
        self.assertEqual(self.porter.domains, {"example.com": "server:port"})

    def testAddMultipleItems(self):
        self.porter.onecmd("add zetaweb.com alpin 8010")
        self.porter.onecmd("mk  thedwarf.com duder 8020")
        self.porter.onecmd("add malcontents.org duder 8020")
        self.porter.onecmd("mk  christyanity.com duder 8020")
        self.porter.onecmd("add tesm.edu underbird 8310")

        domains = self.porter.domains.keys()
        domains.sort()
        self.assertEqual(domains, [
            'christyanity.com', 'malcontents.org', 'tesm.edu', 'thedwarf.com',
            'zetaweb.com'
        ])

        aliases = self.porter.aliases.keys()
        aliases.sort()
        self.assertEqual(aliases,
                         ['alpin:8010', 'duder:8020', 'underbird:8310'])

        multi_domains = self.porter.aliases['duder:8020']
        multi_domains.sort()
        self.assertEqual(
            multi_domains,
            ['christyanity.com', 'malcontents.org', 'thedwarf.com'])

        single_domain = self.porter.aliases['alpin:8010']
        self.assertEqual(single_domain, ['zetaweb.com'])

        single_domain = self.porter.aliases['underbird:8310']
        self.assertEqual(single_domain, ['tesm.edu'])

    def testList(self):
        self.porter.onecmd("add zetaweb.com alpin 8010")
        self.porter.onecmd("mk thedwarf.com duder 8020")
        self.porter.onecmd("add malcontents.org duder 8020")
        self.porter.onecmd("mk christyanity.com duder 8020")
        self.porter.onecmd("add tesm.edu underbird 8310")
        self.porter.onecmd("add zoobaz.info dummy 80")
        self.porter.onecmd("add latebutlaughing.com dummy 80")

        expected = """\
christyanity.com     malcontents.org  thedwarf.com  zoobaz.info
latebutlaughing.com  tesm.edu         zetaweb.com \n"""

        self.porter.onecmd("ls")
        self.assertEqual(self.out.getvalue(), expected)

    def testRemove(self):
        self.porter.onecmd("add zetaweb.com alpin 8010")
        self.porter.onecmd("mk thedwarf.com duder 8020")
        self.porter.onecmd("add malcontents.org duder 8020")
        self.porter.onecmd("mk christyanity.com duder 8020")
        self.porter.onecmd("add tesm.edu underbird 8310")
        self.porter.onecmd("add zoobaz.info dummy 80")
        self.porter.onecmd("add latebutlaughing.com dummy 80")

        self.porter.onecmd("rm zetaweb.com")
        self.assertEqual(len(self.porter.domains), 6)
        self.assert_('zetaweb.com' not in self.porter.domains)
        domains = []
        for w in self.porter.aliases:
            domains += self.porter.aliases[w]
        self.assertEqual(len(domains), 6)
        self.assert_('zetaweb.com' not in domains)

        self.porter.onecmd("rm thedwarf.com malcontents.org christyanity.com")
        self.assertEqual(len(self.porter.domains), 3)
        self.assert_('thedwarf.com' not in self.porter.domains)
        self.assert_('malcontents.org' not in self.porter.domains)
        self.assert_('christyanity.com' not in self.porter.domains)
        domains = []
        for w in self.porter.aliases:
            domains += self.porter.aliases[w]
        self.assertEqual(len(domains), 3)
        self.assert_('thedwarf.com' not in domains)
        self.assert_('malcontents.org' not in domains)
        self.assert_('christyanity.com' not in domains)

        self.porter.onecmd("rm latebutlaughing.com")
        self.assertEqual(len(self.porter.domains), 2)
        self.assert_('latebutlaughing.com' not in self.porter.domains)
        domains = []
        for w in self.porter.aliases:
            domains += self.porter.aliases[w]
        self.assertEqual(len(domains), 2)
        self.assert_('latebutlaughing.com' not in domains)

        domains = self.porter.domains.keys()
        domains.sort()
        self.assertEqual(domains, ['tesm.edu', 'zoobaz.info'])

    def testDoubleUpBug(self):
        self.porter.onecmd("add ugandapartners.org bridei 8010")
        self.porter.onecmd("mv ugandapartners.org bridei 8110")
        self.assertEqual(self.porter.aliases['bridei:8010'], [])

        self.porter.onecmd("mv ugandapartners.org bridei 8010")
        self.assertEqual(self.porter.aliases['bridei:8010'],
                         ['ugandapartners.org'])

    def testDoubleUpBugAgain(self):
        self.porter.onecmd("add zetaweb.com bridei 8090")
        self.porter.onecmd("add ugandapartners.org bridei 8090")
        self.assertEqual(self.porter.aliases['bridei:8090'],
                         ['ugandapartners.org', 'zetaweb.com'])

        self.porter.onecmd("mv zetaweb.com bridei 8080")
        self.assertEqual(self.porter.aliases['bridei:8090'],
                         ['ugandapartners.org'])

        self.porter.onecmd("mv zetaweb.com bridei 8090")
        self.assertEqual(self.porter.aliases['bridei:8090'],
                         ['ugandapartners.org', 'zetaweb.com'])

        self.porter.onecmd("mv zetaweb.com bridei 8080")
        self.assertEqual(self.porter.aliases['bridei:8090'],
                         ['ugandapartners.org'])