def setUp(self): # ready,... self.out = StringIO() self.cleanUp() # ...set,... os.mkdir('var') os.mkdir('gremlin') self.porter = Porter(stdout=self.out) self.porter.gremlin = abspath('gremlin')
class TestPorter(unittest.TestCase): def setUp(self): self.porter = Porter(1) ## This function goes through all the porter states returning to pending once complete ## def test_work(self): simState = Mock() simState.env.timout(return_value=20) self.assertEqual(self.porter.state,Porter.pending) self.porter.work(simState) self.assertEqual(self.porter.state,Porter.pending)
def prepare(data, titles = False, porter = False): ''' Составить матрицу признаков для данных сообщений. Аргументы: data - DataFrame с исходными сообщениями. Возвращает: DataFrame - матрицу признаков. ''' cluster_nums = {cluster: i for i, cluster in enumerate(data["cluster"].unique())} messages = [{"id": ind, "text": ((row["title"]+" ") if titles else "") + row["text"], "cluster": cluster_nums[row["cluster"]]} for ind, row in data.iterrows()] tokens = mystem_parse([m["text"] for m in messages]) assert len(tokens) == len(messages) for mes, tok in zip(messages, tokens): mes["tokens"] = tok if porter: mes["lemmas"] = set(Porter.stem(norm) for norm, _ in tok) rows = [] for i, (m1, m2) in enumerate(combinations(messages, 2)): print("Прогресс в сочетаниях: {}/{}".format(i+1, len(messages)*(len(messages)-1)//2), end='\r') row = {'id1': m1['id'], 'id2': m2['id'], 'similar': int(m1["cluster"] == m2["cluster"])} row.update({grm: 0 for grm in MYSTEM_GRAMMEMS}) row.update(texts_comparison(m1, m2)) rows.append(row) print("") return ps.DataFrame(rows, columns=['id1','id2'] + MYSTEM_GRAMMEMS \ + ["semantic_repeats", "similar"])
def weights_for_block(words, call_position, coefficient): """ Function calculates weight for each word in list of words (in one block). Coefficient is tmp argument which doesn't using in this function. Use two formulas depending on the flag call_position. If call_position = True, then the formula W=L-Position/(L*(L+1)/2.0) is used, where W - weight, L - length of list of words, Position - position of word in list. If call_position = False, then the formula W=N(word)/L is used, where W - weight, N(word) - frequency of word in list, L - length of list of words. :param words: A list of words like ['word1', 'word2', 'word3'] :param call_position: This flag is for selecting formula for calculating weights. Might be True or False :param coefficient: tmp variable. Not used in this function. Looks like 0.2 or 0.4 :return weights: Return dictionary with words and their weights and tmp arg coefficient. For example {'word1': 450, 'word2': 500, 'word3': 50}, 0.3 The sum of all weights is always equals 100% or 1. Particularly for this situation sum is equals 1000. """ if words: porter = Porter() words = map(porter.get, words) if call_position: # Irregular text weights = dict() for num, word in enumerate(words): w = (len(words) - num) / ((float(len(words)) * (len(words) + 1)) / 2.0) weights[word] = weights[word] + round( w, 1) if word in weights else round(w, 1) if weights: return weights, coefficient else: # Uniform text weights = dict() for word in words: w = words.count(word) / float(len(words)) weights[word] = w if weights: return weights, coefficient return {'': 1.0}, coefficient
def prepare(data, titles=False, porter=False): ''' Составить матрицу признаков для данных сообщений. Аргументы: data - DataFrame с исходными сообщениями. Возвращает: DataFrame - матрицу признаков. ''' cluster_nums = { cluster: i for i, cluster in enumerate(data["cluster"].unique()) } messages = [{ "id": ind, "text": ((row["title"] + " ") if titles else "") + row["text"], "cluster": cluster_nums[row["cluster"]] } for ind, row in data.iterrows()] tokens = mystem_parse([m["text"] for m in messages]) assert len(tokens) == len(messages) for mes, tok in zip(messages, tokens): mes["tokens"] = tok if porter: mes["lemmas"] = set(Porter.stem(norm) for norm, _ in tok) rows = [] for i, (m1, m2) in enumerate(combinations(messages, 2)): print("Прогресс в сочетаниях: {}/{}".format( i + 1, len(messages) * (len(messages) - 1) // 2), end='\r') row = { 'id1': m1['id'], 'id2': m2['id'], 'similar': int(m1["cluster"] == m2["cluster"]) } row.update({grm: 0 for grm in MYSTEM_GRAMMEMS}) row.update(texts_comparison(m1, m2)) rows.append(row) print("") return ps.DataFrame(rows, columns=['id1','id2'] + MYSTEM_GRAMMEMS \ + ["semantic_repeats", "similar"])
def setUp(self): self.porter = Porter(1)
os.path.join(droot, 'example.com')) os.symlink(os.path.join(sroot, 'bar', 'website_8810'), os.path.join(droot, 'example.net')) os.symlink(os.path.join(sroot, 'baz', 'website_8090'), os.path.join(droot, 'example.org')) # set up some domain aliases os.symlink(os.path.join(droot, 'example.com'), os.path.join(droot, 'alias.example.com')) os.symlink(os.path.join(droot, 'example.com'), os.path.join(droot, 'alias.example.net')) os.symlink(os.path.join(droot, 'example.org'), os.path.join(droot, 'alias.example.org')) # instantiate Porter with our fake domain root porter = Porter(droot) ## # ok, now run some tests! ## testosterone( """\ # if already canonical then we get NULL porter.canonicalize('example.com') == 'NULL' porter.canonicalize('example.net') == 'NULL' porter.canonicalize('example.org') == 'NULL' # test some real live aliases #print porter.canonicalize('alias.example.com') porter.canonicalize('alias.example.com') == 'example.com'
class TestCRUD(unittest.TestCase): def setUp(self): # ready,... self.out = StringIO() self.cleanUp() # ...set,... os.mkdir('var') os.mkdir('gremlin') self.porter = Porter(stdout=self.out) self.porter.gremlin = abspath('gremlin') # ... go! def tearDown(self): self.cleanUp() def cleanUp(self): # clean up our filesystem for directory in ('var', 'gremlin'): if isdir(directory): test_dir = abspath(directory) for datafile in os.listdir(test_dir): os.remove(join(test_dir, datafile)) os.rmdir(test_dir) def testListWhenEmpty(self): self.porter.onecmd("ls") self.assertEqual(self.out.getvalue(), '') self.assertEqual(os.listdir(self.porter.var), ['rewrite.db']) # db gets created when we try to read in data, frag file not until we # write def testBadInput(self): self.porter.onecmd("add test") self.assertEqual(self.out.getvalue(), "We need a domain name, a " +\ "server name, and a port " +\ "number.\n") self.assertEqual(os.listdir(self.porter.var), ['rewrite.db']) # didn't write, so still just one file def testAddOneItem(self): self.porter.onecmd("add zetaweb.com alpin 8010") self.porter.onecmd("ls") self.assertEqual(self.porter.domains, {'zetaweb.com': 'alpin:8010'}) self.assertEqual(self.porter.aliases, {'alpin:8010': ['zetaweb.com']}) self.assertEqual(self.out.getvalue(), 'zetaweb.com\n') self.assertEqual(os.listdir(self.porter.var), ['rewrite.db', 'rewrite.db.old']) self.assertEqual(os.listdir(self.porter.var), ['rewrite.db', 'rewrite.db.old']) # now we should have both files, plus a backup! # to be really thorough we should reload the backup and make sure it works def testExtraInputIsIgnored(self): self.porter.onecmd( "add example.com server port Frank Sinatra sings the blues") self.assertEqual(self.porter.domains, {"example.com": "server:port"}) def testAddMultipleItems(self): self.porter.onecmd("add zetaweb.com alpin 8010") self.porter.onecmd("mk thedwarf.com duder 8020") self.porter.onecmd("add malcontents.org duder 8020") self.porter.onecmd("mk christyanity.com duder 8020") self.porter.onecmd("add tesm.edu underbird 8310") domains = self.porter.domains.keys() domains.sort() self.assertEqual(domains, [ 'christyanity.com', 'malcontents.org', 'tesm.edu', 'thedwarf.com', 'zetaweb.com' ]) aliases = self.porter.aliases.keys() aliases.sort() self.assertEqual(aliases, ['alpin:8010', 'duder:8020', 'underbird:8310']) multi_domains = self.porter.aliases['duder:8020'] multi_domains.sort() self.assertEqual( multi_domains, ['christyanity.com', 'malcontents.org', 'thedwarf.com']) single_domain = self.porter.aliases['alpin:8010'] self.assertEqual(single_domain, ['zetaweb.com']) single_domain = self.porter.aliases['underbird:8310'] self.assertEqual(single_domain, ['tesm.edu']) def testList(self): self.porter.onecmd("add zetaweb.com alpin 8010") self.porter.onecmd("mk thedwarf.com duder 8020") self.porter.onecmd("add malcontents.org duder 8020") self.porter.onecmd("mk christyanity.com duder 8020") self.porter.onecmd("add tesm.edu underbird 8310") self.porter.onecmd("add zoobaz.info dummy 80") self.porter.onecmd("add latebutlaughing.com dummy 80") expected = """\ christyanity.com malcontents.org thedwarf.com zoobaz.info latebutlaughing.com tesm.edu zetaweb.com \n""" self.porter.onecmd("ls") self.assertEqual(self.out.getvalue(), expected) def testRemove(self): self.porter.onecmd("add zetaweb.com alpin 8010") self.porter.onecmd("mk thedwarf.com duder 8020") self.porter.onecmd("add malcontents.org duder 8020") self.porter.onecmd("mk christyanity.com duder 8020") self.porter.onecmd("add tesm.edu underbird 8310") self.porter.onecmd("add zoobaz.info dummy 80") self.porter.onecmd("add latebutlaughing.com dummy 80") self.porter.onecmd("rm zetaweb.com") self.assertEqual(len(self.porter.domains), 6) self.assert_('zetaweb.com' not in self.porter.domains) domains = [] for w in self.porter.aliases: domains += self.porter.aliases[w] self.assertEqual(len(domains), 6) self.assert_('zetaweb.com' not in domains) self.porter.onecmd("rm thedwarf.com malcontents.org christyanity.com") self.assertEqual(len(self.porter.domains), 3) self.assert_('thedwarf.com' not in self.porter.domains) self.assert_('malcontents.org' not in self.porter.domains) self.assert_('christyanity.com' not in self.porter.domains) domains = [] for w in self.porter.aliases: domains += self.porter.aliases[w] self.assertEqual(len(domains), 3) self.assert_('thedwarf.com' not in domains) self.assert_('malcontents.org' not in domains) self.assert_('christyanity.com' not in domains) self.porter.onecmd("rm latebutlaughing.com") self.assertEqual(len(self.porter.domains), 2) self.assert_('latebutlaughing.com' not in self.porter.domains) domains = [] for w in self.porter.aliases: domains += self.porter.aliases[w] self.assertEqual(len(domains), 2) self.assert_('latebutlaughing.com' not in domains) domains = self.porter.domains.keys() domains.sort() self.assertEqual(domains, ['tesm.edu', 'zoobaz.info']) def testDoubleUpBug(self): self.porter.onecmd("add ugandapartners.org bridei 8010") self.porter.onecmd("mv ugandapartners.org bridei 8110") self.assertEqual(self.porter.aliases['bridei:8010'], []) self.porter.onecmd("mv ugandapartners.org bridei 8010") self.assertEqual(self.porter.aliases['bridei:8010'], ['ugandapartners.org']) def testDoubleUpBugAgain(self): self.porter.onecmd("add zetaweb.com bridei 8090") self.porter.onecmd("add ugandapartners.org bridei 8090") self.assertEqual(self.porter.aliases['bridei:8090'], ['ugandapartners.org', 'zetaweb.com']) self.porter.onecmd("mv zetaweb.com bridei 8080") self.assertEqual(self.porter.aliases['bridei:8090'], ['ugandapartners.org']) self.porter.onecmd("mv zetaweb.com bridei 8090") self.assertEqual(self.porter.aliases['bridei:8090'], ['ugandapartners.org', 'zetaweb.com']) self.porter.onecmd("mv zetaweb.com bridei 8080") self.assertEqual(self.porter.aliases['bridei:8090'], ['ugandapartners.org'])