Exemplo n.º 1
0
 def test_corpus_stop_list_entropy(self):
     """Test production of stoplists from a corpus, using basis: entropy"""
     target_list = ['ac', 'ad', 'atque', 'cum', 'et', 'in', 'mihi', 'qui', 'rerum', 'vel']
     S = LatinCorpusStoplist()
     stoplist = S.build_stoplist(self.test_corpus, size=10,
                 basis='entropy', inc_values=False)
     self.assertEqual(stoplist, target_list)
Exemplo n.º 2
0
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("-d",
                    "--dataset",
                    required=True,
                    help="path to input dataset")
    ap.add_argument("-l", "--length", required=True, help="length of sw list")

    args = vars(ap.parse_args())
    print("[INFO] Importing data...")
    filepath = args["dataset"]
    C = Corpus(filepath)
    data, _ = C.read()
    data = [file.lower() for file in data]
    pat0 = re.compile(r"\W+", flags=re.MULTILINE)
    data = [pat0.sub(" ", file) for file in data]
    pat1 = re.compile(r"\d+", flags=re.MULTILINE)
    data = [pat1.sub(" ", file) for file in data]
    pat2 = re.compile(r"  +", flags=re.MULTILINE)
    data = [pat2.sub(" ", file) for file in data]

    print("[INFO] Lemmatization...")
    le = LemmatizerLatin(token=False)
    data = [le.preprocess(file) for file in data]
    S = CorpusStoplist()
    sw_list = S.build_stoplist(data, size=int(args["length"]))

    print("[INFO] Writing list to file...")
    if os.path.isdir("../res"):
        pass
    else:
        os.mkdir("../res")
    with open("../res/stopwords.txt", "w") as f:
        for word in sw_list:
            f.write("%s\n" % word)
Exemplo n.º 3
0
 def test_corpus_stop_list_entropy(self):
     """Test production of stoplists from a corpus, using basis: entropy"""
     target_list = ['ac', 'ad', 'atque', 'cum', 'et', 'in', 'mihi', 'qui', 'rerum', 'vel']
     S = LatinCorpusStoplist()
     stoplist = S.build_stoplist(self.test_corpus, size=10,
                 basis='entropy', inc_values=False)
     self.assertEqual(stoplist, target_list)
Exemplo n.º 4
0
    def test_corpus_stop_list_freq_sort_words(self):
        """Test production of stoplists from a corpus, using basis: frequency"""
        target_list = ['in', 'et', 'vel', 'ac', 'cum', 'qui', 'atque', 'mihi', 'ad', 'neque']

        S = LatinCorpusStoplist()
        stoplist = S.build_stoplist(self.test_corpus, size=10,
                    basis='frequency', inc_values=False, sort_words=False)
        self.assertEqual(stoplist, target_list)
Exemplo n.º 5
0
    def test_corpus_stop_list_freq_sort_words(self):
        """Test production of stoplists from a corpus, using basis: frequency"""
        target_list = ['in', 'et', 'vel', 'ac', 'cum', 'qui', 'atque', 'mihi', 'ad', 'neque']

        S = LatinCorpusStoplist()
        stoplist = S.build_stoplist(self.test_corpus, size=10,
                    basis='frequency', inc_values=False, sort_words=False)
        self.assertEqual(stoplist, target_list)
Exemplo n.º 6
0
    def test_corpus_latin(self):
        """Test production of Latin stoplists from a corpus"""
        target_list = ['ac', 'atque', 'cum', 'et', 'in', 'mihi', 'neque',
                        'qui', 'rerum', 'vel']
        S = LatinCorpusStoplist()
        stoplist = S.build_stoplist(self.latin_test_corpus, size=10,
                    basis='zou', inc_values=False)

        self.assertEqual(stoplist, target_list)
Exemplo n.º 7
0
    def test_corpus_latin(self):
        """Test production of Latin stoplists from a corpus"""
        target_list = ['ac', 'atque', 'cum', 'et', 'in', 'mihi', 'neque',
                        'qui', 'rerum', 'vel']
        S = LatinCorpusStoplist()
        stoplist = S.build_stoplist(self.latin_test_corpus, size=10,
                    basis='zou', inc_values=False)

        self.assertEqual(stoplist, target_list)
Exemplo n.º 8
0
 def test_corpus_stop_list_freq_inc_values(self):
     """Test production of stoplists from a corpus with values,
     using basis: frequency"""
     target_list = [('ac', 8), ('ad', 5), ('atque', 6), ('cum', 8),
                     ('et', 15), ('in', 18), ('mihi', 6), ('neque', 5),
                     ('qui', 7), ('vel', 9)]
     S = LatinCorpusStoplist()
     stoplist = S.build_stoplist(self.test_corpus, size=10,
                 basis='frequency', inc_values=True)
     self.assertEqual(stoplist, target_list)
Exemplo n.º 9
0
 def test_corpus_stop_list_freq_inc_values(self):
     """Test production of stoplists from a corpus with values,
     using basis: frequency"""
     target_list = [('ac', 8), ('ad', 5), ('atque', 6), ('cum', 8),
                     ('et', 15), ('in', 18), ('mihi', 6), ('neque', 5),
                     ('qui', 7), ('vel', 9)]
     S = LatinCorpusStoplist()
     stoplist = S.build_stoplist(self.test_corpus, size=10,
                 basis='frequency', inc_values=True)
     self.assertEqual(stoplist, target_list)
Exemplo n.º 10
0
 def test_corpus_stop_list_variance(self):
     """Test production of stoplists from a corpus, using basis: variance"""
     target_list = [
         'ac', 'atque', 'cum', 'et', 'in', 'mihi', 'neque', 'qui', 'rerum',
         'vel'
     ]
     S = LatinCorpusStoplist()
     stoplist = S.build_stoplist(self.test_corpus,
                                 size=10,
                                 basis='variance')
     self.assertEqual(stoplist, target_list)
Exemplo n.º 11
0
 def test_corpus_stop_list_freq_include(self):
     """Test production of stoplists from a corpus, using basis: frequency"""
     target_list = [
         'ac', 'ad', 'atque', 'cum', 'est', 'et', 'in', 'mihi', 'neque',
         'qui', 'vel'
     ]
     S = LatinCorpusStoplist()
     stoplist = S.build_stoplist(self.test_corpus,
                                 size=10,
                                 basis='frequency',
                                 include=['est'])
     self.assertEqual(stoplist, target_list)
Exemplo n.º 12
0
 def test_corpus_stop_list_variance(self):
     """Test production of stoplists from a corpus, using basis: variance"""
     target_list = ['ac', 'atque', 'cum', 'et', 'in', 'mihi', 'neque', 'qui', 'rerum', 'vel']
     S = LatinCorpusStoplist()
     stoplist = S.build_stoplist(self.test_corpus, size=10,basis='variance')
     self.assertEqual(stoplist, target_list)
Exemplo n.º 13
0
 def test_corpus_stop_list_freq_include(self):
     """Test production of stoplists from a corpus, using basis: frequency"""
     target_list = ['ac', 'ad', 'atque', 'cum', 'est', 'et', 'in', 'mihi', 'neque', 'qui', 'vel']
     S = LatinCorpusStoplist()
     stoplist = S.build_stoplist(self.test_corpus, size=10, basis='frequency', include=['est'])
     self.assertEqual(stoplist, target_list)