def __train_models(self):
     # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
     self.ngp = NGramPerplexity()
     self.tfidf = TFIDF()
     print("Training models from specific corpora")
     for file in os.listdir(self.input_dir):
         print("Training models from specific corpora: " + file)
         with open(self.input_dir + "/" + file, encoding="utf-8") as input:
             for line in input:
                 words = WordExtractor.get_words(line)
                 if len(words) == 0:
                     continue
                 self.sentences.append(words)
                 self.ngp.train_from_text(words)
                 self.tfidf.train_from_text(words)
 def __train_models(self):
     # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
     self.ngp = NGramPerplexity()
     self.tfidf = TFIDF()
     print("Training models from specific corpora")
     for file in os.listdir(self.input_dir):
         print("Training models from specific corpora: " + file)
         with open(self.input_dir + "/" + file, encoding="utf-8") as input:
             for line in input:
                 words = WordExtractor.get_words(line)
                 if len(words) == 0:
                     continue
                 self.sentences.append(words)
                 self.ngp.train_from_text(words)
                 self.tfidf.train_from_text(words)
class DomainSimilarity:
    def __init__(self, input_dir, threshold_tfidf, threshold_perplexity_ngram,
                 threshold_edit_distance):
        self.threshold_tfidf = 1 - threshold_tfidf
        self.threshold_perplexity_ngram = threshold_perplexity_ngram
        self.threshold_edit_distance = threshold_edit_distance
        self.input_dir = input_dir
        self.sentences = []
        if not os.path.isdir(input_dir):
            raise Exception("The provided dir " + str(input_dir) +
                            " does not exist")
        self.__train_models()

        self.queries_asked = 0
        self.sentences_asked = 0
        self.accepted_by_tfidf = 0
        self.accepted_by_ngp = 0
        self.accepted_by_edit_distance = 0
        self.sum_tfidf = 0
        self.sum_ngp = 0
        self.sum_edit = 0

    def __train_models(self):
        # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
        self.ngp = NGramPerplexity()
        self.tfidf = TFIDF()
        print("Training models from specific corpora")
        for file in os.listdir(self.input_dir):
            print("Training models from specific corpora: " + file)
            with open(self.input_dir + "/" + file, encoding="utf-8") as input:
                for line in input:
                    words = WordExtractor.get_words(line)
                    if len(words) == 0:
                        continue
                    self.sentences.append(words)
                    self.ngp.train_from_text(words)
                    self.tfidf.train_from_text(words)

    def print_progress(self):
        print("Average tfidf: " + str(1 - self.sum_tfidf / self.queries_asked))
        print("Average ngram-perplexity: " +
              str(self.sum_ngp / self.sentences_asked))
        print("Average edit-distance: " +
              str(self.sum_edit / self.queries_asked))
        print("Accept percent by tfidf extractor: " +
              Formatter.percent(self.accepted_by_tfidf / self.sentences_asked))
        print("Accept percent by ngram-perplexity extractor: " +
              Formatter.percent(self.accepted_by_ngp / self.sentences_asked))
        print("Accept percent by edit-distance extractor: " +
              Formatter.percent(self.accepted_by_edit_distance /
                                self.sentences_asked))

    def accepts_sentence(self, words_general):
        # sentence_general: string
        # Returns True if similarity of sentence_general is either:
        # > threshold1 according to tf-idf of one of stored sentences
        # > threshold2 according to ngramperplexity of one of stored sentences
        # > threshold3 according to levenshtein of one of stored sentences
        self.sentences_asked += 1
        accept_ngp = False
        accept_tfidf = False
        accept_edit_distance = False

        perplexity = self.ngp.calc_perplexity(words_general)
        self.sum_ngp += perplexity
        if perplexity <= self.threshold_perplexity_ngram:
            if RUN_CONFIGURATION.mode == MODE.TURBO:
                return True
            self.accepted_by_ngp += 1
            accept_ngp = True

        for words_specific in self.sentences:
            self.queries_asked += 1
            if accept_tfidf and accept_edit_distance:
                return True
            if not accept_tfidf:
                sim = self.tfidf.calc_cosine_similarity(
                    words_general, words_specific)
                self.sum_tfidf += sim
                if sim >= self.threshold_tfidf:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_tfidf += 1
                    accept_tfidf = True
            if not accept_edit_distance:
                edit_distance = Levenshtein.normalized_distance(
                    words_general, words_specific)
                self.sum_edit += edit_distance
                if edit_distance <= self.threshold_edit_distance:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_edit_distance += 1
                    accept_edit_distance = True

        if accept_tfidf or accept_ngp or accept_edit_distance:
            return True

        return False
from ngramperplexity import NGramPerplexity
from wordextractor import WordExtractor

NGramPerplexity.ngram_size = 3
ngp = NGramPerplexity()
ngp.train_from_text(
    WordExtractor.get_words("There are so many people at the beach"))
ngp.train_from_text(
    WordExtractor.get_words(
        "The beach is so crowded with all these people, I wish they would just go to another beach"
    ))
ngp.train_from_text(
    WordExtractor.get_words(
        "It is summer and a great day to go to the beach."))
ngp.train_from_text(
    WordExtractor.get_words(
        "Let's go to the beach and enjoy the great weather we've got today."))
ngp.train_from_text(
    WordExtractor.get_words(
        "I think the first thing I will do at the beach is to buy an ice cream."
    ))
ngp.train_from_text(
    WordExtractor.get_words(
        "There's many people at the beach today, I think they are enjoying their holidays."
    ))
ngp.train_from_text(
    WordExtractor.get_words(
        "I think something is going on at the beach right now, there are literally people everywhere."
    ))

#sentences with great similarity
class DomainSimilarity:

    def __init__(self, input_dir, threshold_tfidf, threshold_perplexity_ngram, threshold_edit_distance):
        self.threshold_tfidf = 1-threshold_tfidf
        self.threshold_perplexity_ngram = threshold_perplexity_ngram
        self.threshold_edit_distance = threshold_edit_distance
        self.input_dir = input_dir
        self.sentences = []
        if not os.path.isdir(input_dir):
            raise Exception("The provided dir " + str(input_dir) + " does not exist")
        self.__train_models()

        self.queries_asked = 0
        self.sentences_asked = 0
        self.accepted_by_tfidf = 0
        self.accepted_by_ngp = 0
        self.accepted_by_edit_distance = 0
        self.sum_tfidf = 0
        self.sum_ngp = 0
        self.sum_edit = 0

    def __train_models(self):
        # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
        self.ngp = NGramPerplexity()
        self.tfidf = TFIDF()
        print("Training models from specific corpora")
        for file in os.listdir(self.input_dir):
            print("Training models from specific corpora: " + file)
            with open(self.input_dir + "/" + file, encoding="utf-8") as input:
                for line in input:
                    words = WordExtractor.get_words(line)
                    if len(words) == 0:
                        continue
                    self.sentences.append(words)
                    self.ngp.train_from_text(words)
                    self.tfidf.train_from_text(words)

    def print_progress(self):
        print("Average tfidf: " + str(1 - self.sum_tfidf / self.queries_asked))
        print("Average ngram-perplexity: " + str(self.sum_ngp / self.sentences_asked))
        print("Average edit-distance: " + str(self.sum_edit / self.queries_asked))
        print("Accept percent by tfidf extractor: " + Formatter.percent(self.accepted_by_tfidf / self.sentences_asked))
        print("Accept percent by ngram-perplexity extractor: " + Formatter.percent(self.accepted_by_ngp / self.sentences_asked))
        print("Accept percent by edit-distance extractor: " + Formatter.percent(self.accepted_by_edit_distance / self.sentences_asked))

    def accepts_sentence(self, words_general):
        # sentence_general: string
        # Returns True if similarity of sentence_general is either:
        # > threshold1 according to tf-idf of one of stored sentences
        # > threshold2 according to ngramperplexity of one of stored sentences
        # > threshold3 according to levenshtein of one of stored sentences
        self.sentences_asked += 1
        accept_ngp = False
        accept_tfidf = False
        accept_edit_distance = False

        perplexity = self.ngp.calc_perplexity(words_general)
        self.sum_ngp += perplexity
        if perplexity <= self.threshold_perplexity_ngram:
            if RUN_CONFIGURATION.mode == MODE.TURBO:
                return True
            self.accepted_by_ngp += 1
            accept_ngp = True

        for words_specific in self.sentences:
            self.queries_asked += 1
            if accept_tfidf and accept_edit_distance:
                return True
            if not accept_tfidf:
                sim = self.tfidf.calc_cosine_similarity(words_general, words_specific)
                self.sum_tfidf += sim
                if sim >= self.threshold_tfidf:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_tfidf += 1
                    accept_tfidf = True
            if not accept_edit_distance:
                edit_distance = Levenshtein.normalized_distance(words_general, words_specific)
                self.sum_edit += edit_distance
                if edit_distance <= self.threshold_edit_distance:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_edit_distance += 1
                    accept_edit_distance = True

        if accept_tfidf or accept_ngp or accept_edit_distance:
            return True

        return False
from ngramperplexity import NGramPerplexity
from wordextractor import WordExtractor

NGramPerplexity.ngram_size = 3
ngp = NGramPerplexity()
ngp.train_from_text(WordExtractor.get_words("There are so many people at the beach"))
ngp.train_from_text(WordExtractor.get_words("The beach is so crowded with all these people, I wish they would just go to another beach"))
ngp.train_from_text(WordExtractor.get_words("It is summer and a great day to go to the beach."))
ngp.train_from_text(WordExtractor.get_words("Let's go to the beach and enjoy the great weather we've got today."))
ngp.train_from_text(WordExtractor.get_words("I think the first thing I will do at the beach is to buy an ice cream."))
ngp.train_from_text(WordExtractor.get_words("There's many people at the beach today, I think they are enjoying their holidays."))
ngp.train_from_text(WordExtractor.get_words("I think something is going on at the beach right now, there are literally people everywhere."))

#sentences with great similarity
very_similar_sentences = [WordExtractor.get_words("The weather is great at the beach!"),
                          WordExtractor.get_words("something is probably going on at the beach right now."),
                          WordExtractor.get_words("I think they would like an ice cream to enjoy at the beach.")]

slightly_similar_sentences = [WordExtractor.get_words("Great day today, I want to run a long distance"),
                            WordExtractor.get_words("It is expensive to buy too many things"),
                            WordExtractor.get_words("These people are new to me, I have never seen them before.")]

different_sentences = [[],
                       WordExtractor.get_words("I play guitar in a band, but I better like to play piano"),
                       WordExtractor.get_words("The Jaguar is a very dangerous animal, that can run very fast"),
                       WordExtractor.get_words("Dinosaurs are extinct and we will probably never see them alive.")]


#A very similar sentence must be more similar than a slightly similar sentence
for very_similar_sentence in very_similar_sentences:
    for slightly_similar_sentence in slightly_similar_sentences: