Пример #1
0
def make_P_S(model,
             n_p,
             n_s,
             hard_stop=False,
             max_attempts=1e5,
             max_sample_length=np.inf):
    model = LanguageModel(model)
    P = set()
    S = set()
    attempts = 0
    while (len(P) < n_p or len(S) < n_s) and attempts < max_attempts:
        attempts += 1
        w = model.sample(cutoff=max_sample_length)
        if len(
                P
        ) < n_p:  # to keep from making P bigger than expected when n_s>n_p
            for i in range(len(w) + 1):
                P.add(tuple(w[:i]))
                if hard_stop and len(P) >= n_p:
                    break
        if len(S) < n_s:  # same idea, when n_p>n_s
            for i in range(len(w), -1, -1):
                S.add(tuple(w[i:]))
                if hard_stop and len(S) >= n_s:
                    break
    P = sorted([list(w) for w in P], key=len)
    S = sorted([list(w) for w in S], key=len)
    if len(P) < n_p or len(S) < n_s:
        print("attempted", max_attempts, "samples, each of max length",
              max_sample_length,
              ", could not get enough P/S but cutting short")
    return P, S
Пример #2
0
    def __init__(self):

        CLEAN_TRAIN_AB_FILE = 'clean_corpus_train1ab.txt'
        INSULT_TRAIN_AB_FILE = 'insult_corpus_train1ab.txt'

        INSULT_TEST_FILE = 'insult_corpus_test.txt'
        CLEAN_TEST_FILE = 'clean_corpus_test.txt'

        self.punctuation = set([',', ';', '\'', '"', '.', '!', '?'])
        self.dictionary = enchant.Dict("en_US")

        #self.cleanTrainSents = LanguageModel(CLEAN_TRAIN_AB_FILE).getSents()
        #self.insultTrainSents = LanguageModel(INSULT_TRAIN_FILE).getSents()

        #self.cleanTestSents = LanguageModel(CLEAN_TEST_AB_FILE).getSents()
        #self.insultTestSents = LanguageModel(INSULT_TEST_FILE).getSents()

        self.cleanSplitSpaces = LanguageModel(
            CLEAN_TRAIN_AB_FILE).splitBySpaces()
        self.insultSplitSpaces = LanguageModel(
            INSULT_TRAIN_AB_FILE).splitBySpaces()

        self.cleanTestSplitSpaces = LanguageModel(
            CLEAN_TEST_FILE).splitBySpaces()
        self.insultTestSplitSpaces = LanguageModel(
            INSULT_TEST_FILE).splitBySpaces()
 def load_language_model(self, fp):
     ''' Takes in a file pointer as input and should initialize the 
     SpellChecker object’s language_model data member to a default 
     LanguageModel and then load the stored language model (e.g. lm.pkl) 
     from fp into that data member. '''
     self.language_model = LanguageModel()
     self.language_model.load(fp)
Пример #4
0
def main():

	precisions = []
	recalls = []
	for alpha in [0.3, 0.5, 0.7, 0.9, 0.95, 0.97, 0.99, 1.00, 1.01, 1.03, 1.05, 1.07, 1.1, 1.3, 1.5, 1.7, 2.0, 3.0, 5.0]:
		ALPHA = alpha

		cleanLM = LanguageModel(CLEAN_TRAIN_FILE)
		insultLM = LanguageModel(INSULT_TRAIN_FILE)
		
		cleanTestSents = LanguageModel(CLEAN_TEST_FILE).getSents()
		insultTestSents = LanguageModel(INSULT_TEST_FILE).getSents()

		NB = baselineNaiveBayes(cleanLM, insultLM)
		NB.train()
		#print NB.genProbs(cleanTestSents, insultTestSents)

		if (STUPID_BACKOFF):
			tp, tn, fp, fn = NB.testStupidBackoff(cleanTestSents, insultTestSents, ALPHA)
		else:
			tp, tn, fp, fn = NB.testImproved1(cleanTestSents, insultTestSents, ALPHA)

		interpretResults(tp, tn, fp, fn)

	print "Precisions:\n {}".format(precisions)
	print "Recalls:\n {}".format(recalls)
Пример #5
0
def get_ndcg_samples_and_target():
    lm = LanguageModel(rnn)
    prefs = []
    while len(prefs) < args.ndcg_num_samples:
        s = lm.sample(cutoff=args.ndcg_max_len)
        prefs += [s[:i] for i in range(len(s) + 1)]
    prefs = prefs[:args.
                  ndcg_num_samples]  # remove extra ones possibly added by last sequence, just in name of easy reporting honestly
    prefs = list(prefs)
    with open(rnn_folder + "/ndcg_samples.txt", "w") as f:
        print(len(prefs), len(lm.input_alphabet), file=f)
        for p in prefs:
            print(
                len(p), " ".join([str(t) for t in p]), file=f
            )  # this is fine for the spices and for the uhls, where the tokens are ints. make sure to read it right too!
    target_filename = rnn_folder + "/ndcg_target.txt"
    with open(target_filename, "w") as f:
        print(args.ndcg_k, file=f)  # store what ndcg_k is being made
        for p in prefs:
            d = lm.distribution_from_sequence(p)
            chars = sorted(list(d.keys()), key=lambda x: d[x], reverse=True)
            optimal = np.sum([
                d[c] / np.log2(i + 2)
                for i, c in enumerate(chars[:args.ndcg_k])
            ])
            #log2(i+2): ndcg wants i+1 where i is token index, but also remember enumerate starts from zero
            chars_weights = [
                v for pair in [(c, d[c]) for c in chars] for v in pair
            ]
            chars_weights = [(v if not v == lm.end_token else -1)
                             for v in chars_weights]
            # spice scoring expects "-1" for end-of-sequence character
            print(optimal, " ".join([str(t) for t in chars_weights]), file=f)
    return prefs, target_filename
Пример #6
0
def main():

    print("Generating models....")
    print("\tcleanLM...")
    cleanLM = LanguageModel(CLEAN_TRAIN_FILE)
    print("\tinsultLM...")
    insultLM = LanguageModel(INSULT_TRAIN_FILE)

    print("\tcleanTestLM...")
    cleanTestSents = LanguageModel(CLEAN_TEST_FILE).getSents()
    print("\tinsultTestLM...")
    insultTestSents = LanguageModel(INSULT_TEST_FILE).getSents()

    NB = baselineNaiveBayes(cleanLM, insultLM)
    print("Training NB Model....")
    NB.train()
    #print NB.genProbs(cleanTestSents, insultTestSents)

    print("Testing NB Model....")
    if (STUPID_BACKOFF):
        tp, tn, fp, fn = NB.testStupidBackoff(cleanTestSents, insultTestSents)
    else:
        tp, tn, fp, fn = NB.testImproved1(cleanTestSents, insultTestSents)

    interpretResults(tp, tn, fp, fn)
Пример #7
0
def ComputeJacardSim(doc_term_matrix):
    print "in get_jacard_sim"

    setOfWords = LanguageModel.getSetOfWords(doc_term_matrix)
    jacard_sim_dict = LanguageModel.get_jacard_sim(setOfWords)

    print "get tag sim okay,length is:" + str(len(jacard_sim_dict))
    return jacard_sim_dict
Пример #8
0
def test():
    trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE)
    trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE)
        
    trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE)
    trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE)

    return make_training_vectors(trainAACleanLM, trainAAInsultLM,
        trainABCleanLM, trainABInsultLM)
Пример #9
0
    def __init__(self):

        INSULT_TRAIN_FILE = 'insult_corpus_train.txt'
        CLEAN_TRAIN_FILE = 'clean_corpus_train.txt'

        INSULT_TEST_FILE = 'insult_corpus_test.txt'
        CLEAN_TEST_FILE = 'clean_corpus_test.txt'

        self.cleanSents = LanguageModel(CLEAN_TRAIN_FILE).splitBySpaces()
        self.insultSents = LanguageModel(INSULT_TRAIN_FILE).splitBySpaces()

        self.cleanTestSents = LanguageModel(CLEAN_TEST_FILE).splitBySpaces()
        self.insultTestSents = LanguageModel(INSULT_TEST_FILE).splitBySpaces()
Пример #10
0
def loop_files(train_file, test_files):
    lm = LanguageModel('corpus/google-10000-english.txt')
    lm.load_weights('model/keras_char_rnn.500.h5')
    found = 0
    for test_file in test_files:
        print('-------------------------')
        print('Starting for ', test_file)
        result = pipeline(train_file, test_file, lm)
        print(result)
        actual = test_file.split('_')[0]
        if actual in result:
            print('Found', actual)
            found += 1

    print('Inference rate: ', found / len(test_files))
Пример #11
0
    def __init__(self, dataset, sampleEach=1):
        self.path = '../data/' + dataset + '/'
        self.chars = codecs.open(self.path + 'chars.txt', 'r', 'utf8').read()
        self.wordChars = codecs.open(self.path + 'wordChars.txt', 'r',
                                     'utf8').read()
        self.lm = LanguageModel(
            codecs.open(self.path + 'corpus.txt', 'r', 'utf8').read(),
            self.chars, self.wordChars)
        self.mats = []
        self.gts = []
        self.fns = []

        i = 0
        while True:
            fnMat = self.path + 'mat_' + str(i) + '.csv'
            fnGT = self.path + 'gt_' + str(i) + '.txt'
            i += 1

            # file not found
            if (not os.path.isfile(fnMat)) or (not os.path.isfile(fnGT)):
                break

            # ignore this sample
            if (i - 1) % sampleEach != 0:
                continue

            # put into result
            self.mats.append(fnMat)
            self.gts.append(fnGT)
            self.fns.append(fnMat + '|' + fnGT)

        self.currIdx = 0
Пример #12
0
    def __init__(self, dataset, sampleEach=1):
        self.path = "../data/" + dataset + "/"
        self.chars = codecs.open(self.path + "chars.txt", "r", "utf8").read()
        self.wordChars = codecs.open(self.path + "wordChars.txt", "r",
                                     "utf8").read()
        self.lm = LanguageModel(
            codecs.open(self.path + "corpus.txt", "r", "utf8").read(),
            self.chars,
            self.wordChars,
        )
        self.mats = []
        self.gts = []
        self.fns = []

        i = 0
        while True:
            fnMat = self.path + "mat_" + str(i) + ".csv"
            fnGT = self.path + "gt_" + str(i) + ".txt"
            i += 1

            # file not found
            if (not os.path.isfile(fnMat)) or (not os.path.isfile(fnGT)):
                break

            # ignore this sample
            if (i - 1) % sampleEach != 0:
                continue

            # put into result
            self.mats.append(fnMat)
            self.gts.append(fnGT)
            self.fns.append(fnMat + "|" + fnGT)

        self.currIdx = 0
Пример #13
0
    def trainOnAllLM(self):
        
        # open clean text files for each book and join all lines
        text = ""
        books = ["AnitaBlake01GuiltyPleasures.clean.txt",
        "AnitaBlake02LaughingCorpse.really.clean.txt",
        "AnitaBlake03CircusOfTheDamned.really.clean.tx",
        "AnitaBlake04LunaticCafe.really.clean.txt",
        "AnitaBlake05BloodyBones.really.clean.txt",
        "AnitaBlake06TheKillingDance.really.clean.txt",
        "AnitaBlake07BurntOfferings.really.clean.txt",
        "AnitaBlake08BlueMoon.really.clean.txt",
        "AnitaBlake09ObsidianButterfly.really.clean.txt",
        "AnitaBlake10NarcissusInChains.really.clean.txt",
        "AnitaBlake11CeruleanSins.really.clean.txt",
        "AnitaBlake12IncubusDreams.really.clean.txt",
        "AnitaBlake16BloodNoir.really.clean.txt",
        "AnitaBlake17SkinTrade.really.clean.txt",
        "AnitaBlake18Flirt.really.clean.txt"]
        
        for book in books:
            text += ''.join(open(os.path.join(os.path.dirname(__file__), '..', 'data', book)).read()) 
        
        # sentencify text
        sentences = re.split(r' *[.?!][\'")\]]* *[(\["]*', text)
        
        # cut out the first 15 proper sentences - dev and test
        sentences = sentences[17:]

        # wordify the sentences
        for i, sentence in enumerate(sentences):
            sentences[i] = re.findall(r"[\w']+|[.,!?;]", sentence)
        
        # train LM on corpus
        self.LM = LanguageModel(sentences)    
Пример #14
0
def get_wer_samples():
    def all_prefs(test_set):
        res = set()
        for p in test_set:
            p = tuple(
                p
            )  # make hashable, bit wonky to work like this but anyways will be consistent with LanguageModel expectations
            res.update(p[:i] for i in range(len(p) + 1))
        return list(res)

    lm = LanguageModel(rnn)
    samples = [
        lm.sample(cutoff=args.wer_max_len) for _ in range(args.wer_num_samples)
    ]
    gold_dict = lm.next_token_preds(all_prefs(samples))
    return samples, gold_dict
Пример #15
0
 def print_metrics(name, model, metric):
     lm = LanguageModel(model)
     if metric == "NDCG":
         temporary_model_preds_file = lm.make_spice_preds(ndcg_samples)
         ndcg = modified_score_rankings(temporary_model_preds_file,
                                        ndcg_target_filename)
         os.remove(temporary_model_preds_file)
         print(name, "got ndcg against rnn:", clean_val(ndcg, 5), file=f)
     if metric == "WER":
         wer = lm.WER(wer_samples, gold_dict=wer_gold)
         print(name, "got wer against rnn:", clean_val(wer, 5), file=f)
     if metric == "TIME":
         print(name,
               "took:",
               lapse_str(model.creation_info["extraction time"], 1),
               "s",
               file=f)
Пример #16
0
    def get_pkg_desp_dict(self):
        print "in get_merge_model"
        weight_title = 1.0
        weight_tag = 1.0
        weight_desp = 1.0

        weight_unigram = (1.0 / 1.4) * weight_desp
        weight_bigram = (0.4 / 1.4) * weight_desp
        #weight_trigram = (0.16/1.56)*weight_desp

        #pkg_title_count_dict = {}
        #pkg_tag_count_dict = {}
        pkg_unigram_count_dict = {}
        pkg_bigram_count_dict = {}
        #pkg_trigram_count_dict = {}

        #for pkg,temp_list in self.pkg_title_dict.items():
        #pkg_title_count_dict[pkg] = {k:weight_title for k in set(temp_list) if k.strip()}
        #for pkg,temp_list in self.pkg_tag_dict.items():
        #pkg_tag_count_dict[pkg] = {k:weight_tag for k in set(temp_list) if k.strip()}
        for pkg, temp_dict in LanguageModel.getBagOfWords(
                self.pkg_unigram_dict).items():
            pkg_unigram_count_dict[pkg] = {
                k: weight_unigram * v
                for k, v in temp_dict.items() if k.strip()
            }
        for pkg, temp_dict in LanguageModel.getBagOfWords(
                self.pkg_bigram_dict, ).items():
            pkg_bigram_count_dict[pkg] = {
                k: weight_bigram * v
                for k, v in temp_dict.items() if k.strip()
            }
        #for pkg,temp_dict in LanguageModel.getBagOfWords(self.pkg_trigram_dict).items():
        #pkg_trigram_count_dict[pkg] = {k:weight_trigram*v for k,v in temp_dict.items() if k.strip()}

        pkg_desp_dict = {}
        for pkg in self.pkg_title_dict:
            #single = Counter(pkg_title_count_dict[pkg]) + Counter(pkg_tag_count_dict[pkg]) + Counter(pkg_unigram_count_dict[pkg]) + Counter(pkg_bigram_count_dict[pkg]) #+ Counter(pkg_trigram_count_dict[pkg])
            single = Counter(pkg_unigram_count_dict[pkg]) + Counter(
                pkg_bigram_count_dict[pkg]
            )  #+ Counter(pkg_trigram_count_dict[pkg])
            pkg_desp_dict[pkg] = single

        print "get desp okay,pkg_desp_dict" + str(len(pkg_desp_dict))
        return pkg_desp_dict
Пример #17
0
def do_ngram():
    print("~~~running ngram extraction~~~")
    print("making samples", end=" ... ")
    sample_start = process_time()
    samples = []
    length = 0
    lmrnn = LanguageModel(rnn)
    while length < args.ngram_total_sample_length:
        s = lmrnn.sample(cutoff=args.ngram_max_sample_length)
        samples.append(s)
        length += (len(s) + 1)  # ending the sequence is also a sample
    ngrams = {}
    ngrams_folder = rnn_folder + "/ngram"
    prepare_directory(ngrams_folder)
    sample_time = process_time() - sample_start
    print("done, that took:", clock_str(sample_start))
    print("making the actual ngrams", end=" ... ")
    with open(ngrams_folder + "/samples.txt", "w") as f:
        print(len(samples), len(rnn.internal_alphabet), file=f)
        for s in samples:
            print(len(s), *s, file=f)
    for n in args.ngram_ns:
        ngram_start = process_time()
        ngram = NGram(n, rnn.input_alphabet, samples)
        ngram.creation_info = {
            "extraction time": sample_time + process_time() - ngram_start,
            "size": len(ngram._state_probs_dist),
            "n": n,
            "total samples len (including EOS)": length,
            "num samples": len(samples),
            "samples cutoff len": args.ngram_max_sample_length
        }
        overwrite_file(ngram, ngrams_folder + "/" + str(n))
        ngrams[n] = ngram
    with open(ngrams_folder + "/creation_infos.txt", "w") as f:
        print("ngrams made from",
              len(samples),
              "samples, of total length",
              length,
              "(including EOSs)",
              file=f)
        for n in ngrams:
            print("===", n, "===\n", ngrams[n].creation_info, "\n\n", file=f)
    print("done, that took overall", clock_str(sample_start))
    return ngrams
Пример #18
0
    def __init__(self,
                 fr,
                 en,
                 model_file_fr,
                 model_file_en,
                 lex_file_fr,
                 lex_file_en,
                 lex_weight=1):
        """Initialises the language model.

        Args:
            fr/en: Foreign/English language code.
            model_file_fr/en (str): Foreign/English LanguageModel file name.
            lex_file_fr/en (str): Foreign/English Lexicon (1 word + frequency per line).
            lex_weight (float): Weight of the lexicon vs. the character model.
        """
        self.lex_weight = lex_weight
        self.model = {}
        self.model[self.FR] = LanguageModel.load(model_file_fr, lex_file_fr,
                                                 lex_weight)
        self.model[self.EN] = LanguageModel.load(model_file_en, lex_file_en,
                                                 lex_weight)
 def weight_avg_f1(self):
     classes = self.classified_tweets_by_class.keys()
     tuple_result_list = LanguageModel.flatten_list(
         self.classified_tweets_by_class.values())
     sum_weighted_f1 = 0
     for chosen_class in classes:
         chosen_class_correct_count = len([
             result for result in tuple_result_list if result[
                 ClassifyTupleResult.CorrectClass.value] == chosen_class
         ])
         sum_weighted_f1 = sum_weighted_f1 + self.per_class_f1[
             chosen_class] * chosen_class_correct_count
     return round(sum_weighted_f1 / len(tuple_result_list), 4)
Пример #20
0
 def __init__(self,
              dataset,
              languages,
              max_length=50,
              languageModels=None,
              filter_token=2,
              device=None):
     self.dataset = dataset
     self.languages = languages
     self.filter_token = filter_token
     self.max_length = max_length
     self.loadFiles()
     if languageModels is not None:
         self.languageModels = languageModels
     else:
         self.languageModels = {
             self.languages[0]: LanguageModel(),
             self.languages[1]: LanguageModel()
         }
         self.prepareLanguageModels()
     self.filter_unk()
     self.device = device
Пример #21
0
    def compute_field_jacardsim(self, doc_term_dict, filename="", offset=0):
        '''input dict is key:term_list '''
        print "in compute_field_jacardsim"
        if os.path.exists(filename):
            jacard_sim_dict = WriteTool.load_nested_dict(filename)
            return jacard_sim_dict

        weight_dict = {}
        if offset + s.log_min_dlcount > 0:
            weight_dict = {
                pkg:
                (offset + math.log(value)) / (offset + self.log_max_dlcount)
                for pkg, value in self.pkg_dlcount_dict.items()
            }
        field_setOfWords = LanguageModel.getSetOfWords(doc_term_dict)
        jacard_sim_dict = LanguageModel.get_jacard_sim(field_setOfWords,
                                                       weight_dict)

        if filename:
            WriteTool.write_nested_dict(jacard_sim_dict, filename,
                                        self.pkg_titlename_dict)

        print str(len(jacard_sim_dict))
        return jacard_sim_dict
Пример #22
0
 def write_for_xiaoxi(self):
     temp_dict = {}
     sekf.pkg_unigram_tfidf_dict = LanguageModel.getTfidf(
         s.pkg_unigram_dict)
     for pkg, tempSet in s.pkg_setOfWords.items():
         if pkg in s.pkg_unigram_tfidf_dict:
             single = {
                 word.strip(): 1.0
                 for word in tempSet if word.strip()
             }
             single = WriteTool.merge_dict(single,
                                           s.pkg_unigram_tfidf_dict[pkg],
                                           "max")
             temp_dict[pkg] = dict(single)
     WriteTool.write_nested_dict(temp_dict, 'data/blend_word_list_max1')
Пример #23
0
	def generate(self,hypothesis):
		assert set(hypothesis.internal_alphabet) == set(self.target.internal_alphabet) and (hypothesis.end_token == self.target.end_token) # order doesn't matter but they should have same letters and same EOS
		self.hypothesis = LanguageModel(hypothesis)
		self.checked = set()
		for n in range(self.n_cex_attempts):
			print("sample number:",n,"of",self.n_cex_attempts,file=self.prints_path)
			model = self.hypothesis if n%2 == 1 else self.target
			w = model.sample(cutoff=self.max_counterexample_length,empty_sequence=())
			pref = self._find_disagreeing_pref(w)
			if not None is pref:
				print("found cex on attempt",n,"of",self.n_cex_attempts,file=self.prints_path)
				print("found by sampling:",("hypothesis" if n%2==1 else "target"),file=self.prints_path)
				return pref
		print("no counterexamples found",file=self.prints_path)
		return None
Пример #24
0
def main(argv):
    sentencesFile = "../pa6/es-en/dev/newstest2012.es"
    foreignFile = None
    nativeFile = None
    loadFile = "../pa6/save.model"
    ngramFile = None

    try:
        opts, args = getopt.getopt(argv, "is:f:n:l:g:")
    except getopt.GetoptError:
        print 'Wrong argument. Use -i for improved version'
        sys.exit(2)

    isImproved = False

    for opt, value in opts:
        if opt == '-i':
            isImproved = True
        elif opt == '-s':
            sentencesFile = value
        elif opt == '-f':
            foreignFile = value
        elif opt == '-n':
            nativeFile = value
        elif opt == '-l':
            loadFile = value
        elif opt == '-g':
            ngramFile = value

    # print "improved!" if isImproved else "Not improved!"
    # print sentencesFile
    # print foreignFile
    # print nativeFile
    # print loadFile

    if foreignFile and nativeFile:
        model = ModelOne(foreignFile, nativeFile)
    else:
        model = ModelOne(loadFile=loadFile)

    langModel = LanguageModel()

    if sentencesFile:
        sentences = []
        with open(sentencesFile) as f:
            for line in f:
                sentences.append(line.lower().strip().split())
        translated = translateSentences(sentences, model, langModel)
Пример #25
0
    def trainLM(self):
        
        # open clean text and join all lines
        text = ''.join(open(os.path.join(os.path.dirname(__file__), '..', 'data', 'AnitaBlake01GuiltyPleasures.clean.txt')).read()) 
        
        # sentencify text
        sentences = re.split(r' *[.?!][\'")\]]* *[(\["]*', text)
        
        # cut out the first 15 proper sentences - dev and test
        sentences = sentences[17:]

        # wordify the sentences
        for i, sentence in enumerate(sentences):
            sentences[i] = re.findall(r"[\w']+|[.,!?;]", sentence)
        
        # train LM on corpus
        self.LM = LanguageModel(sentences)
Пример #26
0
def spectral_reconstruct(model,
                         P,
                         S,
                         k_list,
                         ready_hankel_things=None,
                         print_file=None):
    f = print_file if not None is print_file else sys.stdout

    #make sure they start with the empty sequence
    assert len(P[0]) == 0
    assert len(S[0]) == 0
    model = LanguageModel(model)
    print("making spectral with P,S sizes:",
          len(P),
          len(S),
          file=f,
          flush=True)
    if None is ready_hankel_things:
        stuff = make_hankel_stuff(model, P, S, f)
    else:
        stuff = ready_hankel_things

    results = []
    total_times = []
    done_max = False
    for k in sorted(k_list):
        if k >= stuff["rank"]:
            if done_max:
                print("skipping", k, "onwards", file=f)
                break
            print("maxed out at", k, "so using k=rank=", stuff["rank"], file=f
                  )  # this allows using a k that is 'greater' than the rank,
            # which is important in the case the exact rank is missed (eg if the rank is 15 but its just
            # checking k=10,20,30, it will still do 20 but then skip 30)
            k = stuff[
                "rank"]  # don't make something higher than there actually is, making a WFA that thinks it has eg 5 states when it really has 2
            done_max = True
        start = process_time()
        results.append(their_algorithm(stuff, k))
        total_times.append(stuff["hankel_time"] + stuff["svd_time"] +
                           process_time() - start)
    return results, total_times, stuff["hankel_time"], stuff["svd_time"], stuff
 def calc_recall(self):
     # Order of classes: eu, ca, gl, es, en, pt
     per_class = dict()
     classes = self.classified_tweets_by_class.keys()
     tuple_result_list = LanguageModel.flatten_list(
         self.classified_tweets_by_class.values())
     for chosen_class in classes:
         tp = len([
             result for result in tuple_result_list
             if chosen_class == result[
                 ClassifyTupleResult.CalculatedClass.value] and chosen_class
             == result[ClassifyTupleResult.CorrectClass.value]
         ])
         fn = len([
             result for result in tuple_result_list
             if chosen_class != result[
                 ClassifyTupleResult.CalculatedClass.value] and chosen_class
             == result[ClassifyTupleResult.CorrectClass.value]
         ])
         recall = round(tp / (tp + fn), 4)
         per_class[chosen_class] = recall
     self.per_class_recall = per_class
 def train(self):
     print('\nTraining our model using file: \'{0}\''.format(
         self.training_file_name))
     # Create a language model for each language found in the training corpus.
     tweets_by_lang = self.group_tweets_by_lang()
     num_languages = len(tweets_by_lang)
     count = 1
     start_time = time.time()
     for (language, tweets) in tweets_by_lang.items():
         model = LanguageModel(language, tweets, self.ngram_type,
                               self.vocabulary, self.smoothing_value)
         print(
             '[{0}% completed]: {1} language model created. {2} tweets parsed.'
             .format(round((count / num_languages) * 100),
                     Language.from_str(language).name, len(tweets)))
         count = count + 1
         self.language_models.append(model)
     end_time = time.time()
     print(
         'Training completed. A total of {0} tweets parsed. [{1}s elapsed]'.
         format(len(self.training_tweets_dict),
                round(end_time - start_time)))
class SpellChecker:
    def __init__(self,
                 max_distance,
                 channel_model=None,
                 language_model=None,
                 threshold=15):
        self.nlp = nlp
        self.channel_model = channel_model
        self.language_model = language_model
        self.max_distance = max_distance
        self.unknown_words = dict()
        self.threshold = threshold

    def load_channel_model(self, fp):
        self.channel_model = EditDistance.EditDistanceFinder()
        self.channel_model.load(fp)

    def load_language_model(self, fp):
        self.language_model = LanguageModel()
        self.language_model.load(fp)

    def bigram_score(self, prev_word, focus_word, next_word):
        bigram_prob1 = self.language_model.bigram_prob(prev_word, focus_word)
        bigram_prob2 = self.language_model.bigram_prob(focus_word, next_word)
        return (bigram_prob1 + bigram_prob2) / 2

    def unigram_score(self, word):
        return self.language_model.unigram_prob(word)

    def cm_score(self, error_word, corrected_word):
        return self.channel_model.prob(error_word, corrected_word)

    def inserts(self, word):
        one_insert_away = []
        alphabet = string.ascii_lowercase
        for i in range(len(word) + 1):
            for letter in alphabet:
                new_word = word[0:i] + letter + word[i:]
                if new_word in self.language_model:
                    one_insert_away.append(new_word)
        return one_insert_away

    def deletes(self, word):
        one_delete_away = []
        for i in range(len(word)):
            new_word = word[0:i] + word[i + 1:]
            if new_word in self.language_model:
                one_delete_away.append(new_word)
        return one_delete_away

    def substitutes(self, word):
        one_sub_away = []
        alphabet = string.ascii_lowercase
        for i in range(len(word)):
            for letter in alphabet:
                if letter != word[i]:
                    new_word = word[0:i] + letter + word[i + 1:]
                    if new_word in self.language_model:
                        one_sub_away.append(new_word)
        return one_sub_away

    def generate_candidates_recurse(self, word_list, max_distance):
        if max_distance == 0:
            return word_list
        new_list = []
        for i in word_list:
            insert_words = self.inserts(i)
            delete_words = self.deletes(i)
            sub_words = self.substitutes(i)
            new_list += insert_words
            new_list += delete_words
            new_list += sub_words
        set_list = list(set(new_list))
        return self.generate_candidates_recurse(set_list, max_distance - 1)

    def generate_candidates(self, word):
        return self.generate_candidates_recurse([word], self.max_distance)

    def check_sentence(self, sentence, fallback=False):
        return_list = []
        for i in sentence:
            if i in self.language_model:
                return_list.append([i])
                continue
            if i in self.unknown_words:
                if self.unknown_words[i] > self.threshold:
                    return_list.append([i])
                    continue
                self.unknown_words[i] += 1
            elif i not in self.unknown_words:
                self.unknown_words[i] = 1
            candidates = self.generate_candidates(i)
            if candidates == []:
                if fallback:
                    return_list.append([i])
                    continue
                else:
                    return_list.append([])
                    continue
            candidates = sorted(
                candidates,
                key=lambda x: self.unigram_score(x) + self.cm_score(i, x),
                reverse=True)
            return_list.append(candidates)
        return return_list

    def check_line(self, text, fallback=False):
        sentence_doc = nlp(text)
        sentences = sentence_doc.sents
        sentences = [
            self.language_model.get_tokens(sentence) for sentence in sentences
        ]
        result = []
        for sentence in sentences:
            checked_sentence = self.check_sentence(sentence)
            result += checked_sentence
        return result

    def autocorrect_sentence(self, sentence):
        possibilities = self.check_sentence(sentence, True)
        possibilities = [x[0] for x in possibilities]
        return possibilities

    def autocorrect_line(self, line):
        doc = nlp(line)
        sentences = doc.sents
        sentences = [
            self.language_model.get_tokens(sentence) for sentence in sentences
        ]
        result = []
        for sentence in sentences:
            checked_sentence = self.autocorrect_sentence(sentence)
            result += checked_sentence
        return result

    def suggest_sentence(self, sentence, max_suggestions):
        possibilities = self.check_sentence(sentence, True)
        return_list = []
        for i in possibilities:
            if len(i) == 1:
                return_list.append(i[0])
            else:
                return_list.append(i[:max_suggestions])
        return return_list

    def suggest_line(self, text, max_suggestions):
        doc = nlp(text)
        sentences = doc.sents
        sentences = [
            self.language_model.get_tokens(sentence) for sentence in sentences
        ]
        result = []
        for sentence in sentences:
            checked_sentence = self.suggest_sentence(sentence, max_suggestions)
            result += checked_sentence
        return result
 def load_language_model(self, fp):
     self.language_model = LanguageModel()
     self.language_model.load(fp)
Пример #31
0
def main():
    print ("Generating language models....")
    trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE)
    trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE)
        
    trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE)
    trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE)

    testCleanLM = LanguageModel(CLEAN_TEST_FILE)
    testInsultLM = LanguageModel(INSULT_TEST_FILE)

    trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount()))
    testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount()))


    ### Just baseline probabilities
    print ("Running baseline....")
    NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM)
    print ("\tTraining NB....") 
    NB.train()
    print ("\tTesting NB....")  
    totalNBMatrix = np.array(NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents()))

    trainMatrix = totalNBMatrix 

    testMatrix = np.array(NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents()))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")    
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....") 
    # output1 = clf.predict(testMatrix).tolist()

    ## Baseline + PoS Features
    print ("Running baseline + PoS Features....")
    cleanPosMatrix = trainABCleanLM.getPosMatrix()
    insultPosMatrix = trainABInsultLM.getPosMatrix()

    testCleanPosMatrix = testCleanLM.getPosMatrix()
    testInsultPosMatrix = testInsultLM.getPosMatrix()

    posFeatures = np.array(cleanPosMatrix + insultPosMatrix)
    testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix)
    trainMatrix = np.hstack((trainMatrix, posFeatures))
    testMatrix = np.hstack((testMatrix, testPosFeatures))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")    
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....") 
    # output2 = clf.predict(testMatrix).tolist()


    ### Baseline + PoS Features + TF-IDF Features (TODO Arun)
    print("Running baseline + PoS Features + TF-IDF Features")
    # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM
    # trainMatrix = np.hstack((trainMatrix, the new thing you just generated))
    # do same for testMatrix
    # clf = svm.SVC()
    # print ("\tTraining SVM....")  
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")   
    # output3 = clf.predict(testMatrix).tolist()    
    # then update the output_file.txt thing below


    tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM,
            trainAAInsultLM, trainABCleanLM, trainABInsultLM)

    tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM,
            trainAAInsultLM, testCleanLM, testInsultLM)

    print tfidf_test_features.shape, tfidf_train_features.shape
    print testMatrix.shape, trainMatrix.shape

    trainMatrix = np.hstack((trainMatrix, tfidf_train_features))
    testMatrix = np.hstack((testMatrix, tfidf_test_features))


    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")  
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")   
    # output3 = clf.predict(testMatrix).tolist()  

    ### SENTIMENT ###
    print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features")
    s = Sentiment()
    clean_train = np.array(s.get_clean_train_vector())
    insult_train = np.array(s.get_insult_train_vector())
    sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0)
    shape = sentiment_train_features.shape
    sentiment_train_features = sentiment_train_features.reshape((shape[0], 1))
    print sentiment_train_features.shape

    clean_test = np.array(s.get_clean_test_vector())
    insult_test = np.array(s.get_insult_test_vector())
    sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = sentiment_test_features.shape
    sentiment_test_features = sentiment_test_features.reshape((shape[0], 1))
    print sentiment_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    # clf = svm.SVC(kernel='linear')
    # print ("\tTraining SVM....")  
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")   
    # output4 = clf.predict(testMatrix).tolist()  

    ### MISSPELLINGS ###
    print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features")
    m = Misspellings()
    clean_train = np.array(m.get_clean_misspellings(False))
    insult_train = np.array(m.get_insult_misspellings(False))
    misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0)
    shape = misspellings_train_features.shape
    misspellings_train_features = misspellings_train_features.reshape((shape[0], 1))
    print misspellings_train_features.shape

    clean_test = np.array(m.get_clean_misspellings())
    insult_test = np.array(m.get_insult_misspellings())
    misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = misspellings_test_features.shape
    misspellings_test_features = misspellings_test_features.reshape((shape[0], 1))
    print misspellings_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = svm.SVC(kernel='linear')
    print ("\tTraining SVM....")  
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....")   
    output5 = clf.predict(testMatrix).tolist()  


    index_shuf = range(len(trainMatrix))
    trainMatrix_shuf = []
    trainLabel_shuf = []
    shuffle(index_shuf)
    for i in index_shuf:
        trainMatrix_shuf.append(trainMatrix[i])
        trainLabel_shuf.append(trainLabels[i])

    train_sizes, train_scores, valid_scores = learning_curve(svm.SVC(), trainMatrix_shuf, trainLabel_shuf, train_sizes=[100, 300, 500, 700, 900], cv=2)
    average_train_scores = [sum(i)/float(len(i)) for i in train_scores]
    average_valid_scores = [sum(i)/float(len(i)) for i in valid_scores]
    plt.plot(train_sizes, average_train_scores)
    plt.plot(train_sizes, average_valid_scores)
    plt.legend(['Training score', 'Cross-validation score'], loc='center left', bbox_to_anchor=(0.85, 0.5))
    plt.ylabel('Score')
    plt.xlabel('Training examples')
    plt.show()
    
    # with open('SVM_output_file_with_SB.txt', 'w+') as f:
    #     f.write("Output 1\n")
    #     f.write("{}\n".format(output1))
    #     interpret_results(output1, testLabels, f)
    #     f.write("\nOutput 2\n") 
    #     f.write("{}\n".format(output2))
    #     interpret_results(output2, testLabels, f)
    #     f.write("\nOutput 3\n") 
    #     f.write("{}\n".format(output3))
    #     interpret_results(output3, testLabels, f)
    #     f.write("Output 4\n")
    #     f.write("{}\n".format(output4))
    #     interpret_results(output4, testLabels, f)
    #     f.write("Output 5\n")
    #     f.write("{}\n".format(output5))
    #     interpret_results(output5, testLabels, f)

    get_pca_graph(trainMatrix, trainLabels, "train_pca.png", title="PCA of Training Set")
    get_pca_graph(testMatrix, testLabels, "test_pca.png", title="PCA of Test Set")
    get_pca_graph(trainMatrix, trainLabels, "train_pca2.png", title="PCA of Training Set (Insults Only)", plot_negative=False)
    get_pca_graph(testMatrix, testLabels, "test_pca2.png", title="PCA of Test Set (Insults Only)", plot_negative=False)
Пример #32
0
class HexoSpeller(MainloopFeedback):
    
    states = {
              "level_one": 1, # the hexagons contain groups of symbols, the group has to be picked first
              "level_two": 2, # the hexagons contain individual symbols
              }
    
    
    def init(self):
        self.send_parallel(Marker.feedback_init)
        self.logger.debug("HexoSpeller::init")
        self._last_tick_time = time.clock()
        self._state = self.states["level_one"]
        language_model_folder_path = self._create_language_model_folder_path()
        self.load_language_model(os.path.join(language_model_folder_path, PARAMS["language_model_file"]))
        self.spelled_text = []
        self._sub_list_probs = [] # probability values for each symbol sublist
        self._selected_symbol_idx = 0
        self._selected_symbol_sublist_idx = 0
        self._arrow_locked = False
        self._arrow_locked_time = None
        self.lock_arrow()
        self._control_signal = 0
        self._viz = None
        self._model = None
        
        
        
    def pre_mainloop(self):
        #print "HexoSpeller::pre_main_loop"
        self._model = HexoModel(PARAMS)
        self._model.add_arrow_length_observer(self)
        self._viz = HexoViz(self, VIZ_PARAMS)
        self._viz.hexo_controller = self
        self._viz.set_symbol_lists(self.symbol_list)
        if hasattr(ColorSchemes, VIZ_PARAMS["color_scheme"]):
            scheme_dictionary = getattr(ColorSchemes, VIZ_PARAMS["color_scheme"])
            VIZ_PARAMS.update(scheme_dictionary)
        # set some public variable that can be modified from the feedback controller GUI
        # set all variables for which there is a setter with the corresponding name
        for dict in [PARAMS, VIZ_PARAMS]:
            for key in dict.keys():
                if hasattr(self, 'set_'+key):
                    set_method = getattr(self, 'set_'+key)
                    set_method(dict[key])
    
    def post_mainloop(self):
        """ Tries to shut down the visualization. """
        self._viz.shut_down()
        
        
    def tick(self):
        """ Is called in each iteration of the main loop. This method determines how much time has passed
        between the current and the previous tick, and then delegates that information to the _model and the view
        via their tick(dt) methods. """
        if self._viz==None or self._model==None:
            return
        # determine how much time (in seconds) has passed between this and the previous tick
        current_time = time.clock()
        dt = current_time - self._last_tick_time
        self._last_tick_time = current_time
        # delegate the tick to the back end and the front end
        self._viz.tick(dt)
        self._model.tick(dt)
        self._model.set_control_signal(self.get_control_signal())
        # if the arrow is locked and the locking period is over, unlock it
        if self.is_arrow_locked():
            if current_time - self._arrow_locked_time > self.arrow_locked_duration:
                self.unlock_arrow()
        
    def play_tick(self):
        if not self.is_arrow_locked():
            self._model.play_tick()
        self._viz.play_tick()
    
    def pause_tick(self):
        self._viz.pause_tick()
        self._model.pause_tick()
        
    def on_control_event(self, data):
        self.logger.debug('on_control_event')
        self.set_control_signal(self._data)
        
    def on_interaction_event(self, data):
        self.logger.debug("on_interaction_event") 
        if type(data)==type({}):   
            # try to set the modified attributes
            for name in data.keys():
                # if we have the attribute and the respective setter
                if hasattr(self, name) and hasattr(self, "set_"+name):
                    set_method = getattr(self, "set_"+name)
                    new_value = data[name]
                    set_method(new_value)
    
    def on_play(self):
        self.send_parallel(Marker.status_change_to_play)
        MainloopFeedback.on_play(self)
    
    def on_pause(self):
        if self._MainloopFeedback__running and self._MainloopFeedback__paused:
            self.send_parallel(Marker.status_change_to_play)
        if self._MainloopFeedback__running and not self._MainloopFeedback__paused:
            self.send_parallel(Marker.status_change_to_pause)
        MainloopFeedback.on_pause(self)
        
    def on_stop(self):
        self.send_parallel(Marker.status_change_to_stop)
        MainloopFeedback.on_stop(self)
                    
        
    def get_selected_hexagon_index(self):
        """ Returns the hexagon that the arrow is currently pointing at. """
        return self._model.get_selected_hexagon_index()
    
    def get_arrow_length(self):
        return self._model.get_arrow_length()
    
    def get_phi_degrees(self):
        return self._model.get_phi_degrees()
    
    def arrow_at_max_length(self):
        """ To be called by the _model when the arrow has reached maximum length. """
        self.logger.debug("HexoFeedback::arrow_at_max_length")
        if self._state == self.states["level_one"]:
            self.send_parallel(Marker.hex_selected_level_one)
            # signal the GUI to change the content of the hexagons to single symbols
            selected_idx = self.get_selected_hexagon_index()
            self.send_parallel(Marker.selected_hex_level_one[selected_idx])
            self._viz.set_big_symbols(self.symbol_list[selected_idx], selected_idx)
            self._selected_symbol_sublist_idx = selected_idx
            # return the arrow to start length, but don't change the angle
            self.reset_arrow_model(reset_phi=False)
            # change to _state 'second selection'
            self._state = self.states["level_two"]
            self.lock_arrow()
            self._viz.start_state_change_animation()
        elif self._state == self.states["level_two"]:
            self.send_parallel(Marker.hex_selected_level_two)
            # get and store the selected symbol
            self.get_selected_symbol()
            self.update_symbol_list()
            # update the spelled word in the GUI
            self._viz.show_spelled_text(self.text_list_to_string(self.spelled_text))
            # signal the GUI to change the content of the hexagons back to multiple symbols
            self._viz.set_symbol_lists(self.symbol_list)
            # return the arrow to start angle and start length
            current_phi = self._model.get_phi_degrees()
            new_phi = (self.get_most_probable_hexagon_index()*60 + self.hex_pre_select_bias) % 360
            self.reset_arrow_model(reset_phi=True, phi=new_phi)
            self._state = self.states["level_one"]
            self.lock_arrow()
            self._viz.start_state_change_animation(rot_arrow=True, phi_start=current_phi, phi_end=new_phi)
        
    def reset_arrow_model(self, reset_phi=False, phi=0, control_signal=0):
        """ Resets the arrow length to initial length and the arrow angle and control signal value
         according to the given values. """
        self._model.reset_arrow_length()
        self._model.set_control_signal(control_signal)
        if reset_phi:
            self._model.reset_phi(phi)
        
    def get_selected_symbol(self):
        idx = self.get_selected_hexagon_index()
        self._selected_symbol_idx = idx
        self.send_parallel(Marker.selected_hex_level_two[idx])
        symbol = self._viz.get_selected_symbol(self._selected_symbol_sublist_idx, self._selected_symbol_idx)
        if symbol == self._language_model.delete_symbol:
            # if the delete symbol was selected and there is something to delete, pop the last character from the list
            if len(self.spelled_text) > 0:
                self.spelled_text.pop() 
        elif not symbol == None:
            # if the symbol is not None, attach it to the spelled Text
            self.spelled_text.append(symbol)
            # send a marker 
            idx = self._language_model.get_symbol_index(symbol)
            if not idx == None:
                self.send_parallel(Marker.selected_letter[idx])
            
    def get_most_probable_hexagon_index(self):
        """ Returns the index of the hexagon that contains the most probable next letter, based on what is already written. """
        return self._language_model.get_most_probable_symbol_sublist_index()
    
    def update_symbol_list(self):
        """ Update the order of symbols in the symbol list based on the spelled text. """
        spelled_text = self.text_list_to_string(self.spelled_text)
        self.symbol_list = self._language_model.update_symbol_list_sorting(spelled_text)

    def _create_language_model_folder_path(self):
        """ Creates a path that points to the folder that contains the language model file. 
        I assume that the lm files lie in a folder called "LanguageModels" which itself lies
        in the same folder as the HexoSpeller.py file, whose path is given by the __file__
        variable. """
        file_path = __file__ # file_path is now something like foo/bar/Feedbacks/HexoSpeller/HexoSpeller.py
        # remove the actual file name from the path by first reversing the string, then partitioning at the
        # last occourence of the path separator end reversing the tail of the partitioning 
        reversed_file_path = file_path[::-1]
        (_file_name, _sep, hexospeller_dir) = reversed_file_path.partition(os.path.sep)
        hexospeller_dir = hexospeller_dir[::-1] # reverse it, now in correct order
        # now complete the path to point to the language model directory
        lm_path = os.path.join(hexospeller_dir,"LanguageModels")
        return lm_path
       
    def load_language_model(self, file_name):
        """ Get the language _model, preferably from file. The path should be specified in params... """
        self._language_model = LanguageModel(file_name)
        self.symbol_list = self._language_model.get_symbol_list()
        
    def text_list_to_string(self, text_list):
        text = ''
        for c in text_list:
            text = text + c
        return text
    
    def set_control_signal(self, value):
        self._control_signal = value
    
    def get_control_signal(self):
        return self._control_signal
    
    def lock_arrow(self):
        self._arrow_locked = True
        self._arrow_locked_time = time.clock()
    
    def unlock_arrow(self):
        self._arrow_locked = False
    
    def is_arrow_locked(self):
        return self._arrow_locked
    
#================================================================================
#  Setter for the variables that should be 
#  setable from the feedback controller GUI
#================================================================================
    
    def set_hexagon_default_color(self, rgb):
        self.hexagon_default_color = rgb
        if not self._viz == None:
            r,g,b, = rgb
            self._viz.set_hexagon_color(r,g,b)
    
    def set_hexagon_highlight_color(self, rgb):
        self.hexagon_highlight_color = rgb
        if not self._viz == None:
            r,g,b, = rgb
            self._viz.set_hexagon_highlight_color(r, g, b)
        
    def set_hexagon_text_color(self, rgb):
        self.hexagon_text_color = rgb
        if not self._viz == None:
            r,g,b, = rgb
            self._viz.set_hexagon_text_color(r, g, b, alpha=1)
        
    def set_arrow_color(self, rgb):
        self.arrow_color = rgb
        if not self._viz == None:
            r,g,b, = rgb
            self._viz.set_arrow_color(r,g,b)
    
    def set_state_change_animation_duration(self, dur):
        self.state_change_animation_duration = dur
        if not self._viz == None:
            self._viz.params['state_change_animation_duration'] = dur
    
    def set_arrow_growth_time(self, time):
        self.arrow_growth_time = time
        if not self._model == None:
            self._model.params['arrow_growth_time'] = time
        
    def set_arrow_shrinkage_time(self, time):
        self.arrow_shrinkage_time = time
        if not self._model == None:
            self._model.params['arrow_shrinkage_time'] = time
        
    def set_arrow_rotation_time(self, time):
        self.arrow_rotation_time = time
        if not self._model == None:
            self._model.params['arrow_rotation_time'] = time
        
    def set_arrow_locked_duration(self, duration):
        self.arrow_locked_duration = duration
    
    def set_control_signal_arrow_rotation_threshold(self, t):
        self.control_signal_arrow_rotation_threshold = t
        if not self._model == None:
            self._model.params["control_signal_arrow_rotation_threshold"] = t
        if not self._viz == None:
            self._viz.set_arrow_rotation_threshold(t)
    
    def set_control_signal_arrow_growth_threshold(self, t):
        self.control_signal_arrow_growth_threshold = t
        if not self._model == None:
            self._model.params["control_signal_arrow_growth_threshold"] = t
        if not self._viz == None:
            self._viz.set_arrow_growth_threshold(t)
        
    def set_control_signal_bar_frame_color(self, rgb):
        self.control_signal_bar_frame_color = rgb
        if not self._viz == None:
            r,g,b = rgb
            self._viz.set_control_signal_bar_frame_color(r,g,b)

    def set_control_signal_bar_color(self, rgb):
        self.control_signal_bar_color = rgb
        if not self._viz == None:
            r,g,b = rgb
            self._viz.set_control_signal_bar_color(r,g,b)
        
    def set_lm_head_factors(self, head_factors):
        self._language_model.head_factors = head_factors
        self.lm_head_factors = head_factors
    
    def set_lm_letter_factor(self, letter_factor):
        self._language_model.letter_factor = letter_factor
        self.lm_letter_factor = letter_factor
        
    def set_lm_n_pred(self, n_pred):
        self._language_model.n_pred = n_pred
        self.lm_n_pred = n_pred
        
    def set_textboard_background_color(self, rgb):
        self.textboard_background_color = rgb
        if not self._viz == None:
            r,g,b = rgb
            self._viz.set_textboard_background_color(r, g, b)
        
    def set_textboard_frame_color(self, rgb):
        self.textboard_frame_color = rgb
        if not self._viz == None:
            r,g,b = rgb
            self._viz.set_textboard_frame_color(r, g, b)
    
    def set_textboard_text_color(self, rgb):
        self.textboard_text_color = rgb
        if not self._viz == None:
            r,g,b = rgb
            self._viz.set_textboard_text_color(r, g, b)
        
    def set_background_color(self, rgb):
        self.backgroud_color = rgb
        if not self._viz == None:
            r,g,b = rgb
            self._viz.set_background_color(r,g,b)
        
    def set_hex_pre_select_bias(self, v):
        self.hex_pre_select_bias = v
Пример #33
0
 def load_language_model(self, file_name):
     """ Get the language _model, preferably from file. The path should be specified in params... """
     self._language_model = LanguageModel(file_name)
     self.symbol_list = self._language_model.get_symbol_list()
Пример #34
0
def main():
	global LM 

	# print "training language model"

	# trainingCorpus = HolbrookCorpus(brown.sents())
	# LM = LanguageModel(trainingCorpus)

	# print "training complete"

	# print "------------------"


	#testLanguageModel()

	#tagged_corpus = cess_esp.tagged_sents()
	#size = int(len(tagged_corpus) * .9)
	#training = tagged_corpus[:size]

	#print "training HiddenMarkovModelTagger"
	#hmm_tagger = HiddenMarkovModelTagger.train(training)
	#print "finished training"

	dict_file = "../data/dictionary.txt"
	sentences_file = "../corpus/corpus_test.txt"
	tagged_corpus_file = "../data/tagged_sentences_test.txt"
	dictionary_lists = loadList(dict_file)
	sentences_lists = loadList(sentences_file)
	tagged_sentences = loadList(tagged_corpus_file)

	print "training LM..."
	trainingCorpus = HolbrookCorpus(brown.sents())
	LM = LanguageModel(trainingCorpus)
	estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
	# unigram_model = NgramModel(1, brown.words(), True, False, estimator)
	unigram_model = None
	print "finished training LM"
	#print sentences_lists
	#print dictionary_lists
	dictionary = dict()
	for entry in dictionary_lists:
		entry_list = entry.split()
		key = ""
		translations = []
		for idx, word in enumerate(entry_list):
			if idx== 0: 
				key = word.lower()
			else:
				translations.append(word)
		dictionary[key]=translations

	#print dictionary
	#s = 0
	#for v in dictionary.values():
	#	s += v
	#s /= len(dictionary)
	#print "avg num values = " + str(s)
	#return

	# s = 0
	# for v in dictionary.values():
	# 	s += v
	# s /= len(dictionary)
	# print "avg num values = " + str(s)
	# get_translations_by_pos("momento/ncms000", {"momento": ['time/N', 'times/NP', 'moment/N', 'moments/NP']}) #testing tag method
	# return


	for idx, sentence in enumerate(sentences_lists):
		if sentence == "": continue

		#tagged_sentences.append(hmm_tagger.tag(sentence.split()))
		#print("")

		# tagged_sentences.append(hmm_tagger.tag(sentence.split()))
		print("")

		print("Sentence ",idx+1)

		# sentence_list = sentence.split()
		#tagged_list = hmm_tagger.tag(sentence.split())
		tagged_sentence = tagged_sentences[idx].split()

		#for pair in tagged_list:
		#	tagged_sentence.append("/".join(pair))
		#print tagged_sentence
		sentence_list = tagged_sentence


		demo_translation_list = []
		list_of_likely_translations = []

		list_of_likely_translations= likely_translations(sentence_list,dictionary,unigram_model)
		list_of_likely_translations_as_strings = []
		for lis in list_of_likely_translations:
			lis = noun_adjective_switch(lis)
			lis = noun_of_the_noun_switch(lis)
			clean_lis = remove_pos_tags_and_underscores(lis)

			string = ' '.join(clean_lis)
			list_of_likely_translations_as_strings.append(string)

		best = LM.n_most_likely(list_of_likely_translations_as_strings, 10)

		#print list_of_likely_translations
		##if idx==0:
		#	print list_of_likely_translations
			#for lis in list_of_likely_translations:
				#print lis

		for word in sentence.split():
			# print(word)
			word = word.replace('.','')
			word = word.replace(',','')
			word = word.replace(':','')
			word = word.replace('(','')
			word = word.replace(')','')
			word = word.replace('-','')
			word = word.lower()
			#print(word)
			if word!='':
				trans = dictionary.get(word)
				demo_translation_list.append(trans[(idx+17)%(len(trans))-1])

		#print demo_translation_list
		pos_free_translation_list = remove_pos_tags_and_underscores(demo_translation_list)
		print("Initial Translation: ",' '.join(pos_free_translation_list),)



		print("Final Translation: ",best[0])
Пример #35
0
    # get corpus directories
    corpus_root_xml = nltk.data.find(
        'C:\\Users\\James\\PycharmProjects\\FIT3036\\xml')
    corpus_root_plain = 'C:\\Users\\James\\PycharmProjects\\FIT3036\\plain_text'

    # get all xml and plain text files from specified directories
    corpus_xml = CHILDESCorpusReader(corpus_root_xml, '.*.xml')
    corpus_plain = PlaintextCorpusReader(corpus_root_plain, '.*.cha')

    # get all the words spoken by a child
    all_words = [w.lower() for w in corpus_xml.words(speaker=['CHI'])]

    # init wordnet and language model
    corpus_ic = wn.ic(corpus_xml, True, 1.0)
    lm = LanguageModel(all_words)

    # collect all the features for each corpus
    for j in range(len(corpus_xml.fileids())):
        current_features = []  # init empty array to store features
        # Text initialization
        text_xml = corpus_xml.fileids()[j]
        text_plain = corpus_plain.fileids()[j]

        # list of words spoken by the child in lowercase
        child_words_xml = [
            w.lower() for w in corpus_xml.words(text_xml, speaker=['CHI'])
        ]

        # list of words spoken by the child in lowercase with replaced words
        child_words_replaced_xml = [
Пример #36
0
                    else:
                        result = eachword  + ' ' + ''.join(seq_of_rest) 
            return result, max



if __name__ == '__main__':
    if len(sys.argv) < 4:
        print "Usage: python corrector.py <dev | test> <uniform | empirical> <queries file>"
        exit(0)
    queries_file = sys.argv[3]
    queries, gold, google = read_query_data(queries_file)
    kind_of_editmodel = sys.argv[2]
    #Read in unigram and bigram probs
    print >> sys.stderr, "Loading language model"
    languagemodel = LanguageModel('unigram_model','bigram_model')
    print >> sys.stderr, "Loading edit model"
    editmodel = EditModel(kind_of_editmodel,languagemodel)
    languagemodel.init_edit_model(editmodel)
    print >> sys.stderr,"Loading spell correct"
    spell_corrector = SpellCorrect(languagemodel, editmodel)
    answers = []
    qc = 0
    for eachquery in queries:
        answer = spell_corrector.spell_correct_query(eachquery)  
        print answer  
        print >> sys.stderr, "%d" % (qc)
        qc+=1
        answers.append(answer)
    #Accuracy evaluation
    wrong = 0
class SpellChecker():
    def __init__(self, max_distance, channel_model=None, language_model=None):
        self.nlp = spacy.load('en', pipeline=["tagger", "parser"])
        self.max_distance = max_distance
        # self.load_channel_model(channel_model)
        # self.load_language_model(language_model)

    def load_channel_model(self, fp):
        self.channel_model = EditDistanceFinder()
        self.channel_model.load(fp)

    def load_language_model(self, fp):
        self.language_model = LanguageModel()
        self.language_model.load(fp)

    def bigram_score(self, prev_word, focus_word, next_word):
        prevFocusScore = self.language_model.bigram_prob(prev_word, focus_word)
        focusNextScore = self.language_model.bigram_prob(focus_word, next_word)
        return (prevFocusScore + focusNextScore) / 2

    def unigram_score(self, word):
        return self.language_model.unigram_prob(word)

    def cm_score(self, error_word, corrected_word):
        return self.channel_model.prob(error_word, corrected_word)

    def inserts(self, word):
        '''
            Takes in word and return a list of words that are within one insert of word
        '''
        # Insert every letter
        possibleWords = []
        for letter in string.ascii_lowercase:
            # Every possible position
            for i in range(len(word) + 1):
                # Check if the resulting word is a word
                testWord = word[:i] + letter + word[i:]
                if self.language_model.__contains__(testWord):
                    possibleWords.append(testWord)
        return possibleWords

    def deletes(self, word):
        # Delete every letter
        possibleWords = []
        for i in range(len(word) + 1):
            # Check if the resulting word is a word
            testWord = word[:i] + word[i + 1:]
            if self.language_model.__contains__(testWord):
                possibleWords.append(testWord)
        return possibleWords

    def substitutions(self, word):
        # Substitute every letter
        possibleWords = []
        for letter in string.ascii_lowercase:
            # At every possible position
            for i in range(len(word) + 1):
                # Check if the resulting word is a word
                testWord = word[:i] + letter + word[i + 1:]
                if self.language_model.__contains__(testWord):
                    possibleWords.append(testWord)
        return possibleWords

    def generate_candidates(self, word):
        '''
            Takes in a candidate word and returns words that are within self.max_distance edits of word
        '''
        for i in range(1, self.max_distance + 1):
            if i == 1:
                candidateWords = self.inserts(word) + self.deletes(
                    word) + self.substitutions(word)
            else:
                newWords = []
                for currentWord in candidateWords:
                    newWords += self.inserts(currentWord) + self.deletes(
                        currentWord) + self.substitutions(currentWord)
                candidateWords += newWords
        # Get rid of duplicates
        return list(set(candidateWords))

    def check_sentence(self, sentence, fallback=False):
        returnList = []
        for i in range(len(sentence)):
            if i == 0 and i == len(sentence) - 1:
                prevWord = '<s>'
                nextWord = '</s>'
            elif i == 0:
                prevWord = '<s>'
                nextWord = sentence[i + 1]
            elif i == len(sentence) - 1:
                nextWord = '</s>'
                prevWord = sentence[i - 1]
            else:
                prevWord = sentence[i - 1]
                nextWord = sentence[i + 1]
            word = sentence[i]
            # If it's in the language model, add just that word
            if self.language_model.__contains__(word):
                returnList.append([word])
            else:
                # Get all the candidates for that word
                candidates = self.generate_candidates(word)
                candidateList = []
                if candidates == [] and fallback:
                    candidateList = [word]
                else:
                    for candidate in candidates:
                        unigramScore = self.unigram_score(candidate)
                        bigramScore = self.bigram_score(
                            prevWord, candidate, nextWord)
                        languageScore = (0.5*unigramScore) + \
                            (0.5 * bigramScore)
                        candidateScore = languageScore + \
                            self.cm_score(word, candidate)

                        candidateList.append([candidate, candidateScore])

                    # Sort the list by the second element
                    candidateList.sort(key=lambda x: x[1], reverse=True)
                    # Remove the second element, and append
                    candidateList = [x[0] for x in candidateList]
                returnList += [candidateList]

        return returnList

    def check_text(self, text, fallback=False):
        '''
        take a string as input, tokenize and sentence segment it with spacy, and then return the concatenation of the result of calling check_sentence on all of the resulting sentence objects.
        '''
        tokens = self.nlp(text)
        sentences = list(tokens.sents)

        processedSentences = []
        for sentence in sentences:
            # Convert sentence into list of lowercase words
            wordList = sentence.text.split()
            wordList = [x.lower() for x in wordList]
            processedSentences.append(self.check_sentence(wordList, fallback))

        return processedSentences

    def autocorrect_sentence(self, sentence):
        '''
         take a tokenized sentence (as a list of words) as input, call check_sentence on the sentence with fallback=True, and return a new list of tokens where each non-word has been replaced by its most likely spelling correction.
        '''
        corrections = self.check_sentence(sentence, fallback=True)
        return [x[0] for x in corrections]

    def autocorrect_line(self, line):
        '''
             take a string as input, tokenize and segment it with spacy, and then return the concatenation of the result of calling autocorrect_sentence on all of the resulting sentence objects.
        '''

        tokens = self.nlp(line)
        sentences = list(tokens.sents)

        processedSentences = []
        for sentence in sentences:
            # Convert sentence into list of lowercase words
            wordList = sentence.text.split()
            if len(wordList) == 0:
                continue
            wordList = [x.lower() for x in wordList]
            processedSentences.append(self.autocorrect_sentence(wordList))

        return processedSentences

    def suggest_sentence(self, sentence, max_suggestions):
        '''
            take a tokenized sentence (as a list of words) as input, call check_sentence on the sentence, and return a new list where:
            Real words are just strings in the list
            Non-words are lists of up to max_suggestions suggested spellings, ordered by your model’s preference for them.
        '''
        sentenceCorrections = self.check_sentence(sentence)

        returnList = []
        for word in sentenceCorrections:
            if len(word) == 1:
                returnList += word
            else:
                returnList.append(word[:max_suggestions])

        return returnList

    def suggest_text(self, text, max_suggestions):
        '''
            take a string as input, tokenize and segment it with spacy, and then return the concatenation of the result of calling suggest_sentence on all of the resulting sentence objects
        '''
        tokens = self.nlp(text)
        sentences = list(tokens.sents)

        processedSentences = []
        for sentence in sentences:
            # Convert sentence into list of lowercase words
            wordList = sentence.text.split()
            wordList = [x.lower() for x in wordList]
            # Get rid of the period
            if wordList[-1][-1] == '.':
                wordList[-1] = wordList[-1][:-1]
            processedSentences.append(
                self.suggest_sentence(wordList, max_suggestions))

        return processedSentences
Пример #38
0
def main():
    print ("Generating language models....")
    trainAACleanLM = LanguageModel(CLEAN_TRAIN_AA_FILE)
    trainAAInsultLM = LanguageModel(INSULT_TRAIN_AA_FILE)
        
    trainABCleanLM = LanguageModel(CLEAN_TRAIN_AB_FILE)
    trainABInsultLM = LanguageModel(INSULT_TRAIN_AB_FILE)

    testCleanLM = LanguageModel(CLEAN_TEST_FILE)
    testInsultLM = LanguageModel(INSULT_TEST_FILE)

    trainLabels = np.array(([0] * trainABCleanLM.getDocCount()) + ([1] * trainABInsultLM.getDocCount()))
    testLabels = np.array(([0] * testCleanLM.getDocCount()) + ([1] * testInsultLM.getDocCount()))


    ### Just baseline probabilities
    print ("Running baseline....")
    NB = baselineNaiveBayes(trainAACleanLM, trainAAInsultLM)
    print ("\tTraining NB....") 
    NB.train()
    print ("\tTesting NB....")  
    totalNBMatrix = np.array(NB.genProbs(trainABCleanLM.getSents(), trainABInsultLM.getSents()))

    trainMatrix = totalNBMatrix 

    testMatrix = np.array(NB.genProbs(testCleanLM.getSents(), testInsultLM.getSents()))

    clf = LogisticRegression()
    print ("\tTraining SVM....")    
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....") 
    output1 = clf.predict(testMatrix).tolist()


    ### Baseline + PoS Features
    print ("Running baseline + PoS Features....")
    cleanPosMatrix = trainABCleanLM.getPosMatrix()
    insultPosMatrix = trainABInsultLM.getPosMatrix()

    testCleanPosMatrix = testCleanLM.getPosMatrix()
    testInsultPosMatrix = testInsultLM.getPosMatrix()

    posFeatures = np.array(cleanPosMatrix + insultPosMatrix)
    testPosFeatures = np.array(testCleanPosMatrix + testInsultPosMatrix)
    trainMatrix = np.hstack((trainMatrix, posFeatures))
    testMatrix = np.hstack((testMatrix, testPosFeatures))

    clf = LogisticRegression()
    print ("\tTraining SVM....")    
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....") 
    output2 = clf.predict(testMatrix).tolist()


    ### Baseline + PoS Features + TF-IDF Features (TODO Arun)
    print("Running baseline + PoS Features + TF-IDF Features")
    # generate list of features with TFIDF, using trainABCleanLM and trainABInsultLM
    # trainMatrix = np.hstack((trainMatrix, the new thing you just generated))
    # do same for testMatrix
    # clf = svm.SVC()
    # print ("\tTraining SVM....")  
    # clf.fit(trainMatrix, trainLabels)
    # print ("\tTesting SVM....")   
    # output3 = clf.predict(testMatrix).tolist()    
    # then update the output_file.txt thing below


    tfidf_train_features = tfidf.make_feature_vectors(trainAACleanLM,
            trainAAInsultLM, trainABCleanLM, trainABInsultLM)

    tfidf_test_features = tfidf.make_feature_vectors(trainAACleanLM,
            trainAAInsultLM, testCleanLM, testInsultLM)

    print tfidf_test_features.shape, tfidf_train_features.shape
    print testMatrix.shape, trainMatrix.shape

    trainMatrix = np.hstack((trainMatrix, tfidf_train_features))
    testMatrix = np.hstack((testMatrix, tfidf_test_features))


    clf = LogisticRegression()
    print ("\tTraining SVM....")  
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....")   
    output3 = clf.predict(testMatrix).tolist()  

    ### SENTIMENT ###
    print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features")
    s = Sentiment()
    clean_train = np.array(s.get_clean_train_vector())
    insult_train = np.array(s.get_insult_train_vector())
    sentiment_train_features = np.concatenate((clean_train, insult_train), axis=0)
    shape = sentiment_train_features.shape
    sentiment_train_features = sentiment_train_features.reshape((shape[0], 1))
    print sentiment_train_features.shape

    clean_test = np.array(s.get_clean_test_vector())
    insult_test = np.array(s.get_insult_test_vector())
    sentiment_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = sentiment_test_features.shape
    sentiment_test_features = sentiment_test_features.reshape((shape[0], 1))
    print sentiment_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = LogisticRegression()
    print ("\tTraining SVM....")  
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....")   
    output4 = clf.predict(testMatrix).tolist()  

    ### MISSPELLINGS ###
    print("Running baseline + PoS Features + TF-IDF Features + Sentiment Features + Misspellings features")
    m = Misspellings()
    clean_train = np.array(m.get_clean_misspellings(False))
    insult_train = np.array(m.get_insult_misspellings(False))
    misspellings_train_features = np.concatenate((clean_train, insult_train), axis=0)
    shape = misspellings_train_features.shape
    misspellings_train_features = misspellings_train_features.reshape((shape[0], 1))
    print misspellings_train_features.shape

    clean_test = np.array(m.get_clean_misspellings())
    insult_test = np.array(m.get_insult_misspellings())
    misspellings_test_features = np.concatenate((clean_test, insult_test), axis=0)
    shape = misspellings_test_features.shape
    misspellings_test_features = misspellings_test_features.reshape((shape[0], 1))
    print misspellings_test_features.shape

    trainMatrix = np.hstack((trainMatrix, sentiment_train_features))
    testMatrix = np.hstack((testMatrix, sentiment_test_features))

    clf = LogisticRegression()
    print ("\tTraining SVM....")  
    clf.fit(trainMatrix, trainLabels)
    print ("\tTesting SVM....")   
    output5 = clf.predict(testMatrix).tolist()  

    with open('LOG_REG_output_file_w_SB.txt', 'w+') as f:
        f.write("Output 1\n")
        f.write("{}\n".format(output1))
        interpret_results(output1, testLabels, f)
        f.write("\nOutput 2\n") 
        f.write("{}\n".format(output2))
        interpret_results(output2, testLabels, f)
        f.write("\nOutput 3\n") 
        f.write("{}\n".format(output3))
        interpret_results(output3, testLabels, f)
        f.write("Output 4\n")
        f.write("{}\n".format(output4))
        interpret_results(output4, testLabels, f)
        f.write("Output 5\n")
        f.write("{}\n".format(output5))
        interpret_results(output5, testLabels, f)
Пример #39
0
class MT:

  
    def __init__(self, file, ngrams):
        
        self.dictionary = self.read_json(file)
        self.ngrams = self.read_json(ngrams)
        self.stemmer = SnowballStemmer("german") 
        self.tagger = POStagger() 
        self.trainOnAllLM()
        
        
    def translate(self, file):
        
        translated_sentences = []        
        sentences = self.read_json(file)
        dev = self.tagger.tag(sentences['test']) 
        
#        print dev
        for line in dev:
            clauses = self.split_line(line)
            translated_clauses = []
            
            for clause in clauses:
                LL = []
                clause = self.reorder_dependent_clause(clause)
                clause = self.reorder_obj_subj(clause)
                clause = self.reorder_participles(clause)
                clause = self.reorder_modals(clause)
                clause = self.recombine_sep_prefixes(clause)
#                words = self.reorder_adverbs(words)
                clause = self.interpolate_idioms(clause)
                clause = self.split_compounds(clause)

                for word in clause:
                    word = word.replace('.', '')
                    LL.append(self.lookup(word))
#                    print LL[-1]

                translated_clauses.append(self.refine_word_choice(LL))
                    
            translated_sentences.append(translated_clauses)
#            engSent.append(LL)

        translation = []
        for sentence in translated_sentences:
            trans = ""
            for clause in sentence:
                clauz = ""
                for word in clause:
                    clauz += " "+word
                trans += ", "+clauz  
            translation.append(trans)      

        return translation
        #return translated_sentences
  
  
    def refine_word_choice(self, LL):
        
        output = [[]]
        
        for wordList in LL:
            numPrefix = len(output)
            numWords = len(wordList)
            
            if len(wordList) > 1:
                tmp = [None]*(numWords*numPrefix)
                
                for i in range(numWords):
                    for k in range(numPrefix):
                        tmp[i*numPrefix+k] = copy.copy(output[k])
                        
                output = tmp
            for i, word in enumerate(wordList):
                for itr in range(numPrefix):
                    output[i*numPrefix+itr].append(word)
                    
        bestScore = float("-inf")
        index = 0
        
        for i, sent in enumerate(output):
            currScore = self.LM.score(sent)
            if currScore > bestScore:
                bestScore = currScore
                index = i
                
        print bestScore
        
        return output[index]
    
        
    def reorder_dependent_clause(self, words):
        
        pairs = [x.split('_') for x in words]

        if re.match('V[VAM]FIN', pairs[-1][-1]):
            changed = False
            
            """
            First look for an auxilliary verb to which to attach our end verb.
            """
#            for i in range(len(pairs) - 1, -1, -1):
#                if '_VAFIN' in pairs[i][-1]:
#                    words = words[:i + 1] + [words[-1]] + words[i + 1:-1]
#                    
#                    changed = True
#                    
#                    break
#                elif '_KON' in pairs[i][-1]:
#                    # Give up at the first conjunction
#                    break
            
            if not changed:
                """
                Find where to put it.  Look for a pair of noun phrases or articles
                or prepositions and put it between them.  Note that we can't cross a
                conjuction, though.
                """ 
                first = -1
                second = -1
                conjunction = -1

                # First we assume that articles do not substitute for nouns
                for i, pair in enumerate(pairs[:-1]):
                    if pair[-1] == 'PPER' or pair[-1] == 'PPOSS' or pair[-1] == 'PWS' or \
                        pair[-1] == 'NN' or pair[-1] == 'NE' or pair[-1] == 'PDS':
                        if first < 0:
                            first = i
                        elif second < 0:
                            second = i
                        else:
                            print ("Three subjects/objects in " + str(words))
                            break
                    elif pair[-1] == 'KON' or pair[-1] == 'KOUS' or pair[-1] == 'KOKOM':
                        first = -1
                        second = -1
                        conjunction = i

                if first >= 0 and second < 0:
                    first = -1
                    conjunction = -1
                    first_is_article = False
                    second_is_article = False

                    # An article may be substituting for the second noun.  Try again
                    for i, pair in enumerate(pairs[:-1]):
                        if pair[-1] == 'ART' or pair[-1] == 'CARD':
                            if first < 0:
                                first = i
                                first_is_article = True
                            elif second < 0:
                                second = i
                                second_is_article = True
                            else:
                                print ("Three subjects/objects in " + str(words))
                                break
                        elif pair[-1] == 'PPER' or pair[-1] == 'PPOSS' or pair[-1] == 'PWS' or \
                            pair[-1] == 'NN' or pair[-1] == 'NE' or pair[-1] == 'PDS':
                            if second < 0 and (first < 0 or first_is_article):
                                first = i
                                first_is_article = False
                            elif second < 0 or second_is_article:
                                second = i
                                second_is_article = False
                            else:
                                print ("Three subjects/objects in " + str(words))
                                break
                        elif pair[-1] == 'KON' or pair[-1] == 'KOUS' or pair[-1] == 'KOKOM':
                            first = -1
                            second = -1
                            conjunction = i

                    if second < 0:
                        # Nope. Maybe an article is subbing for the first noun.  Try again.
                        first = -1
                        conjunction = -1
                        first_is_article = False

                        # An article may be substituting for the second noun.  Try again
                        for i, pair in enumerate(pairs[:-1]):
                            if pair[-1] == 'ART' or pair[-1] == 'CARD':
                                if first < 0:
                                    first = i
                                elif second < 0:
                                    second = i
                                    second_is_article = True
                                else:
                                    print ("Three subjects/objects in " + str(words))
                                    break
                            elif pair[-1] == 'PPER' or pair[-1] == 'PPOSS' or pair[-1] == 'PWS' or \
                                pair[-1] == 'NN' or pair[-1] == 'NE' or pair[-1] == 'PDS':
                                if first < 0:
                                    first = i
                                elif second < 0 or second_is_article:
                                    second = i
                                    second_is_article = False
                                else:
                                    print ("Three subjects/objects in " + str(words))
                                    break
                            elif pair[-1] == 'KON' or pair[-1] == 'KOUS' or pair[-1] == 'KOKOM':
                                first = -1
                                second = -1
                                conjunction = i

                if first < 0 and conjunction < 0:
                    # Move the verb into the first position
                    words = words[-1:] + words[:-1]
                if first < 0:
                    # Move the verb into the first position
                    words = words[:conjunction + 1] + words[-1:] + words[conjunction + 1:-1]
                else:
                    words = words[:first + 1] + words[-1:] + words[first + 1:-1]

        return words
        
        
    @staticmethod
    def read_json(file):

        with codecs.open(file, encoding='utf8') as f:
            sentences = json.load(f, encoding='utf8')
            
        return sentences
   
    
    @staticmethod
    def split_line(line):
  
        phrases = []
        words = []  
        
        for word in re.findall(r'[^\s]+_(?:[A-Z]+|\$[.,(])', line):
            if len(word) > 0 and word[-2] == '$':
                phrases.append(words)
                words = []
            elif len(word) > 0:
                words.append(word)
        
        if len(words) > 0:
            phrases.append(words)
        
        return phrases


    """
    Find phrases in the source sentence and replace them with idiomatic
    translations.
    """
    def interpolate_idioms(self, words):
 
        new = words
        changed = True
        
        while changed:
            changed = False
            
            for i in range(len(new) - 1):
                for j in range(len(new), i + 1, -1):
                    phrase = ' '.join([x.split('_')[0] for x in new[i:j]])
                    
                    if phrase in self.dictionary['idioms']:
                        new = new[:i] + [self.dictionary['idioms'][phrase] + '_IDIOM'] + new[j:]
                        changed = True
                        break
                            
                    if changed:
                        break
                        
                if changed:
                    break
            
        return new
    
    
    def lookup(self, word):
        
        parts = word.split('_')
        translation = [parts[0]]
        
        if parts[1] != 'IDIOM':
            if parts[0] in self.dictionary['words']:
                translation = self.dictionary['words'][parts[0]]
            elif parts[0].lower() in self.dictionary['words']:
                translation = self.dictionary['words'][parts[0].lower()]
                
        if parts[1].startswith('V'):
            translation = [self.from_tense(w, self.get_tense(parts[0], parts[1])) for w in translation]

        return translation

    def verb_stem(self, verb):

        stem = verb
        
        m = REGULAR_PATTERN.match(verb)
        
        if m:
            stem = m.group(1)
        else:
            # Must be irregular present or past (1S or 3S), but how did we
            # not already find it in the dictionary?
            raise Exception("Didn't find %s in the dictionary" % verb)
        
        return stem
    
    def from_tense(self, verb, tense):
        words = verb.split(' ')
        rest = []
        
        if len(words) > 1:
            verb = words[0]
            rest = words[1:]
            
        new = verb
        
        if verb in self.dictionary['verbs'] and len(self.dictionary['verbs'][verb]) == 3 and tense == '1':
            new = self.dictionary['verbs'][verb][0][0]
        elif verb in self.dictionary['verbs'] and len(self.dictionary['verbs'][verb]) == 3 and tense == '2':
            new = self.dictionary['verbs'][verb][0][1]
        elif verb in self.dictionary['verbs'] and len(self.dictionary['verbs'][verb]) == 3 and tense == '3':
            new = self.dictionary['verbs'][verb][0][2]
        elif verb in self.dictionary['verbs'] and len(self.dictionary['verbs'][verb]) == 3 and tense[-1] == '+':
            new = self.dictionary['verbs'][verb][0][3]
        elif verb in self.dictionary['verbs'] and tense == 'PP':
            new = self.dictionary['verbs'][verb][-1]
        elif verb in self.dictionary['verbs'] and type(self.dictionary['verbs'][verb][-2]) == list and tense == '1P':
            new = self.dictionary['verbs'][verb][-2][0]
        elif verb in self.dictionary['verbs'] and type(self.dictionary['verbs'][verb][-2]) == list and tense == '2P':
            new = self.dictionary['verbs'][verb][-2][1]
        elif verb in self.dictionary['verbs'] and type(self.dictionary['verbs'][verb][-2]) == list and tense == '3P':
            new = self.dictionary['verbs'][verb][-2][2]
        elif verb in self.dictionary['verbs'] and type(self.dictionary['verbs'][verb][-2]) == list and tense[-2:] == '+P':
            new = self.dictionary['verbs'][verb][-2][3]
        elif verb in self.dictionary['verbs'] and tense[-1] == 'P':
            new = self.dictionary['verbs'][verb][-2]
        elif tense[-1] == 'P' and verb[-1] == 'e':
            new = verb + 'd'
        elif tense[-1] == 'P' and re.match('.*[^aeiou][aeiou][b-df-hj-np-tvwyz]$', verb):
            new = verb + verb[-1] + 'ed'
        elif tense[-1] == 'P':
            new = verb + 'ed'
#        elif tense == 'I' and verb[-1] == 'e':
#            new = verb[:-1] + 'ing'
#        elif tense == 'I':
#            new = verb + 'ing'
        elif tense == '2' and verb[-1] == 'o':
            new = verb + 'es'
        elif tense == '2':
            new = verb + 's'
        
        return ' '.join([new] + rest)
    
    def get_tense(self, verb, tag):
        words = verb.split(' ')
        
        if len(words) > 1:
            verb = words[0]
            
        tense = None
        
        if tag.endswith('PP'):
            tense = 'P'
        elif tag.endswith('INF'):
            tense = 'I'
        elif verb in self.dictionary['tenses']:
            tense = self.dictionary['tenses'][verb]
            
            if tense == 'P':
                tense = '1P' #1st or 3rd, we don't care which
            elif tense == 'I':
                tense = '1+' #1st or 3rd, we don't care which
        else:
            m = REGULAR_PATTERN.match(verb)
            
            if m and m.group(1) in self.dictionary['tenses']:
                # If it's in the dictionary, it must be strong simple past
                tense = self.dictionary['tenses'][m.group(1)]
                
                if tense == 'P' and m.group(2) == 't':
                    tense = '2+P'
                elif tense == 'P' and m.group(2) == 'st':
                    tense = '2P'
                elif tense == 'P' and (m.group(2) == 'en' or m.group(2) == 'n'):
                    tense = '1+P' #1st or 3rd, we don't care which
            elif m:
                # And now we're left with present or weak simple past
                if m.group(2) == 'e':
                    tense = '1'
                elif m.group(2) == 't':
                    # This could also be 2nd plural, in which case we'll get it wrong
                    tense = '3'
                elif m.group(2) == 'st':
                    tense = '2'
                elif m.group(2) == 'en':
                    tense = '1+' #1st or 3rd, we don't care which
                elif m.group(2) == 'ete':
                    tense = '1P'
                elif m.group(2) == 'etet':
                    # This could also be 2nd plural, in which case we'll get it wrong for weak verbs!
                    tense = '3P' 
                elif m.group(2) == 'etest':
                    tense = '2P'
                elif m.group(2) == 'eten':
                    tense = '1+P' #1st or 3rd, we don't care which
                else:
                    raise Exception('unexpected verb ending: %s/%s' % (m.group(1), m.group(2)))
            else:
                raise Exception("verb doesn't match pattern: %s/%s" % (verb, m.group(1)))
            
        return tense

    def split_compounds(self, words):
        
        split = []
        for word in words:
            parts = word.split('_')
            
            if parts[0] not in self.dictionary['words'] and parts[0].lower() not in self.dictionary['words']:
                if parts[1] == 'NN':
                    split.extend(self.split_noun(parts[0], parts[1]))
                elif parts[1] == 'APPRART':
                    split.extend(self.split_preposition(parts[0], parts[1]))
                else:
                    split.append(word)
            else:
                split.append(word)
                 
        return split
              
                
    def split_noun(self, word, tag):
 
        words = []
        i = len(word)

        while i > 0:
            if word[:i] in self.dictionary['words']:
                if i < len(word):
                    rest = self.split_noun(word[i].upper() + word[i+1:], tag)

                    if rest:
                        words.append(word[0:i] + '_' + tag)
                        words.extend(rest)
                        break
                else:
                    words.append(word[0:i] + '_' + tag)
                    break

            i -= 1

        if not words:
            words.append(word + '_' + tag)
            
        return words
    
    
    def split_preposition(self, preposition, tag):
 
        split = []
        
        if preposition in COMPOUND_PREPOSITIONS:
            split.extend(COMPOUND_PREPOSITIONS[preposition])
        else:
            split.append(preposition + '_' + tag)
            
        return split
        
        
    def reorder_participles(self, words):
 
        # find clause that ends with VVPP, VAPP, or VMPP
        new_words = words
        check = ["VVPP", "VAPP", "VMPP"]
        for c in check:
            if c in words[-1]:
                
                # find the preceding VA*
                for i, word in enumerate(words[:-1]):
                    if "VA" in word:
                
                        # move last word into pos after prec. VA*
                        new_words = words[:i+1]
                        new_words.append(words[-1])
                        new_words.extend(words[i+1:-1])
                        break
        
        return new_words
        
        
    def reorder_modals(self, words):
 
        # find clause that ends with VVPP, VAPP, or VMPP
        new_words = words
        check = ["_VVINF", "_VAINF", "_VMINF"]
        
        for c in check:
            if c in words[-1]:
                
                # find the preceding VM*
                for i, word in enumerate(words[:-1]):
                    if "_VM" in word:
                
                        # move last word into pos after prec. VA*
                        new_words = words[:i+1]
                        new_words.append(words[-1])
                        new_words.extend(words[i+1:-1])
                        break
        
        return new_words
       

    def reorder_obj_subj(self, words):
 
        new_words = words

        # find first verb
        for i, word in enumerate(words[:-2]):
            if '_V' in word:
                # We can only be certain about ich, du, and er.
                if words[i + 1] == 'ich_PPER' or words[i + 1] == 'du_PPER' or words[i + 1] == 'er_PPER':
                    new_words = [words[i + 1]] + [words[i]] + words[:i] + words[i + 2:]
                    
                break
        
        return new_words
       

    def recombine_sep_prefixes(self, words):
        
        # find clause ends with PTKVZ
        new_words = words
        if "_PTKVZ" in words[-1]:
                # find the preceding VVFIN
                for i, word in enumerate(words[:-1]):
                    if "_VVFIN" in word:
            
                        # move the last word into pos after prec. VVFIN
                        new_words = words[:-1]
                        new_words[i] = words[i].split('_')[0] + ' ' + words[-1].split('_')[0] + '_VVFIN'
#                        new_words = words[:i + 1]
#                        new_words.append(words[-1])
#                        new_words.extend(words[i + 1:-1])

                        break
        
        
        return new_words

            
    def reorder_adverbs(self, words):   
        
        # find any ADV that follows any V*
        new_words = words
        for i, word in enumerate(words):
            if "ADV" in word:
                for w in words[:i]:
                    if "_V" in w:
                            
                        # find the preceding VV*
                        for j, wurd in enumerate(words[:i]):
                            if "VV" in wurd: 

                                # move the ADV into before prec. VV*
                                new_words = words[:j-1]
                                new_words.append(words[i])
                                new_words.extend(words[j-1:i])
                                new_words.extend(words[i+1:])
                                break
                        break            

        return new_words
        
        
    def trainLM(self):
        
        # open clean text and join all lines
        text = ''.join(open(os.path.join(os.path.dirname(__file__), '..', 'data', 'AnitaBlake01GuiltyPleasures.clean.txt')).read()) 
        
        # sentencify text
        sentences = re.split(r' *[.?!][\'")\]]* *[(\["]*', text)
        
        # cut out the first 15 proper sentences - dev and test
        sentences = sentences[17:]

        # wordify the sentences
        for i, sentence in enumerate(sentences):
            sentences[i] = re.findall(r"[\w']+|[.,!?;]", sentence)
        
        # train LM on corpus
        self.LM = LanguageModel(sentences)
        
    
    def trainOnAllLM(self):
        
        # open clean text files for each book and join all lines
        text = ""
        books = ["AnitaBlake01GuiltyPleasures.clean.txt",
        "AnitaBlake02LaughingCorpse.really.clean.txt",
        "AnitaBlake03CircusOfTheDamned.really.clean.tx",
        "AnitaBlake04LunaticCafe.really.clean.txt",
        "AnitaBlake05BloodyBones.really.clean.txt",
        "AnitaBlake06TheKillingDance.really.clean.txt",
        "AnitaBlake07BurntOfferings.really.clean.txt",
        "AnitaBlake08BlueMoon.really.clean.txt",
        "AnitaBlake09ObsidianButterfly.really.clean.txt",
        "AnitaBlake10NarcissusInChains.really.clean.txt",
        "AnitaBlake11CeruleanSins.really.clean.txt",
        "AnitaBlake12IncubusDreams.really.clean.txt",
        "AnitaBlake16BloodNoir.really.clean.txt",
        "AnitaBlake17SkinTrade.really.clean.txt",
        "AnitaBlake18Flirt.really.clean.txt"]
        
        for book in books:
            text += ''.join(open(os.path.join(os.path.dirname(__file__), '..', 'data', book)).read()) 
        
        # sentencify text
        sentences = re.split(r' *[.?!][\'")\]]* *[(\["]*', text)
        
        # cut out the first 15 proper sentences - dev and test
        sentences = sentences[17:]

        # wordify the sentences
        for i, sentence in enumerate(sentences):
            sentences[i] = re.findall(r"[\w']+|[.,!?;]", sentence)
        
        # train LM on corpus
        self.LM = LanguageModel(sentences)    
        
       
    def permutationTester(self, sentence):
        
        # generate all order permutations of words in the sentence  
        orig = sentence        
        sentences = list(itertools.permutations(orig, len(orig)))

        # score each sentence and pick the best
        max = [self.LM.score(orig), orig]
        for sentence in sentences:
            
            score = self.LM.score(sentence)
            if score > max[0]:
                max = [score, sentence]
              
        print ("\n Best Sentence:")
        print (max)
        return (max[1])
    '''