Пример #1
0
 def __init__(self):
     self.unigram_model = Unigram()
     self.bigram_model = Bigram()
     self.trigram_model = Trigram()
     self.unigram_lambda = .25
     self.bigram_lambda = .25
     self.trigram_lambda = .5
Пример #2
0
def guess_language(entry):
    text = u'';
    for child in entry.xml_xpath(u'atom:title|atom:summary|atom:content'):
	text = text + u' '+ child.__unicode__()
    t = Trigram()
    t.parseString(text)
    if tri('fr') - t > tri('en') - t:
	lang=u'en'
    else:
	lang=u'fr'
    entry.xml_set_attribute((u'xml:lang', XML_NS), lang)
Пример #3
0
def best_match(query, items, min_score=0.5):
    match = None
    max_score = 0
    if query:
        t = Trigram(query)
        for i, item in enumerate(items):
            score = t.score(item) if item else 0
            if score == 1.0:
                return (i, 1.0,)
            if score > max_score and score > min_score:
                match = i
                max_score = score
    return (match, max_score,)
Пример #4
0
class Interpolation(LanguageModel):
    def __init__(self):
        self.unigram_model = Unigram()
        self.bigram_model = Bigram()
        self.trigram_model = Trigram()
        self.unigram_lambda = .25
        self.bigram_lambda = .25
        self.trigram_lambda = .5

    def train(self, trainingSentences):
        self.unigram_model.train(trainingSentences)
        self.bigram_model.train(trainingSentences)
        self.trigram_model.train(trainingSentences)

    #Arbitrary lambdas.
    def getWordProbability(self, sentence, index):
        return (self.trigram_lambda * self.trigram_model.getWordProbability(sentence, index)) \
               + (self.bigram_lambda * self.bigram_model.getWordProbability(sentence, index)) \
               + (self.unigram_lambda * self.unigram_model.getWordProbability(sentence, index))

    #Doesn't matter which model we use here- vocabulary is the same
    def getVocabulary(self, context):
        return self.trigram_model.getVocabulary(context)

    #What does generating a sentence in an interpolation model look like?
    #I don't know, so what I've done is generate a word using trigram, bigram, and
    #unigram model some of the time, using the same values in getWordProbability
    def generateSentence(self):
        sentence = []
        prev_previous = LanguageModel.START
        previous = random.choice(list(self.trigram_model.word_count.keys()))
        for i in range(20):
            model_choice = random.random()
            if model_choice <= self.trigram_lambda:
                word = self.trigram_model.generateWord(prev_previous, previous)
            elif model_choice > self.trigram_lambda and model_choice <= self.trigram_lambda + self.bigram_lambda:
                word = self.bigram_model.generate_word(previous)
            else:
                word = self.unigram_model.generateWord()
            sentence.append(word)
            prev_previous = previous
            previous = word
            if word == LanguageModel.STOP:
                break
        return sentence
Пример #5
0
class TestTrigram(TestCase):

    def setUp(self):
        input = "I wish I may I wish I might"
        self.trigram = Trigram(input)

    def test_find_trigrams(self):
        expected_trigram_collection = {
            "I wish": ["I", "I"],
            "wish I": ["may", "might"],
            "may I": ["wish"],
            "I may": ["I"]
        }
        actual_trigram_collection = self.trigram.findTrigrams()
        self.assertEqual(expected_trigram_collection, actual_trigram_collection)
Пример #6
0
    def on_success(self, data):
        if 'text' in data:
            print "analyzing '" + data['text'] + "'"
            # find words
            ws = filter(lambda match: match != '',
                re.findall(r"$| ([a-zA-Z-']+)", data['text'].encode('utf-8')))
            # turn into lower case words
            ws = map(lambda word: word.lower(), ws)

            # trigrams please
            trigrams = reduce(
                lambda tris, w: tris + [(tris[-1][1], tris[-1][2], w)],
                ws, [('','','')])[3:]
            print "found trigrams: " + str(trigrams)

            for tg in trigrams:
                try:
                    t_rec = Trigram.get(tg[0]+','+tg[1], tg[2])
                except DynamoDBKeyNotFoundError:
                    t_rec = Trigram()
                    t_rec.w12 = tg[0]+','+tg[1]
                    t_rec.w3 = tg[2]
                t_rec.count += 1
                t_rec.save()
Пример #7
0
from unigram import Unigram
from bigram import Bigram
from trigram import Trigram

inputs = read('input.txt')[0].strip().split(" ")
V, N, S_FACTOR, TRAINING_FILE, TEST_FILE = (int(inputs[0]), int(inputs[1]),
                                            float(inputs[2]), inputs[3],
                                            inputs[4])
OUTPUT_FILE_NAME = f"./results/trace_{V}_{N}_{S_FACTOR}.txt"

t1 = time()
if V == 3:
    print(f"BYOM: V = {V} n = 3 d = {S_FACTOR}")
    BYOM = BYOM(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    BYOM.execute()
elif N == 1:
    print(f"unigram: V = {V} d = {S_FACTOR}")
    UNIGRAM = Unigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    UNIGRAM.execute()
elif N == 2:
    print(f"bigram: V = {V} d = {S_FACTOR}")
    BIGRAM = Bigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    BIGRAM.execute()
elif N == 3:
    print(f"trigram: V = {V} d = {S_FACTOR}")
    TRIGRAM = Trigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    TRIGRAM.execute()
t2 = time()

print(f"execution time: {t2 - t1}s")
Пример #8
0
cb = ConnectionBorg()
cb.set_region('eu-west-1')
cb.set_credentials(settings.aws_access_key_id, settings.aws_secret_access_key)

tw = Twython(settings.app_key,
        settings.app_secret,
        settings.oauth_token,
        settings.oauth_token_secret)

while True:
    try:
        tweet = ''
        last_word = ''

        # get all digrams in db
        ts = Trigram.scan()
        # count total number of digrams on the db
        total_count = 0
        for t in ts:
            total_count += t.count
        # pick a random digram
        r = randint(1, total_count)
        print 'r = '+str(r)
        ts = Trigram.scan()
        csum = 0
        for t in ts:
            csum += t.count
            print 'csum = '+str(csum)
            if r <= csum:
                tweet += ' '.join(t.w12.split(','))
                last_pair = (t.w12.split(',')[1], t.w3)
Пример #9
0
 def setUp(self):
     input = "I wish I may I wish I might"
     self.trigram = Trigram(input)
Пример #10
0
def main():
    tri = Trigram(argv[1])
    out = open(argv[2], 'w')
    dump(tri, out)
    out.close()
Пример #11
0
from __future__ import print_function
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from trigram import Trigram
import random

replace_rate = 0.2

if __name__ == '__main__':
    reviews = BeautifulSoup(open('temp/positive.review').read(),
                            'lxml').findAll('review_text')
    documents = [review.text for review in reviews]
    print("Data Loaded")

    model = Trigram()
    model.fit(documents)

    while True:
        review = random.choice(documents)
        if len(review.split()) < 30:
            break
    string = review.lower().strip()
    print(string, end='\n\n')

    tokens = word_tokenize(string)
    for i in range(len(tokens) - 2):
        if random.random() < replace_rate:
            key = (tokens[i], tokens[i + 2])
            if key in model.trigram2proba:
                next_word = model.predict(key)
                tokens[i + 1] = next_word
Пример #12
0
from twitter import Twitter
from trigram import Trigram
from corpus import create_corpus, reconstruct_corpus

if __name__ == "__main__":
    """Main program"""

    tw = Twitter()
    corpus = tw.fecth_tweets(5000, wait=True)
    create_corpus(corpus, 'tweets')

    corpus = reconstruct_corpus('tweets.pickle')

    trigram = Trigram(corpus=corpus)

    print('以下の単語群から文章を開始することができます')
    for usage in sorted(list(trigram.usage_)):
        print(usage, end=' / ')

    try:
        while True:
            word = input('\n開始単語を1語入力してください:')
            sentence = trigram.generate(word, 0.08)
            print(sentence)
            print('============================')
            tweet = input('ツイートしますか? (Y/n)')
            if tweet == 'Y':
                tw.update_tweet(sentence)

            input('プログラム終了次は`Ctrl+C`を押してください\n\
            続ける場合はそのほかのキーを押してください')
Пример #13
0
import time
from trigram import Trigram

app = Flask(__name__, static_url_path='/static')

# read dictionary file into a trie
word_list = []
with open('word_search.tsv', 'r') as tsv:
    AoA = [line.strip().split('\t')[0] for line in tsv]

# print('Creating a tree')
# trie = TrieNode()
# for word in AoA:
#     trie.insert(word)

trig = Trigram()
for word in AoA:
    trig.add(word)


@app.route('/')
def hello_world():
    return render_template('index.html')


@app.route('/search')
def search1():
    query = request.args.get('query')
    print("getting search results")
    start = time.time()
    result = trig.search(query)