def create_unigram_count_db(lang, langmethod=lambda x: x, db="sqilte:///:memory:"): engine = create_engine(db) # create session Session = sessionmaker(bind=engine) session = Session() # trigram table tablename = 'lang{}unigram'.format(lang) Sentence = Tables().get_sentence_table() Unigram = Tables().get_unigram_table(tablename) # create table Unigram.__table__.drop(engine, checkfirst=True) Unigram.__table__.create(engine) query = session.query(Sentence) ngram_dic = collections.defaultdict(int) for item in query: if lang == 1: sentences = langmethod(item.lang1).split() elif lang == 2: sentences = langmethod(item.lang2).split() ngrams = ngram(sentences, 1) for tpl in ngrams: ngram_dic[tpl] += 1 # insert items for (first, ), count in ngram_dic.items(): print(u"inserting {}: {}".format(first, count)) item = Unigram(first=first, count=count) session.add(item) session.commit()
def create_unigram_count_db(lang, langmethod=lambda x: x, db="sqilte:///:memory:"): engine = create_engine(db) # create session Session = sessionmaker(bind=engine) session = Session() # trigram table tablename = "lang{}unigram".format(lang) Sentence = Tables().get_sentence_table() Unigram = Tables().get_unigram_table(tablename) # create table Unigram.__table__.drop(engine, checkfirst=True) Unigram.__table__.create(engine) query = session.query(Sentence) ngram_dic = collections.defaultdict(int) for item in query: if lang == 1: sentences = langmethod(item.lang1).split() elif lang == 2: sentences = langmethod(item.lang2).split() ngrams = ngram(sentences, 1) for tpl in ngrams: ngram_dic[tpl] += 1 # insert items for (first,), count in ngram_dic.items(): print(u"inserting {}: {}".format(first, count)) item = Unigram(first=first, count=count) session.add(item) session.commit()
def test_ngram_3(self): sentence = ["I am teacher", "I am", "I", ""] test_sentences = (["</s>", "<s>"] + item.split() + ["</s>"] for item in sentence) anss = [ [("</s>", "<s>", "I"), ("<s>", "I", "am"), ("I", "am", "teacher"), ("am", "teacher", "</s>")], [("</s>", "<s>", "I"), ("<s>", "I", "am"), ("I", "am", "</s>")], [("</s>", "<s>", "I"), ("<s>", "I", "</s>")], [("</s>", "<s>", "</s>")], ] for sentences, ans in zip(test_sentences, anss): a = ngram(sentences, 3) self.assertEqual(list(a), ans)
def _create_ngram_count_db(lang, langmethod=lambda x: x, n=3, db="sqilte:///:memory:"): engine = create_engine(db) # create session Session = sessionmaker(bind=engine) session = Session() Sentence = Tables().get_sentence_table() query = session.query(Sentence) ngram_dic = collections.defaultdict(float) for item in query: if lang == 1: sentences = langmethod(item.lang1).split() elif lang == 2: sentences = langmethod(item.lang2).split() sentences = ["</s>", "<s>"] + sentences + ["</s>"] ngrams = ngram(sentences, n) for tpl in ngrams: ngram_dic[tpl] += 1 return ngram_dic
def test_ngram_3(self): sentence = ["I am teacher", "I am", "I", ""] test_sentences = (["</s>", "<s>"] + item.split() + ["</s>"] for item in sentence) anss = [[("</s>", "<s>", "I"), ("<s>", "I", "am"), ("I", "am", "teacher"), ("am", "teacher", "</s>")], [("</s>", "<s>", "I"), ("<s>", "I", "am"), ("I", "am", "</s>")], [("</s>", "<s>", "I"), ("<s>", "I", "</s>")], [("</s>", "<s>", "</s>")], ] for sentences, ans in zip(test_sentences, anss): a = ngram(sentences, 3) self.assertEqual(list(a), ans)