Пример #1
0
def main():
    triplewise = ngrams_gen(3)
    if len(sys.argv) > 1:
        content = get_corpus(sys.argv[1])
    else:
        content = get_corpus()
    names = get_names()

    prevs = [arity_dict(), arity_dict(), arity_dict()]
    nexts = [arity_dict(), arity_dict(), arity_dict()]

    for phrase in re.split(r"\n\n|[.;?!]", content):
        for prev_word, middle, next_word in triplewise(no_empties(re.split(r"[ ,-^]", phrase))):
            if middle.lower() in names:
                for prev, next, n in zip(prevs, nexts, (3, 4, 5)):
                    if len(prev_word) > 3:
                        prev.count(ngrams_gen(n)(prev_word.lower()))
                    if len(next_word) > 3:
                        next.count(ngrams_gen(n)(next_word.lower()))

    with open("ngrams.dat", "w") as f:
        for prev in prevs:
            for line in top(prev):
                print >> f, line[0].encode("utf-8")
        print >> f, "--"
        for next in nexts:
            for line in top(next):
                print >> f, line[0].encode("utf-8")
Пример #2
0
def check_ngrams(text, ngrams_set=None):
    ret = []
    if ngrams_set is None:
        ngrams_set = retrieve_ngrams()
    for n, word in enumerate(text):
        for length in [3, 4, 5]:
            ngrams = ngrams_gen(length)
            if len(word) >= length:
                if ((word[:length], START) in ngrams_set or 
                    (word[-length:], END) in ngrams_set
                    ):
                    ret.append(n)
                for ngram in ngrams(word[1:-1]):
                    if (ngram, MIDDLE) in ngrams_set:
                        ret.append(n)
    return ret