def analize_text(text: str, *, exact_words: bool = False) -> tuple: sentences = array(split_into_sentences(text, True)) if (not len(sentences)): print("Nothing found") return [] tags = pos_tag_sents(map(word_tokenize, sentences)) if (not exact_words): lemmatized = lemmatize_sents(tags) else: lemmatized = tags chunker = RegexpParser("AC: {(<CD>?<TO|IN>?<CD>)+}\n " "AN: {(<NPP>+<DT|NPP|JJ>*)+}\n " "}<DT>+{\n " "PH: {<[B-Z]+>+}\n " "}<DT|CC|PRP|EX|WDT>+{") chunked = list(chunker.parse_sents(lemmatized)) return (*setup_search_structure(chunked, tuple), sentences)
def main(): with open("test.txt", 'r', encoding="utf-8") as f: text = f.read() if (False): sentences = array(split_into_sentences(text, True)) if (not len(sentences)): print("Nothing found") exit(-1) tags = pos_tag_sents(map(word_tokenize, sentences)) lemmatized = lemmatize_sents( deepcopy(tags)) #Only for aesthetics reasons chunker = RegexpParser("AC: {(<CD>?<TO|IN>?<CD>)+}\n " "AN: {(<NPP>+<DT|NPP|JJ>*)+}\n " "}<DT>+{\n " "PH: {<[B-Z]+>+}\n " "}<DT|CC|PRP|EX|WDT>+{") chunked = list(chunker.parse_sents(lemmatized)) droped = setup_search_structure(chunked, tuple) if (True): num_print = input("Full data of:[None] ") if (num_print): num_print = int(num_print) print() for num_print in range(num_print, num_print + 10): print(sentences[num_print]) print() print(tags[num_print]) print() print(lemmatized[num_print]) print() #chunks = ne_chunk_sents(tags) #iob = [tree2conlltags(chunk) for chunk in chunks] #iob = tree2conlltags(chunks) #print(iob[num_print]) #print() #tree = [conlltags2tree(i) for i in iob] #print(tree[num_print]) #print() #"NP: {<IN|TO>?((<IN>?<DT>?<JJ.?>*<CD>?<NN.*>+<POS>?)+<CD>?<FW>?)+}\n " #"VP: {((<WP>?<PRP>?<MD>?<VB.?>?<JJ>*<TO>?<VB.?>+<RB>*(<JJ>*<TO>?)*)+<CC>?)+}\n " print(chunked[num_print]) print("\n###\n") print(droped[0][num_print]) print() if (input(f"({num_print}) ?> ")): break ### Search params to_search = input("Search: ") or "work" tag = { '1': 'n', '2': 'v', '3': 'a', '4': 'r' }.get( input(f"\nWhat '{to_search}'?\n" "[1]: Noun\n" "[2]: Verb\n" "[3]: Adjective\n" "[4]: Adverb\n\n" "> "), None) syn = 'y' in input("\nFind related words too? ").lower() exact = 'y' in input("\nFind exact word? ").lower() print() _, ph_num_ls, sentences = analize_text(text, exact_words=exact) num = 1000000 num2 = 10 if (to_search): if (syn): w_rel = words_related(to_search, tag) else: w_rel = to_search ph_nums = find(w_rel, ph_num_ls) print() if (not len(ph_nums)): print(f"{to_search} not in text.") exit(0) if (False): print(f"Looking for \"{to_search}\" {num} times...\n") print(timeit.timeit("find(w_rel, ph_num_ls)", number=num, globals={ **globals(), **locals() }), end=' seconds\n\n') if (False): print(f"{num2} times text setup...\n") print(timeit.timeit("analize_text(text)", number=num2, globals={ **globals(), **locals() }), end=' seconds \n') if ("y" in input("Show found instances?[No] ")): from colorama import init as color_init color_init() print() if (not ph_nums is None): # Unnecessary, but clean for ph in ph_nums: print(_color_sent(sentences[ph], w_rel)) print() else: print("You did not specify any search param")