def highlight(text, language="en", iknow=iknowpy.iKnowEngine()): iknow.index(text, language) for s in iknow.m_index['sentences']: # first figure out where negation spans are and tag those entities for a in s['path_attributes']: # path attributes are expressed as positions within s['path'], # which in turn keys into the s['entities'] array for ent in range(s['path'][a['pos']], s['path'][a['pos'] + a['span'] - 1] + 1): if a['type'] == "Negation": s['entities'][ent]['colour'] = Fore.RED if a['type'] == "Certainty": s['entities'][ent]['colour'] = Fore.CYAN for e in s['entities']: colour = Fore.BLACK style = Style.NORMAL if "colour" in e: colour = e["colour"] if (e['type'] == 'Concept'): style = Style.BRIGHT if (e['type'] == 'NonRelevant') | (e['type'] == 'PathRelevant'): style = Style.DIM print(colour + style + text[e['offset_start']:e['offset_stop']], end=' ') print("\n")
def __init__(self, corpus_path, use_iknow_entities, tokenize_concepts): self.corpus_path = corpus_path self.is_dir = os.path.isdir(corpus_path) if use_iknow_entities: self.use_iknow_entities = use_iknow_entities self.tokenize_concepts = tokenize_concepts self.engine = iknowpy.iKnowEngine()
def synonym_dict_from_file(self, source_text, use_iknow_entities=True, num_similar=5): """ Uses currently loaded model to determine a dictionary of synonyms for each word or entity in a provided text file. Parameters -------------- source_text (str) - The path to a file containing the source text use_iknow_entities (bool) - whether to find synonyms for iKnow entities (as opposed to words) num_similar (int) - Number of similar words that will be returned for each term in the source text (if exist). Higher num_similar ~ less strict similarity, lower num_similar ~ more strict similarity Returns -------------- a dictionary of synonyms for each entity or word in the source NOTE: Right now, using iKnow entities will only check for synoyms of the iKnow entities, not for their individual components. So it is one or the other. """ dictionary = {} if use_iknow_entities: # index the source with iknow entities engine = iknowpy.iKnowEngine() for line in open(source_text, 'r'): engine.index(line, 'en') # Populate dictionary with keys for each term, all with empty list for value for s in engine.m_index['sentences']: for e in s['entities']: if (e['type'] in ('PathRelevant', 'NonRelevant')) or (e['index'] in dictionary): continue else: try: dictionary[e['index']] = [self.most_similar(e['index'], num_similar=num_similar)] \ if num_similar == 1 else self.most_similar(e['index'], num_similar=num_similar) except KeyError: continue else: # use words instead of entities for line in open(source_text, 'r'): words = line.split(' ') for word in words: if word in dictionary: continue else: try: dictionary[word] = [self.most_similar(word, num_similar=num_similar)] \ if num_similar == 1 else self.most_similar(word, num_similar=num_similar) except KeyError: continue return dictionary
def strip_negation(text, language="en", iknow=iknowpy.iKnowEngine()): iknow.index(text, language) stripped = "" for s in iknow.m_index['sentences']: # first figure out where negation spans are and tag those entities for a in s['path_attributes']: # path attributes are expressed as positions within s['path'], # which in turn keys into the s['entities'] array if a['type'] == "Negation": for ent in range(s['path'][a['pos']], s['path'][a['pos'] + a['span'] - 1] + 1): s['entities'][ent]['neg'] = 1 for e in s['entities']: if "neg" in e: continue stripped += text[e['offset_start']:e['offset_stop']] + " " return stripped
def collect_files_recursive(in_path_par): for (dirpath, dirnames, filenames) in walk(in_path_par): for single_file in filenames: if (single_file.endswith('.txt')): full_path = dirpath + single_file f_rec.append(full_path) for single_dir in dirnames: full_dir = dirpath + single_dir + "/" collect_files_recursive(full_dir) break collect_files_recursive(in_path_par) engine = iknowpy.iKnowEngine() def read_udct_file(file_, udct_): f_udct = open(file_, "r", True, "utf8") for txt_line in f_udct: # print('txt_line: ' + txt_line) txt_line = txt_line.rstrip() if ',' in txt_line and txt_line[0:2] != '/*': txt_list = txt_line.split(',') lexrep, action = txt_list[0], txt_list[1] if (lexrep[0] == '@'): literal = lexrep[1:] if action == "UDCertainty": level = txt_list[2]
for e in s['entities']: if "neg" in e: continue stripped += text[e['offset_start']:e['offset_stop']] + " " return stripped # command-line processing import sys, glob lang = "en" if len(sys.argv) > 2: lang = sys.argv[2] iknow = iknowpy.iKnowEngine() # read file pattern argument and process the contents, writing directly to stdout (for piping) # note that file patterns need to be wrapped in quotes or they will be "applied" before this hits python # usage: # $ python strip_negation.py test.txt # $ python strip_negation.py test.txt "fr" # $ python strip_negation.py "*.txt" | grep fix for path in glob.glob(sys.argv[1]): with open(path, 'r') as file: for line in file: print(strip_negation(line, lang, iknow)) # # variation: this reads piped text straight from stdin # usage