def genia_tokenizer(self): '''Tokenize pair text with genia tagger. ''' tagger = GeniaTagger('./tools/geniatagger-3.0.2/geniatagger') with open('./chemprot_test_gs/new_testing_examples.json', 'r') as f: training_examples = json.load(f) # print(len(training_examples)) for i in training_examples: tokenized_tuple = tagger.parse(i['sentence']) token_list = [] for output in tokenized_tuple: if output[0] in string.punctuation: continue pos = output[2] if output[0] == pos: continue if output[0].endswith('..'): token = output[0][:-2] elif pos == 'CD': token = 'NUM' else: token = output[0] token_list.append(token) i['sentence'] = ' '.join(token_list) # with open('./chemprot_training/train_tokenized.json', 'w+') as j: with open('./chemprot_test_gs/testing_tokenized.json', 'w+') as j: json.dump(training_examples, j, indent=4)
def load_tagger(fp: Path): """ load GENIA Tagger :param fp: file path of GENIA Tagger :return: """ tagger = GeniaTagger(fp) return tagger
def loadParsingTools(): global tagger, sentencesplitter # Get the locations of geniatagger and lingpipe relative to this script scriptPath = os.path.dirname(os.path.realpath(__file__)) geniaPath = "../../Geniatagger/geniatagger-3.0.1/geniatagger" lingpipePath = "../../Lingpipe/LingpipeSentenceSplitter/run.sh" # Check they are there if not os.path.isfile(geniaPath): raise RuntimeError("Cannot access GeniaTagger. Tried: " + geniaPath) elif not os.path.isfile(lingpipePath): raise RuntimeError("Cannot access LingPipe. Tried: " + lingpipePath) tagger = GeniaTagger(geniaPath) sentencesplitter = LingPipe(lingpipePath)
def __init__(self, model, topn, alpha, tagger, complex_freq, simple_freq, freq_t, char_ngram): logger.info("Instatiating Simple Science Simplifier...") self.model = unpickle(model) logger.info("Loaded embeddings models from : `{}`".format(model)) self.topn = topn self.alpha = alpha self.tagger = GeniaTagger(tagger) logger.info("Loaded Genia PoS tagger from : `{}`".format(tagger)) self.complex_freq = unpickle(complex_freq) logger.info( "Loaded Complex Word Frequencies from : `{}`".format(complex_freq)) self.simple_freq = unpickle(simple_freq) logger.info( "Loaded Simple Word Frequencies from : `{}`".format(simple_freq)) self.freq_t = freq_t self.char_ngram = char_ngram
from geniatagger import GeniaTagger import codecs import sys tagger = GeniaTagger('/Users/ruichen/DL/geniatagger-3.0.2/geniatagger') read_dir = '/Users/ruichen/DL/bio_de_mt/pubmed_en_fr_separate/' en_fn = read_dir + 'pubmed_en.txt' fr_fn = read_dir + 'pubmed_fr.txt' # UTF8Writer = codecs.getwriter('utf8') # sys.stdout = UTF8Writer(sys.stdout) def decode_list(l): return [ tuple(word.decode('utf-8') for word in tp) for tp in l] #sys.stdout = codecs.getwriter('utf8')(sys.stdout) def tag(): out_fn = "test.txt" #out_fn = "genia_raw_tag_pubmed_en_fr.txt" out_file = codecs.open(out_fn, encoding='utf-8', mode='w+') with codecs.open(en_fn, encoding='utf-8') as ef: with codecs.open(fr_fn, encoding = 'utf-8') as ff: for en, fr in zip(ef, ff): # parse_result = tagger.parse(line.encode('utf-8')) # print parse_result
from nltk.corpus import sentiwordnet as swn from nltk.wsd import lesk from pywsd.lesk import simple_lesk from geniatagger import GeniaTagger import pandas as pd import nltk from general_functions import * geni = GeniaTagger('/home/aoguntuga/myVirtualEnvs/sent2/geniatagger/geniatagger') xl_file = pd.ExcelFile("/home/aoguntuga/myVirtualEnvs/sent2/Test/Test Cases.xlsx") sent_df = xl_file.parse('Sheet1') sentences = list(sent_df['Sentence'].values) polarity_list_1 = [] polarity_list_2 = [] for s in sentences: syn1 = return_synset_list_1(sentence) syn2 = return_synset_list_2(sentence) p1 = polarity_score(syn1) p2 = polarity_score(syn2) polarity_list_1.append(p1) polarity_list_2.append(p2) sent_df['lesk_wsd_polarity'] = polarity_list_1 sent_df['siplelesk_wsd_polarity'] = polarity_list_2 sent_df.to_excel("output_path.xlsx")
from nltk import * from nltk.corpus import * import string from nltk.corpus import brown from geniatagger import GeniaTagger ####################################################################### corpusDir = ["/GI_TAGGING/"] corpusRootList = [os.getcwd() + directory for directory in corpusDir] codec = "utf8" tagger = GeniaTagger("/usr/local/geniatagger/geniatagger") train_sents = brown.tagged_sents(categories="learned", tagset="treebank") t0 = nltk.DefaultTagger("UNK") t1 = nltk.UnigramTagger(train_sents, backoff = t0) t2 = nltk.BigramTagger(train_sents, backoff = t1) # Gennia + Backoff for corpusRoot in corpusRootList : corpusReader = PlaintextCorpusReader(corpusRoot, ".*.txt", encoding = codec) outFile = open(corpusRoot + "genia_and_backoff.txt", "w")
from sklearn.cross_validation import KFold from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import f1_score import numpy as np import sklearn as sk import csv import re from geniatagger import GeniaTagger tagger = GeniaTagger( '/home/sunilnew/python_packages/geniatagger-3.0.2/geniatagger') from nltk.tokenize import WordPunctTokenizer tokenizer = WordPunctTokenizer() import pickle from cnn_train import * def preProcess(sent): sent = sent.lower() sent = tokenizer.tokenize(sent) sent = ' '.join(sent) sent = re.sub('\d', 'dg', sent) sent_list, _, _, _, _ = zip(*tagger.parse(sent)) sent = ' '.join(sent_list) return sent def find_sub_list(sl, l): sll = len(sl) for ind in (i for i, e in enumerate(l) if e == sl[0]): if l[ind:ind + sll] == sl:
from sklearn.feature_extraction import DictVectorizer from sklearn import datasets from sklearn import svm from sklearn import cross_validation from sklearn.cross_validation import KFold from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import f1_score from nltk.tokenize import WordPunctTokenizer from sklearn.metrics import classification_report import nltk import numpy as np import re import pickle from geniatagger import GeniaTagger tagger = GeniaTagger('/home/desh/geniatagger-3.0.2/geniatagger') tokenizer = WordPunctTokenizer() def preProcess(sent): sent = re.sub(r"[-+]?\d*\.\d+|\d+", "num", sent.lower()) sent = tokenizer.tokenize(sent) sent = ' '.join(sent) sent_list, _, _, _, _ = zip(*tagger.parse(sent)) sent = ' '.join(sent_list) return sent def find_sub_list(sl, l): sll = len(sl)
import glob import itertools import pickle from geniatagger import GeniaTagger tagger = GeniaTagger('/home/asada.13003/ddi_cnn/geniatagger-3.0.2/geniatagger') train_path = 'Divide/train/*.ann' dev_path = 'Divide/dev/*.ann' p_tr_path = 'Pickle/train' p_dev_path = 'Pickle/dev' def sent_label_pe(path): sent = [] label = [] y = [] y_minus = [] id = [] for i in glob.glob(path): f = open(i, 'r') f_txt = open(i.replace('Divide', 'Brat').replace('.ann', '.txt'), 'r') sentID = i.split('/')[-1].replace('.ann', '') line = f.readlines() text = f_txt.read() entity = []
def loadParsingTools(): global tagger, lingpipe # Currently point to scripts. TODO: use $PATH instead to find them tagger = GeniaTagger('/home/jlever/apps/geniatagger-3.0.1/geniatagger') lingpipe = LingPipe('/projects/jlever/megaTextProject/nounphrasePipeline/lingpipeSentenceSplitter/run.sh')
import os import nltk from nltk import * from nltk.corpus import * import string from nltk.corpus import brown from geniatagger import GeniaTagger ####################################################################### corpusDir = ["/GI_TAGGING/"] corpusRootList = [os.getcwd() + directory for directory in corpusDir] codec = "utf8" tagger = GeniaTagger("/usr/local/geniatagger/geniatagger") train_sents = brown.tagged_sents(categories="learned", tagset="treebank") t0 = nltk.DefaultTagger("UNK") t1 = nltk.UnigramTagger(train_sents, backoff=t0) t2 = nltk.BigramTagger(train_sents, backoff=t1) # Gennia + Backoff for corpusRoot in corpusRootList: corpusReader = PlaintextCorpusReader(corpusRoot, ".*.txt", encoding=codec) outFile = open(corpusRoot + "genia_and_backoff.txt", "w") for journal in corpusReader.fileids():
from nltk.stem.lancaster import LancasterStemmer from collections import Counter,defaultdict import geniatagger from geniatagger import GeniaTagger from nltk import word_tokenize, pos_tag from nltk.tokenize import sent_tokenize import requests import json from linggle_api import Linggle from classify_error_type import * GEC_API = 'https://whisky.nlplab.cc/translate/?text={}' app = Flask(__name__) tagger = GeniaTagger('/home/nlplab/yeema/geniataggerPython/geniatagger-3.0.2/geniatagger',['-nt']) ling = Linggle() dictWord = defaultdict(lambda: defaultdict(list)) phraseV = defaultdict(lambda: defaultdict(list)) dictPhrase = defaultdict(lambda: defaultdict(list)) dictDef = defaultdict(lambda: defaultdict(list)) miniparCol = defaultdict(lambda: defaultdict(lambda: Counter())) pw = defaultdict(lambda: defaultdict(lambda:Counter())) pw_ratio = defaultdict(lambda: defaultdict(lambda:Counter())) LCE = eval(open('/home/nlplab/yeema/ErrorExplaination/LCE.json').read()) dictSimilar = defaultdict() @app.route('/') def index(): return render_template('template.html')
from nltk.tokenize import PunktSentenceTokenizer from geniatagger import GeniaTagger tagger = GeniaTagger('~/qwerty/shashank/geniatagger-3.0.2/geniatagger') print(tagger.parse('This is a pen.')) #print(tagger.parse('tis is pen')) #print(data) med_tokenizer = PunktSentenceTokenizer(train_data)
def annotate_text(tager=''): genia = GeniaTagger('../genia-tagger/geniatagger-3.0.2/geniatagger') medpost = spacy.load(os.path.abspath('trained_tagger')) stanford = StanfordCoreNLP('http://localhost:9000') main_dir = 'corrected_outcomes' data_dir = os.path.abspath(os.path.join(main_dir, 'aggregated')) create_storage_dirs([data_dir]) sub_dir = os.path.abspath(os.path.join(data_dir, 'test')) if not os.path.exists(os.path.dirname(sub_dir)): os.makedirs(os.path.dirname(sub_dir)) turker, ebm_extract = e.read_anns('hierarchical_labels', 'outcomes', \ ann_type='aggregated', model_phase='train') seq_dir = os.path.abspath(os.path.join(os.path.curdir, 'corrected_outcomes', 'test')) create_storage_dirs([seq_dir]) ebm_csv = [] start = time.time() with open(os.path.join(seq_dir, 'test_medpost.bmes'), 'w') as f: for pmid, doc in ebm_extract.items(): abstract = ' '.join(i for i in doc.tokens) #pprint(abstract) u = doc.anns['AGGREGATED'] v = doc.tokens o = [] corr_outcomes = [] temp, temp_2 = [], [] t = 0 m = 0 o_come = e.print_labeled_spans_2(doc)[0] #extract outcomes from the abstract being examined, [(Outcome-type, Outcome), (Outcome-type, Outcome2)] #store the annotations and the index of the annotations for each abstract for x in range(len(u)): if x == t: if u[x] != 0: for ff in o_come: for j in range(len(u)): if j < len(ff[1].split()): o.append((t, u[x])) t += 1 break o_come.pop(0) txt_toks = [v[i[0]] for i in o] text_wrds = ' '.join(i for i in txt_toks) corr = correcting_spans.correct_text() text_wrds = corr.statTerm_keyWord_punct_remove(text_wrds) if tager.lower() == 'genia': tagged = genia.parse(text_wrds) pos = [i[2] for i in tagged] elif tager.lower() == 'medpost': tagged = medpost(text_wrds) pos = [i.tag_ for i in tagged] elif tager.lower() == 'stanford': pos = [] for elem in word_tokenize(text_wrds): stan = stanford.annotate(elem, properties={'annotators':'pos', 'outputFormat':'json'}) pos.append(stan['sentences'][0]['tokens'][0]['pos']) text_pos = ' '.join(i for i in pos) label = core_outcome[u[x]] corrected_spans = corr.pos_co_occurrence_cleaning(text_wrds, text_pos, label) if len(corrected_spans) == 0: v[o[0][0]:(o[-1][0] + 1)] = txt_toks u[o[0][0]:(o[-1][0] + 1)] = [0 for i in range(len(txt_toks))] elif len(corrected_spans) < 2: span = corrected_spans[0] s = [i for i in span[1].split()] ll = [o[0][1] if i in s else 0 for i in txt_toks] v[o[0][0]:(o[-1][0] + 1)] = txt_toks u[o[0][0]:(o[-1][0] + 1)] = ll else: s = [i for j in corrected_spans for i in j[1].split()] ll = [o[0][1] if i in s else 0 for i in txt_toks] v[o[0][0]:(o[-1][0] + 1)] = txt_toks u[o[0][0]:(o[-1][0] + 1)] = ll p = [i for i in corrected_spans] if len(p) > 0: for i in p: corr_outcomes.append(i) o.clear() else: t += 1 if corr_outcomes: temp_2 = build_sequence_model(v, u, core_outcome, corr_outcomes) qq = 1 for i in temp_2: print(qq, i) f.write('{}\n'.format(i)) qq += 1 f.write('\n') for k in corr_outcomes: ebm_csv.append(k) ebm_csv_df = pd.DataFrame(ebm_csv, columns=['Label','Outcome']) ebm_csv_df.to_csv(os.path.join(os.path.abspath(os.path.curdir), 'corrected_outcomes/test/labels_outcomes_medpost.csv')) f.close() print("Duration {}".format(time.time() - start))