def __init__(self, source, dest): ''' Constructor ''' self.src = source self.dest = dest self.Dep = DependencyReader() self.Tree = ParseTreeReader() self.Chunk = ChunkReader() self.Stemmer = PorterStemmer()
def stem_string(line): if line == "": return "" p = PorterStemmer() word = "" output = "" for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0, len(word) - 1) word = '' output += c.lower() if word: output += p.stem(word, 0, len(word) - 1) return output
def stem_string(line): if line == "": return "" p = PorterStemmer() word = "" output = "" for c in line: if c.isalpha(): word += c.lower() else: if word: output += p.stem(word, 0,len(word)-1) word = '' output += c.lower() if word: output += p.stem(word, 0,len(word)-1) return output
def test(self, test_name): Stemmer = PorterStemmer() if test_name == "loading": words = [ "induction", "inducted", "restimulated", "binding", "binds", "up-regulate", "mRNA expression", "binding activity" ] self.load("dev") print "\n\n----------------------------" print "Using original string" for w in words: cnt = self.count(w) print w, cnt print "\n\n----------------------------" print "Using stem version of string" for w in words: w = Stemmer.stem(w, 0, len(w) - 1) cnt = self.count(w) print w, cnt print "\n\n"
class GeniaReader(object): ''' read all necessary file and convert it to internal format, then save it for latter feature extraction ''' ''' list of extension''' TXT_EXT = ".txt" PROTEIN_EXT = ".a1" TRIGGER_REL_EXT = ".a2" CHUNK_EXT = ".chk" MCCCJ_TREE_EXT = ".txt.ss.mcccjtok.mcccj" MCCCJ_SD_EXT = ".txt.ss.mcccjtok.mcccj.basic.sd" CORPUS_DIR = ["dev", "train", "test"] # this folder contains original corpus of bionlp2011: txt, a1, and a2 files ORIGINAL_DIR = "original" # this folder contains parsed corpus (tree and dependency) and also chunk file PARSED_DIR = "parse" # this folder contains pre-processed text, all proteins are replaced by placeholder PREPROCESS_DIR = "preprocess" # this folder contain data source for all docs DATA_DIR = "data" # bracket char coding #BRACKET_CHAR = {'-LRB-':'(', '-RRB-':')', '-RSB-':']', '-LSB-':'[', '-LCB-':'{', '-RCB-':'}'} def __init__(self, source, dest): ''' Constructor ''' self.src = source self.dest = dest self.Dep = DependencyReader() self.Tree = ParseTreeReader() self.Chunk = ChunkReader() self.Stemmer = PorterStemmer() def run(self): # read all files from dir for cdir in self.CORPUS_DIR: print "reading content of " + cdir + " directory" doc_ids = self.load_save_data(cdir) print str(len(doc_ids)) + " doc have been read" # write doc ids to file fpath = self.dest + '/' + cdir + '_doc_ids.json' self.write_to_file(doc_ids, fpath) ''' this function returns list of doc_id for a given cdir and do: 1. list files under cdir 2. load all necessary file for a given doc_id 3. check it consistency 4. save it as json ''' def load_save_data(self, cdir): ext = self.TXT_EXT doc_ids = self.get_doc_list( self.get_full_path(self.ORIGINAL_DIR, cdir), ext) # there is a document that cannot be parsed by mcccj parser # no event in that document, so we can just skip it if cdir == "train": print "Skip PMID-8632999" doc_ids.remove("PMID-8632999") for doc_id in doc_ids: # load doc data doc = self.load_doc(cdir, doc_id) # check consistency self.check_consistency(doc) # save to file fpath = self.dest + '/' + self.DATA_DIR + '/' + doc_id + '.json' self.write_to_file(doc, fpath) return doc_ids ''' read document data and return doc representation ''' def load_doc(self, cdir, doc_id): # init variable triggers = [] events = [] equivs = [] is_test = True # path for original file ori_fpath = self.get_full_path(self.ORIGINAL_DIR, cdir) + '/' + doc_id txt = self.get_text(ori_fpath + self.TXT_EXT) proteins = self.get_protein(ori_fpath + self.PROTEIN_EXT) if cdir != 'test': triggers, events, equivs = self.get_trigger_relation( ori_fpath + self.TRIGGER_REL_EXT) is_test = False # path for parsed file parsed_fpath = self.get_full_path(self.PARSED_DIR, cdir) + '/' + doc_id chunks = self.get_chunk(parsed_fpath + self.CHUNK_EXT) tree = self.get_tree_mcccj(parsed_fpath + self.MCCCJ_TREE_EXT) dep = self.get_dependency(parsed_fpath + self.MCCCJ_SD_EXT) # path for preprocess file pre_fpath = self.get_full_path(self.PREPROCESS_DIR, cdir) + '/' + doc_id sentences = self.get_sentences(pre_fpath + self.TXT_EXT) # create doc representation doc = { "doc_id": doc_id, "test": is_test, "path1": ori_fpath, "path2": parsed_fpath, "nsen": len(tree), "txt": txt, "sen": sentences, "protein": proteins, "equiv": equivs, "trigger": triggers, "event": events, "chunk": chunks, "tree": tree, "dep": dep } return doc ''' update/add offset of word offset is relative position in the abstract and add stem of word ''' def update_word_info(self, txt, sentences): search_offset = 0 for sentence in sentences: for word in sentence: string = word["string"] #if string in self.BRACKET_CHAR: # string = self.BRACKET_CHAR[string] start = txt.find(string, search_offset) if start == -1: # try to convert back " into '' # in parser " are represented as '' so we need to convert back to original char # but there are some cases that '' is actually '' as in => 4'',6-Diamidino-2-phenylindole string = string.replace("\"", "\'\'") start = txt.find(string, search_offset) if start == -1: print string print sentence raise ValueError("string not found") word["string"] = string end = start + len(string) # add offset word["start"] = start word["end"] = end # add stem using porter stemmer word["stem"] = self.Stemmer.stem(string, 0, len(string) - 1) search_offset = end return sentences ''' write to file ''' def write_to_file(self, doc_to_write, fpath): with open(fpath, 'w') as fout: fout.write(json.dumps(doc_to_write)) ''' check consistency for chunk, tree, and dep data type they must have same number of line and number of word for each line ''' def check_consistency(self, doc): chunck = doc["chunk"] tree = doc["tree"] dep = doc["dep"] # check number of sentence #print "number of sentence:", len(chunck), len(tree),len(dep) if len(chunck) != len(tree) or len(tree) != len(dep): raise ValueError( "Chunck, Tree and Dep has different number of sentence") # check number of line for i in range(0, len(chunck)): #print i, "number of words: ", chunck[i]["nword"], tree[i]["nword"], dep[i]["nword"] if chunck[i]["nword"] != tree[i]["nword"] or tree[i][ "nword"] != dep[i]["nword"]: print tree[i] print chunck[i] print dep[i] raise ValueError("Different number of word in sentence " + str(i) + " doc: " + doc["doc_id"]) ''' print to screen document representation ''' def print_doc(self, doc): print "doc id: ", doc["doc_id"] print "is test: ", doc["test"] print "ori path: ", doc["path1"] print "parsed path: ", doc["path2"] print "number of sentence:", doc["nsen"] print doc["txt"] print "Sentences:" for sen in doc["sen"]: print sen print "Proteins:" for line in doc["protein"].values(): print line print "Equivs:" for line in doc["equiv"]: print line print "Triggers:" for line in doc["trigger"].values(): print line print "Events:" for line in doc["event"].values(): print line print "Chunks:" for line in doc["chunk"]: print line print "Trees:" for line in doc["tree"]: print line print "Dependencies:" for line in doc["dep"]: print line ''' return list of file names in cdir directory ''' def get_doc_list(self, cdir, ext_filter): return [ d.rstrip(ext_filter) for d in os.listdir(cdir) if d.endswith(ext_filter) ] ''' return text from txt file of given fpath ''' def get_text(self, fpath): with open(fpath, 'r') as fin: txt = fin.read() return txt ''' return protein dict pretein representation: 'T84' : ['T84', 'Negative_regulation', '2665', '2673', 'decrease'] ''' def get_protein(self, fpath): proteins = {} with open(fpath, 'r') as fin: for line in fin: line = line.rstrip('\n') p = re.split("\\t|\\s+", line, 4) proteins[p[0]] = p return proteins ''' return trigger dictionary, event dictionary and equiv tuple trigger id : (id, trigger type, start idx, end idx, trigger text) event tuple id : (id, event type, trigger_id, theme1 id, theme2 id, cause id) equiv tuple: (protein1, protein2) e ''' def get_trigger_relation(self, fpath): triggers = {} events = {} equivs = [] with open(fpath, 'r') as fin: for line in fin: line = line.rstrip('\n') # process trigger if line[0] == 'T': t = re.split("\\t|\\s+", line, 4) triggers[t[0]] = t # process event elif line[0] == 'E': evt = re.split("\\t|\\s+", line) eid = evt[0] etype, _, trigid = evt[1].partition(':') theme1 = evt[2].split(':')[1] theme2 = "" cause = "" if len(evt) > 3: argtype, _, argid = evt[3].partition(':') if argtype == 'Theme2': theme2 = argid cause = "" elif argtype == 'Cause': theme2 = "" cause = argid events[eid] = list( (eid, etype, trigid, theme1, theme2, cause)) # process equiv elif line[0] == '*': equiv = re.split("\\t|\\s", line) equivs.append(tuple(equiv[2:])) return triggers, events, equivs ''' return chunk data ''' def get_chunk(self, fpath): return self.Chunk.read(fpath) ''' return parse tree data ''' def get_tree_mcccj(self, fpath): return self.Tree.read(fpath) ''' return sentences of a document get_tree_mcccj must be called first ''' def get_sentences(self, preprocess_txt_fpath, check=False): pre_txt = self.get_text(preprocess_txt_fpath) sentences = self.update_word_info(pre_txt, self.Tree.sentences) if check: for sentence in sentences: for word in sentence: if pre_txt[word["start"]:word["end"]] != word["string"]: print pre_txt[word["start"]:word["end"]], "=>", word #raise ValueError("string in txt and word do not match") return sentences ''' return dependency data ''' def get_dependency(self, fpath): return self.Dep.read(fpath) ''' cdir: dev, train, or test ctype: original or parsed ''' def get_full_path(self, ctype, cdir): return self.src + '/' + ctype + '/' + cdir
class GeniaReader(object): ''' read all necessary file and convert it to internal format, then save it for latter feature extraction ''' ''' list of extension''' TXT_EXT = ".txt" PROTEIN_EXT = ".a1" TRIGGER_REL_EXT = ".a2" CHUNK_EXT = ".chk" MCCCJ_TREE_EXT = ".txt.ss.mcccjtok.mcccj" MCCCJ_SD_EXT = ".txt.ss.mcccjtok.mcccj.basic.sd" CORPUS_DIR = ["dev","train","test"] # this folder contains original corpus of bionlp2011: txt, a1, and a2 files ORIGINAL_DIR = "original" # this folder contains parsed corpus (tree and dependency) and also chunk file PARSED_DIR = "parse" # this folder contains pre-processed text, all proteins are replaced by placeholder PREPROCESS_DIR = "preprocess" # this folder contain data source for all docs DATA_DIR = "data" # bracket char coding #BRACKET_CHAR = {'-LRB-':'(', '-RRB-':')', '-RSB-':']', '-LSB-':'[', '-LCB-':'{', '-RCB-':'}'} def __init__(self, source, dest): ''' Constructor ''' self.src = source self.dest = dest self.Dep = DependencyReader() self.Tree = ParseTreeReader() self.Chunk = ChunkReader() self.Stemmer = PorterStemmer() def run(self): # read all files from dir for cdir in self.CORPUS_DIR: print "reading content of " + cdir + " directory" doc_ids = self.load_save_data(cdir) print str(len(doc_ids)) + " doc have been read" # write doc ids to file fpath = self.dest + '/' + cdir + '_doc_ids.json' self.write_to_file(doc_ids, fpath) ''' this function returns list of doc_id for a given cdir and do: 1. list files under cdir 2. load all necessary file for a given doc_id 3. check it consistency 4. save it as json ''' def load_save_data(self, cdir): ext = self.TXT_EXT doc_ids = self.get_doc_list(self.get_full_path(self.ORIGINAL_DIR,cdir), ext) # there is a document that cannot be parsed by mcccj parser # no event in that document, so we can just skip it if cdir == "train": print "Skip PMID-8632999" doc_ids.remove("PMID-8632999") for doc_id in doc_ids: # load doc data doc = self.load_doc(cdir, doc_id) # check consistency self.check_consistency(doc) # save to file fpath = self.dest + '/' + self.DATA_DIR + '/' + doc_id + '.json' self.write_to_file(doc, fpath) return doc_ids ''' read document data and return doc representation ''' def load_doc(self, cdir, doc_id): # init variable triggers = [] events = [] equivs = [] is_test = True # path for original file ori_fpath = self.get_full_path(self.ORIGINAL_DIR,cdir) + '/' + doc_id txt = self.get_text(ori_fpath + self.TXT_EXT) proteins = self.get_protein(ori_fpath + self.PROTEIN_EXT) if cdir != 'test': triggers, events, equivs = self.get_trigger_relation(ori_fpath + self.TRIGGER_REL_EXT) is_test = False # path for parsed file parsed_fpath = self.get_full_path(self.PARSED_DIR,cdir) + '/' + doc_id chunks = self.get_chunk(parsed_fpath + self.CHUNK_EXT) tree = self.get_tree_mcccj(parsed_fpath + self.MCCCJ_TREE_EXT) dep = self.get_dependency(parsed_fpath + self.MCCCJ_SD_EXT) # path for preprocess file pre_fpath = self.get_full_path(self.PREPROCESS_DIR, cdir) + '/' + doc_id sentences = self.get_sentences(pre_fpath + self.TXT_EXT) # create doc representation doc = {"doc_id": doc_id, "test": is_test, "path1": ori_fpath, "path2": parsed_fpath, "nsen": len(tree), "txt":txt, "sen": sentences, "protein":proteins, "equiv":equivs, "trigger":triggers, "event":events, "chunk":chunks, "tree":tree, "dep":dep} return doc ''' update/add offset of word offset is relative position in the abstract and add stem of word ''' def update_word_info(self, txt, sentences): search_offset = 0 for sentence in sentences: for word in sentence: string = word["string"] #if string in self.BRACKET_CHAR: # string = self.BRACKET_CHAR[string] start = txt.find(string, search_offset) if start == -1: # try to convert back " into '' # in parser " are represented as '' so we need to convert back to original char # but there are some cases that '' is actually '' as in => 4'',6-Diamidino-2-phenylindole string = string.replace("\"","\'\'") start = txt.find(string, search_offset) if start == -1: print string print sentence raise ValueError("string not found") word["string"] = string end = start + len(string) # add offset word["start"] = start word["end"] = end # add stem using porter stemmer word["stem"] = self.Stemmer.stem(string, 0, len(string)-1) search_offset = end return sentences ''' write to file ''' def write_to_file(self, doc_to_write, fpath): with open(fpath, 'w') as fout: fout.write(json.dumps(doc_to_write)) ''' check consistency for chunk, tree, and dep data type they must have same number of line and number of word for each line ''' def check_consistency(self, doc): chunck = doc["chunk"] tree = doc["tree"] dep = doc["dep"] # check number of sentence #print "number of sentence:", len(chunck), len(tree),len(dep) if len(chunck) != len(tree) or len(tree) != len(dep): raise ValueError("Chunck, Tree and Dep has different number of sentence") # check number of line for i in range(0,len(chunck)): #print i, "number of words: ", chunck[i]["nword"], tree[i]["nword"], dep[i]["nword"] if chunck[i]["nword"] != tree[i]["nword"] or tree[i]["nword"] != dep[i]["nword"]: print tree[i] print chunck[i] print dep[i] raise ValueError("Different number of word in sentence " + str(i) + " doc: " + doc["doc_id"]) ''' print to screen document representation ''' def print_doc(self, doc): print "doc id: ", doc["doc_id"] print "is test: ", doc["test"] print "ori path: ", doc["path1"] print "parsed path: ", doc["path2"] print "number of sentence:", doc["nsen"] print doc["txt"] print "Sentences:" for sen in doc["sen"]: print sen print "Proteins:" for line in doc["protein"].values(): print line print "Equivs:" for line in doc["equiv"]: print line print "Triggers:" for line in doc["trigger"].values(): print line print "Events:" for line in doc["event"].values(): print line print "Chunks:" for line in doc["chunk"]: print line print "Trees:" for line in doc["tree"]: print line print "Dependencies:" for line in doc["dep"]: print line ''' return list of file names in cdir directory ''' def get_doc_list(self, cdir, ext_filter): return [d.rstrip(ext_filter) for d in os.listdir(cdir) if d.endswith(ext_filter)] ''' return text from txt file of given fpath ''' def get_text(self, fpath): with open(fpath, 'r') as fin: txt = fin.read() return txt ''' return protein dict pretein representation: 'T84' : ['T84', 'Negative_regulation', '2665', '2673', 'decrease'] ''' def get_protein(self, fpath): proteins = {} with open(fpath, 'r') as fin: for line in fin: line = line.rstrip('\n') p = re.split("\\t|\\s+",line,4) proteins[p[0]] = p return proteins ''' return trigger dictionary, event dictionary and equiv tuple trigger id : (id, trigger type, start idx, end idx, trigger text) event tuple id : (id, event type, trigger_id, theme1 id, theme2 id, cause id) equiv tuple: (protein1, protein2) e ''' def get_trigger_relation(self, fpath): triggers = {} events = {} equivs = [] with open(fpath, 'r') as fin: for line in fin: line = line.rstrip('\n') # process trigger if line[0] == 'T': t = re.split("\\t|\\s+",line,4) triggers[t[0]] = t # process event elif line[0] == 'E': evt = re.split("\\t|\\s+",line) eid = evt[0] etype,_,trigid = evt[1].partition(':') theme1 = evt[2].split(':')[1] theme2 = "" cause = "" if len(evt) > 3: argtype,_,argid = evt[3].partition(':') if argtype == 'Theme2': theme2 = argid cause = "" elif argtype == 'Cause': theme2 = "" cause = argid events[eid] = list((eid, etype, trigid, theme1, theme2, cause)) # process equiv elif line[0] == '*': equiv = re.split("\\t|\\s",line) equivs.append(tuple(equiv[2:])) return triggers, events, equivs ''' return chunk data ''' def get_chunk(self, fpath): return self.Chunk.read(fpath) ''' return parse tree data ''' def get_tree_mcccj(self, fpath): return self.Tree.read(fpath) ''' return sentences of a document get_tree_mcccj must be called first ''' def get_sentences(self, preprocess_txt_fpath, check = False): pre_txt = self.get_text(preprocess_txt_fpath) sentences = self.update_word_info(pre_txt, self.Tree.sentences) if check: for sentence in sentences: for word in sentence: if pre_txt[word["start"]:word["end"]] != word["string"]: print pre_txt[word["start"]:word["end"]], "=>", word #raise ValueError("string in txt and word do not match") return sentences ''' return dependency data ''' def get_dependency(self, fpath): return self.Dep.read(fpath) ''' cdir: dev, train, or test ctype: original or parsed ''' def get_full_path(self, ctype, cdir): return self.src + '/' + ctype + '/' + cdir
def __init__(self, source): super(TriggerDictionary, self).__init__(source) self.Stemmer = PorterStemmer()
class TriggerDictionary(Dictionary): def __init__(self, source): super(TriggerDictionary, self).__init__(source) self.Stemmer = PorterStemmer() def load(self, corpus_dir): """ Load a trigger dictionary data corpus_dir is dev, train, or mix """ if corpus_dir not in self.CORPUS_DIR: raise ValueError("wrong value. choose 'dev', 'train', or 'mix'") fpath = self.src + '/' + self.DICT_DIR + '/' + corpus_dir + self.TDICT_SUFIX_EXT if not os.path.exists(fpath): print "Trigger dictionary data is not exist" print "Now building new trigger dictionary data ..." self.build() with open(fpath, 'r') as f: self.data = json.loads(f.read()) def count(self, word, ttype=""): """ return number of word occurrence in dictionary """ if self.data == {}: raise ValueError("Dictionary data has not been loaded") # get counter ttype_cnt = self.data.get(word.lower(), None) retval = 0 if ttype_cnt != None: if ttype != "": retval = ttype_cnt.get(ttype, 0) else: # get count for all event type for v in ttype_cnt.itervalues(): retval += v return retval def build(self): """ build all trigger dictionaries (dev, train, and mix) and save it to dict folder """ for cdir in self.CORPUS_DIR: print "building " + cdir + " trigger dictionary" fpath = self.src + '/' + self.DICT_DIR + '/' + cdir + self.TDICT_SUFIX_EXT with open(fpath, 'w') as f: f.write(json.dumps(self.build_dict(cdir))) def build_dict(self, corpus_dir): """ add trigger word to dictionary, including multi-word trigger ex. triggers: "Negative Regulator", "Expression" store in dict: "Negative Regulator" <== bigram version "Negative" <== head of bigram "Expression" <== standard unigram """ if corpus_dir not in self.CORPUS_DIR: raise ValueError("wrong value. choose 'dev', 'train', or 'mix'") # init document builder doc_builder = DocumentBuilder(self.src) # init default dict with counter td = defaultdict(Counter) # get list of document doc_ids = self.get_doc_ids(corpus_dir) for doc_id in doc_ids: o_doc = doc_builder.build(doc_id) for i in range(0, len(o_doc.sen)): o_sen = o_doc.sen[i] for twn in o_sen.trigger: w = o_sen.words[twn] ttype = w['type'] string = w['string'].lower() stem = w['stem'].lower() # skip short word if len(string) < 4: continue # adding unigram text and stem to dictionary td[string][ttype] += 1 td[stem][ttype] += 1 # check full text # add to dict if it's multi-word full_string = o_sen.trigger_text[twn] if ' ' in full_string: td[full_string][ttype] += 1 print "the dictionary contains:", len(td), "trigger words" return dict(td) def get_triggers(self, doc_id): """ return list of trigger from document """ fpath = self.src + '/' + self.DATA_DIR + '/' + doc_id + self.DATA_EXT with open(fpath, 'r') as f: doc = json.loads(f.read()) return doc["trigger"] def test(self, test_name): if test_name == "loading": trigger = { "Negative Regulator": "Negative_regulation", "Negative": "Negative_regulation", "Regulator": "Negative_regulation", "binds": "Binding", "mRNA expression": "Transcription", "mRNA": "Transcription" } print "\n\n----------------------------" print "Using original string" self.load("mix") for t, ttype in trigger.iteritems(): cnt1 = self.count(t) cnt2 = self.count(t, ttype) print t, "All", cnt1 print t, ttype, cnt2 print "\n\n----------------------------" print "Using stem version of string" for t, ttype in trigger.iteritems(): t = self.Stemmer.stem(t, 0, len(t) - 1) cnt1 = self.count(t) cnt2 = self.count(t, ttype) print t, "All", cnt1 print t, ttype, cnt2 print "\n\n"