예제 #1
0
    def __init__(self, source, dest):
        '''
        Constructor
        '''
        self.src = source
        self.dest = dest

        self.Dep = DependencyReader()
        self.Tree = ParseTreeReader()
        self.Chunk = ChunkReader()
        self.Stemmer = PorterStemmer()
예제 #2
0
def stem_string(line):
    if line == "":
        return ""
    p = PorterStemmer()
    word = ""
    output = ""
    for c in line:
        if c.isalpha():
            word += c.lower()
        else:
            if word:
                output += p.stem(word, 0, len(word) - 1)
                word = ''
            output += c.lower()
    if word:
        output += p.stem(word, 0, len(word) - 1)
    return output
예제 #3
0
def stem_string(line):
    if line == "":
        return ""
    p = PorterStemmer()
    word = ""
    output = ""
    for c in line:
        if c.isalpha():
            word += c.lower()
        else:
            if word:
                output += p.stem(word, 0,len(word)-1)
                word = ''
            output += c.lower()
    if word:
        output += p.stem(word, 0,len(word)-1)
    return output
예제 #4
0
 def __init__(self, source, dest):
     '''
     Constructor
     '''
     self.src = source
     self.dest = dest
     
     self.Dep = DependencyReader()
     self.Tree = ParseTreeReader()
     self.Chunk = ChunkReader()
     self.Stemmer = PorterStemmer()
예제 #5
0
    def test(self, test_name):
        Stemmer = PorterStemmer()
        if test_name == "loading":
            words = [
                "induction", "inducted", "restimulated", "binding", "binds",
                "up-regulate", "mRNA expression", "binding activity"
            ]
            self.load("dev")

            print "\n\n----------------------------"
            print "Using original string"
            for w in words:
                cnt = self.count(w)
                print w, cnt

            print "\n\n----------------------------"
            print "Using stem version of string"
            for w in words:
                w = Stemmer.stem(w, 0, len(w) - 1)
                cnt = self.count(w)
                print w, cnt

            print "\n\n"
예제 #6
0
class GeniaReader(object):
    '''
    read all necessary file and convert it to internal format, then save it for latter feature extraction
    '''
    ''' list of extension'''
    TXT_EXT = ".txt"

    PROTEIN_EXT = ".a1"

    TRIGGER_REL_EXT = ".a2"

    CHUNK_EXT = ".chk"

    MCCCJ_TREE_EXT = ".txt.ss.mcccjtok.mcccj"

    MCCCJ_SD_EXT = ".txt.ss.mcccjtok.mcccj.basic.sd"

    CORPUS_DIR = ["dev", "train", "test"]

    # this folder contains original corpus of bionlp2011: txt, a1, and a2 files
    ORIGINAL_DIR = "original"

    # this folder contains parsed corpus (tree and dependency) and also chunk file
    PARSED_DIR = "parse"

    # this folder contains pre-processed text, all proteins are replaced by placeholder
    PREPROCESS_DIR = "preprocess"

    # this folder contain data source for all docs
    DATA_DIR = "data"

    # bracket char coding
    #BRACKET_CHAR = {'-LRB-':'(', '-RRB-':')', '-RSB-':']', '-LSB-':'[', '-LCB-':'{', '-RCB-':'}'}

    def __init__(self, source, dest):
        '''
        Constructor
        '''
        self.src = source
        self.dest = dest

        self.Dep = DependencyReader()
        self.Tree = ParseTreeReader()
        self.Chunk = ChunkReader()
        self.Stemmer = PorterStemmer()

    def run(self):
        # read all files from dir
        for cdir in self.CORPUS_DIR:
            print "reading content of " + cdir + " directory"
            doc_ids = self.load_save_data(cdir)
            print str(len(doc_ids)) + " doc have been read"

            # write doc ids to file
            fpath = self.dest + '/' + cdir + '_doc_ids.json'
            self.write_to_file(doc_ids, fpath)

    '''
    this function returns list of doc_id for a given cdir
    and do:
    1. list files under cdir
    2. load all necessary file for a given doc_id
    3. check it consistency
    4. save it as json    
    '''

    def load_save_data(self, cdir):

        ext = self.TXT_EXT
        doc_ids = self.get_doc_list(
            self.get_full_path(self.ORIGINAL_DIR, cdir), ext)

        # there is a document that cannot be parsed by mcccj parser
        # no event in that document, so we can just skip it
        if cdir == "train":
            print "Skip PMID-8632999"
            doc_ids.remove("PMID-8632999")

        for doc_id in doc_ids:
            # load doc data
            doc = self.load_doc(cdir, doc_id)

            # check consistency
            self.check_consistency(doc)

            # save to file
            fpath = self.dest + '/' + self.DATA_DIR + '/' + doc_id + '.json'
            self.write_to_file(doc, fpath)

        return doc_ids

    '''
    read document data
    and return doc representation
    '''

    def load_doc(self, cdir, doc_id):
        # init variable
        triggers = []
        events = []
        equivs = []
        is_test = True

        # path for original file
        ori_fpath = self.get_full_path(self.ORIGINAL_DIR, cdir) + '/' + doc_id

        txt = self.get_text(ori_fpath + self.TXT_EXT)
        proteins = self.get_protein(ori_fpath + self.PROTEIN_EXT)
        if cdir != 'test':
            triggers, events, equivs = self.get_trigger_relation(
                ori_fpath + self.TRIGGER_REL_EXT)
            is_test = False

        # path for parsed file
        parsed_fpath = self.get_full_path(self.PARSED_DIR, cdir) + '/' + doc_id

        chunks = self.get_chunk(parsed_fpath + self.CHUNK_EXT)
        tree = self.get_tree_mcccj(parsed_fpath + self.MCCCJ_TREE_EXT)
        dep = self.get_dependency(parsed_fpath + self.MCCCJ_SD_EXT)

        # path for preprocess file
        pre_fpath = self.get_full_path(self.PREPROCESS_DIR,
                                       cdir) + '/' + doc_id
        sentences = self.get_sentences(pre_fpath + self.TXT_EXT)

        # create doc representation
        doc = {
            "doc_id": doc_id,
            "test": is_test,
            "path1": ori_fpath,
            "path2": parsed_fpath,
            "nsen": len(tree),
            "txt": txt,
            "sen": sentences,
            "protein": proteins,
            "equiv": equivs,
            "trigger": triggers,
            "event": events,
            "chunk": chunks,
            "tree": tree,
            "dep": dep
        }

        return doc

    '''
    update/add offset of word
    offset is relative position in the abstract
    and add stem of word
    '''

    def update_word_info(self, txt, sentences):

        search_offset = 0

        for sentence in sentences:
            for word in sentence:
                string = word["string"]
                #if string in self.BRACKET_CHAR:
                #    string = self.BRACKET_CHAR[string]
                start = txt.find(string, search_offset)
                if start == -1:

                    # try to convert back " into ''
                    # in parser " are represented as '' so we need to convert back to original char
                    # but there are some cases that '' is actually '' as in =>  4'',6-Diamidino-2-phenylindole
                    string = string.replace("\"", "\'\'")
                    start = txt.find(string, search_offset)

                    if start == -1:
                        print string
                        print sentence
                        raise ValueError("string not found")

                    word["string"] = string

                end = start + len(string)
                # add offset
                word["start"] = start
                word["end"] = end
                # add stem using porter stemmer
                word["stem"] = self.Stemmer.stem(string, 0, len(string) - 1)
                search_offset = end

        return sentences

    '''
    write to file
    '''

    def write_to_file(self, doc_to_write, fpath):
        with open(fpath, 'w') as fout:
            fout.write(json.dumps(doc_to_write))

    '''
    check consistency for chunk, tree, and dep data type
    they must have same number of line and number of word for each line
    '''

    def check_consistency(self, doc):
        chunck = doc["chunk"]
        tree = doc["tree"]
        dep = doc["dep"]

        # check number of sentence
        #print "number of sentence:", len(chunck), len(tree),len(dep)
        if len(chunck) != len(tree) or len(tree) != len(dep):
            raise ValueError(
                "Chunck, Tree and Dep has different number of sentence")

        # check number of line
        for i in range(0, len(chunck)):
            #print i, "number of words: ", chunck[i]["nword"], tree[i]["nword"], dep[i]["nword"]
            if chunck[i]["nword"] != tree[i]["nword"] or tree[i][
                    "nword"] != dep[i]["nword"]:
                print tree[i]
                print chunck[i]
                print dep[i]
                raise ValueError("Different number of word in sentence " +
                                 str(i) + " doc: " + doc["doc_id"])

    '''
    print to screen document representation
    '''

    def print_doc(self, doc):

        print "doc id: ", doc["doc_id"]
        print "is test: ", doc["test"]
        print "ori path: ", doc["path1"]
        print "parsed path: ", doc["path2"]
        print "number of sentence:", doc["nsen"]
        print doc["txt"]
        print "Sentences:"
        for sen in doc["sen"]:
            print sen
        print "Proteins:"
        for line in doc["protein"].values():
            print line
        print "Equivs:"
        for line in doc["equiv"]:
            print line
        print "Triggers:"
        for line in doc["trigger"].values():
            print line
        print "Events:"
        for line in doc["event"].values():
            print line
        print "Chunks:"
        for line in doc["chunk"]:
            print line
        print "Trees:"
        for line in doc["tree"]:
            print line
        print "Dependencies:"
        for line in doc["dep"]:
            print line

    '''
    return list of file names in cdir directory
    '''

    def get_doc_list(self, cdir, ext_filter):
        return [
            d.rstrip(ext_filter) for d in os.listdir(cdir)
            if d.endswith(ext_filter)
        ]

    '''
    return text from txt file of given fpath
    '''

    def get_text(self, fpath):
        with open(fpath, 'r') as fin:
            txt = fin.read()
        return txt

    '''
    return protein dict
    pretein representation:
    'T84' : ['T84', 'Negative_regulation', '2665', '2673', 'decrease']
    '''

    def get_protein(self, fpath):
        proteins = {}
        with open(fpath, 'r') as fin:
            for line in fin:
                line = line.rstrip('\n')
                p = re.split("\\t|\\s+", line, 4)
                proteins[p[0]] = p
        return proteins

    '''
    return trigger dictionary, event dictionary and equiv tuple
    trigger 
    id : (id, trigger type, start idx, end idx, trigger text)
    event tuple
    id : (id, event type, trigger_id, theme1 id, theme2 id, cause id)
    equiv tuple: (protein1, protein2)
    e
    '''

    def get_trigger_relation(self, fpath):
        triggers = {}
        events = {}
        equivs = []
        with open(fpath, 'r') as fin:
            for line in fin:
                line = line.rstrip('\n')
                # process trigger
                if line[0] == 'T':
                    t = re.split("\\t|\\s+", line, 4)
                    triggers[t[0]] = t

                # process event
                elif line[0] == 'E':
                    evt = re.split("\\t|\\s+", line)
                    eid = evt[0]
                    etype, _, trigid = evt[1].partition(':')
                    theme1 = evt[2].split(':')[1]
                    theme2 = ""
                    cause = ""
                    if len(evt) > 3:
                        argtype, _, argid = evt[3].partition(':')
                        if argtype == 'Theme2':
                            theme2 = argid
                            cause = ""
                        elif argtype == 'Cause':
                            theme2 = ""
                            cause = argid
                    events[eid] = list(
                        (eid, etype, trigid, theme1, theme2, cause))

                # process equiv
                elif line[0] == '*':
                    equiv = re.split("\\t|\\s", line)
                    equivs.append(tuple(equiv[2:]))

        return triggers, events, equivs

    '''
    return chunk data
    '''

    def get_chunk(self, fpath):
        return self.Chunk.read(fpath)

    '''
    return parse tree data
    '''

    def get_tree_mcccj(self, fpath):
        return self.Tree.read(fpath)

    '''
    return sentences of a document
    get_tree_mcccj must be called first
    '''

    def get_sentences(self, preprocess_txt_fpath, check=False):
        pre_txt = self.get_text(preprocess_txt_fpath)
        sentences = self.update_word_info(pre_txt, self.Tree.sentences)

        if check:
            for sentence in sentences:
                for word in sentence:
                    if pre_txt[word["start"]:word["end"]] != word["string"]:
                        print pre_txt[word["start"]:word["end"]], "=>", word
                        #raise ValueError("string in txt and word do not match")

        return sentences

    '''
    return dependency data
    '''

    def get_dependency(self, fpath):
        return self.Dep.read(fpath)

    '''
    cdir: dev, train, or test
    ctype: original or parsed
    '''

    def get_full_path(self, ctype, cdir):
        return self.src + '/' + ctype + '/' + cdir
예제 #7
0
class GeniaReader(object):
    '''
    read all necessary file and convert it to internal format, then save it for latter feature extraction
    '''
    
    ''' list of extension'''
    TXT_EXT = ".txt"
    
    PROTEIN_EXT = ".a1"
    
    TRIGGER_REL_EXT = ".a2"
    
    CHUNK_EXT = ".chk"
    
    MCCCJ_TREE_EXT = ".txt.ss.mcccjtok.mcccj"
    
    MCCCJ_SD_EXT = ".txt.ss.mcccjtok.mcccj.basic.sd"
    

    CORPUS_DIR = ["dev","train","test"]    
    
    # this folder contains original corpus of bionlp2011: txt, a1, and a2 files 
    ORIGINAL_DIR = "original"
    
    # this folder contains parsed corpus (tree and dependency) and also chunk file
    PARSED_DIR = "parse"    
    
    # this folder contains pre-processed text, all proteins are replaced by placeholder
    PREPROCESS_DIR = "preprocess"
    
    # this folder contain data source for all docs
    DATA_DIR = "data"
    
    # bracket char coding
    #BRACKET_CHAR = {'-LRB-':'(', '-RRB-':')', '-RSB-':']', '-LSB-':'[', '-LCB-':'{', '-RCB-':'}'}

    def __init__(self, source, dest):
        '''
        Constructor
        '''
        self.src = source
        self.dest = dest
        
        self.Dep = DependencyReader()
        self.Tree = ParseTreeReader()
        self.Chunk = ChunkReader()
        self.Stemmer = PorterStemmer()
        
        
    def run(self):
        # read all files from dir
        for cdir in self.CORPUS_DIR:
            print "reading content of " + cdir + " directory"
            doc_ids = self.load_save_data(cdir)
            print str(len(doc_ids)) + " doc have been read"
            
            # write doc ids to file
            fpath = self.dest + '/' + cdir + '_doc_ids.json'
            self.write_to_file(doc_ids, fpath)
    
    '''
    this function returns list of doc_id for a given cdir
    and do:
    1. list files under cdir
    2. load all necessary file for a given doc_id
    3. check it consistency
    4. save it as json    
    '''
    def load_save_data(self, cdir):
                
        ext = self.TXT_EXT
        doc_ids = self.get_doc_list(self.get_full_path(self.ORIGINAL_DIR,cdir), ext)
        
        # there is a document that cannot be parsed by mcccj parser
        # no event in that document, so we can just skip it
        if cdir == "train":
            print "Skip PMID-8632999"
            doc_ids.remove("PMID-8632999")
        
        for doc_id in doc_ids:            
            # load doc data
            doc = self.load_doc(cdir, doc_id)
            
            # check consistency
            self.check_consistency(doc)
            
            # save to file
            fpath = self.dest + '/' + self.DATA_DIR + '/' + doc_id + '.json'
            self.write_to_file(doc, fpath)
        
           
        return doc_ids
    
    '''
    read document data
    and return doc representation
    '''
    def load_doc(self, cdir, doc_id):
        # init variable
        triggers = []
        events = []
        equivs = []
        is_test = True
        
        # path for original file
        ori_fpath = self.get_full_path(self.ORIGINAL_DIR,cdir) + '/' + doc_id
        
        txt = self.get_text(ori_fpath + self.TXT_EXT)        
        proteins = self.get_protein(ori_fpath + self.PROTEIN_EXT)                
        if cdir != 'test':            
            triggers, events, equivs = self.get_trigger_relation(ori_fpath + self.TRIGGER_REL_EXT)
            is_test = False
        
        # path for parsed file
        parsed_fpath = self.get_full_path(self.PARSED_DIR,cdir) + '/' + doc_id
        
        chunks = self.get_chunk(parsed_fpath + self.CHUNK_EXT)
        tree = self.get_tree_mcccj(parsed_fpath + self.MCCCJ_TREE_EXT)
        dep = self.get_dependency(parsed_fpath + self.MCCCJ_SD_EXT)
        
        # path for preprocess file
        pre_fpath = self.get_full_path(self.PREPROCESS_DIR, cdir) + '/' + doc_id
        sentences = self.get_sentences(pre_fpath + self.TXT_EXT)        
        
        
        # create doc representation
        doc = {"doc_id": doc_id,
               "test": is_test,
               "path1": ori_fpath,
               "path2": parsed_fpath,
               "nsen": len(tree),
               "txt":txt,
               "sen": sentences,
               "protein":proteins,
               "equiv":equivs,
               "trigger":triggers,
               "event":events,
               "chunk":chunks,
               "tree":tree,
               "dep":dep}
        
        return doc
                    
                    
    
    '''
    update/add offset of word
    offset is relative position in the abstract
    and add stem of word
    '''
    def update_word_info(self, txt, sentences):
        
        search_offset = 0
        
        for sentence in sentences:
            for word in sentence:
                string = word["string"]
                #if string in self.BRACKET_CHAR:
                #    string = self.BRACKET_CHAR[string]
                start = txt.find(string, search_offset)
                if start == -1:
                    
                    # try to convert back " into ''
                    # in parser " are represented as '' so we need to convert back to original char
                    # but there are some cases that '' is actually '' as in =>  4'',6-Diamidino-2-phenylindole 
                    string = string.replace("\"","\'\'")
                    start = txt.find(string, search_offset)
                    
                    if start == -1:                        
                        print string
                        print sentence
                        raise ValueError("string not found")
                    
                    word["string"] = string
                    
                end = start + len(string)  
                # add offset
                word["start"] = start
                word["end"] = end
                # add stem using porter stemmer
                word["stem"] = self.Stemmer.stem(string, 0, len(string)-1)
                search_offset = end                
            
        return sentences
        
    
    '''
    write to file
    '''
    def write_to_file(self, doc_to_write, fpath):
        with open(fpath, 'w') as fout:
            fout.write(json.dumps(doc_to_write))
    
    '''
    check consistency for chunk, tree, and dep data type
    they must have same number of line and number of word for each line
    '''
    def check_consistency(self, doc):
        chunck = doc["chunk"]
        tree = doc["tree"]
        dep = doc["dep"]
        
        # check number of sentence
        #print "number of sentence:", len(chunck), len(tree),len(dep)
        if len(chunck) != len(tree) or len(tree) != len(dep):            
            raise ValueError("Chunck, Tree and Dep has different number of sentence")
        
        # check number of line
        for i in range(0,len(chunck)):
            #print i, "number of words: ", chunck[i]["nword"], tree[i]["nword"], dep[i]["nword"]
            if chunck[i]["nword"] != tree[i]["nword"] or tree[i]["nword"] != dep[i]["nword"]:                
                print tree[i]
                print chunck[i]
                print dep[i]
                raise ValueError("Different number of word in sentence " + str(i) + " doc: " + doc["doc_id"])
    
    '''
    print to screen document representation
    '''
    def print_doc(self, doc):  
        
        print "doc id: ", doc["doc_id"]
        print "is test: ", doc["test"]
        print "ori path: ", doc["path1"]
        print "parsed path: ", doc["path2"]        
        print "number of sentence:", doc["nsen"]
        print doc["txt"]
        print "Sentences:"
        for sen in doc["sen"]:
            print sen
        print "Proteins:"
        for line in doc["protein"].values():
            print line
        print "Equivs:"
        for line in doc["equiv"]:
            print line
        print "Triggers:"
        for line in doc["trigger"].values():
            print line   
        print "Events:"  
        for line in doc["event"].values():
            print line
        print "Chunks:"
        for line in doc["chunk"]:
            print line
        print "Trees:"   
        for line in doc["tree"]:
            print line
        print "Dependencies:"
        for line in doc["dep"]:
            print line
    
    '''
    return list of file names in cdir directory
    '''
    def get_doc_list(self, cdir, ext_filter):
        return [d.rstrip(ext_filter) for d in os.listdir(cdir) if d.endswith(ext_filter)]
            
    '''
    return text from txt file of given fpath
    '''
    def get_text(self, fpath):
        with open(fpath, 'r') as fin:
            txt = fin.read()
        return txt
            
    '''
    return protein dict
    pretein representation:
    'T84' : ['T84', 'Negative_regulation', '2665', '2673', 'decrease']
    '''
    def get_protein(self, fpath):
        proteins = {}
        with open(fpath, 'r') as fin:
            for line in fin:
                line = line.rstrip('\n')
                p = re.split("\\t|\\s+",line,4)
                proteins[p[0]] = p
        return proteins
    
    '''
    return trigger dictionary, event dictionary and equiv tuple
    trigger 
    id : (id, trigger type, start idx, end idx, trigger text)
    event tuple
    id : (id, event type, trigger_id, theme1 id, theme2 id, cause id)
    equiv tuple: (protein1, protein2)
    e
    '''
    def get_trigger_relation(self, fpath):
        triggers = {}
        events = {}
        equivs = []
        with open(fpath, 'r') as fin:
            for line in fin:
                line = line.rstrip('\n')
                # process trigger
                if line[0] == 'T':
                    t = re.split("\\t|\\s+",line,4)
                    triggers[t[0]] = t
                
                # process event
                elif line[0] == 'E':                    
                    evt = re.split("\\t|\\s+",line)
                    eid = evt[0]
                    etype,_,trigid = evt[1].partition(':')
                    theme1 = evt[2].split(':')[1]
                    theme2 = ""
                    cause = ""
                    if len(evt) > 3:
                        argtype,_,argid = evt[3].partition(':')
                        if argtype == 'Theme2':
                            theme2 = argid
                            cause = ""
                        elif argtype == 'Cause':
                            theme2 = ""
                            cause = argid                        
                    events[eid] = list((eid, etype, trigid, theme1, theme2, cause))
                
                # process equiv
                elif line[0] == '*':
                    equiv = re.split("\\t|\\s",line)
                    equivs.append(tuple(equiv[2:]))
                    
                    
                    
        return triggers, events, equivs
        
    '''
    return chunk data
    '''
    def get_chunk(self, fpath):        
        return self.Chunk.read(fpath)
    
    '''
    return parse tree data
    '''
    def get_tree_mcccj(self, fpath):
        return self.Tree.read(fpath)
    
    '''
    return sentences of a document
    get_tree_mcccj must be called first
    '''
    def get_sentences(self, preprocess_txt_fpath, check = False):
        pre_txt = self.get_text(preprocess_txt_fpath)
        sentences = self.update_word_info(pre_txt,  self.Tree.sentences)     
        
        if check:
            for sentence in sentences:
                for word in sentence:
                    if pre_txt[word["start"]:word["end"]] != word["string"]:
                        print pre_txt[word["start"]:word["end"]], "=>", word
                        #raise ValueError("string in txt and word do not match")
        
        return sentences
    
    '''
    return dependency data
    '''
    def get_dependency(self, fpath):
        return self.Dep.read(fpath)
    
    '''
    cdir: dev, train, or test
    ctype: original or parsed
    '''
    def get_full_path(self, ctype, cdir):
        return self.src + '/' + ctype + '/' + cdir
예제 #8
0
    def __init__(self, source):

        super(TriggerDictionary, self).__init__(source)
        self.Stemmer = PorterStemmer()
예제 #9
0
class TriggerDictionary(Dictionary):
    def __init__(self, source):

        super(TriggerDictionary, self).__init__(source)
        self.Stemmer = PorterStemmer()

    def load(self, corpus_dir):
        """
        Load a trigger dictionary data
        corpus_dir is dev, train, or mix
        """
        if corpus_dir not in self.CORPUS_DIR:
            raise ValueError("wrong value. choose 'dev', 'train', or 'mix'")

        fpath = self.src + '/' + self.DICT_DIR + '/' + corpus_dir + self.TDICT_SUFIX_EXT
        if not os.path.exists(fpath):
            print "Trigger dictionary data is not exist"
            print "Now building new trigger dictionary data ..."
            self.build()

        with open(fpath, 'r') as f:
            self.data = json.loads(f.read())

    def count(self, word, ttype=""):
        """
        return number of word occurrence in dictionary
        """
        if self.data == {}:
            raise ValueError("Dictionary data has not been loaded")
        # get counter
        ttype_cnt = self.data.get(word.lower(), None)
        retval = 0
        if ttype_cnt != None:
            if ttype != "":
                retval = ttype_cnt.get(ttype, 0)
            else:
                # get count for all event type
                for v in ttype_cnt.itervalues():
                    retval += v
        return retval

    def build(self):
        """
        build all trigger dictionaries (dev, train, and mix)
        and save it to dict folder
        """
        for cdir in self.CORPUS_DIR:
            print "building " + cdir + " trigger dictionary"
            fpath = self.src + '/' + self.DICT_DIR + '/' + cdir + self.TDICT_SUFIX_EXT
            with open(fpath, 'w') as f:
                f.write(json.dumps(self.build_dict(cdir)))

    def build_dict(self, corpus_dir):
        """
        add trigger word to dictionary, including multi-word trigger
        ex. 
        triggers: "Negative Regulator", "Expression"
        store in dict:
        "Negative Regulator" <== bigram version 
        "Negative" <== head of bigram
        "Expression"  <== standard unigram
        """
        if corpus_dir not in self.CORPUS_DIR:
            raise ValueError("wrong value. choose 'dev', 'train', or 'mix'")

        # init document builder
        doc_builder = DocumentBuilder(self.src)

        # init default dict with counter
        td = defaultdict(Counter)

        # get list of document
        doc_ids = self.get_doc_ids(corpus_dir)

        for doc_id in doc_ids:
            o_doc = doc_builder.build(doc_id)

            for i in range(0, len(o_doc.sen)):
                o_sen = o_doc.sen[i]
                for twn in o_sen.trigger:
                    w = o_sen.words[twn]
                    ttype = w['type']
                    string = w['string'].lower()
                    stem = w['stem'].lower()

                    # skip short word
                    if len(string) < 4: continue

                    # adding unigram text and stem to dictionary
                    td[string][ttype] += 1
                    td[stem][ttype] += 1

                    # check full text
                    # add to dict if it's multi-word
                    full_string = o_sen.trigger_text[twn]
                    if ' ' in full_string:
                        td[full_string][ttype] += 1

        print "the dictionary contains:", len(td), "trigger words"

        return dict(td)

    def get_triggers(self, doc_id):
        """
        return list of trigger from document
        """
        fpath = self.src + '/' + self.DATA_DIR + '/' + doc_id + self.DATA_EXT
        with open(fpath, 'r') as f:
            doc = json.loads(f.read())

        return doc["trigger"]

    def test(self, test_name):

        if test_name == "loading":
            trigger = {
                "Negative Regulator": "Negative_regulation",
                "Negative": "Negative_regulation",
                "Regulator": "Negative_regulation",
                "binds": "Binding",
                "mRNA expression": "Transcription",
                "mRNA": "Transcription"
            }

            print "\n\n----------------------------"
            print "Using original string"
            self.load("mix")
            for t, ttype in trigger.iteritems():
                cnt1 = self.count(t)
                cnt2 = self.count(t, ttype)
                print t, "All", cnt1
                print t, ttype, cnt2

            print "\n\n----------------------------"
            print "Using stem version of string"
            for t, ttype in trigger.iteritems():
                t = self.Stemmer.stem(t, 0, len(t) - 1)
                cnt1 = self.count(t)
                cnt2 = self.count(t, ttype)
                print t, "All", cnt1
                print t, ttype, cnt2

            print "\n\n"