Exemplo n.º 1
0
class GeniaA2WriterTest(object):
    '''
    classdocs
    '''

    source = "E:/corpus/bionlp2011/project_data/"

    def __init__(self):
        '''
        Constructor
        '''
        '''
        Constructor
        '''

        out_path = "E:/corpus/bionlp2011/project_test/result/model1"

        WD = WordDictionary(self.source)
        WD.load("train")

        TD = TriggerDictionary(self.source)
        TD.load("train")

        self.builder = DocumentBuilder(self.source, WD, TD)
        self.a2writter = GeniaA2Writer(out_path)

    def test1(self):
        doc_id = 'PMC-2222968-04-Results-03'
        o_doc = self.builder.build(doc_id)

        self.a2writter.write(o_doc)

    def test2(self):
        dir_name_eval = "test-model-002-cause"
        doc_ids = ['PMC-2222968-04-Results-03']
        dict_type = 'train'

        prediction = Prediction(self.source, dir_name_eval, dict_type)
        prediction.predict(doc_ids)

        o_doc = prediction.docs[doc_ids[0]]
        for sen in o_doc.sen:
            sen.test()
        self.a2writter.write(o_doc)

    def test3(self):
        dir_name_eval = "test-model-013"
        doc_ids = ['PMID-1763325']
        dict_type = 'train'

        prediction = Prediction(self.source, dir_name_eval, dict_type)
        prediction.predict(doc_ids, write_result=False)

        o_doc = prediction.docs[doc_ids[0]]
        for sen in o_doc.sen:
            sen.test()

        self.a2writter.write(o_doc)
Exemplo n.º 2
0
class GeniaA2WriterTest(object):
    '''
    classdocs
    '''
    
    source = "E:/corpus/bionlp2011/project_data/"

    def __init__(self):
        '''
        Constructor
        '''
        '''
        Constructor
        '''
               
        out_path = "E:/corpus/bionlp2011/project_test/result/model1" 
        
        WD = WordDictionary(self.source)    
        WD.load("train")
               
        TD = TriggerDictionary(self.source)
        TD.load("train")
        
        self.builder = DocumentBuilder(self.source, WD, TD)
        self.a2writter = GeniaA2Writer(out_path)
    
    def test1(self):
        doc_id = 'PMC-2222968-04-Results-03'
        o_doc = self.builder.build(doc_id)
        
        self.a2writter.write(o_doc)
        
    def test2(self):
        dir_name_eval = "test-model-002-cause"    
        doc_ids = ['PMC-2222968-04-Results-03']
        dict_type = 'train'
        
        prediction = Prediction(self.source, dir_name_eval, dict_type)    
        prediction.predict(doc_ids)
        
        o_doc = prediction.docs[doc_ids[0]]
        for sen in o_doc.sen:
            sen.test()
        self.a2writter.write(o_doc)
        
    def test3(self):
        dir_name_eval = "test-model-013"    
        doc_ids = ['PMID-1763325']
        dict_type = 'train'
        
        prediction = Prediction(self.source, dir_name_eval, dict_type)
        prediction.predict(doc_ids, write_result=False)
        
        o_doc = prediction.docs[doc_ids[0]]
        for sen in o_doc.sen:
            sen.test()
            
        self.a2writter.write(o_doc) 
Exemplo n.º 3
0
    def build_dict(self, corpus_dir):
        """
        add trigger word to dictionary, including multi-word trigger
        ex. 
        triggers: "Negative Regulator", "Expression"
        store in dict:
        "Negative Regulator" <== bigram version 
        "Negative" <== head of bigram
        "Expression"  <== standard unigram
        """
        if corpus_dir not in self.CORPUS_DIR:
            raise ValueError("wrong value. choose 'dev', 'train', or 'mix'")

        # init document builder
        doc_builder = DocumentBuilder(self.src)

        # init default dict with counter
        td = defaultdict(Counter)

        # get list of document
        doc_ids = self.get_doc_ids(corpus_dir)

        for doc_id in doc_ids:
            o_doc = doc_builder.build(doc_id)

            for i in range(0, len(o_doc.sen)):
                o_sen = o_doc.sen[i]
                for twn in o_sen.trigger:
                    w = o_sen.words[twn]
                    ttype = w['type']
                    string = w['string'].lower()
                    stem = w['stem'].lower()

                    # skip short word
                    if len(string) < 4: continue

                    # adding unigram text and stem to dictionary
                    td[string][ttype] += 1
                    td[stem][ttype] += 1

                    # check full text
                    # add to dict if it's multi-word
                    full_string = o_sen.trigger_text[twn]
                    if ' ' in full_string:
                        td[full_string][ttype] += 1

        print "the dictionary contains:", len(td), "trigger words"

        return dict(td)
Exemplo n.º 4
0
class DocumentBuilderTest(object):
    '''
    classdocs
    '''


    def __init__(self):
        '''
        Constructor
        '''
        self.source = "E:/corpus/bionlp2011/project_data/"        
        
        WD = WordDictionary(self.source)    
        WD.load("train")
               
        TD = TriggerDictionary(self.source)
        TD.load("train")
        
        self.builder = DocumentBuilder(self.source, WD, TD)
        
    def run(self):
        self.test1()
        self.test2()
        self.test3()
    
    def test1(self):
        doc_id = "PMID-2160380"
        o_doc = self.builder.build(doc_id, is_test = False)
        
        print "Test 1: document from test corpus\n================================================="
        self.print_info(o_doc)
    
    def test2(self):        
        doc_id = "PMID-2083253"
        o_doc = self.builder.build(doc_id, is_test = True)
        
        print "\n\nTest 2: document from train corpus set is_test=True\n================================================="
        self.print_info(o_doc)
        
    def test3(self):        
        doc_id = "PMC-2222968-04-Results-03"
        o_doc = self.builder.build(doc_id)
        
        print "\n\nTest 3: document from dev corpus\n================================================="
        self.print_info(o_doc)
        
    def test4(self):
        # document builder without dictionary
        doc_id = 'PMID-8978306'
        
        o_doc = self.builder.build(doc_id)
        print "\n\nTest 4: document with multi word trigger\n================================================="
        self.print_info(o_doc)
        
    def print_info(self, o_doc):
        print "doc id:", o_doc.doc_id
        print "is test:", o_doc.is_test
        
        
        for i in range(0, len(o_doc.sen)):
            o_sen = o_doc.sen[i]
            print "sen:", i
            print "-------------------------------"
            for j in range(0,o_sen.nwords):
                w = o_sen.words[j]
                print j, w['start'], w['string'], w['pos_tag'], w['type'], w['score']
            # entity maps
            print "entity maps"
            print o_sen.entity_map
            # list of word number which is marked as trigger candidate
            print "trigger candidate:"
            print o_sen.trigger_candidate            
            # list of protein word number
            print "protein:"
            print o_sen.protein            
            # list of trigger word number
            print "trigger:"
            print o_sen.trigger      
            print o_sen.trigger_text      
            # dependency
            print "dependency"
            print o_sen.dep.root
            print o_sen.dep.graph
            print o_sen.dep.pair            
            # chunk
            print "chunk"
            print o_sen.chunk.chunk_map
            print o_sen.chunk.chunk_type
            
            # tree
            
            # relation representation
            print "relation:"
            if o_sen.rel != None:
                print o_sen.rel.data 
Exemplo n.º 5
0
class Prediction(object):
    '''
    classdocs
    '''
    
    # suffix and extension of id file
    DOCID_SUFFIX_EXT = "_doc_ids.json"
    
    # directory for saving svm model
    MODEL_DIR = "/model"
    
    # directory for saving output a2 file
    OUT_DIR = "/result"

    # list of event name
    EVENT_NAME = ["None",
                  "Gene_expression",
                  "Transcription",
                  "Protein_catabolism",
                  "Phosphorylation",
                  "Localization",
                  "Binding",
                  "Regulation",
                  "Positive_regulation",
                  "Negative_regulation"]
        

    def __init__(self, source, dir_name, dict_type):
        '''
        Constructor
        '''
        self.src = source
        self._model_path = '' 
        self._out_path = ''
        self.set_path(source, dir_name)
        
        self.dict_type = dict_type
        self.wdict = None
        self.tdict = None
        self.doc_builder = None
        self.extraction = None      
        
        self.docs = {}          
        
        self._set(dict_type)
    
    def _set(self, dict_type):
        """
        initialize dictionary type to be used in feature extraction process
        initialize document builder
        initialize feature extraction
        """       
        
        self.wdict = WordDictionary(self.src)    
        self.wdict.load(dict_type)
               
        self.tdict = TriggerDictionary(self.src)
        self.tdict.load(dict_type)
        
        self.doc_builder = DocumentBuilder(self.src, self.wdict, self.tdict)         
        self.extraction = FeatureExtraction(self.src, self.wdict, self.tdict)
        
        self.a2 = A2Writter(self._out_path)
        
    def set_path(self, source, dir_name):
        """
        check whether given dir_name is exist
        raise error if it does not exist
        return full _model_path of dir_name
        """
        # model path
        path = source + self.MODEL_DIR + '/' + dir_name
        if not os.path.exists(path):
            raise ValueError(path + "does not exist!!, chose another dir_name for prediction")        
        self._model_path = path
        
        # output path
        path = source + self.OUT_DIR + '/' + dir_name
        if not os.path.exists(path):
            os.makedirs(path)        
        self._out_path = path 
       
        
    def get_feature(self, step):
        """
        extract feature and return X, Y for a given step
        step are either one of these:
        'tp' => trigger-protein relation
        'tt' => trigger-trigger relation to predict regulation event with trigger argument  
        'tc' => trigger-theme-cause relation to predict regulation event with theme and cause (binary)
        't2' => trigger-theme1-theme2 relation to predict theme2 in binding (binary)
        """
        if step not in ['tt','tp','tc','t2']:
            raise ValueError("only support step for tt, tp, tc and t2")
        
        X = []
        Y = []
        info = []
        
        dt_start = dt.now()        
        
        # reset statistic of extraction
        self.extraction.reset_statistic()
                      
        # init feature
        print "now extracting", len(self.docs), "docs"
        for doc_id in self.docs.keys():             
            o_doc = self.docs[doc_id]
            if step == 'tp':
                samples = self.extraction.extract_tp(o_doc)
            elif step == 'tt':
                samples = self.extraction.extract_tt(o_doc)
            elif step == 'tc':
                samples = self.extraction.extract_tc(o_doc)
            elif step == 't2':
                samples = self.extraction.extract_t2(o_doc)
            
            for sample in samples:
                X.append(sample[2])
                Y.append(sample[1])      
                info.append(sample[0])             
                
        print "time to extract feature", dt.now() - dt_start
        
        return X,Y, info
    
    def set_prediction_docs(self,docid_list_fname, is_test = True):
        """
        build a document to be predicted
        """
        dt_start = dt.now()      
        self.docs = {}
        # get list of file
        doc_ids = self.get_docid_list(docid_list_fname)
        
        print "now building", len(doc_ids), "docs"
        for doc_id in doc_ids:
            self.docs[doc_id] = self.doc_builder.build(doc_id, is_test)
            
        print "finish built docs in:", dt.now() - dt_start

    def update_doc_info(self, list_info, list_target, arg_name, arg_type):
        """
        update trigger and relation of document
        """
        for i in range(0,len(list_info)):
            target = list_target[i]
            if target < 1: continue
            info = list_info[i]
            doc_id = info["doc"]
            self.docs[doc_id].update(info['sen'], info['t'], self.EVENT_NAME[target], info['a'], arg_name, arg_type)
            
    def update_doc_relation(self, rel_type, list_info, list_target):
        """
        update only relation of document
        """
        for i in range(0,len(list_info)):
            target = list_target[i]
            if target == 1:
                info = list_info[i]
                doc_id = info["doc"]
                
                if rel_type == 'cause':
                    arg = info['c']
                else:
                    arg = info['a2']
                self.docs[doc_id].update_relation(rel_type, info['sen'], info['t'], arg)

    
    def get_docid_list(self, docid_list_fname):
        """
        return list of file
        """
        if not isinstance(docid_list_fname, list):
            # get list of doc ids from file
            path = self.src + '/' + docid_list_fname + self.DOCID_SUFFIX_EXT
            if not os.path.exists(path):
                raise ValueError(path + " is not exist")
            with open(path, 'r') as f: 
                doc_ids = json.loads(f.read())
        else:
            doc_ids = docid_list_fname
        
        return doc_ids
    
    def predict_tp(self, grid_search = True):
        """
        return prediction of given docid_list
        """
        if self.docs == {}:
            raise ValueError("docs have not been created. call set_prediction_docs first!")
        # get list of file
        #doc_ids = self.get_docid_list(docid_list_fname)
        
        # get features and target
        X, Y, info = self.get_feature('tp')
        
        # init svm classifier
        svm = SVM(self._model_path, "trig-prot", "linear", grid_search = grid_search, class_weight = 'auto')
        svm.load()
        
        return svm.predict(X), Y, info
        
    def predict_tt(self, grid_search = True):
        """
        return prediction of given docid_list
        """
        if self.docs == {}:
            raise ValueError("docs have not been created. call set_prediction_docs first!")
        # get list of file
        #doc_ids = self.get_docid_list(docid_list_fname)
        
        # get features and target
        X, Y, info = self.get_feature('tt')
        
        # init svm classifier
        svm = SVM(self._model_path, "trig-trig", "linear", grid_search = grid_search, class_weight = 'auto')
        svm.load()
        
        return svm.predict(X), Y, info
    
    def predict_tc(self, grid_search = True):
        if self.docs == {}:
            raise ValueError("docs have not been created. call set_prediction_docs first!")
        # get list of file
        #doc_ids = self.get_docid_list(docid_list_fname)
        
        # get features and target
        X, Y, info = self.get_feature('tc')
        
        # init svm classifier
        svm = SVM(self._model_path, "trig-theme-cause", "linear", grid_search = grid_search, class_weight = 'auto')
        svm.load()
        
        return svm.predict(X), Y, info
    
    def predict_t2(self, grid_search = True):
        if self.docs == {}:
            raise ValueError("docs have not been created. call set_prediction_docs first!")

        # get features and target
        X, Y, info = self.get_feature('t2')
        
        # init svm classifier
        svm = SVM(self._model_path, "trig-theme1-2", "linear", grid_search = grid_search, class_weight = 'auto')
        svm.load()
        
        return svm.predict(X), Y, info
        
    def predict(self, docid_list_fname, write_result = True):
        
        # create document object for prediction
        self.set_prediction_docs(docid_list_fname)
        
        # predict trigger-protein relation
        Ypred, _, info = self.predict_tp(grid_search = True)
        # update document
        self.update_doc_info(info, Ypred, "Theme", "P")
        
        # predict trigger-trigger relation
        for _ in range(0,2):
            Ypred, _, info = self.predict_tt(grid_search = True)
            self.update_doc_info(info, Ypred, "Theme", "E")
        
        # predict trigger-theme-cause relation
        Ypred, _, info = self.predict_tc(grid_search = True)
        self.update_doc_relation('cause', info, Ypred)
        
        # predict theme2 relation
        Ypred, _, info = self.predict_t2(grid_search = True)
        self.update_doc_relation('theme2', info, Ypred)
        
        # write a2
        if write_result:
            self.write_result()
        
    def write_result(self):
        print "now writing", len(self.docs), "docs result to", self._out_path
        for doc in self.docs.itervalues():
            self.a2.write(doc)
Exemplo n.º 6
0
class Learning(object):
    '''
    Learning steps:
    1. define docs for learning
    2. extract features
    3. build input data for classifier
    4. build a model and save it
    '''

    # suffix and extension of id file
    DOCID_SUFFIX_EXT = "_doc_ids.json"

    # directory for saving svm model
    MODEL_DIR = "/model"

    def __init__(self, source, dir_name, dict_type):
        '''
        Constructor
        '''
        self.src = source
        self.path = self.set_path(source, dir_name)

        self.dict_type = dict_type
        self.wdict = None
        self.tdict = None
        self.doc_builder = None
        self.extraction = None

        self._set(dict_type)

    def set_path(self, source, dir_name):
        """
        check whether given dir_name is exist
        raise error if exist, otherwise create new one
        return full path of dir_name
        """
        path = source + self.MODEL_DIR + '/' + dir_name
        if os.path.exists(path):
            raise ValueError(path +
                             "exist!!, chose anoher dir_name for learning")
        else:
            # create dir_name
            os.makedirs(path)
        return path

    def _set(self, dict_type):
        """
        initialize dictionary type to be used in learning process
        initialize document builder
        initialize feature extraction
        """

        self.wdict = WordDictionary(self.src)
        self.wdict.load(dict_type)

        self.tdict = TriggerDictionary(self.src)
        self.tdict.load(dict_type)

        self.doc_builder = DocumentBuilder(self.src, self.wdict, self.tdict)
        self.extraction = FeatureExtraction(self.src, self.wdict, self.tdict)

    def get_docid_list(self, docid_list_fname):
        """
        return list of file
        """
        if not isinstance(docid_list_fname, list):
            # get list of doc ids from file
            path = self.src + '/' + docid_list_fname + self.DOCID_SUFFIX_EXT
            if not os.path.exists(path):
                raise ValueError(path + " is not exist")
            with open(path, 'r') as f:
                doc_ids = json.loads(f.read())
        else:
            doc_ids = docid_list_fname

        return doc_ids

    def get_feature(self, doc_ids, step):
        """
        extract feature and return X, Y for a given step
        step are either one of these:
        'tp' => trigger-protein relation
        'tt' => trigger-trigger relation to predict regulation event with trigger argument  
        'tc' => trigger-theme-cause relation to predict regulation event with theme and cause (binary)
        't2' => trigger-theme1-theme2 relation to predict theme2 in binding (binary)
        """
        if step not in ['tt', 'tp', 'tc', 't2']:
            raise ValueError("only support step for tt, tp, tc and t2")

        X = []
        Y = []

        dt_start = dt.now()

        # reset statistic of extraction
        self.extraction.reset_statistic()

        # init feature
        print "now extracting", len(doc_ids), "docs"
        for doc_id in doc_ids:
            o_doc = self.doc_builder.build(doc_id)
            if step == 'tp':
                samples = self.extraction.extract_tp(o_doc)
            elif step == 'tt':
                samples = self.extraction.extract_tt(o_doc)
            elif step == 'tc':
                samples = self.extraction.extract_tc(o_doc)
            elif step == 't2':
                samples = self.extraction.extract_t2(o_doc)

            for sample in samples:
                X.append(sample[2])
                Y.append(sample[1])

        # print statistic
        pos = self.extraction.sample_pos
        neg = self.extraction.sample_neg
        stat = (pos, neg, pos + neg)
        print stat
        print "percentege of positif data:", pos * 100.0 / (pos + neg)
        print "time to extract feature", dt.now() - dt_start

        return X, Y

    def learn_tp(self, docid_list_fname, grid_search):

        # get list of file
        doc_ids = self.get_docid_list(docid_list_fname)

        # get features and target
        X, Y = self.get_feature(doc_ids, 'tp')

        # init svm classifier
        svm = SVM(self.path,
                  'trig-prot',
                  'linear',
                  grid_search=grid_search,
                  class_weight='auto')
        svm.create()

        # fit training data
        svm.learn(X, Y)

    def learn_tt(self, docid_list_fname, grid_search):
        # get list of file
        doc_ids = self.get_docid_list(docid_list_fname)

        # get features and target
        X, Y = self.get_feature(doc_ids, 'tt')

        # init svm classifier
        svm = SVM(self.path,
                  'trig-trig',
                  'linear',
                  grid_search=grid_search,
                  class_weight='auto')
        svm.create()

        # fit training data
        svm.learn(X, Y)

    def learn_tc(self, docid_list_fname, grid_search):
        # get list of file
        doc_ids = self.get_docid_list(docid_list_fname)

        # get features and target
        X, Y = self.get_feature(doc_ids, 'tc')

        # init svm classifier
        svm = SVM(self.path,
                  'trig-theme-cause',
                  'linear',
                  grid_search=grid_search,
                  class_weight='auto')
        svm.create()

        # fit training data
        svm.learn(X, Y)

    def learn_t2(self, docid_list_fname, grid_search):
        # get list of file
        doc_ids = self.get_docid_list(docid_list_fname)

        # get features and target
        X, Y = self.get_feature(doc_ids, 't2')

        # init svm classifier
        svm = SVM(self.path,
                  'trig-theme1-2',
                  'linear',
                  grid_search=grid_search,
                  class_weight='auto')
        svm.create()

        # fit training data
        svm.learn(X, Y)
Exemplo n.º 7
0
class DocumentBuilderTest(object):
    '''
    classdocs
    '''
    def __init__(self):
        '''
        Constructor
        '''
        self.source = "E:/corpus/bionlp2011/project_data/"

        WD = WordDictionary(self.source)
        WD.load("train")

        TD = TriggerDictionary(self.source)
        TD.load("train")

        self.builder = DocumentBuilder(self.source, WD, TD)

    def run(self):
        self.test1()
        self.test2()
        self.test3()

    def test1(self):
        doc_id = "PMID-2160380"
        o_doc = self.builder.build(doc_id, is_test=False)

        print "Test 1: document from test corpus\n================================================="
        self.print_info(o_doc)

    def test2(self):
        doc_id = "PMID-2083253"
        o_doc = self.builder.build(doc_id, is_test=True)

        print "\n\nTest 2: document from train corpus set is_test=True\n================================================="
        self.print_info(o_doc)

    def test3(self):
        doc_id = "PMC-2222968-04-Results-03"
        o_doc = self.builder.build(doc_id)

        print "\n\nTest 3: document from dev corpus\n================================================="
        self.print_info(o_doc)

    def test4(self):
        # document builder without dictionary
        doc_id = 'PMID-8978306'

        o_doc = self.builder.build(doc_id)
        print "\n\nTest 4: document with multi word trigger\n================================================="
        self.print_info(o_doc)

    def print_info(self, o_doc):
        print "doc id:", o_doc.doc_id
        print "is test:", o_doc.is_test

        for i in range(0, len(o_doc.sen)):
            o_sen = o_doc.sen[i]
            print "sen:", i
            print "-------------------------------"
            for j in range(0, o_sen.nwords):
                w = o_sen.words[j]
                print j, w['start'], w['string'], w['pos_tag'], w['type'], w[
                    'score']
            # entity maps
            print "entity maps"
            print o_sen.entity_map
            # list of word number which is marked as trigger candidate
            print "trigger candidate:"
            print o_sen.trigger_candidate
            # list of protein word number
            print "protein:"
            print o_sen.protein
            # list of trigger word number
            print "trigger:"
            print o_sen.trigger
            print o_sen.trigger_text
            # dependency
            print "dependency"
            print o_sen.dep.root
            print o_sen.dep.graph
            print o_sen.dep.pair
            # chunk
            print "chunk"
            print o_sen.chunk.chunk_map
            print o_sen.chunk.chunk_type

            # tree

            # relation representation
            print "relation:"
            if o_sen.rel != None:
                print o_sen.rel.data