Пример #1
0
    def initialize_files(self, entity_type):
        """
        This function checks the type of entity.
        We have currently done it for entity_type='city'.
        If the input parameter is entity_type city, it will run CRF model loaded for city and initialize the
        tagger and model_path accordingly

        Args:
            entity_type: type of entity

        """
        global CITY_MODEL_OBJECT, DATE_MODEL_OBJECT
        if entity_type == CITY_ENTITY_TYPE:
            self._model_path = CITY_MODEL_PATH
            if not CITY_MODEL_OBJECT:
                CITY_MODEL_OBJECT = CRFPP.Tagger("-m %s -v 3 -n2" %
                                                 self._model_path)
                ner_logger.debug('CITY CRF model loaded %s' % self._model_path)

            self.tagger = CITY_MODEL_OBJECT
        elif entity_type == DATE_ENTITY_TYPE:
            self._model_path = DATE_MODEL_PATH
            if not DATE_MODEL_OBJECT:
                DATE_MODEL_OBJECT = CRFPP.Tagger("-m %s -v 3 -n2" %
                                                 self._model_path)
                ner_logger.debug('date CRF model loaded %s' % self._model_path)

            self.tagger = DATE_MODEL_OBJECT
Пример #2
0
def create_model(seg_file=None, pos_file=None):
    the_file = os.path.dirname(os.path.abspath(__file__))
    if not seg_file:
        seg_file = os.path.join(the_file, 'data', 'crf_seg.model')
    if not pos_file:
        pos_file = os.path.join(the_file, 'data', 'crf_pos.model')
    seg_model = CRFPP.Model("-m %s" % seg_file)
    pos_model = CRFPP.Model("-m %s" % pos_file)
    return (seg_model, pos_model)
Пример #3
0
def locationNER(text):
    #tagger = load_model('model')
    tagger=CRFPP.Tagger('-m model -v 3 -n2')
    for c in text:
        tagger.add(c)
    result = []
    tagger.parse()
    #print(tagger.size())
    word = ''
    for i in range(0, tagger.size()):
        for j in range(0, tagger.xsize()):
            ch = tagger.x(i, j)
            print(ch)
            tag = tagger.y2(i)
            print(tag)
            if tag == 'B':
                word = ch
            elif tag == 'M':
                word += ch
            elif tag == 'E':
                word += ch
                result.append(word)
            elif tag == 'S':
                word = ch
                result.append(word)
    return result
Пример #4
0
def getCx(content_words):
    """
    得到词性
    :param content_words:注意 单词是按照在句子中的顺序排列的
    :return: 返回对应的单词的tag
    """
    tags=list()
    try:
        tagger = CRFPP.Tagger("-m " + "crf_model_1")
        tagger.clear()
        for word in content_words:
            word = word.strip()
            if word:
                tagger.add(word.encode('utf-8'))
        tagger.parse()
        size = tagger.size()#tagger.add的数据大小
        xsize = tagger.xsize()
        for i in range(0, size):
            for j in range(0, xsize):
                #char = tagger.x(i, j).decode('utf-8')
                tag = tagger.y2(i)
                tags.append(tag)
    # except RuntimeError, e:
    #     print "RuntimeError: ", e,
    return tags
Пример #5
0
 def __init__(self, model_path=None, args="-v 3 -n1"):
     if model_path is None:
         model_path = os.path.join(os.path.dirname(__file__),
                                   "./../model/seg.crf.model")
     if args is None:
         args = "-v 3 -n1"
     self.model = CRFPP.Tagger("-m " + model_path + " " + args)
Пример #6
0
 def __init__(self, name):
     model_path = train_path + name + "/"
     which_model = model_path + "model"
     arguments = "-m " + which_model + " -v 3 -n2"
     self.labels = read_labels(model_path)
     self.tagger = CRFPP.Tagger(arguments)
     self.name = name
Пример #7
0
def load_model(path):
    import os, CRFPP
    # -v 3: access deep information like alpha,beta,prob
    # -nN: enable nbest output. N should be >= 2
    if os.path.exists(path):
        return CRFPP.Tagger('-m {0} -v 3 -n2'.format(path))
    return None
Пример #8
0
def selectDocByEntropy(pool_sentence_index, index_dic, datalines, model, n):
    tagger = CRFPP.Tagger("-m " + model + " -v 3 -n2")
    dic_sentence_entropy = []
    for index in pool_sentence_index:
        begin, end = index_dic[index]
        tagger.clear()
        for line in datalines[begin:end]:
            words = line.split('\t')
            newLine = '\t'.join(words[:-1])
            tagger.add(newLine)
        tagger.parse()

        ysize = tagger.ysize()  #tag1 tag2 tag3...
        size = tagger.size()  #char1 char2 char3...

        entropy = 0.0
        for i in range(0, size):
            #wordCurr = tagger.x(i, 0)
            #tagCurr = tagger.y2(i)
            for j in range(0, ysize):
                prob = tagger.prob(i, j)
                entropy -= prob * math.log(prob) / math.log(2)
        #entropy /= size
        dic_sentence_entropy.append((index, entropy))

    dic_sentence_entropy.sort(key=lambda x: x[1], reverse=True)

    if len(dic_sentence_entropy) > n:
        return [x[0] for x in dic_sentence_entropy[:n]]
    else:
        return [x[0] for x in dic_sentence_entropy]
Пример #9
0
    def __init__(self, pathModel=r"./crfpp_model.bin"):
        """
        初始化方法.

        Args:
            pathModel: string, CRF++模型路径

        Return:
            None

        """
        # -v 3: access deep information like alpha, beta, prob
        # -n N: enable N best output. N should be >= 2
        self.tagger = CRFPP.Tagger("-m " + pathModel + " -v 3 -n 2")
        # clear internal context
        self.tagger.clear()

        # 输入特征列数
        print("column size: {}".format(self.tagger.xsize()))
        # 输入tokens(即句子)数
        # print("token size: {}".format(self.tagger.size()))
        # 序列标注输出类别数
        print("tag size: {}".format(self.tagger.ysize()))
        self.tagsList = [
            self.tagger.yname(i) for i in range(self.tagger.ysize())
        ]
        print("tag set: {}".format(self.tagsList))
        self.classes_ = self.tagsList
        return
Пример #10
0
def processing():
	jieba.load_userdict("../../../data/userdict.txt")
	tagger = CRFPP.Tagger("-m ../../NER/train/model")
	TestCol = db["test"]
	data = TestCol.find()
	try:
		for d in data:
			view = []
			content = d["content"]
			# _content = []
			words = pseg.cut(content)
			for w in words:
				tag = w.word.encode("utf-8") + " " + w.flag.encode("utf-8")
				# _content.append(w.word)
				tagger.add(tag)
			tagger.parse()
			ysize = tagger.ysize()
			size = tagger.size()
			xsize = tagger.xsize()
			for i in range(size):
				if tagger.y2(i) == "Y":
					view.append(tagger.x(i,0))
			d["view"] = view
			# d["content"] = _content
			TestCol.save(d)
			tagger.clear()

	except RuntimeError, e:
		print "RuntimeError: ", e,
Пример #11
0
    def __init__(self, table, sym, conf):
        super(LTGenCRF, self).__init__(table, sym)
        model = conf.get("log_template_crf", "model_filename")
        self._middle = conf.get("log_template_crf", "middle_label")

        self._crf = CRFPP.Tagger("-m " + model + " -v 3 -n2")
        if self._middle == "re":
            import label_word
            self._lwobj = label_word.LabelWord(conf)
Пример #12
0
 def __init__(self, model_file, verb_level=2, best_out_n=2):
     try:
         self.m, self.v, self.bn = model_file, verb_level, best_out_n
         self.tagger = CRFPP.Tagger("-m %s -v %i -n%i" %
                                    (model_file, verb_level, best_out_n))
         logger.info("CRFPP Tagger initialized with command %s" %
                     ("-m %s -v %i -n%i" % (self.m, self.v, self.bn)))
     except RuntimeError, e:
         print "RuntimeError: ", e,
Пример #13
0
    def use_model(self, model_path):
        """
        use a alread trained model.

        @type modelPath: string
        @param modelPath: the path of the model file
        """
        #self.model = modelPath
        self.tagger = CRFPP.Tagger("-m %s -v3" % model_path)
Пример #14
0
 def __init__(self):
     self.tagger = CRFPP.Tagger("-m /app/models/MentionExtractionUB.Model")
     self.stemmer = SnowballStemmer('english')
     # self.pos_tagger = PerceptronTagger()
     self.regex_dna_mutation_str = utils.readlines(
         '/app/models/tmvar_regexes/DNAMutation.RegEx.txt')
     self.regex_protein_mutation_str = utils.readlines(
         '/app/models/tmvar_regexes/ProteinMutation.RegEx.txt')
     self.regex_snp_mutation_str = utils.readlines(
         '/app/models/tmvar_regexes/SNP.RegEx.txt')
Пример #15
0
def crfTest(lst,model_name):
    if model_name == "DNA":
        tagger = CRFPP.Tagger("-m bioNER/static/model-5-20-DNA-protein")
    elif model_name == "RNA":
        tagger = CRFPP.Tagger("-m bioNER/static/model-5-20-RNA-protein")
    elif model_name == "cell":
        tagger = CRFPP.Tagger("-m bioNER/static/model-5-20-cell-cell")
    for line in lst:
        tagger.add('\t'.join(map(str,line)))
    tagger.parse()
    ysize = tagger.ysize()
    size = tagger.size()
    xsize = tagger.xsize()
    taglst = [tagger.y2(i) for i in range(len(lst))]
    result = []
    for i in range(len(lst)):
        tmp = [ lst[i][0],taglst[i] ]
        result.append(tmp)
    return result
Пример #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('model',help='model path.')
    parser.add_argument('-n','--num',help='top n result.')
    parser.add_argument('-e','--evaldata',help='evaluation data with ground truth.')
    parser.add_argument('-t','--testdata',help='test data without ground truth.')
    opt = parser.parse_args()
    tagger = CRFPP.Tagger("-m {} -n {} ".format(opt.model,opt.num))
    if opt.testdata:
        test(tagger,opt.testdata)
    if opt.evaldata:
        evaluate(tagger,opt.evaldata)
Пример #17
0
def selectDocByTfidf(trainset, poolset, model, n):
    chosen = []
    char_dic, char_filename_dic = calculateCharRate(poolset)

    tagger = CRFPP.Tagger("-m " + model + " -v 3 -n2")
    dic_filename_entropy = []
    for filename in poolset:
        tagger.clear()
        f = open(os.path.join(samplefolder, filename + '.crf'), 'r')
        entropy_sum = 0.0
        line_num = 0
        line = f.readline()
        while line:
            if len(line.strip()) <= 1:
                tagger.parse()
                ysize = tagger.ysize()  #tag1 tag2 tag3...
                size = tagger.size()  #char1 char2 char3...

                entropy = 0.0
                for i in range(0, size):
                    wordCurr = tagger.x(i, 0)
                    word_entropy = 0.0
                    rate = len(char_filename_dic[wordCurr])
                    tagCurr = tagger.y2(i)

                    for j in range(0, ysize):
                        prob = tagger.prob(i, j)
                        word_entropy -= prob * math.log(prob) / math.log(2)
                    entropy += word_entropy * rate

                entropy /= size

                tagger.clear()
                line_num += 1
                entropy_sum += entropy

                line = f.readline()
            else:
                words = line.split('\t')
                newLine = '\t'.join(words[:-1])
                tagger.add(newLine)
                line = f.readline()
        entropy_sum /= line_num
        dic_filename_entropy.append((filename, entropy_sum))

        f.close()

    dic_filename_entropy.sort(key=lambda x: x[1], reverse=True)

    if len(dic_filename_entropy) > n:
        return [x[0] for x in dic_filename_entropy[:n]]
    else:
        return [x[0] for x in dic_filename_entropy]
Пример #18
0
def segment(sentence, model):
    try:
        tagger = CRFPP.Tagger("-m " + model)

        # clear internal context
        tagger.clear()

        l_features = features

        splitted = split_enumerate(sentence, '.')
        raws = [word for word, span in splitted]
        words = [(normalize(word), span) for word, span in splitted]
        words_length = len(words)

        raws = iter(raws)

        if words_length == 0:
            return [(0, 0)]
        else:
            lastword, last_span = words.pop()
            words = iter(words)
            last_span = str(last_span[0]), str(last_span[0])

            # add context
            for i, (w, span) in enumerate(words):
                nextline = '\t'.join((next(raws), ) +
                                     l_features(w, u'LF%s' %
                                                (i, ))).encode('utf-8')
                tagger.add(nextline)

                if i >= 1:
                    break

            for w, span in words:
                # s_span = (str(span[0]), str(span[1]))
                nextline = '\t'.join((next(raws), ) +
                                     l_features(w, u'MID')).encode('utf-8')
                tagger.add(nextline)

            nextline = '\t'.join((next(raws), ) +
                                 l_features(lastword, u'RHT')).encode('utf-8')
            tagger.add(nextline)

            # Parse and change internal stated as 'parsed'
            tagger.parse()
            anchors = crf_anchors(tagger, splitted)
            # print "Done tagging crf"
            return anchors

    except RuntimeError as e:
        print("RuntimeError: ", e, end=' ')
Пример #19
0
 def __init__(
     self,
     modelfile=None,
     nbest=None,
 ):
     if not nbest:
         nbest = 1
     if not modelfile:
         assert False
     self.tagger = CRFPP.Tagger('-n ' + str(nbest) + ' -m ' + modelfile)
     self.tagger.clear()
     self.begin = "#SENT_BEG#\tbegin\tOUT"
     self.end = "#SENT_BEG#\tend\tOUT"
     self.terms = []
Пример #20
0
def parse_sentence(pharse):
    try:
        text = nltk.word_tokenize(pharse)
        POS = nltk.pos_tag(text)  #get the POS for CRF++
        tagger = CRFPP.Tagger("-m model_file -v 3 -n2")
        # clear internal context
        tagger.clear()

        for i in range(0, len(POS)):
            word = (POS[i][0] + ' ' + POS[i][1]
                    )  # add the word and its POS together
            tagger.add(word)
        tagger.parse()  # parse the sentence
    except RuntimeError, e:
        print "RuntimeError: ", e,
Пример #21
0
    def initialize_files(self, entity_type):
        """
        This function checks the type of entity.
        We have currently done it for entity_type='city'.
        If the input parameter is entity_type city, it will run CRF model loaded for city and initialize the
        tagger and model_path accordingly

        Args:
            entity_type: type of entity

        """

        if entity_type == CITY_ENTITY_TYPE:
            self._model_path = CITY_MODEL_PATH
            self.tagger = CRFPP.Tagger("-m %s -v 3 -n2" % self._model_path)
Пример #22
0
    def chunk(self, string):
        import CRFPP
        tagger = CRFPP.Tagger("-m " + self.modelFile)
        indexes, tokens = tokenize(string)
        tags = self.tagger.tag(tokens)
        tagger.clear()

        for word, posTag in tags:
            tagger.add(str("%s %s" % (word, posTag)))
        tagger.parse()
        for i, (index, token) in enumerate(zip(indexes, tokens)):
            label = tagger.y2(i)
            #print index, token, label
        labels = [tagger.y2(i) for i in range(len(tokens))]
        return indexes, tokens, labels
Пример #23
0
 def add_tagger(self, tag_data):
     """
     函数说明: 添加语料
     :param tag_data: 数据
     :return:
     """
     word_str = tag_data.strip()
     if not os.path.exists(self.model):
         print('模型不存在,请确认模型路径是否正确!')
         exit()
     tagger = CRFPP.Tagger("-m {} -v 3 -n2".format(self.model))
     tagger.clear()
     for word in word_str:
         tagger.add(word)
     tagger.parse()
     return tagger
Пример #24
0
    def __init__(self, model_dir=__default_model_dir__):
        MIDic = path.join(model_dir, 'DefaultMI.pickle')
        WordDic = path.join(model_dir, 'WordDic.pickle')
        CRFModel = path.join(model_dir, 'DefaultModel')

        # 載入 char bigram MI
        self.MIDic = pickle.load(open(MIDic, 'rb'))

        # 載入簡單字典
        self.WordDic = pickle.load(open(WordDic, 'rb'))

        # 指定 CRF 模型檔名
        self.CRFModel = CRFModel

        # CRF 模組
        self.tagger = CRFPP.Tagger("-m " + self.CRFModel)

        self._to_tokens_rule = re.compile(Segmentor._to_tokens_pattern)
Пример #25
0
 def add_tagger(self, tag_data_file):
     """
     函数说明: 添加语料
     :param tag_data: 数据
     :return:
     """
     tag_data_file = open(tag_data_file, 'r')
     if not os.path.exists(self.model):
         print('模型不存在,请确认模型路径是否正确!')
         exit()
     tagger = CRFPP.Tagger("-m {} -v 3 -n2".format(self.model))
     tagger.clear()
     while True:
         line = tag_data_file.readline()
         if line == '':
             break
         tagger.add(line.strip())
     tagger.parse()
     return tagger
 def locationNER(text):
     tagger = CRFPP.Tagger("-m {0} -v 3 -n2".format("data/modelwithflag"))
     for c in text:
         tagger.add(c)
     res = []
     tagger.parse()
     builder = ""
     for i in range(tagger.size()):
         for j in range(tagger.xsize()):
             ch = tagger.x(i, j)
             tag = tagger.y2(i)
             if tag == "B":
                 builder = ch
             elif tag == "M":
                 builder += ch
             elif tag == "E":
                 builder += ch
                 res.append(builder)
             elif tag == "S":
                 builder = ch
                 res.append(builder)
     return res
Пример #27
0
def crf_segmenter(input_file, output_file, tagger):
    input_data = codecs.open(input_file, 'r', 'utf-8')
    output_data = codecs.open(output_file, 'w', 'utf-8')
    for line in input_data.readlines():
        tagger.clear()
        for word in line.strip():
            word = word.strip()
            if word:
                tagger.add((word + "\to\tB").encode('utf-8'))
        tagger.parse()
        size = tagger.size()
        xsize = tagger.xsize()
        for i in range(0, size):
            for j in range(0, xsize):
                char = tagger.x(i, j).decode('utf-8')
                tag = tagger.y2(i)
                if tag == 'B':
                    output_data.write(char)
                elif tag == 'M':
                    output_data.write(char)
                elif tag == 'E':
                    output_data.write(char + ' ')
                else:  # tag == 'S'
                    output_data.write(char + ' ')
        output_data.write('\n')
    input_data.close()
    output_data.close()
    if __name__ == '__main__':
        if len(sys.argv) != 4:
            print
            "pls use: python crf_segmenter.py model input output"
        sys.exit()
    crf_model = sys.argv[1]
    input_file = sys.argv[2]
    output_file = sys.argv[3]
    tagger = CRFPP.Tagger("-m " + crf_model)
    crf_segmenter(input_file, output_file, tagger)
Пример #28
0
                if word:
                    tagger.add((word + '\to\tB'))
            tagger.parse()
            size = tagger.size()
            xsize = tagger.xsize()
            for i in range(size):
                for j in range(xsize):
                    char = tagger.x(i, j)
                    tag = tagger.y2(i)
                    if tag == 'B':
                        test_result_file.write(' ' + char)
                    elif tag == 'M':
                        test_result_file.write(char)
                    elif tag == 'E':
                        test_result_file.write(char + ' ')
                    else:
                        test_result_file.write(' ' + char + ' ')
            test_result_file.write('\n')

        test_file.close()
        test_result_file.close()


if __name__ == '__main__':
    crf = CRF()
    # crf.tagging()
    tagger = CRFPP.Tagger('-m' + crf_model)
    crf.segment(tagger)
    e = Evaluate()
    e.evaluate()
    e.result()
Пример #29
0
def load_model(path):
    if os.path.exists(path):
        return CRFPP.Tagger('-m{0} -v 3 -n 2'.format(path))
    return None
Пример #30
0
 def __setstate__(self, dict):
     self.__dict__.update(dict)
     self.tagger = CRFPP.Tagger("-m %s -v %i -n%i" %
                                (self.m, self.v, self.bn))